From ace9429bb58fd418f0c81d4c2835699bddf6bde6 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Thu, 11 Apr 2024 10:27:49 +0200
Subject: Adding upstream version 6.6.15.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 net/ipv4/Kconfig                            |  753 +++
 net/ipv4/Makefile                           |   74 +
 net/ipv4/af_inet.c                          | 2117 ++++++++
 net/ipv4/ah4.c                              |  604 +++
 net/ipv4/arp.c                              | 1472 ++++++
 net/ipv4/bpf_tcp_ca.c                       |  290 ++
 net/ipv4/bpfilter/Makefile                  |    2 +
 net/ipv4/bpfilter/sockopt.c                 |   71 +
 net/ipv4/cipso_ipv4.c                       | 2295 +++++++++
 net/ipv4/datagram.c                         |  129 +
 net/ipv4/devinet.c                          | 2816 +++++++++++
 net/ipv4/esp4.c                             | 1251 +++++
 net/ipv4/esp4_offload.c                     |  386 ++
 net/ipv4/fib_frontend.c                     | 1663 +++++++
 net/ipv4/fib_lookup.h                       |   63 +
 net/ipv4/fib_notifier.c                     |   72 +
 net/ipv4/fib_rules.c                        |  436 ++
 net/ipv4/fib_semantics.c                    | 2275 +++++++++
 net/ipv4/fib_trie.c                         | 3067 ++++++++++++
 net/ipv4/fou_bpf.c                          |  119 +
 net/ipv4/fou_core.c                         | 1266 +++++
 net/ipv4/fou_nl.c                           |   48 +
 net/ipv4/fou_nl.h                           |   25 +
 net/ipv4/gre_demux.c                        |  221 +
 net/ipv4/gre_offload.c                      |  287 ++
 net/ipv4/icmp.c                             | 1510 ++++++
 net/ipv4/igmp.c                             | 3110 ++++++++++++
 net/ipv4/inet_connection_sock.c             | 1501 ++++++
 net/ipv4/inet_diag.c                        | 1485 ++++++
 net/ipv4/inet_fragment.c                    |  608 +++
 net/ipv4/inet_hashtables.c                  | 1277 +++++
 net/ipv4/inet_timewait_sock.c               |  339 ++
 net/ipv4/inetpeer.c                         |  308 ++
 net/ipv4/ip_forward.c                       |  181 +
 net/ipv4/ip_fragment.c                      |  761 +++
 net/ipv4/ip_gre.c                           | 1802 +++++++
 net/ipv4/ip_input.c                         |  675 +++
 net/ipv4/ip_options.c                       |  641 +++
 net/ipv4/ip_output.c                        | 1663 +++++++
 net/ipv4/ip_sockglue.c                      | 1796 +++++++
 net/ipv4/ip_tunnel.c                        | 1301 +++++
 net/ipv4/ip_tunnel_core.c                   | 1148 +++++
 net/ipv4/ip_vti.c                           |  726 +++
 net/ipv4/ipcomp.c                           |  204 +
 net/ipv4/ipconfig.c                         | 1851 +++++++
 net/ipv4/ipip.c                             |  663 +++
 net/ipv4/ipmr.c                             | 3181 ++++++++++++
 net/ipv4/ipmr_base.c                        |  448 ++
 net/ipv4/metrics.c                          |   91 +
 net/ipv4/netfilter.c                        |   95 +
 net/ipv4/netfilter/Kconfig                  |  344 ++
 net/ipv4/netfilter/Makefile                 |   53 +
 net/ipv4/netfilter/arp_tables.c             | 1667 +++++++
 net/ipv4/netfilter/arpt_mangle.c            |   92 +
 net/ipv4/netfilter/arptable_filter.c        |   91 +
 net/ipv4/netfilter/ip_tables.c              | 1886 +++++++
 net/ipv4/netfilter/ipt_ECN.c                |  133 +
 net/ipv4/netfilter/ipt_REJECT.c             |  111 +
 net/ipv4/netfilter/ipt_SYNPROXY.c           |  121 +
 net/ipv4/netfilter/ipt_ah.c                 |   88 +
 net/ipv4/netfilter/ipt_rpfilter.c           |  126 +
 net/ipv4/netfilter/iptable_filter.c         |  110 +
 net/ipv4/netfilter/iptable_mangle.c         |  143 +
 net/ipv4/netfilter/iptable_nat.c            |  172 +
 net/ipv4/netfilter/iptable_raw.c            |  110 +
 net/ipv4/netfilter/iptable_security.c       |   98 +
 net/ipv4/netfilter/nf_defrag_ipv4.c         |  188 +
 net/ipv4/netfilter/nf_dup_ipv4.c            |   99 +
 net/ipv4/netfilter/nf_nat_h323.c            |  567 +++
 net/ipv4/netfilter/nf_nat_pptp.c            |  320 ++
 net/ipv4/netfilter/nf_nat_snmp_basic.asn1   |  177 +
 net/ipv4/netfilter/nf_nat_snmp_basic_main.c |  231 +
 net/ipv4/netfilter/nf_reject_ipv4.c         |  341 ++
 net/ipv4/netfilter/nf_socket_ipv4.c         |  153 +
 net/ipv4/netfilter/nf_tproxy_ipv4.c         |  152 +
 net/ipv4/netfilter/nft_dup_ipv4.c           |  112 +
 net/ipv4/netfilter/nft_fib_ipv4.c           |  222 +
 net/ipv4/netfilter/nft_reject_ipv4.c        |   76 +
 net/ipv4/netlink.c                          |   33 +
 net/ipv4/nexthop.c                          | 3792 ++++++++++++++
 net/ipv4/ping.c                             | 1196 +++++
 net/ipv4/proc.c                             |  560 +++
 net/ipv4/protocol.c                         |   70 +
 net/ipv4/raw.c                              | 1110 +++++
 net/ipv4/raw_diag.c                         |  261 +
 net/ipv4/route.c                            | 3789 ++++++++++++++
 net/ipv4/syncookies.c                       |  449 ++
 net/ipv4/sysctl_net_ipv4.c                  | 1573 ++++++
 net/ipv4/tcp.c                              | 4729 ++++++++++++++++++
 net/ipv4/tcp_bbr.c                          | 1202 +++++
 net/ipv4/tcp_bic.c                          |  229 +
 net/ipv4/tcp_bpf.c                          |  700 +++
 net/ipv4/tcp_cdg.c                          |  428 ++
 net/ipv4/tcp_cong.c                         |  540 ++
 net/ipv4/tcp_cubic.c                        |  557 +++
 net/ipv4/tcp_dctcp.c                        |  306 ++
 net/ipv4/tcp_dctcp.h                        |   40 +
 net/ipv4/tcp_diag.c                         |  250 +
 net/ipv4/tcp_fastopen.c                     |  595 +++
 net/ipv4/tcp_highspeed.c                    |  186 +
 net/ipv4/tcp_htcp.c                         |  317 ++
 net/ipv4/tcp_hybla.c                        |  194 +
 net/ipv4/tcp_illinois.c                     |  360 ++
 net/ipv4/tcp_input.c                        | 7113 +++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c                         | 3411 +++++++++++++
 net/ipv4/tcp_lp.c                           |  354 ++
 net/ipv4/tcp_metrics.c                      | 1050 ++++
 net/ipv4/tcp_minisocks.c                    |  895 ++++
 net/ipv4/tcp_nv.c                           |  501 ++
 net/ipv4/tcp_offload.c                      |  358 ++
 net/ipv4/tcp_output.c                       | 4273 ++++++++++++++++
 net/ipv4/tcp_plb.c                          |  109 +
 net/ipv4/tcp_rate.c                         |  209 +
 net/ipv4/tcp_recovery.c                     |  237 +
 net/ipv4/tcp_scalable.c                     |   65 +
 net/ipv4/tcp_timer.c                        |  841 ++++
 net/ipv4/tcp_ulp.c                          |  168 +
 net/ipv4/tcp_vegas.c                        |  340 ++
 net/ipv4/tcp_vegas.h                        |   26 +
 net/ipv4/tcp_veno.c                         |  238 +
 net/ipv4/tcp_westwood.c                     |  309 ++
 net/ipv4/tcp_yeah.c                         |  239 +
 net/ipv4/tunnel4.c                          |  297 ++
 net/ipv4/udp.c                              | 3634 ++++++++++++++
 net/ipv4/udp_bpf.c                          |  157 +
 net/ipv4/udp_diag.c                         |  300 ++
 net/ipv4/udp_impl.h                         |   27 +
 net/ipv4/udp_offload.c                      |  751 +++
 net/ipv4/udp_tunnel_core.c                  |  207 +
 net/ipv4/udp_tunnel_nic.c                   |  975 ++++
 net/ipv4/udp_tunnel_stub.c                  |    7 +
 net/ipv4/udplite.c                          |  136 +
 net/ipv4/xfrm4_input.c                      |  173 +
 net/ipv4/xfrm4_output.c                     |   46 +
 net/ipv4/xfrm4_policy.c                     |  249 +
 net/ipv4/xfrm4_protocol.c                   |  306 ++
 net/ipv4/xfrm4_state.c                      |   24 +
 net/ipv4/xfrm4_tunnel.c                     |  118 +
 138 files changed, 108759 insertions(+)
 create mode 100644 net/ipv4/Kconfig
 create mode 100644 net/ipv4/Makefile
 create mode 100644 net/ipv4/af_inet.c
 create mode 100644 net/ipv4/ah4.c
 create mode 100644 net/ipv4/arp.c
 create mode 100644 net/ipv4/bpf_tcp_ca.c
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c
 create mode 100644 net/ipv4/cipso_ipv4.c
 create mode 100644 net/ipv4/datagram.c
 create mode 100644 net/ipv4/devinet.c
 create mode 100644 net/ipv4/esp4.c
 create mode 100644 net/ipv4/esp4_offload.c
 create mode 100644 net/ipv4/fib_frontend.c
 create mode 100644 net/ipv4/fib_lookup.h
 create mode 100644 net/ipv4/fib_notifier.c
 create mode 100644 net/ipv4/fib_rules.c
 create mode 100644 net/ipv4/fib_semantics.c
 create mode 100644 net/ipv4/fib_trie.c
 create mode 100644 net/ipv4/fou_bpf.c
 create mode 100644 net/ipv4/fou_core.c
 create mode 100644 net/ipv4/fou_nl.c
 create mode 100644 net/ipv4/fou_nl.h
 create mode 100644 net/ipv4/gre_demux.c
 create mode 100644 net/ipv4/gre_offload.c
 create mode 100644 net/ipv4/icmp.c
 create mode 100644 net/ipv4/igmp.c
 create mode 100644 net/ipv4/inet_connection_sock.c
 create mode 100644 net/ipv4/inet_diag.c
 create mode 100644 net/ipv4/inet_fragment.c
 create mode 100644 net/ipv4/inet_hashtables.c
 create mode 100644 net/ipv4/inet_timewait_sock.c
 create mode 100644 net/ipv4/inetpeer.c
 create mode 100644 net/ipv4/ip_forward.c
 create mode 100644 net/ipv4/ip_fragment.c
 create mode 100644 net/ipv4/ip_gre.c
 create mode 100644 net/ipv4/ip_input.c
 create mode 100644 net/ipv4/ip_options.c
 create mode 100644 net/ipv4/ip_output.c
 create mode 100644 net/ipv4/ip_sockglue.c
 create mode 100644 net/ipv4/ip_tunnel.c
 create mode 100644 net/ipv4/ip_tunnel_core.c
 create mode 100644 net/ipv4/ip_vti.c
 create mode 100644 net/ipv4/ipcomp.c
 create mode 100644 net/ipv4/ipconfig.c
 create mode 100644 net/ipv4/ipip.c
 create mode 100644 net/ipv4/ipmr.c
 create mode 100644 net/ipv4/ipmr_base.c
 create mode 100644 net/ipv4/metrics.c
 create mode 100644 net/ipv4/netfilter.c
 create mode 100644 net/ipv4/netfilter/Kconfig
 create mode 100644 net/ipv4/netfilter/Makefile
 create mode 100644 net/ipv4/netfilter/arp_tables.c
 create mode 100644 net/ipv4/netfilter/arpt_mangle.c
 create mode 100644 net/ipv4/netfilter/arptable_filter.c
 create mode 100644 net/ipv4/netfilter/ip_tables.c
 create mode 100644 net/ipv4/netfilter/ipt_ECN.c
 create mode 100644 net/ipv4/netfilter/ipt_REJECT.c
 create mode 100644 net/ipv4/netfilter/ipt_SYNPROXY.c
 create mode 100644 net/ipv4/netfilter/ipt_ah.c
 create mode 100644 net/ipv4/netfilter/ipt_rpfilter.c
 create mode 100644 net/ipv4/netfilter/iptable_filter.c
 create mode 100644 net/ipv4/netfilter/iptable_mangle.c
 create mode 100644 net/ipv4/netfilter/iptable_nat.c
 create mode 100644 net/ipv4/netfilter/iptable_raw.c
 create mode 100644 net/ipv4/netfilter/iptable_security.c
 create mode 100644 net/ipv4/netfilter/nf_defrag_ipv4.c
 create mode 100644 net/ipv4/netfilter/nf_dup_ipv4.c
 create mode 100644 net/ipv4/netfilter/nf_nat_h323.c
 create mode 100644 net/ipv4/netfilter/nf_nat_pptp.c
 create mode 100644 net/ipv4/netfilter/nf_nat_snmp_basic.asn1
 create mode 100644 net/ipv4/netfilter/nf_nat_snmp_basic_main.c
 create mode 100644 net/ipv4/netfilter/nf_reject_ipv4.c
 create mode 100644 net/ipv4/netfilter/nf_socket_ipv4.c
 create mode 100644 net/ipv4/netfilter/nf_tproxy_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_dup_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_fib_ipv4.c
 create mode 100644 net/ipv4/netfilter/nft_reject_ipv4.c
 create mode 100644 net/ipv4/netlink.c
 create mode 100644 net/ipv4/nexthop.c
 create mode 100644 net/ipv4/ping.c
 create mode 100644 net/ipv4/proc.c
 create mode 100644 net/ipv4/protocol.c
 create mode 100644 net/ipv4/raw.c
 create mode 100644 net/ipv4/raw_diag.c
 create mode 100644 net/ipv4/route.c
 create mode 100644 net/ipv4/syncookies.c
 create mode 100644 net/ipv4/sysctl_net_ipv4.c
 create mode 100644 net/ipv4/tcp.c
 create mode 100644 net/ipv4/tcp_bbr.c
 create mode 100644 net/ipv4/tcp_bic.c
 create mode 100644 net/ipv4/tcp_bpf.c
 create mode 100644 net/ipv4/tcp_cdg.c
 create mode 100644 net/ipv4/tcp_cong.c
 create mode 100644 net/ipv4/tcp_cubic.c
 create mode 100644 net/ipv4/tcp_dctcp.c
 create mode 100644 net/ipv4/tcp_dctcp.h
 create mode 100644 net/ipv4/tcp_diag.c
 create mode 100644 net/ipv4/tcp_fastopen.c
 create mode 100644 net/ipv4/tcp_highspeed.c
 create mode 100644 net/ipv4/tcp_htcp.c
 create mode 100644 net/ipv4/tcp_hybla.c
 create mode 100644 net/ipv4/tcp_illinois.c
 create mode 100644 net/ipv4/tcp_input.c
 create mode 100644 net/ipv4/tcp_ipv4.c
 create mode 100644 net/ipv4/tcp_lp.c
 create mode 100644 net/ipv4/tcp_metrics.c
 create mode 100644 net/ipv4/tcp_minisocks.c
 create mode 100644 net/ipv4/tcp_nv.c
 create mode 100644 net/ipv4/tcp_offload.c
 create mode 100644 net/ipv4/tcp_output.c
 create mode 100644 net/ipv4/tcp_plb.c
 create mode 100644 net/ipv4/tcp_rate.c
 create mode 100644 net/ipv4/tcp_recovery.c
 create mode 100644 net/ipv4/tcp_scalable.c
 create mode 100644 net/ipv4/tcp_timer.c
 create mode 100644 net/ipv4/tcp_ulp.c
 create mode 100644 net/ipv4/tcp_vegas.c
 create mode 100644 net/ipv4/tcp_vegas.h
 create mode 100644 net/ipv4/tcp_veno.c
 create mode 100644 net/ipv4/tcp_westwood.c
 create mode 100644 net/ipv4/tcp_yeah.c
 create mode 100644 net/ipv4/tunnel4.c
 create mode 100644 net/ipv4/udp.c
 create mode 100644 net/ipv4/udp_bpf.c
 create mode 100644 net/ipv4/udp_diag.c
 create mode 100644 net/ipv4/udp_impl.h
 create mode 100644 net/ipv4/udp_offload.c
 create mode 100644 net/ipv4/udp_tunnel_core.c
 create mode 100644 net/ipv4/udp_tunnel_nic.c
 create mode 100644 net/ipv4/udp_tunnel_stub.c
 create mode 100644 net/ipv4/udplite.c
 create mode 100644 net/ipv4/xfrm4_input.c
 create mode 100644 net/ipv4/xfrm4_output.c
 create mode 100644 net/ipv4/xfrm4_policy.c
 create mode 100644 net/ipv4/xfrm4_protocol.c
 create mode 100644 net/ipv4/xfrm4_state.c
 create mode 100644 net/ipv4/xfrm4_tunnel.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 0000000000..2dfb12230f
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,753 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# IP configuration
+#
+config IP_MULTICAST
+	bool "IP: multicasting"
+	help
+	  This is code for addressing several networked computers at once,
+	  enlarging your kernel by about 2 KB. You need multicasting if you
+	  intend to participate in the MBONE, a high bandwidth network on top
+	  of the Internet which carries audio and video broadcasts. More
+	  information about the MBONE is on the WWW at
+	  <https://www.savetz.com/mbone/>. For most people, it's safe to say N.
+
+config IP_ADVANCED_ROUTER
+	bool "IP: advanced router"
+	help
+	  If you intend to run your Linux box mostly as a router, i.e. as a
+	  computer that forwards and redistributes network packets, say Y; you
+	  will then be presented with several options that allow more precise
+	  control about the routing process.
+
+	  The answer to this question won't directly affect the kernel:
+	  answering N will just cause the configurator to skip all the
+	  questions about advanced routing.
+
+	  Note that your box can only act as a router if you enable IP
+	  forwarding in your kernel; you can do that by saying Y to "/proc
+	  file system support" and "Sysctl support" below and executing the
+	  line
+
+	  echo "1" > /proc/sys/net/ipv4/ip_forward
+
+	  at boot time after the /proc file system has been mounted.
+
+	  If you turn on IP forwarding, you should consider the rp_filter, which
+	  automatically rejects incoming packets if the routing table entry
+	  for their source address doesn't match the network interface they're
+	  arriving on. This has security advantages because it prevents the
+	  so-called IP spoofing, however it can pose problems if you use
+	  asymmetric routing (packets from you to a host take a different path
+	  than packets from that host to you) or if you operate a non-routing
+	  host which has several IP addresses on different interfaces. To turn
+	  rp_filter on use:
+
+	  echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+	   or
+	  echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+
+	  Note that some distributions enable it in startup scripts.
+	  For details about rp_filter strict and loose mode read
+	  <file:Documentation/networking/ip-sysctl.rst>.
+
+	  If unsure, say N here.
+
+config IP_FIB_TRIE_STATS
+	bool "FIB TRIE statistics"
+	depends on IP_ADVANCED_ROUTER
+	help
+	  Keep track of statistics on structure of FIB TRIE table.
+	  Useful for testing and measuring TRIE performance.
+
+config IP_MULTIPLE_TABLES
+	bool "IP: policy routing"
+	depends on IP_ADVANCED_ROUTER
+	select FIB_RULES
+	help
+	  Normally, a router decides what to do with a received packet based
+	  solely on the packet's final destination address. If you say Y here,
+	  the Linux router will also be able to take the packet's source
+	  address into account. Furthermore, the TOS (Type-Of-Service) field
+	  of the packet can be used for routing decisions as well.
+
+	  If you need more information, see the Linux Advanced
+	  Routing and Traffic Control documentation at
+	  <https://lartc.org/howto/lartc.rpdb.html>
+
+	  If unsure, say N.
+
+config IP_ROUTE_MULTIPATH
+	bool "IP: equal cost multipath"
+	depends on IP_ADVANCED_ROUTER
+	help
+	  Normally, the routing tables specify a single action to be taken in
+	  a deterministic manner for a given packet. If you say Y here
+	  however, it becomes possible to attach several actions to a packet
+	  pattern, in effect specifying several alternative paths to travel
+	  for those packets. The router considers all these paths to be of
+	  equal "cost" and chooses one of them in a non-deterministic fashion
+	  if a matching packet arrives.
+
+config IP_ROUTE_VERBOSE
+	bool "IP: verbose route monitoring"
+	depends on IP_ADVANCED_ROUTER
+	help
+	  If you say Y here, which is recommended, then the kernel will print
+	  verbose messages regarding the routing, for example warnings about
+	  received packets which look strange and could be evidence of an
+	  attack or a misconfigured system somewhere. The information is
+	  handled by the klogd daemon which is responsible for kernel messages
+	  ("man klogd").
+
+config IP_ROUTE_CLASSID
+	bool
+
+config IP_PNP
+	bool "IP: kernel level autoconfiguration"
+	help
+	  This enables automatic configuration of IP addresses of devices and
+	  of the routing table during kernel boot, based on either information
+	  supplied on the kernel command line or by BOOTP or RARP protocols.
+	  You need to say Y only for diskless machines requiring network
+	  access to boot (in which case you want to say Y to "Root file system
+	  on NFS" as well), because all other machines configure the network
+	  in their startup scripts.
+
+config IP_PNP_DHCP
+	bool "IP: DHCP support"
+	depends on IP_PNP
+	help
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the DHCP protocol (a
+	  special protocol designed for doing this job), say Y here. In case
+	  the boot ROM of your network card was designed for booting Linux and
+	  does DHCP itself, providing all necessary information on the kernel
+	  command line, you can say N here.
+
+	  If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+	  must be operating on your network.  Read
+	  <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
+
+config IP_PNP_BOOTP
+	bool "IP: BOOTP support"
+	depends on IP_PNP
+	help
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the BOOTP protocol (a
+	  special protocol designed for doing this job), say Y here. In case
+	  the boot ROM of your network card was designed for booting Linux and
+	  does BOOTP itself, providing all necessary information on the kernel
+	  command line, you can say N here. If unsure, say Y. Note that if you
+	  want to use BOOTP, a BOOTP server must be operating on your network.
+	  Read <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
+
+config IP_PNP_RARP
+	bool "IP: RARP support"
+	depends on IP_PNP
+	help
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the RARP protocol (an
+	  older protocol which is being obsoleted by BOOTP and DHCP), say Y
+	  here. Note that if you want to use RARP, a RARP server must be
+	  operating on your network. Read
+	  <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
+
+config NET_IPIP
+	tristate "IP: tunneling"
+	select INET_TUNNEL
+	select NET_IP_TUNNEL
+	help
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This particular tunneling driver implements
+	  encapsulation of IP within IP, which sounds kind of pointless, but
+	  can be useful if you want to make your (or some other) machine
+	  appear on a different network than it physically is, or to use
+	  mobile-IP facilities (allowing laptops to seamlessly move between
+	  networks without changing their IP addresses).
+
+	  Saying Y to this option will produce two modules ( = code which can
+	  be inserted in and removed from the running kernel whenever you
+	  want). Most people won't need this and can say N.
+
+config NET_IPGRE_DEMUX
+	tristate "IP: GRE demultiplexer"
+	help
+	  This is helper module to demultiplex GRE packets on GRE version field criteria.
+	  Required by ip_gre and pptp modules.
+
+config NET_IP_TUNNEL
+	tristate
+	select DST_CACHE
+	select GRO_CELLS
+	default n
+
+config NET_IPGRE
+	tristate "IP: GRE tunnels over IP"
+	depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+	select NET_IP_TUNNEL
+	help
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This particular tunneling driver implements
+	  GRE (Generic Routing Encapsulation) and at this time allows
+	  encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+	  This driver is useful if the other endpoint is a Cisco router: Cisco
+	  likes GRE much better than the other Linux tunneling driver ("IP
+	  tunneling" above). In addition, GRE allows multicast redistribution
+	  through the tunnel.
+
+config NET_IPGRE_BROADCAST
+	bool "IP: broadcast GRE over IP"
+	depends on IP_MULTICAST && NET_IPGRE
+	help
+	  One application of GRE/IP is to construct a broadcast WAN (Wide Area
+	  Network), which looks like a normal Ethernet LAN (Local Area
+	  Network), but can be distributed all over the Internet. If you want
+	  to do that, say Y here and to "IP multicast routing" below.
+
+config IP_MROUTE_COMMON
+	bool
+	depends on IP_MROUTE || IPV6_MROUTE
+
+config IP_MROUTE
+	bool "IP: multicast routing"
+	depends on IP_MULTICAST
+	select IP_MROUTE_COMMON
+	help
+	  This is used if you want your machine to act as a router for IP
+	  packets that have several destination addresses. It is needed on the
+	  MBONE, a high bandwidth network on top of the Internet which carries
+	  audio and video broadcasts. In order to do that, you would most
+	  likely run the program mrouted. If you haven't heard about it, you
+	  don't need it.
+
+config IP_MROUTE_MULTIPLE_TABLES
+	bool "IP: multicast policy routing"
+	depends on IP_MROUTE && IP_ADVANCED_ROUTER
+	select FIB_RULES
+	help
+	  Normally, a multicast router runs a userspace daemon and decides
+	  what to do with a multicast packet based on the source and
+	  destination addresses. If you say Y here, the multicast router
+	  will also be able to take interfaces and packet marks into
+	  account and run multiple instances of userspace daemons
+	  simultaneously, each one handling a single table.
+
+	  If unsure, say N.
+
+config IP_PIMSM_V1
+	bool "IP: PIM-SM version 1 support"
+	depends on IP_MROUTE
+	help
+	  Kernel side support for Sparse Mode PIM (Protocol Independent
+	  Multicast) version 1. This multicast routing protocol is used widely
+	  because Cisco supports it. You need special software to use it
+	  (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
+	  information about PIM.
+
+	  Say Y if you want to use PIM-SM v1. Note that you can say N here if
+	  you just want to use Dense Mode PIM.
+
+config IP_PIMSM_V2
+	bool "IP: PIM-SM version 2 support"
+	depends on IP_MROUTE
+	help
+	  Kernel side support for Sparse Mode PIM version 2. In order to use
+	  this, you need an experimental routing daemon supporting it (pimd or
+	  gated-5). This routing protocol is not used widely, so say N unless
+	  you want to play with it.
+
+config SYN_COOKIES
+	bool "IP: TCP syncookie support"
+	help
+	  Normal TCP/IP networking is open to an attack known as "SYN
+	  flooding". This denial-of-service attack prevents legitimate remote
+	  users from being able to connect to your computer during an ongoing
+	  attack and requires very little work from the attacker, who can
+	  operate from anywhere on the Internet.
+
+	  SYN cookies provide protection against this type of attack. If you
+	  say Y here, the TCP/IP stack will use a cryptographic challenge
+	  protocol known as "SYN cookies" to enable legitimate users to
+	  continue to connect, even when your machine is under attack. There
+	  is no need for the legitimate users to change their TCP/IP software;
+	  SYN cookies work transparently to them. For technical information
+	  about SYN cookies, check out <https://cr.yp.to/syncookies.html>.
+
+	  If you are SYN flooded, the source address reported by the kernel is
+	  likely to have been forged by the attacker; it is only reported as
+	  an aid in tracing the packets to their actual source and should not
+	  be taken as absolute truth.
+
+	  SYN cookies may prevent correct error reporting on clients when the
+	  server is really overloaded. If this happens frequently better turn
+	  them off.
+
+	  If you say Y here, you can disable SYN cookies at run time by
+	  saying Y to "/proc file system support" and
+	  "Sysctl support" below and executing the command
+
+	  echo 0 > /proc/sys/net/ipv4/tcp_syncookies
+
+	  after the /proc file system has been mounted.
+
+	  If unsure, say N.
+
+config NET_IPVTI
+	tristate "Virtual (secure) IP: tunneling"
+	depends on IPV6 || IPV6=n
+	select INET_TUNNEL
+	select NET_IP_TUNNEL
+	select XFRM
+	help
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This can be used with xfrm mode tunnel to give
+	  the notion of a secure tunnel for IPSEC and then use routing protocol
+	  on top.
+
+config NET_UDP_TUNNEL
+	tristate
+	select NET_IP_TUNNEL
+	default n
+
+config NET_FOU
+	tristate "IP: Foo (IP protocols) over UDP"
+	select NET_UDP_TUNNEL
+	help
+	  Foo over UDP allows any IP protocol to be directly encapsulated
+	  over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP
+	  network mechanisms and optimizations for UDP (such as ECMP
+	  and RSS) can be leveraged to provide better service.
+
+config NET_FOU_IP_TUNNELS
+	bool "IP: FOU encapsulation of IP tunnels"
+	depends on NET_IPIP || NET_IPGRE || IPV6_SIT
+	select NET_FOU
+	help
+	  Allow configuration of FOU or GUE encapsulation for IP tunnels.
+	  When this option is enabled IP tunnels can be configured to use
+	  FOU or GUE encapsulation.
+
+config INET_AH
+	tristate "IP: AH transformation"
+	select XFRM_AH
+	help
+	  Support for IPsec AH (Authentication Header).
+
+	  AH can be used with various authentication algorithms.  Besides
+	  enabling AH support itself, this option enables the generic
+	  implementations of the algorithms that RFC 8221 lists as MUST be
+	  implemented.  If you need any other algorithms, you'll need to enable
+	  them in the crypto API.  You should also enable accelerated
+	  implementations of any needed algorithms when available.
+
+	  If unsure, say Y.
+
+config INET_ESP
+	tristate "IP: ESP transformation"
+	select XFRM_ESP
+	help
+	  Support for IPsec ESP (Encapsulating Security Payload).
+
+	  ESP can be used with various encryption and authentication algorithms.
+	  Besides enabling ESP support itself, this option enables the generic
+	  implementations of the algorithms that RFC 8221 lists as MUST be
+	  implemented.  If you need any other algorithms, you'll need to enable
+	  them in the crypto API.  You should also enable accelerated
+	  implementations of any needed algorithms when available.
+
+	  If unsure, say Y.
+
+config INET_ESP_OFFLOAD
+	tristate "IP: ESP transformation offload"
+	depends on INET_ESP
+	select XFRM_OFFLOAD
+	default n
+	help
+	  Support for ESP transformation offload. This makes sense
+	  only if this system really does IPsec and want to do it
+	  with high throughput. A typical desktop system does not
+	  need it, even if it does IPsec.
+
+	  If unsure, say N.
+
+config INET_ESPINTCP
+	bool "IP: ESP in TCP encapsulation (RFC 8229)"
+	depends on XFRM && INET_ESP
+	select STREAM_PARSER
+	select NET_SOCK_MSG
+	select XFRM_ESPINTCP
+	help
+	  Support for RFC 8229 encapsulation of ESP and IKE over
+	  TCP/IPv4 sockets.
+
+	  If unsure, say N.
+
+config INET_IPCOMP
+	tristate "IP: IPComp transformation"
+	select INET_XFRM_TUNNEL
+	select XFRM_IPCOMP
+	help
+	  Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+	  typically needed for IPsec.
+
+	  If unsure, say Y.
+
+config INET_TABLE_PERTURB_ORDER
+	int "INET: Source port perturbation table size (as power of 2)" if EXPERT
+	default 16
+	help
+	  Source port perturbation table size (as power of 2) for
+	  RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm.
+
+	  The default is almost always what you want.
+	  Only change this if you know what you are doing.
+
+config INET_XFRM_TUNNEL
+	tristate
+	select INET_TUNNEL
+	default n
+
+config INET_TUNNEL
+	tristate
+	default n
+
+config INET_DIAG
+	tristate "INET: socket monitoring interface"
+	default y
+	help
+	  Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+	  native Linux tools such as ss. ss is included in iproute2, currently
+	  downloadable at:
+
+	    http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
+
+	  If unsure, say Y.
+
+config INET_TCP_DIAG
+	depends on INET_DIAG
+	def_tristate INET_DIAG
+
+config INET_UDP_DIAG
+	tristate "UDP: socket monitoring interface"
+	depends on INET_DIAG && (IPV6 || IPV6=n)
+	default n
+	help
+	  Support for UDP socket monitoring interface used by the ss tool.
+	  If unsure, say Y.
+
+config INET_RAW_DIAG
+	tristate "RAW: socket monitoring interface"
+	depends on INET_DIAG && (IPV6 || IPV6=n)
+	default n
+	help
+	  Support for RAW socket monitoring interface used by the ss tool.
+	  If unsure, say Y.
+
+config INET_DIAG_DESTROY
+	bool "INET: allow privileged process to administratively close sockets"
+	depends on INET_DIAG
+	default n
+	help
+	  Provides a SOCK_DESTROY operation that allows privileged processes
+	  (e.g., a connection manager or a network administration tool such as
+	  ss) to close sockets opened by other processes. Closing a socket in
+	  this way interrupts any blocking read/write/connect operations on
+	  the socket and causes future socket calls to behave as if the socket
+	  had been disconnected.
+	  If unsure, say N.
+
+menuconfig TCP_CONG_ADVANCED
+	bool "TCP: advanced congestion control"
+	help
+	  Support for selection of various TCP congestion control
+	  modules.
+
+	  Nearly all users can safely say no here, and a safe default
+	  selection will be made (CUBIC with new Reno as a fallback).
+
+	  If unsure, say N.
+
+if TCP_CONG_ADVANCED
+
+config TCP_CONG_BIC
+	tristate "Binary Increase Congestion (BIC) control"
+	default m
+	help
+	  BIC-TCP is a sender-side only change that ensures a linear RTT
+	  fairness under large windows while offering both scalability and
+	  bounded TCP-friendliness. The protocol combines two schemes
+	  called additive increase and binary search increase. When the
+	  congestion window is large, additive increase with a large
+	  increment ensures linear RTT fairness as well as good
+	  scalability. Under small congestion windows, binary search
+	  increase provides TCP friendliness.
+	  See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+config TCP_CONG_CUBIC
+	tristate "CUBIC TCP"
+	default y
+	help
+	  This is version 2.0 of BIC-TCP which uses a cubic growth function
+	  among other techniques.
+	  See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
+config TCP_CONG_WESTWOOD
+	tristate "TCP Westwood+"
+	default m
+	help
+	  TCP Westwood+ is a sender-side only modification of the TCP Reno
+	  protocol stack that optimizes the performance of TCP congestion
+	  control. It is based on end-to-end bandwidth estimation to set
+	  congestion window and slow start threshold after a congestion
+	  episode. Using this estimation, TCP Westwood+ adaptively sets a
+	  slow start threshold and a congestion window which takes into
+	  account the bandwidth used  at the time congestion is experienced.
+	  TCP Westwood+ significantly increases fairness wrt TCP Reno in
+	  wired networks and throughput over wireless links.
+
+config TCP_CONG_HTCP
+	tristate "H-TCP"
+	default m
+	help
+	  H-TCP is a send-side only modifications of the TCP Reno
+	  protocol stack that optimizes the performance of TCP
+	  congestion control for high speed network links. It uses a
+	  modeswitch to change the alpha and beta parameters of TCP Reno
+	  based on network conditions and in a way so as to be fair with
+	  other Reno and H-TCP flows.
+
+config TCP_CONG_HSTCP
+	tristate "High Speed TCP"
+	default n
+	help
+	  Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+	  A modification to TCP's congestion control mechanism for use
+	  with large congestion windows. A table indicates how much to
+	  increase the congestion window by when an ACK is received.
+	  For more detail see https://www.icir.org/floyd/hstcp.html
+
+config TCP_CONG_HYBLA
+	tristate "TCP-Hybla congestion control algorithm"
+	default n
+	help
+	  TCP-Hybla is a sender-side only change that eliminates penalization of
+	  long-RTT, large-bandwidth connections, like when satellite legs are
+	  involved, especially when sharing a common bottleneck with normal
+	  terrestrial connections.
+
+config TCP_CONG_VEGAS
+	tristate "TCP Vegas"
+	default n
+	help
+	  TCP Vegas is a sender-side only change to TCP that anticipates
+	  the onset of congestion by estimating the bandwidth. TCP Vegas
+	  adjusts the sending rate by modifying the congestion
+	  window. TCP Vegas should provide less packet loss, but it is
+	  not as aggressive as TCP Reno.
+
+config TCP_CONG_NV
+	tristate "TCP NV"
+	default n
+	help
+	  TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+	  10G networks, measurement noise introduced by LRO, GRO and interrupt
+	  coalescence. In addition, it will decrease its cwnd multiplicatively
+	  instead of linearly.
+
+	  Note that in general congestion avoidance (cwnd decreased when # packets
+	  queued grows) cannot coexist with congestion control (cwnd decreased only
+	  when there is packet loss) due to fairness issues. One scenario when they
+	  can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+
+	  For further details see http://www.brakmo.org/networking/tcp-nv/
+
+config TCP_CONG_SCALABLE
+	tristate "Scalable TCP"
+	default n
+	help
+	  Scalable TCP is a sender-side only change to TCP which uses a
+	  MIMD congestion control algorithm which has some nice scaling
+	  properties, though is known to have fairness issues.
+	  See http://www.deneholme.net/tom/scalable/
+
+config TCP_CONG_LP
+	tristate "TCP Low Priority"
+	default n
+	help
+	  TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+	  to utilize only the excess network bandwidth as compared to the
+	  ``fair share`` of bandwidth as targeted by TCP.
+	  See http://www-ece.rice.edu/networks/TCP-LP/
+
+config TCP_CONG_VENO
+	tristate "TCP Veno"
+	default n
+	help
+	  TCP Veno is a sender-side only enhancement of TCP to obtain better
+	  throughput over wireless networks. TCP Veno makes use of state
+	  distinguishing to circumvent the difficult judgment of the packet loss
+	  type. TCP Veno cuts down less congestion window in response to random
+	  loss packets.
+	  See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+
+config TCP_CONG_YEAH
+	tristate "YeAH TCP"
+	select TCP_CONG_VEGAS
+	default n
+	help
+	  YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+	  algorithm, which uses a mixed loss/delay approach to compute the
+	  congestion window. It's design goals target high efficiency,
+	  internal, RTT and Reno fairness, resilience to link loss while
+	  keeping network elements load as low as possible.
+
+	  For further details look here:
+	    http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+
+config TCP_CONG_ILLINOIS
+	tristate "TCP Illinois"
+	default n
+	help
+	  TCP-Illinois is a sender-side modification of TCP Reno for
+	  high speed long delay links. It uses round-trip-time to
+	  adjust the alpha and beta parameters to achieve a higher average
+	  throughput and maintain fairness.
+
+	  For further details see:
+	    http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+
+config TCP_CONG_DCTCP
+	tristate "DataCenter TCP (DCTCP)"
+	default n
+	help
+	  DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+	  provide multi-bit feedback to the end hosts. It is designed to provide:
+
+	  - High burst tolerance (incast due to partition/aggregate),
+	  - Low latency (short flows, queries),
+	  - High throughput (continuous data updates, large file transfers) with
+	    commodity, shallow-buffered switches.
+
+	  All switches in the data center network running DCTCP must support
+	  ECN marking and be configured for marking when reaching defined switch
+	  buffer thresholds. The default ECN marking threshold heuristic for
+	  DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+	  (~100KB) at 10Gbps, but might need further careful tweaking.
+
+	  For further details see:
+	    http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+
+config TCP_CONG_CDG
+	tristate "CAIA Delay-Gradient (CDG)"
+	default n
+	help
+	  CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+	  the TCP sender in order to:
+
+	  o Use the delay gradient as a congestion signal.
+	  o Back off with an average probability that is independent of the RTT.
+	  o Coexist with flows that use loss-based congestion control.
+	  o Tolerate packet loss unrelated to congestion.
+
+	  For further details see:
+	    D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+	    delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+
+config TCP_CONG_BBR
+	tristate "BBR TCP"
+	default n
+	help
+
+	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+	  maximize network utilization and minimize queues. It builds an explicit
+	  model of the bottleneck delivery rate and path round-trip propagation
+	  delay. It tolerates packet loss and delay unrelated to congestion. It
+	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+	  coexist with flows that use loss-based congestion control, and can
+	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+	  AQM schemes that do not provide a delay signal. It requires the fq
+	  ("Fair Queue") pacing packet scheduler.
+
+choice
+	prompt "Default TCP congestion control"
+	default DEFAULT_CUBIC
+	help
+	  Select the TCP congestion control that will be used by default
+	  for all connections.
+
+	config DEFAULT_BIC
+		bool "Bic" if TCP_CONG_BIC=y
+
+	config DEFAULT_CUBIC
+		bool "Cubic" if TCP_CONG_CUBIC=y
+
+	config DEFAULT_HTCP
+		bool "Htcp" if TCP_CONG_HTCP=y
+
+	config DEFAULT_HYBLA
+		bool "Hybla" if TCP_CONG_HYBLA=y
+
+	config DEFAULT_VEGAS
+		bool "Vegas" if TCP_CONG_VEGAS=y
+
+	config DEFAULT_VENO
+		bool "Veno" if TCP_CONG_VENO=y
+
+	config DEFAULT_WESTWOOD
+		bool "Westwood" if TCP_CONG_WESTWOOD=y
+
+	config DEFAULT_DCTCP
+		bool "DCTCP" if TCP_CONG_DCTCP=y
+
+	config DEFAULT_CDG
+		bool "CDG" if TCP_CONG_CDG=y
+
+	config DEFAULT_BBR
+		bool "BBR" if TCP_CONG_BBR=y
+
+	config DEFAULT_RENO
+		bool "Reno"
+endchoice
+
+endif
+
+config TCP_CONG_CUBIC
+	tristate
+	depends on !TCP_CONG_ADVANCED
+	default y
+
+config DEFAULT_TCP_CONG
+	string
+	default "bic" if DEFAULT_BIC
+	default "cubic" if DEFAULT_CUBIC
+	default "htcp" if DEFAULT_HTCP
+	default "hybla" if DEFAULT_HYBLA
+	default "vegas" if DEFAULT_VEGAS
+	default "westwood" if DEFAULT_WESTWOOD
+	default "veno" if DEFAULT_VENO
+	default "reno" if DEFAULT_RENO
+	default "dctcp" if DEFAULT_DCTCP
+	default "cdg" if DEFAULT_CDG
+	default "bbr" if DEFAULT_BBR
+	default "cubic"
+
+config TCP_MD5SIG
+	bool "TCP: MD5 Signature Option support (RFC2385)"
+	select CRYPTO
+	select CRYPTO_MD5
+	help
+	  RFC2385 specifies a method of giving MD5 protection to TCP sessions.
+	  Its main (only?) use is to protect BGP sessions between core routers
+	  on the Internet.
+
+	  If unsure, say N.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 0000000000..b18ba8ef93
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+
+obj-y     := route.o inetpeer.o protocol.o \
+	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+	     ip_output.o ip_sockglue.o inet_hashtables.o \
+	     inet_timewait_sock.o inet_connection_sock.o \
+	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+	     tcp_rate.o tcp_recovery.o tcp_ulp.o \
+	     tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
+	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
+	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
+	     metrics.o netlink.o nexthop.o udp_tunnel_stub.o
+
+obj-$(CONFIG_BPFILTER) += bpfilter/
+
+obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
+obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+gre-y := gre_demux.o
+fou-y := fou_core.o fou_nl.o fou_bpf.o
+obj-$(CONFIG_NET_FOU) += fou.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+udp_tunnel-y := udp_tunnel_core.o udp_tunnel_nic.o
+obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
+obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
+obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
+obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
+obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
+obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
+obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+		      xfrm4_output.o xfrm4_protocol.o
+
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
+endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 0000000000..1c58bd72e1
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,2117 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		PF_INET protocol family socket handler.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ *		piggy,
+ *		Karl Knutson	:	Socket protocol table
+ *		A.N.Kuznetsov	:	Socket death error in accept().
+ *		John Richardson :	Fix non blocking error in connect()
+ *					so sockets that fail to connect
+ *					don't return -EINPROGRESS.
+ *		Alan Cox	:	Asynchronous I/O support
+ *		Alan Cox	:	Keep correct socket pointer on sock
+ *					structures
+ *					when accept() ed
+ *		Alan Cox	:	Semantics of SO_LINGER aren't state
+ *					moved to close when you look carefully.
+ *					With this fixed and the accept bug fixed
+ *					some RPC stuff seems happier.
+ *		Niibe Yutaka	:	4.4BSD style write async I/O
+ *		Alan Cox,
+ *		Tony Gale 	:	Fixed reuse semantics.
+ *		Alan Cox	:	bind() shouldn't abort existing but dead
+ *					sockets. Stops FTP netin:.. I hope.
+ *		Alan Cox	:	bind() works correctly for RAW sockets.
+ *					Note that FreeBSD at least was broken
+ *					in this respect so be careful with
+ *					compatibility tests...
+ *		Alan Cox	:	routing cache support
+ *		Alan Cox	:	memzero the socket structure for
+ *					compactness.
+ *		Matt Day	:	nonblock connect error handler
+ *		Alan Cox	:	Allow large numbers of pending sockets
+ *					(eg for big web sites), but only if
+ *					specifically application requested.
+ *		Alan Cox	:	New buffering throughout IP. Used
+ *					dumbly.
+ *		Alan Cox	:	New buffering now used smartly.
+ *		Alan Cox	:	BSD rather than common sense
+ *					interpretation of listen.
+ *		Germano Caronni	:	Assorted small races.
+ *		Alan Cox	:	sendmsg/recvmsg basic support.
+ *		Alan Cox	:	Only sendmsg/recvmsg now supported.
+ *		Alan Cox	:	Locked down bind (see security list).
+ *		Alan Cox	:	Loosened bind a little.
+ *		Mike McLagan	:	ADD/DEL DLCI Ioctls
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		David S. Miller	:	New socket lookup architecture.
+ *					Some other random speedups.
+ *		Cyrus Durgin	:	Cleaned up file for kmod hacks.
+ *		Andi Kleen	:	Fix inet_stream_connect TCP race.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/capability.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/inet.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/ping.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/ip_tunnels.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/secure_seq.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#include <net/l3mdev.h>
+#include <net/compat.h>
+
+#include <trace/events/sock.h>
+
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+
+/* New destruction routine */
+
+void inet_sock_destruct(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	__skb_queue_purge(&sk->sk_receive_queue);
+	__skb_queue_purge(&sk->sk_error_queue);
+
+	sk_mem_reclaim_final(sk);
+
+	if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+		pr_err("Attempt to release TCP socket in state %d %p\n",
+		       sk->sk_state, sk);
+		return;
+	}
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		pr_err("Attempt to release alive inet socket %p\n", sk);
+		return;
+	}
+
+	WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
+	WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
+	WARN_ON_ONCE(sk->sk_wmem_queued);
+	WARN_ON_ONCE(sk_forward_alloc_get(sk));
+
+	kfree(rcu_dereference_protected(inet->inet_opt, 1));
+	dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
+	dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
+}
+EXPORT_SYMBOL(inet_sock_destruct);
+
+/*
+ *	The routines beyond this point handle the behaviour of an AF_INET
+ *	socket object. Mostly it punts to the subprotocols of IP to do
+ *	the work.
+ */
+
+/*
+ *	Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+	struct inet_sock *inet;
+	/* We may need to bind the socket. */
+	lock_sock(sk);
+	inet = inet_sk(sk);
+	if (!inet->inet_num) {
+		if (sk->sk_prot->get_port(sk, 0)) {
+			release_sock(sk);
+			return -EAGAIN;
+		}
+		inet->inet_sport = htons(inet->inet_num);
+	}
+	release_sock(sk);
+	return 0;
+}
+
+int __inet_listen_sk(struct sock *sk, int backlog)
+{
+	unsigned char old_state = sk->sk_state;
+	int err, tcp_fastopen;
+
+	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+		return -EINVAL;
+
+	WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
+	/* Really, if the socket is already in listen state
+	 * we can only allow the backlog to be adjusted.
+	 */
+	if (old_state != TCP_LISTEN) {
+		/* Enable TFO w/o requiring TCP_FASTOPEN socket option.
+		 * Note that only TCP sockets (SOCK_STREAM) will reach here.
+		 * Also fastopen backlog may already been set via the option
+		 * because the socket was in TCP_LISTEN state previously but
+		 * was shutdown() rather than close().
+		 */
+		tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
+		if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
+		    (tcp_fastopen & TFO_SERVER_ENABLE) &&
+		    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
+			fastopen_queue_tune(sk, backlog);
+			tcp_fastopen_init_key_once(sock_net(sk));
+		}
+
+		err = inet_csk_listen_start(sk);
+		if (err)
+			return err;
+
+		tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
+	}
+	return 0;
+}
+
+/*
+ *	Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int err = -EINVAL;
+
+	lock_sock(sk);
+
+	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+		goto out;
+
+	err = __inet_listen_sk(sk, backlog);
+
+out:
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(inet_listen);
+
+/*
+ *	Create an inet socket.
+ */
+
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
+{
+	struct sock *sk;
+	struct inet_protosw *answer;
+	struct inet_sock *inet;
+	struct proto *answer_prot;
+	unsigned char answer_flags;
+	int try_loading_module = 0;
+	int err;
+
+	if (protocol < 0 || protocol >= IPPROTO_MAX)
+		return -EINVAL;
+
+	sock->state = SS_UNCONNECTED;
+
+	/* Look for the requested type/protocol pair. */
+lookup_protocol:
+	err = -ESOCKTNOSUPPORT;
+	rcu_read_lock();
+	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
+
+		err = 0;
+		/* Check the non-wild match. */
+		if (protocol == answer->protocol) {
+			if (protocol != IPPROTO_IP)
+				break;
+		} else {
+			/* Check for the two wild cases. */
+			if (IPPROTO_IP == protocol) {
+				protocol = answer->protocol;
+				break;
+			}
+			if (IPPROTO_IP == answer->protocol)
+				break;
+		}
+		err = -EPROTONOSUPPORT;
+	}
+
+	if (unlikely(err)) {
+		if (try_loading_module < 2) {
+			rcu_read_unlock();
+			/*
+			 * Be more specific, e.g. net-pf-2-proto-132-type-1
+			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+			 */
+			if (++try_loading_module == 1)
+				request_module("net-pf-%d-proto-%d-type-%d",
+					       PF_INET, protocol, sock->type);
+			/*
+			 * Fall back to generic, e.g. net-pf-2-proto-132
+			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+			 */
+			else
+				request_module("net-pf-%d-proto-%d",
+					       PF_INET, protocol);
+			goto lookup_protocol;
+		} else
+			goto out_rcu_unlock;
+	}
+
+	err = -EPERM;
+	if (sock->type == SOCK_RAW && !kern &&
+	    !ns_capable(net->user_ns, CAP_NET_RAW))
+		goto out_rcu_unlock;
+
+	sock->ops = answer->ops;
+	answer_prot = answer->prot;
+	answer_flags = answer->flags;
+	rcu_read_unlock();
+
+	WARN_ON(!answer_prot->slab);
+
+	err = -ENOMEM;
+	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
+	if (!sk)
+		goto out;
+
+	err = 0;
+	if (INET_PROTOSW_REUSE & answer_flags)
+		sk->sk_reuse = SK_CAN_REUSE;
+
+	if (INET_PROTOSW_ICSK & answer_flags)
+		inet_init_csk_locks(sk);
+
+	inet = inet_sk(sk);
+	inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
+
+	inet_clear_bit(NODEFRAG, sk);
+
+	if (SOCK_RAW == sock->type) {
+		inet->inet_num = protocol;
+		if (IPPROTO_RAW == protocol)
+			inet_set_bit(HDRINCL, sk);
+	}
+
+	if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
+		inet->pmtudisc = IP_PMTUDISC_DONT;
+	else
+		inet->pmtudisc = IP_PMTUDISC_WANT;
+
+	atomic_set(&inet->inet_id, 0);
+
+	sock_init_data(sock, sk);
+
+	sk->sk_destruct	   = inet_sock_destruct;
+	sk->sk_protocol	   = protocol;
+	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+	sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
+
+	inet->uc_ttl	= -1;
+	inet_set_bit(MC_LOOP, sk);
+	inet->mc_ttl	= 1;
+	inet_set_bit(MC_ALL, sk);
+	inet->mc_index	= 0;
+	inet->mc_list	= NULL;
+	inet->rcv_tos	= 0;
+
+	if (inet->inet_num) {
+		/* It assumes that any protocol which allows
+		 * the user to assign a number at socket
+		 * creation time automatically
+		 * shares.
+		 */
+		inet->inet_sport = htons(inet->inet_num);
+		/* Add to protocol hash chains. */
+		err = sk->sk_prot->hash(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+
+	if (sk->sk_prot->init) {
+		err = sk->sk_prot->init(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+out:
+	return err;
+out_rcu_unlock:
+	rcu_read_unlock();
+	goto out;
+}
+
+
+/*
+ *	The peer socket should always be NULL (or else). When we call this
+ *	function we are destroying the object and from then on nobody
+ *	should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		long timeout;
+
+		if (!sk->sk_kern_sock)
+			BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);
+
+		/* Applications forget to leave groups before exiting */
+		ip_mc_drop_socket(sk);
+
+		/* If linger is set, we don't return until the close
+		 * is complete.  Otherwise we return immediately. The
+		 * actually closing is done the same either way.
+		 *
+		 * If the close is due to the process exiting, we never
+		 * linger..
+		 */
+		timeout = 0;
+		if (sock_flag(sk, SOCK_LINGER) &&
+		    !(current->flags & PF_EXITING))
+			timeout = sk->sk_lingertime;
+		sk->sk_prot->close(sk, timeout);
+		sock->sk = NULL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(inet_release);
+
+int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	u32 flags = BIND_WITH_LOCK;
+	int err;
+
+	/* If the socket has its own bind function then use it. (RAW) */
+	if (sk->sk_prot->bind) {
+		return sk->sk_prot->bind(sk, uaddr, addr_len);
+	}
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	/* BPF prog is run before any checks are done so that if the prog
+	 * changes context in a wrong way it will be caught.
+	 */
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
+						 CGROUP_INET4_BIND, &flags);
+	if (err)
+		return err;
+
+	return __inet_bind(sk, uaddr, addr_len, flags);
+}
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	return inet_bind_sk(sock->sk, uaddr, addr_len);
+}
+EXPORT_SYMBOL(inet_bind);
+
+int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+		u32 flags)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	unsigned short snum;
+	int chk_addr_ret;
+	u32 tb_id = RT_TABLE_LOCAL;
+	int err;
+
+	if (addr->sin_family != AF_INET) {
+		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
+		 * only if s_addr is INADDR_ANY.
+		 */
+		err = -EAFNOSUPPORT;
+		if (addr->sin_family != AF_UNSPEC ||
+		    addr->sin_addr.s_addr != htonl(INADDR_ANY))
+			goto out;
+	}
+
+	tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
+	chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
+
+	/* Not specified by any standard per-se, however it breaks too
+	 * many applications when removed.  It is unfortunate since
+	 * allowing applications to make a non-local bind solves
+	 * several problems with systems using dynamic addressing.
+	 * (ie. your servers still start up even if your ISDN link
+	 *  is temporarily down)
+	 */
+	err = -EADDRNOTAVAIL;
+	if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+	                                 chk_addr_ret))
+		goto out;
+
+	snum = ntohs(addr->sin_port);
+	err = -EACCES;
+	if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+	    snum && inet_port_requires_bind_service(net, snum) &&
+	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+		goto out;
+
+	/*      We keep a pair of addresses. rcv_saddr is the one
+	 *      used by hash lookups, and saddr is used for transmit.
+	 *
+	 *      In the BSD API these are the same except where it
+	 *      would be illegal to use them (multicast/broadcast) in
+	 *      which case the sending device address is used.
+	 */
+	if (flags & BIND_WITH_LOCK)
+		lock_sock(sk);
+
+	/* Check these errors (active socket, double bind). */
+	err = -EINVAL;
+	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
+		goto out_release_sock;
+
+	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+		inet->inet_saddr = 0;  /* Use device */
+
+	/* Make sure we are allowed to bind here. */
+	if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
+		      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
+		err = sk->sk_prot->get_port(sk, snum);
+		if (err) {
+			inet->inet_saddr = inet->inet_rcv_saddr = 0;
+			goto out_release_sock;
+		}
+		if (!(flags & BIND_FROM_BPF)) {
+			err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
+			if (err) {
+				inet->inet_saddr = inet->inet_rcv_saddr = 0;
+				if (sk->sk_prot->put_port)
+					sk->sk_prot->put_port(sk);
+				goto out_release_sock;
+			}
+		}
+	}
+
+	if (inet->inet_rcv_saddr)
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+	if (snum)
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+	inet->inet_sport = htons(inet->inet_num);
+	inet->inet_daddr = 0;
+	inet->inet_dport = 0;
+	sk_dst_reset(sk);
+	err = 0;
+out_release_sock:
+	if (flags & BIND_WITH_LOCK)
+		release_sock(sk);
+out:
+	return err;
+}
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+		       int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	const struct proto *prot;
+	int err;
+
+	if (addr_len < sizeof(uaddr->sa_family))
+		return -EINVAL;
+
+	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
+	prot = READ_ONCE(sk->sk_prot);
+
+	if (uaddr->sa_family == AF_UNSPEC)
+		return prot->disconnect(sk, flags);
+
+	if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
+		err = prot->pre_connect(sk, uaddr, addr_len);
+		if (err)
+			return err;
+	}
+
+	if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
+		return -EAGAIN;
+	return prot->connect(sk, uaddr, addr_len);
+}
+EXPORT_SYMBOL(inet_dgram_connect);
+
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk->sk_write_pending += writebias;
+
+	/* Basic assumption: if someone sets sk->sk_err, he _must_
+	 * change state of the socket from TCP_SYN_*.
+	 * Connect() does not allow to get error notifications
+	 * without closing the socket.
+	 */
+	while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		release_sock(sk);
+		timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		lock_sock(sk);
+		if (signal_pending(current) || !timeo)
+			break;
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+	sk->sk_write_pending -= writebias;
+	return timeo;
+}
+
+/*
+ *	Connect to a remote host. There is regrettably still a little
+ *	TCP 'magic' in here.
+ */
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			  int addr_len, int flags, int is_sendmsg)
+{
+	struct sock *sk = sock->sk;
+	int err;
+	long timeo;
+
+	/*
+	 * uaddr can be NULL and addr_len can be 0 if:
+	 * sk is a TCP fastopen active socket and
+	 * TCP_FASTOPEN_CONNECT sockopt is set and
+	 * we already have a valid cookie for this socket.
+	 * In this case, user can call write() after connect().
+	 * write() will invoke tcp_sendmsg_fastopen() which calls
+	 * __inet_stream_connect().
+	 */
+	if (uaddr) {
+		if (addr_len < sizeof(uaddr->sa_family))
+			return -EINVAL;
+
+		if (uaddr->sa_family == AF_UNSPEC) {
+			sk->sk_disconnects++;
+			err = sk->sk_prot->disconnect(sk, flags);
+			sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+			goto out;
+		}
+	}
+
+	switch (sock->state) {
+	default:
+		err = -EINVAL;
+		goto out;
+	case SS_CONNECTED:
+		err = -EISCONN;
+		goto out;
+	case SS_CONNECTING:
+		if (inet_test_bit(DEFER_CONNECT, sk))
+			err = is_sendmsg ? -EINPROGRESS : -EISCONN;
+		else
+			err = -EALREADY;
+		/* Fall out of switch with err, set for this state */
+		break;
+	case SS_UNCONNECTED:
+		err = -EISCONN;
+		if (sk->sk_state != TCP_CLOSE)
+			goto out;
+
+		if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
+			err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
+			if (err)
+				goto out;
+		}
+
+		err = sk->sk_prot->connect(sk, uaddr, addr_len);
+		if (err < 0)
+			goto out;
+
+		sock->state = SS_CONNECTING;
+
+		if (!err && inet_test_bit(DEFER_CONNECT, sk))
+			goto out;
+
+		/* Just entered SS_CONNECTING state; the only
+		 * difference is that return value in non-blocking
+		 * case is EINPROGRESS, rather than EALREADY.
+		 */
+		err = -EINPROGRESS;
+		break;
+	}
+
+	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+				tcp_sk(sk)->fastopen_req &&
+				tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+		int dis = sk->sk_disconnects;
+
+		/* Error code is set above */
+		if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
+			goto out;
+
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			goto out;
+
+		if (dis != sk->sk_disconnects) {
+			err = -EPIPE;
+			goto out;
+		}
+	}
+
+	/* Connection was closed by RST, timeout, ICMP error
+	 * or another process disconnected us.
+	 */
+	if (sk->sk_state == TCP_CLOSE)
+		goto sock_error;
+
+	/* sk->sk_err may be not zero now, if RECVERR was ordered by user
+	 * and error was received after socket entered established state.
+	 * Hence, it is handled normally after connect() return successfully.
+	 */
+
+	sock->state = SS_CONNECTED;
+	err = 0;
+out:
+	return err;
+
+sock_error:
+	err = sock_error(sk) ? : -ECONNABORTED;
+	sock->state = SS_UNCONNECTED;
+	sk->sk_disconnects++;
+	if (sk->sk_prot->disconnect(sk, flags))
+		sock->state = SS_DISCONNECTING;
+	goto out;
+}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	int err;
+
+	lock_sock(sock->sk);
+	err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
+	release_sock(sock->sk);
+	return err;
+}
+EXPORT_SYMBOL(inet_stream_connect);
+
+void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
+{
+	sock_rps_record_flow(newsk);
+	WARN_ON(!((1 << newsk->sk_state) &
+		  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+
+	if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
+		set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
+	sock_graft(newsk, newsock);
+
+	newsock->state = SS_CONNECTED;
+}
+
+/*
+ *	Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags,
+		bool kern)
+{
+	struct sock *sk1 = sock->sk, *sk2;
+	int err = -EINVAL;
+
+	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
+	sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern);
+	if (!sk2)
+		return err;
+
+	lock_sock(sk2);
+	__inet_accept(sock, newsock, sk2);
+	release_sock(sk2);
+	return 0;
+}
+EXPORT_SYMBOL(inet_accept);
+
+/*
+ *	This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+		 int peer)
+{
+	struct sock *sk		= sock->sk;
+	struct inet_sock *inet	= inet_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+	int sin_addr_len = sizeof(*sin);
+
+	sin->sin_family = AF_INET;
+	lock_sock(sk);
+	if (peer) {
+		if (!inet->inet_dport ||
+		    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+		     peer == 1)) {
+			release_sock(sk);
+			return -ENOTCONN;
+		}
+		sin->sin_port = inet->inet_dport;
+		sin->sin_addr.s_addr = inet->inet_daddr;
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+				       CGROUP_INET4_GETPEERNAME);
+	} else {
+		__be32 addr = inet->inet_rcv_saddr;
+		if (!addr)
+			addr = inet->inet_saddr;
+		sin->sin_port = inet->inet_sport;
+		sin->sin_addr.s_addr = addr;
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+				       CGROUP_INET4_GETSOCKNAME);
+	}
+	release_sock(sk);
+	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+	return sin_addr_len;
+}
+EXPORT_SYMBOL(inet_getname);
+
+int inet_send_prepare(struct sock *sk)
+{
+	sock_rps_record_flow(sk);
+
+	/* We may need to bind the socket. */
+	if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
+	    inet_autobind(sk))
+		return -EAGAIN;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inet_send_prepare);
+
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+
+	if (unlikely(inet_send_prepare(sk)))
+		return -EAGAIN;
+
+	return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
+			       sk, msg, size);
+}
+EXPORT_SYMBOL(inet_sendmsg);
+
+void inet_splice_eof(struct socket *sock)
+{
+	const struct proto *prot;
+	struct sock *sk = sock->sk;
+
+	if (unlikely(inet_send_prepare(sk)))
+		return;
+
+	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot->splice_eof)
+		prot->splice_eof(sock);
+}
+EXPORT_SYMBOL_GPL(inet_splice_eof);
+
+INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
+					  size_t, int, int *));
+int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+		 int flags)
+{
+	struct sock *sk = sock->sk;
+	int addr_len = 0;
+	int err;
+
+	if (likely(!(flags & MSG_ERRQUEUE)))
+		sock_rps_record_flow(sk);
+
+	err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
+			      sk, msg, size, flags, &addr_len);
+	if (err >= 0)
+		msg->msg_namelen = addr_len;
+	return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
+
+int inet_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	/* This should really check to make sure
+	 * the socket is a TCP socket. (WHY AC...)
+	 */
+	how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+		       1->2 bit 2 snds.
+		       2->3 */
+	if ((how & ~SHUTDOWN_MASK) || !how)	/* MAXINT->0 */
+		return -EINVAL;
+
+	lock_sock(sk);
+	if (sock->state == SS_CONNECTING) {
+		if ((1 << sk->sk_state) &
+		    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+			sock->state = SS_DISCONNECTING;
+		else
+			sock->state = SS_CONNECTED;
+	}
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		err = -ENOTCONN;
+		/* Hack to wake up other listeners, who can poll for
+		   EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
+		fallthrough;
+	default:
+		WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how);
+		if (sk->sk_prot->shutdown)
+			sk->sk_prot->shutdown(sk, how);
+		break;
+
+	/* Remaining two branches are temporary solution for missing
+	 * close() in multithreaded environment. It is _not_ a good idea,
+	 * but we have no choice until close() is repaired at VFS level.
+	 */
+	case TCP_LISTEN:
+		if (!(how & RCV_SHUTDOWN))
+			break;
+		fallthrough;
+	case TCP_SYN_SENT:
+		err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+		break;
+	}
+
+	/* Wake up anyone sleeping in poll. */
+	sk->sk_state_change(sk);
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(inet_shutdown);
+
+/*
+ *	ioctl() calls you can issue on an INET socket. Most of these are
+ *	device configuration and stuff and very rarely used. Some ioctls
+ *	pass on to the socket itself.
+ *
+ *	NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ *	loads the devconfigure module does its configuring and unloads it.
+ *	There's a good 20K of config code hanging around the kernel.
+ */
+
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	struct net *net = sock_net(sk);
+	void __user *p = (void __user *)arg;
+	struct ifreq ifr;
+	struct rtentry rt;
+
+	switch (cmd) {
+	case SIOCADDRT:
+	case SIOCDELRT:
+		if (copy_from_user(&rt, p, sizeof(struct rtentry)))
+			return -EFAULT;
+		err = ip_rt_ioctl(net, cmd, &rt);
+		break;
+	case SIOCRTMSG:
+		err = -EINVAL;
+		break;
+	case SIOCDARP:
+	case SIOCGARP:
+	case SIOCSARP:
+		err = arp_ioctl(net, cmd, (void __user *)arg);
+		break;
+	case SIOCGIFADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCGIFDSTADDR:
+	case SIOCGIFPFLAGS:
+		if (get_user_ifreq(&ifr, NULL, p))
+			return -EFAULT;
+		err = devinet_ioctl(net, cmd, &ifr);
+		if (!err && put_user_ifreq(&ifr, p))
+			err = -EFAULT;
+		break;
+
+	case SIOCSIFADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCSIFNETMASK:
+	case SIOCSIFDSTADDR:
+	case SIOCSIFPFLAGS:
+	case SIOCSIFFLAGS:
+		if (get_user_ifreq(&ifr, NULL, p))
+			return -EFAULT;
+		err = devinet_ioctl(net, cmd, &ifr);
+		break;
+	default:
+		if (sk->sk_prot->ioctl)
+			err = sk_ioctl(sk, cmd, (void __user *)arg);
+		else
+			err = -ENOIOCTLCMD;
+		break;
+	}
+	return err;
+}
+EXPORT_SYMBOL(inet_ioctl);
+
+#ifdef CONFIG_COMPAT
+static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+		struct compat_rtentry __user *ur)
+{
+	compat_uptr_t rtdev;
+	struct rtentry rt;
+
+	if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
+			3 * sizeof(struct sockaddr)) ||
+	    get_user(rt.rt_flags, &ur->rt_flags) ||
+	    get_user(rt.rt_metric, &ur->rt_metric) ||
+	    get_user(rt.rt_mtu, &ur->rt_mtu) ||
+	    get_user(rt.rt_window, &ur->rt_window) ||
+	    get_user(rt.rt_irtt, &ur->rt_irtt) ||
+	    get_user(rtdev, &ur->rt_dev))
+		return -EFAULT;
+
+	rt.rt_dev = compat_ptr(rtdev);
+	return ip_rt_ioctl(sock_net(sk), cmd, &rt);
+}
+
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = compat_ptr(arg);
+	struct sock *sk = sock->sk;
+
+	switch (cmd) {
+	case SIOCADDRT:
+	case SIOCDELRT:
+		return inet_compat_routing_ioctl(sk, cmd, argp);
+	default:
+		if (!sk->sk_prot->compat_ioctl)
+			return -ENOIOCTLCMD;
+		return sk->sk_prot->compat_ioctl(sk, cmd, arg);
+	}
+}
+#endif /* CONFIG_COMPAT */
+
+const struct proto_ops inet_stream_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = inet_bind,
+	.connect	   = inet_stream_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = inet_accept,
+	.getname	   = inet_getname,
+	.poll		   = tcp_poll,
+	.ioctl		   = inet_ioctl,
+	.gettstamp	   = sock_gettstamp,
+	.listen		   = inet_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = inet_recvmsg,
+#ifdef CONFIG_MMU
+	.mmap		   = tcp_mmap,
+#endif
+	.splice_eof	   = inet_splice_eof,
+	.splice_read	   = tcp_splice_read,
+	.read_sock	   = tcp_read_sock,
+	.read_skb	   = tcp_read_skb,
+	.sendmsg_locked    = tcp_sendmsg_locked,
+	.peek_len	   = tcp_peek_len,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet_compat_ioctl,
+#endif
+	.set_rcvlowat	   = tcp_set_rcvlowat,
+};
+EXPORT_SYMBOL(inet_stream_ops);
+
+const struct proto_ops inet_dgram_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = inet_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = inet_getname,
+	.poll		   = udp_poll,
+	.ioctl		   = inet_ioctl,
+	.gettstamp	   = sock_gettstamp,
+	.listen		   = sock_no_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.read_skb	   = udp_read_skb,
+	.recvmsg	   = inet_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.splice_eof	   = inet_splice_eof,
+	.set_peek_off	   = sk_set_peek_off,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_dgram_ops);
+
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static const struct proto_ops inet_sockraw_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = inet_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = inet_getname,
+	.poll		   = datagram_poll,
+	.ioctl		   = inet_ioctl,
+	.gettstamp	   = sock_gettstamp,
+	.listen		   = sock_no_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = inet_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.splice_eof	   = inet_splice_eof,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	   = inet_compat_ioctl,
+#endif
+};
+
+static const struct net_proto_family inet_family_ops = {
+	.family = PF_INET,
+	.create = inet_create,
+	.owner	= THIS_MODULE,
+};
+
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+	{
+		.type =       SOCK_STREAM,
+		.protocol =   IPPROTO_TCP,
+		.prot =       &tcp_prot,
+		.ops =        &inet_stream_ops,
+		.flags =      INET_PROTOSW_PERMANENT |
+			      INET_PROTOSW_ICSK,
+	},
+
+	{
+		.type =       SOCK_DGRAM,
+		.protocol =   IPPROTO_UDP,
+		.prot =       &udp_prot,
+		.ops =        &inet_dgram_ops,
+		.flags =      INET_PROTOSW_PERMANENT,
+       },
+
+       {
+		.type =       SOCK_DGRAM,
+		.protocol =   IPPROTO_ICMP,
+		.prot =       &ping_prot,
+		.ops =        &inet_sockraw_ops,
+		.flags =      INET_PROTOSW_REUSE,
+       },
+
+       {
+	       .type =       SOCK_RAW,
+	       .protocol =   IPPROTO_IP,	/* wild card */
+	       .prot =       &raw_prot,
+	       .ops =        &inet_sockraw_ops,
+	       .flags =      INET_PROTOSW_REUSE,
+       }
+};
+
+#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
+
+void inet_register_protosw(struct inet_protosw *p)
+{
+	struct list_head *lh;
+	struct inet_protosw *answer;
+	int protocol = p->protocol;
+	struct list_head *last_perm;
+
+	spin_lock_bh(&inetsw_lock);
+
+	if (p->type >= SOCK_MAX)
+		goto out_illegal;
+
+	/* If we are trying to override a permanent protocol, bail. */
+	last_perm = &inetsw[p->type];
+	list_for_each(lh, &inetsw[p->type]) {
+		answer = list_entry(lh, struct inet_protosw, list);
+		/* Check only the non-wild match. */
+		if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
+			break;
+		if (protocol == answer->protocol)
+			goto out_permanent;
+		last_perm = lh;
+	}
+
+	/* Add the new entry after the last permanent entry if any, so that
+	 * the new entry does not override a permanent entry when matched with
+	 * a wild-card protocol. But it is allowed to override any existing
+	 * non-permanent entry.  This means that when we remove this entry, the
+	 * system automatically returns to the old behavior.
+	 */
+	list_add_rcu(&p->list, last_perm);
+out:
+	spin_unlock_bh(&inetsw_lock);
+
+	return;
+
+out_permanent:
+	pr_err("Attempt to override permanent protocol %d\n", protocol);
+	goto out;
+
+out_illegal:
+	pr_err("Ignoring attempt to register invalid socket type %d\n",
+	       p->type);
+	goto out;
+}
+EXPORT_SYMBOL(inet_register_protosw);
+
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+	if (INET_PROTOSW_PERMANENT & p->flags) {
+		pr_err("Attempt to unregister permanent protocol %d\n",
+		       p->protocol);
+	} else {
+		spin_lock_bh(&inetsw_lock);
+		list_del_rcu(&p->list);
+		spin_unlock_bh(&inetsw_lock);
+
+		synchronize_net();
+	}
+}
+EXPORT_SYMBOL(inet_unregister_protosw);
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	__be32 old_saddr = inet->inet_saddr;
+	__be32 daddr = inet->inet_daddr;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	__be32 new_saddr;
+	struct ip_options_rcu *inet_opt;
+	int err;
+
+	inet_opt = rcu_dereference_protected(inet->inet_opt,
+					     lockdep_sock_is_held(sk));
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+
+	/* Query new route. */
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if,
+			      sk->sk_protocol, inet->inet_sport,
+			      inet->inet_dport, sk);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	new_saddr = fl4->saddr;
+
+	if (new_saddr == old_saddr) {
+		sk_setup_caps(sk, &rt->dst);
+		return 0;
+	}
+
+	err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET);
+	if (err) {
+		ip_rt_put(rt);
+		return err;
+	}
+
+	sk_setup_caps(sk, &rt->dst);
+
+	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
+		pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+			__func__, &old_saddr, &new_saddr);
+	}
+
+	/*
+	 * XXX The only one ugly spot where we need to
+	 * XXX really change the sockets identity after
+	 * XXX it has entered the hashes. -DaveM
+	 *
+	 * Besides that, it does not check for connection
+	 * uniqueness. Wait for troubles.
+	 */
+	return __sk_prot_rehash(sk);
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+	__be32 daddr;
+	struct ip_options_rcu *inet_opt;
+	struct flowi4 *fl4;
+	int err;
+
+	/* Route is OK, nothing to do. */
+	if (rt)
+		return 0;
+
+	/* Reroute. */
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	daddr = inet->inet_daddr;
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	rcu_read_unlock();
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
+				   inet->inet_dport, inet->inet_sport,
+				   sk->sk_protocol, RT_CONN_FLAGS(sk),
+				   sk->sk_bound_dev_if);
+	if (!IS_ERR(rt)) {
+		err = 0;
+		sk_setup_caps(sk, &rt->dst);
+	} else {
+		err = PTR_ERR(rt);
+
+		/* Routing failed... */
+		sk->sk_route_caps = 0;
+		/*
+		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+		 */
+		if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
+		    sk->sk_state != TCP_SYN_SENT ||
+		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+		    (err = inet_sk_reselect_saddr(sk)) != 0)
+			WRITE_ONCE(sk->sk_err_soft, -err);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
+void inet_sk_set_state(struct sock *sk, int state)
+{
+	trace_inet_sock_set_state(sk, sk->sk_state, state);
+	sk->sk_state = state;
+}
+EXPORT_SYMBOL(inet_sk_set_state);
+
+void inet_sk_state_store(struct sock *sk, int newstate)
+{
+	trace_inet_sock_set_state(sk, sk->sk_state, newstate);
+	smp_store_release(&sk->sk_state, newstate);
+}
+
+struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+				 netdev_features_t features)
+{
+	bool udpfrag = false, fixedid = false, gso_partial, encap;
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	const struct net_offload *ops;
+	unsigned int offset = 0;
+	struct iphdr *iph;
+	int proto, tot_len;
+	int nhoff;
+	int ihl;
+	int id;
+
+	skb_reset_network_header(skb);
+	nhoff = skb_network_header(skb) - skb_mac_header(skb);
+	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+		goto out;
+
+	iph = ip_hdr(skb);
+	ihl = iph->ihl * 4;
+	if (ihl < sizeof(*iph))
+		goto out;
+
+	id = ntohs(iph->id);
+	proto = iph->protocol;
+
+	/* Warning: after this point, iph might be no longer valid */
+	if (unlikely(!pskb_may_pull(skb, ihl)))
+		goto out;
+	__skb_pull(skb, ihl);
+
+	encap = SKB_GSO_CB(skb)->encap_level > 0;
+	if (encap)
+		features &= skb->dev->hw_enc_features;
+	SKB_GSO_CB(skb)->encap_level += ihl;
+
+	skb_reset_transport_header(skb);
+
+	segs = ERR_PTR(-EPROTONOSUPPORT);
+
+	if (!skb->encapsulation || encap) {
+		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+
+		/* fixed ID is invalid if DF bit is not set */
+		if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
+			goto out;
+	}
+
+	ops = rcu_dereference(inet_offloads[proto]);
+	if (likely(ops && ops->callbacks.gso_segment)) {
+		segs = ops->callbacks.gso_segment(skb, features);
+		if (!segs)
+			skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
+	}
+
+	if (IS_ERR_OR_NULL(segs))
+		goto out;
+
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
+	skb = segs;
+	do {
+		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
+		if (udpfrag) {
+			iph->frag_off = htons(offset >> 3);
+			if (skb->next)
+				iph->frag_off |= htons(IP_MF);
+			offset += skb->len - nhoff - ihl;
+			tot_len = skb->len - nhoff;
+		} else if (skb_is_gso(skb)) {
+			if (!fixedid) {
+				iph->id = htons(id);
+				id += skb_shinfo(skb)->gso_segs;
+			}
+
+			if (gso_partial)
+				tot_len = skb_shinfo(skb)->gso_size +
+					  SKB_GSO_CB(skb)->data_offset +
+					  skb->head - (unsigned char *)iph;
+			else
+				tot_len = skb->len - nhoff;
+		} else {
+			if (!fixedid)
+				iph->id = htons(id++);
+			tot_len = skb->len - nhoff;
+		}
+		iph->tot_len = htons(tot_len);
+		ip_send_check(iph);
+		if (encap)
+			skb_reset_inner_headers(skb);
+		skb->network_header = (u8 *)iph - skb->head;
+		skb_reset_mac_len(skb);
+	} while ((skb = skb->next));
+
+out:
+	return segs;
+}
+
+static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
+					netdev_features_t features)
+{
+	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
+		return ERR_PTR(-EINVAL);
+
+	return inet_gso_segment(skb, features);
+}
+
+struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+	const struct net_offload *ops;
+	struct sk_buff *pp = NULL;
+	const struct iphdr *iph;
+	struct sk_buff *p;
+	unsigned int hlen;
+	unsigned int off;
+	unsigned int id;
+	int flush = 1;
+	int proto;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*iph);
+	iph = skb_gro_header(skb, hlen, off);
+	if (unlikely(!iph))
+		goto out;
+
+	proto = iph->protocol;
+
+	ops = rcu_dereference(inet_offloads[proto]);
+	if (!ops || !ops->callbacks.gro_receive)
+		goto out;
+
+	if (*(u8 *)iph != 0x45)
+		goto out;
+
+	if (ip_is_fragment(iph))
+		goto out;
+
+	if (unlikely(ip_fast_csum((u8 *)iph, 5)))
+		goto out;
+
+	NAPI_GRO_CB(skb)->proto = proto;
+	id = ntohl(*(__be32 *)&iph->id);
+	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
+	id >>= 16;
+
+	list_for_each_entry(p, head, list) {
+		struct iphdr *iph2;
+		u16 flush_id;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		iph2 = (struct iphdr *)(p->data + off);
+		/* The above works because, with the exception of the top
+		 * (inner most) layer, we only aggregate pkts with the same
+		 * hdr length so all the hdrs we'll need to verify will start
+		 * at the same offset.
+		 */
+		if ((iph->protocol ^ iph2->protocol) |
+		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		/* All fields must match except length and checksum. */
+		NAPI_GRO_CB(p)->flush |=
+			(iph->ttl ^ iph2->ttl) |
+			(iph->tos ^ iph2->tos) |
+			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
+
+		NAPI_GRO_CB(p)->flush |= flush;
+
+		/* We need to store of the IP ID check to be included later
+		 * when we can verify that this packet does in fact belong
+		 * to a given flow.
+		 */
+		flush_id = (u16)(id - ntohs(iph2->id));
+
+		/* This bit of code makes it much easier for us to identify
+		 * the cases where we are doing atomic vs non-atomic IP ID
+		 * checks.  Specifically an atomic check can return IP ID
+		 * values 0 - 0xFFFF, while a non-atomic check can only
+		 * return 0 or 0xFFFF.
+		 */
+		if (!NAPI_GRO_CB(p)->is_atomic ||
+		    !(iph->frag_off & htons(IP_DF))) {
+			flush_id ^= NAPI_GRO_CB(p)->count;
+			flush_id = flush_id ? 0xFFFF : 0;
+		}
+
+		/* If the previous IP ID value was based on an atomic
+		 * datagram we can overwrite the value and ignore it.
+		 */
+		if (NAPI_GRO_CB(skb)->is_atomic)
+			NAPI_GRO_CB(p)->flush_id = flush_id;
+		else
+			NAPI_GRO_CB(p)->flush_id |= flush_id;
+	}
+
+	NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
+	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_set_network_header(skb, off);
+	/* The above will be needed by the transport layer if there is one
+	 * immediately following this IP hdr.
+	 */
+
+	/* Note : No need to call skb_gro_postpull_rcsum() here,
+	 * as we already checked checksum over ipv4 header was 0
+	 */
+	skb_gro_pull(skb, sizeof(*iph));
+	skb_set_transport_header(skb, skb_gro_offset(skb));
+
+	pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
+				       ops->callbacks.gro_receive, head, skb);
+
+out:
+	skb_gro_flush_final(skb, pp, flush);
+
+	return pp;
+}
+
+static struct sk_buff *ipip_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
+{
+	if (NAPI_GRO_CB(skb)->encap_mark) {
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	NAPI_GRO_CB(skb)->encap_mark = 1;
+
+	return inet_gro_receive(head, skb);
+}
+
+#define SECONDS_PER_DAY	86400
+
+/* inet_current_timestamp - Return IP network timestamp
+ *
+ * Return milliseconds since midnight in network byte order.
+ */
+__be32 inet_current_timestamp(void)
+{
+	u32 secs;
+	u32 msecs;
+	struct timespec64 ts;
+
+	ktime_get_real_ts64(&ts);
+
+	/* Get secs since midnight. */
+	(void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
+	/* Convert to msecs. */
+	msecs = secs * MSEC_PER_SEC;
+	/* Convert nsec to msec. */
+	msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;
+
+	/* Convert to network byte order. */
+	return htonl(msecs);
+}
+EXPORT_SYMBOL(inet_current_timestamp);
+
+int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+	if (sk->sk_family == AF_INET)
+		return ip_recv_error(sk, msg, len, addr_len);
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
+#endif
+	return -EINVAL;
+}
+EXPORT_SYMBOL(inet_recv_error);
+
+int inet_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
+	const struct net_offload *ops;
+	__be16 totlen = iph->tot_len;
+	int proto = iph->protocol;
+	int err = -ENOSYS;
+
+	if (skb->encapsulation) {
+		skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
+		skb_set_inner_network_header(skb, nhoff);
+	}
+
+	iph_set_totlen(iph, skb->len - nhoff);
+	csum_replace2(&iph->check, totlen, iph->tot_len);
+
+	ops = rcu_dereference(inet_offloads[proto]);
+	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+		goto out;
+
+	/* Only need to add sizeof(*iph) to get to the next hdr below
+	 * because any hdr with option will have been flushed in
+	 * inet_gro_receive().
+	 */
+	err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
+			      tcp4_gro_complete, udp4_gro_complete,
+			      skb, nhoff + sizeof(*iph));
+
+out:
+	return err;
+}
+
+static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	skb->encapsulation = 1;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
+	return inet_gro_complete(skb, nhoff);
+}
+
+int inet_ctl_sock_create(struct sock **sk, unsigned short family,
+			 unsigned short type, unsigned char protocol,
+			 struct net *net)
+{
+	struct socket *sock;
+	int rc = sock_create_kern(net, family, type, protocol, &sock);
+
+	if (rc == 0) {
+		*sk = sock->sk;
+		(*sk)->sk_allocation = GFP_ATOMIC;
+		(*sk)->sk_use_task_frag = false;
+		/*
+		 * Unhash it so that IP input processing does not even see it,
+		 * we do not wish this socket to see incoming packets.
+		 */
+		(*sk)->sk_prot->unhash(*sk);
+	}
+	return rc;
+}
+EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
+{
+	unsigned long res = 0;
+	int i;
+
+	for_each_possible_cpu(i)
+		res += snmp_get_cpu_field(mib, i, offt);
+	return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field);
+
+#if BITS_PER_LONG==32
+
+u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
+			 size_t syncp_offset)
+{
+	void *bhptr;
+	struct u64_stats_sync *syncp;
+	u64 v;
+	unsigned int start;
+
+	bhptr = per_cpu_ptr(mib, cpu);
+	syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+	do {
+		start = u64_stats_fetch_begin(syncp);
+		v = *(((u64 *)bhptr) + offt);
+	} while (u64_stats_fetch_retry(syncp, start));
+
+	return v;
+}
+EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);
+
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
+{
+	u64 res = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
+	}
+	return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
+#ifdef CONFIG_IP_MULTICAST
+static const struct net_protocol igmp_protocol = {
+	.handler =	igmp_rcv,
+};
+#endif
+
+static const struct net_protocol tcp_protocol = {
+	.handler	=	tcp_v4_rcv,
+	.err_handler	=	tcp_v4_err,
+	.no_policy	=	1,
+	.icmp_strict_tag_validation = 1,
+};
+
+static const struct net_protocol udp_protocol = {
+	.handler =	udp_rcv,
+	.err_handler =	udp_err,
+	.no_policy =	1,
+};
+
+static const struct net_protocol icmp_protocol = {
+	.handler =	icmp_rcv,
+	.err_handler =	icmp_err,
+	.no_policy =	1,
+};
+
+static __net_init int ipv4_mib_init_net(struct net *net)
+{
+	int i;
+
+	net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+	if (!net->mib.tcp_statistics)
+		goto err_tcp_mib;
+	net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+	if (!net->mib.ip_statistics)
+		goto err_ip_mib;
+
+	for_each_possible_cpu(i) {
+		struct ipstats_mib *af_inet_stats;
+		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
+		u64_stats_init(&af_inet_stats->syncp);
+	}
+
+	net->mib.net_statistics = alloc_percpu(struct linux_mib);
+	if (!net->mib.net_statistics)
+		goto err_net_mib;
+	net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+	if (!net->mib.udp_statistics)
+		goto err_udp_mib;
+	net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+	if (!net->mib.udplite_statistics)
+		goto err_udplite_mib;
+	net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+	if (!net->mib.icmp_statistics)
+		goto err_icmp_mib;
+	net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+					      GFP_KERNEL);
+	if (!net->mib.icmpmsg_statistics)
+		goto err_icmpmsg_mib;
+
+	tcp_mib_init(net);
+	return 0;
+
+err_icmpmsg_mib:
+	free_percpu(net->mib.icmp_statistics);
+err_icmp_mib:
+	free_percpu(net->mib.udplite_statistics);
+err_udplite_mib:
+	free_percpu(net->mib.udp_statistics);
+err_udp_mib:
+	free_percpu(net->mib.net_statistics);
+err_net_mib:
+	free_percpu(net->mib.ip_statistics);
+err_ip_mib:
+	free_percpu(net->mib.tcp_statistics);
+err_tcp_mib:
+	return -ENOMEM;
+}
+
+static __net_exit void ipv4_mib_exit_net(struct net *net)
+{
+	kfree(net->mib.icmpmsg_statistics);
+	free_percpu(net->mib.icmp_statistics);
+	free_percpu(net->mib.udplite_statistics);
+	free_percpu(net->mib.udp_statistics);
+	free_percpu(net->mib.net_statistics);
+	free_percpu(net->mib.ip_statistics);
+	free_percpu(net->mib.tcp_statistics);
+#ifdef CONFIG_MPTCP
+	/* allocated on demand, see mptcp_init_sock() */
+	free_percpu(net->mib.mptcp_statistics);
+#endif
+}
+
+static __net_initdata struct pernet_operations ipv4_mib_ops = {
+	.init = ipv4_mib_init_net,
+	.exit = ipv4_mib_exit_net,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+	return register_pernet_subsys(&ipv4_mib_ops);
+}
+
+static __net_init int inet_init_net(struct net *net)
+{
+	/*
+	 * Set defaults for local port range
+	 */
+	seqlock_init(&net->ipv4.ip_local_ports.lock);
+	net->ipv4.ip_local_ports.range[0] =  32768;
+	net->ipv4.ip_local_ports.range[1] =  60999;
+
+	seqlock_init(&net->ipv4.ping_group_range.lock);
+	/*
+	 * Sane defaults - nobody may create ping sockets.
+	 * Boot scripts should set this to distro-specific group.
+	 */
+	net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+	net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+
+	/* Default values for sysctl-controlled parameters.
+	 * We set them here, in case sysctl is not compiled.
+	 */
+	net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+	net->ipv4.sysctl_ip_fwd_update_priority = 1;
+	net->ipv4.sysctl_ip_dynaddr = 0;
+	net->ipv4.sysctl_ip_early_demux = 1;
+	net->ipv4.sysctl_udp_early_demux = 1;
+	net->ipv4.sysctl_tcp_early_demux = 1;
+	net->ipv4.sysctl_nexthop_compat_mode = 1;
+#ifdef CONFIG_SYSCTL
+	net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
+#endif
+
+	/* Some igmp sysctl, whose values are always used */
+	net->ipv4.sysctl_igmp_max_memberships = 20;
+	net->ipv4.sysctl_igmp_max_msf = 10;
+	/* IGMP reports for link-local multicast groups are enabled by default */
+	net->ipv4.sysctl_igmp_llm_reports = 1;
+	net->ipv4.sysctl_igmp_qrv = 2;
+
+	net->ipv4.sysctl_fib_notify_on_flag_change = 0;
+
+	return 0;
+}
+
+static __net_initdata struct pernet_operations af_inet_ops = {
+	.init = inet_init_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+	return register_pernet_subsys(&af_inet_ops);
+}
+
+static int ipv4_proc_init(void);
+
+/*
+ *	IP protocol layer initialiser
+ */
+
+static struct packet_offload ip_packet_offload __read_mostly = {
+	.type = cpu_to_be16(ETH_P_IP),
+	.callbacks = {
+		.gso_segment = inet_gso_segment,
+		.gro_receive = inet_gro_receive,
+		.gro_complete = inet_gro_complete,
+	},
+};
+
+static const struct net_offload ipip_offload = {
+	.callbacks = {
+		.gso_segment	= ipip_gso_segment,
+		.gro_receive	= ipip_gro_receive,
+		.gro_complete	= ipip_gro_complete,
+	},
+};
+
+static int __init ipip_offload_init(void)
+{
+	return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+}
+
+static int __init ipv4_offload_init(void)
+{
+	/*
+	 * Add offloads
+	 */
+	if (udpv4_offload_init() < 0)
+		pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+	if (tcpv4_offload_init() < 0)
+		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+	if (ipip_offload_init() < 0)
+		pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
+
+	dev_add_offload(&ip_packet_offload);
+	return 0;
+}
+
+fs_initcall(ipv4_offload_init);
+
+static struct packet_type ip_packet_type __read_mostly = {
+	.type = cpu_to_be16(ETH_P_IP),
+	.func = ip_rcv,
+	.list_func = ip_list_rcv,
+};
+
+static int __init inet_init(void)
+{
+	struct inet_protosw *q;
+	struct list_head *r;
+	int rc;
+
+	sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
+
+	raw_hashinfo_init(&raw_v4_hashinfo);
+
+	rc = proto_register(&tcp_prot, 1);
+	if (rc)
+		goto out;
+
+	rc = proto_register(&udp_prot, 1);
+	if (rc)
+		goto out_unregister_tcp_proto;
+
+	rc = proto_register(&raw_prot, 1);
+	if (rc)
+		goto out_unregister_udp_proto;
+
+	rc = proto_register(&ping_prot, 1);
+	if (rc)
+		goto out_unregister_raw_proto;
+
+	/*
+	 *	Tell SOCKET that we are alive...
+	 */
+
+	(void)sock_register(&inet_family_ops);
+
+#ifdef CONFIG_SYSCTL
+	ip_static_sysctl_init();
+#endif
+
+	/*
+	 *	Add all the base protocols.
+	 */
+
+	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
+	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+		pr_crit("%s: Cannot add UDP protocol\n", __func__);
+	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+		pr_crit("%s: Cannot add TCP protocol\n", __func__);
+#ifdef CONFIG_IP_MULTICAST
+	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+		pr_crit("%s: Cannot add IGMP protocol\n", __func__);
+#endif
+
+	/* Register the socket-side information for inet_create. */
+	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+		INIT_LIST_HEAD(r);
+
+	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+		inet_register_protosw(q);
+
+	/*
+	 *	Set the ARP module up
+	 */
+
+	arp_init();
+
+	/*
+	 *	Set the IP module up
+	 */
+
+	ip_init();
+
+	/* Initialise per-cpu ipv4 mibs */
+	if (init_ipv4_mibs())
+		panic("%s: Cannot init ipv4 mibs\n", __func__);
+
+	/* Setup TCP slab cache for open requests. */
+	tcp_init();
+
+	/* Setup UDP memory threshold */
+	udp_init();
+
+	/* Add UDP-Lite (RFC 3828) */
+	udplite4_register();
+
+	raw_init();
+
+	ping_init();
+
+	/*
+	 *	Set the ICMP layer up
+	 */
+
+	if (icmp_init() < 0)
+		panic("Failed to create the ICMP control socket.\n");
+
+	/*
+	 *	Initialise the multicast router
+	 */
+#if defined(CONFIG_IP_MROUTE)
+	if (ip_mr_init())
+		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
+#endif
+
+	if (init_inet_pernet_ops())
+		pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
+
+	ipv4_proc_init();
+
+	ipfrag_init();
+
+	dev_add_pack(&ip_packet_type);
+
+	ip_tunnel_core_init();
+
+	rc = 0;
+out:
+	return rc;
+out_unregister_raw_proto:
+	proto_unregister(&raw_prot);
+out_unregister_udp_proto:
+	proto_unregister(&udp_prot);
+out_unregister_tcp_proto:
+	proto_unregister(&tcp_prot);
+	goto out;
+}
+
+fs_initcall(inet_init);
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef CONFIG_PROC_FS
+static int __init ipv4_proc_init(void)
+{
+	int rc = 0;
+
+	if (raw_proc_init())
+		goto out_raw;
+	if (tcp4_proc_init())
+		goto out_tcp;
+	if (udp4_proc_init())
+		goto out_udp;
+	if (ping_proc_init())
+		goto out_ping;
+	if (ip_misc_proc_init())
+		goto out_misc;
+out:
+	return rc;
+out_misc:
+	ping_proc_exit();
+out_ping:
+	udp4_proc_exit();
+out_udp:
+	tcp4_proc_exit();
+out_tcp:
+	raw_proc_exit();
+out_raw:
+	rc = -ENOMEM;
+	goto out;
+}
+
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 0000000000..015c0f4ec5
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,604 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/algapi.h>
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/scatterlist.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+
+struct ah_skb_cb {
+	struct xfrm_skb_cb xfrm;
+	void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+			  unsigned int size)
+{
+	unsigned int len;
+
+	len = size + crypto_ahash_digestsize(ahash) +
+	      (crypto_ahash_alignmask(ahash) &
+	       ~(crypto_tfm_ctx_alignment() - 1));
+
+	len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+	len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+	len = ALIGN(len, __alignof__(struct scatterlist));
+
+	len += sizeof(struct scatterlist) * nfrags;
+
+	return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
+{
+	return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+			     unsigned int offset)
+{
+	return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+					       u8 *icv)
+{
+	struct ahash_request *req;
+
+	req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+				crypto_tfm_ctx_alignment());
+
+	ahash_request_set_tfm(req, ahash);
+
+	return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+					     struct ahash_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_ahash_reqsize(ahash),
+			     __alignof__(struct scatterlist));
+}
+
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
+{
+	unsigned char *optptr = (unsigned char *)(iph+1);
+	int  l = iph->ihl*4 - sizeof(struct iphdr);
+	int  optlen;
+
+	while (l > 0) {
+		switch (*optptr) {
+		case IPOPT_END:
+			return 0;
+		case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		optlen = optptr[1];
+		if (optlen<2 || optlen>l)
+			return -EINVAL;
+		switch (*optptr) {
+		case IPOPT_SEC:
+		case 0x85:	/* Some "Extended Security" crap. */
+		case IPOPT_CIPSO:
+		case IPOPT_RA:
+		case 0x80|21:	/* RFC1770 */
+			break;
+		case IPOPT_LSRR:
+		case IPOPT_SSRR:
+			if (optlen < 6)
+				return -EINVAL;
+			memcpy(daddr, optptr+optlen-4, 4);
+			fallthrough;
+		default:
+			memset(optptr, 0, optlen);
+		}
+		l -= optlen;
+		optptr += optlen;
+	}
+	return 0;
+}
+
+static void ah_output_done(void *data, int err)
+{
+	u8 *icv;
+	struct iphdr *iph;
+	struct sk_buff *skb = data;
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+	struct ah_data *ahp = x->data;
+	struct iphdr *top_iph = ip_hdr(skb);
+	struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+	int ihl = ip_hdrlen(skb);
+
+	iph = AH_SKB_CB(skb)->tmp;
+	icv = ah_tmp_icv(ahp->ahash, iph, ihl);
+	memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+	top_iph->tos = iph->tos;
+	top_iph->ttl = iph->ttl;
+	top_iph->frag_off = iph->frag_off;
+	if (top_iph->ihl != 5) {
+		top_iph->daddr = iph->daddr;
+		memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+	}
+
+	kfree(AH_SKB_CB(skb)->tmp);
+	xfrm_output_resume(skb->sk, skb, err);
+}
+
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	int nfrags;
+	int ihl;
+	u8 *icv;
+	struct sk_buff *trailer;
+	struct crypto_ahash *ahash;
+	struct ahash_request *req;
+	struct scatterlist *sg;
+	struct iphdr *iph, *top_iph;
+	struct ip_auth_hdr *ah;
+	struct ah_data *ahp;
+	int seqhi_len = 0;
+	__be32 *seqhi;
+	int sglists = 0;
+	struct scatterlist *seqhisg;
+
+	ahp = x->data;
+	ahash = ahp->ahash;
+
+	if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+		goto out;
+	nfrags = err;
+
+	skb_push(skb, -skb_network_offset(skb));
+	ah = ip_auth_hdr(skb);
+	ihl = ip_hdrlen(skb);
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists = 1;
+		seqhi_len = sizeof(*seqhi);
+	}
+	err = -ENOMEM;
+	iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
+	if (!iph)
+		goto out;
+	seqhi = (__be32 *)((char *)iph + ihl);
+	icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+	req = ah_tmp_req(ahash, icv);
+	sg = ah_req_sg(ahash, req);
+	seqhisg = sg + nfrags;
+
+	memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+	top_iph = ip_hdr(skb);
+
+	iph->tos = top_iph->tos;
+	iph->ttl = top_iph->ttl;
+	iph->frag_off = top_iph->frag_off;
+
+	if (top_iph->ihl != 5) {
+		iph->daddr = top_iph->daddr;
+		memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+		err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+		if (err)
+			goto out_free;
+	}
+
+	ah->nexthdr = *skb_mac_header(skb);
+	*skb_mac_header(skb) = IPPROTO_AH;
+
+	top_iph->tos = 0;
+	top_iph->tot_len = htons(skb->len);
+	top_iph->frag_off = 0;
+	top_iph->ttl = 0;
+	top_iph->check = 0;
+
+	if (x->props.flags & XFRM_STATE_ALIGN4)
+		ah->hdrlen  = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+	else
+		ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+
+	ah->reserved = 0;
+	ah->spi = x->id.spi;
+	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+	sg_init_table(sg, nfrags + sglists);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		/* Attach seqhi sg right after packet payload */
+		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+		sg_set_buf(seqhisg, seqhi, seqhi_len);
+	}
+	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+	ahash_request_set_callback(req, 0, ah_output_done, skb);
+
+	AH_SKB_CB(skb)->tmp = iph;
+
+	err = crypto_ahash_digest(req);
+	if (err) {
+		if (err == -EINPROGRESS)
+			goto out;
+
+		if (err == -ENOSPC)
+			err = NET_XMIT_DROP;
+		goto out_free;
+	}
+
+	memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+	top_iph->tos = iph->tos;
+	top_iph->ttl = iph->ttl;
+	top_iph->frag_off = iph->frag_off;
+	if (top_iph->ihl != 5) {
+		top_iph->daddr = iph->daddr;
+		memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+	}
+
+out_free:
+	kfree(iph);
+out:
+	return err;
+}
+
+static void ah_input_done(void *data, int err)
+{
+	u8 *auth_data;
+	u8 *icv;
+	struct iphdr *work_iph;
+	struct sk_buff *skb = data;
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ah_data *ahp = x->data;
+	struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+	int ihl = ip_hdrlen(skb);
+	int ah_hlen = (ah->hdrlen + 2) << 2;
+
+	if (err)
+		goto out;
+
+	work_iph = AH_SKB_CB(skb)->tmp;
+	auth_data = ah_tmp_auth(work_iph, ihl);
+	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+	err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+	if (err)
+		goto out;
+
+	err = ah->nexthdr;
+
+	skb->network_header += ah_hlen;
+	memcpy(skb_network_header(skb), work_iph, ihl);
+	__skb_pull(skb, ah_hlen + ihl);
+
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		skb_reset_transport_header(skb);
+	else
+		skb_set_transport_header(skb, -ihl);
+out:
+	kfree(AH_SKB_CB(skb)->tmp);
+	xfrm_input_resume(skb, err);
+}
+
+static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int ah_hlen;
+	int ihl;
+	int nexthdr;
+	int nfrags;
+	u8 *auth_data;
+	u8 *icv;
+	struct sk_buff *trailer;
+	struct crypto_ahash *ahash;
+	struct ahash_request *req;
+	struct scatterlist *sg;
+	struct iphdr *iph, *work_iph;
+	struct ip_auth_hdr *ah;
+	struct ah_data *ahp;
+	int err = -ENOMEM;
+	int seqhi_len = 0;
+	__be32 *seqhi;
+	int sglists = 0;
+	struct scatterlist *seqhisg;
+
+	if (!pskb_may_pull(skb, sizeof(*ah)))
+		goto out;
+
+	ah = (struct ip_auth_hdr *)skb->data;
+	ahp = x->data;
+	ahash = ahp->ahash;
+
+	nexthdr = ah->nexthdr;
+	ah_hlen = (ah->hdrlen + 2) << 2;
+
+	if (x->props.flags & XFRM_STATE_ALIGN4) {
+		if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+		    ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+			goto out;
+	} else {
+		if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+		    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+			goto out;
+	}
+
+	if (!pskb_may_pull(skb, ah_hlen))
+		goto out;
+
+	/* We are going to _remove_ AH header to keep sockets happy,
+	 * so... Later this can change. */
+	if (skb_unclone(skb, GFP_ATOMIC))
+		goto out;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+
+	if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+		goto out;
+	nfrags = err;
+
+	ah = (struct ip_auth_hdr *)skb->data;
+	iph = ip_hdr(skb);
+	ihl = ip_hdrlen(skb);
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists = 1;
+		seqhi_len = sizeof(*seqhi);
+	}
+
+	work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+				ahp->icv_trunc_len + seqhi_len);
+	if (!work_iph) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	seqhi = (__be32 *)((char *)work_iph + ihl);
+	auth_data = ah_tmp_auth(seqhi, seqhi_len);
+	icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+	req = ah_tmp_req(ahash, icv);
+	sg = ah_req_sg(ahash, req);
+	seqhisg = sg + nfrags;
+
+	memcpy(work_iph, iph, ihl);
+	memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+	memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+	iph->ttl = 0;
+	iph->tos = 0;
+	iph->frag_off = 0;
+	iph->check = 0;
+	if (ihl > sizeof(*iph)) {
+		__be32 dummy;
+		err = ip_clear_mutable_options(iph, &dummy);
+		if (err)
+			goto out_free;
+	}
+
+	skb_push(skb, ihl);
+
+	sg_init_table(sg, nfrags + sglists);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		/* Attach seqhi sg right after packet payload */
+		*seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+		sg_set_buf(seqhisg, seqhi, seqhi_len);
+	}
+	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+	ahash_request_set_callback(req, 0, ah_input_done, skb);
+
+	AH_SKB_CB(skb)->tmp = work_iph;
+
+	err = crypto_ahash_digest(req);
+	if (err) {
+		if (err == -EINPROGRESS)
+			goto out;
+
+		goto out_free;
+	}
+
+	err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+	if (err)
+		goto out_free;
+
+	skb->network_header += ah_hlen;
+	memcpy(skb_network_header(skb), work_iph, ihl);
+	__skb_pull(skb, ah_hlen + ihl);
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		skb_reset_transport_header(skb);
+	else
+		skb_set_transport_header(skb, -ihl);
+
+	err = nexthdr;
+
+out_free:
+	kfree (work_iph);
+out:
+	return err;
+}
+
+static int ah4_err(struct sk_buff *skb, u32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+			return 0;
+		break;
+	case ICMP_REDIRECT:
+		break;
+	default:
+		return 0;
+	}
+
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      ah->spi, IPPROTO_AH, AF_INET);
+	if (!x)
+		return 0;
+
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
+	else
+		ipv4_redirect(skb, net, 0, IPPROTO_AH);
+	xfrm_state_put(x);
+
+	return 0;
+}
+
+static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
+{
+	struct ah_data *ahp = NULL;
+	struct xfrm_algo_desc *aalg_desc;
+	struct crypto_ahash *ahash;
+
+	if (!x->aalg) {
+		NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
+		goto error;
+	}
+
+	if (x->encap) {
+		NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
+		goto error;
+	}
+
+	ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
+	if (!ahp)
+		return -ENOMEM;
+
+	ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+	if (IS_ERR(ahash)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+		goto error;
+	}
+
+	ahp->ahash = ahash;
+	if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+				(x->aalg->alg_key_len + 7) / 8)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+		goto error;
+	}
+
+	/*
+	 * Lookup the algorithm description maintained by xfrm_algo,
+	 * verify crypto transform properties, and store information
+	 * we need for AH processing.  This lookup cannot fail here
+	 * after a successful crypto_alloc_ahash().
+	 */
+	aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+	BUG_ON(!aalg_desc);
+
+	if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+	    crypto_ahash_digestsize(ahash)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+		goto error;
+	}
+
+	ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+	ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+	if (x->props.flags & XFRM_STATE_ALIGN4)
+		x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+						  ahp->icv_trunc_len);
+	else
+		x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+						  ahp->icv_trunc_len);
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		x->props.header_len += sizeof(struct iphdr);
+	x->data = ahp;
+
+	return 0;
+
+error:
+	if (ahp) {
+		crypto_free_ahash(ahp->ahash);
+		kfree(ahp);
+	}
+	return -EINVAL;
+}
+
+static void ah_destroy(struct xfrm_state *x)
+{
+	struct ah_data *ahp = x->data;
+
+	if (!ahp)
+		return;
+
+	crypto_free_ahash(ahp->ahash);
+	kfree(ahp);
+}
+
+static int ah4_rcv_cb(struct sk_buff *skb, int err)
+{
+	return 0;
+}
+
+static const struct xfrm_type ah_type =
+{
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_AH,
+	.flags		= XFRM_TYPE_REPLAY_PROT,
+	.init_state	= ah_init_state,
+	.destructor	= ah_destroy,
+	.input		= ah_input,
+	.output		= ah_output
+};
+
+static struct xfrm4_protocol ah4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.input_handler	=	xfrm_input,
+	.cb_handler	=	ah4_rcv_cb,
+	.err_handler	=	ah4_err,
+	.priority	=	0,
+};
+
+static int __init ah4_init(void)
+{
+	if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+		pr_info("%s: can't add xfrm type\n", __func__);
+		return -EAGAIN;
+	}
+	if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
+		pr_info("%s: can't add protocol\n", __func__);
+		xfrm_unregister_type(&ah_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit ah4_fini(void)
+{
+	if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
+		pr_info("%s: can't remove protocol\n", __func__);
+	xfrm_unregister_type(&ah_type, AF_INET);
+}
+
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 0000000000..9456f5bb35
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1472 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* linux/net/ipv4/arp.c
+ *
+ * Copyright (C) 1994 by Florian  La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * Fixes:
+ *		Alan Cox	:	Removed the Ethernet assumptions in
+ *					Florian's code
+ *		Alan Cox	:	Fixed some small errors in the ARP
+ *					logic
+ *		Alan Cox	:	Allow >4K in /proc
+ *		Alan Cox	:	Make ARP add its own protocol entry
+ *		Ross Martin     :       Rewrote arp_rcv() and arp_get_info()
+ *		Stephen Henson	:	Add AX25 support to arp_get_info()
+ *		Alan Cox	:	Drop data when a device is downed.
+ *		Alan Cox	:	Use init_timer().
+ *		Alan Cox	:	Double lock fixes.
+ *		Martin Seine	:	Move the arphdr structure
+ *					to if_arp.h for compatibility.
+ *					with BSD based programs.
+ *		Andrew Tridgell :       Added ARP netmask code and
+ *					re-arranged proxy handling.
+ *		Alan Cox	:	Changed to use notifiers.
+ *		Niibe Yutaka	:	Reply for this device or proxies only.
+ *		Alan Cox	:	Don't proxy across hardware types!
+ *		Jonathan Naylor :	Added support for NET/ROM.
+ *		Mike Shaver     :       RFC1122 checks.
+ *		Jonathan Naylor :	Only lookup the hardware address for
+ *					the correct hardware type.
+ *		Germano Caronni	:	Assorted subtle races.
+ *		Craig Schlenter :	Don't modify permanent entry
+ *					during arp_rcv.
+ *		Russ Nelson	:	Tidied up a few bits.
+ *		Alexey Kuznetsov:	Major changes to caching and behaviour,
+ *					eg intelligent arp probing and
+ *					generation
+ *					of host down events.
+ *		Alan Cox	:	Missing unlock in device events.
+ *		Eckes		:	ARP ioctl control errors.
+ *		Alexey Kuznetsov:	Arp free fix.
+ *		Manuel Rodriguez:	Gratuitous ARP.
+ *              Jonathan Layes  :       Added arpd support through kerneld
+ *                                      message queue (960314)
+ *		Mike Shaver	:	/proc/sys/net/ipv4/arp_* support
+ *		Mike McLagan    :	Routing by source
+ *		Stuart Cheshire	:	Metricom and grat arp fixes
+ *					*** FOR 2.1 clean this up ***
+ *		Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ *		Alan Cox	:	Took the AP1000 nasty FDDI hack and
+ *					folded into the mainstream FDDI code.
+ *					Ack spit, Linus how did you allow that
+ *					one in...
+ *		Jes Sorensen	:	Make FDDI work again in 2.1.x and
+ *					clean up the APFDDI & gen. FDDI bits.
+ *		Alexey Kuznetsov:	new arp state machine;
+ *					now it is in net/core/neighbour.c.
+ *		Krzysztof Halasa:	Added Frame Relay ARP support.
+ *		Arnaldo C. Melo :	convert /proc/net/arp to seq_file
+ *		Shmulik Hen:		Split arp_send to arp_create and
+ *					arp_xmit so intermediate drivers like
+ *					bonding can change the skb before
+ *					sending (e.g. insert 8021q tag).
+ *		Harald Welte	:	convert to make use of jenkins hash
+ *		Jesper D. Brouer:       Proxy ARP PVLAN RFC 3069 support.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ax25.h>
+#include <net/netrom.h>
+#include <net/dst_metadata.h>
+#include <net/ip_tunnels.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/netfilter_arp.h>
+
+/*
+ *	Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static bool arp_key_eq(const struct neighbour *n, const void *pkey);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+static int arp_is_multicast(const void *pkey);
+
+static const struct neigh_ops arp_generic_ops = {
+	.family =		AF_INET,
+	.solicit =		arp_solicit,
+	.error_report =		arp_error_report,
+	.output =		neigh_resolve_output,
+	.connected_output =	neigh_connected_output,
+};
+
+static const struct neigh_ops arp_hh_ops = {
+	.family =		AF_INET,
+	.solicit =		arp_solicit,
+	.error_report =		arp_error_report,
+	.output =		neigh_resolve_output,
+	.connected_output =	neigh_resolve_output,
+};
+
+static const struct neigh_ops arp_direct_ops = {
+	.family =		AF_INET,
+	.output =		neigh_direct_output,
+	.connected_output =	neigh_direct_output,
+};
+
+struct neigh_table arp_tbl = {
+	.family		= AF_INET,
+	.key_len	= 4,
+	.protocol	= cpu_to_be16(ETH_P_IP),
+	.hash		= arp_hash,
+	.key_eq		= arp_key_eq,
+	.constructor	= arp_constructor,
+	.proxy_redo	= parp_redo,
+	.is_multicast	= arp_is_multicast,
+	.id		= "arp_cache",
+	.parms		= {
+		.tbl			= &arp_tbl,
+		.reachable_time		= 30 * HZ,
+		.data	= {
+			[NEIGH_VAR_MCAST_PROBES] = 3,
+			[NEIGH_VAR_UCAST_PROBES] = 3,
+			[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
+			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
+			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+			[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
+			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
+			[NEIGH_VAR_PROXY_QLEN] = 64,
+			[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+			[NEIGH_VAR_PROXY_DELAY]	= (8 * HZ) / 10,
+			[NEIGH_VAR_LOCKTIME] = 1 * HZ,
+		},
+	},
+	.gc_interval	= 30 * HZ,
+	.gc_thresh1	= 128,
+	.gc_thresh2	= 512,
+	.gc_thresh3	= 1024,
+};
+EXPORT_SYMBOL(arp_tbl);
+
+int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+	case ARPHRD_FDDI:
+	case ARPHRD_IEEE802:
+		ip_eth_mc_map(addr, haddr);
+		return 0;
+	case ARPHRD_INFINIBAND:
+		ip_ib_mc_map(addr, dev->broadcast, haddr);
+		return 0;
+	case ARPHRD_IPGRE:
+		ip_ipgre_mc_map(addr, dev->broadcast, haddr);
+		return 0;
+	default:
+		if (dir) {
+			memcpy(haddr, dev->broadcast, dev->addr_len);
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+
+static u32 arp_hash(const void *pkey,
+		    const struct net_device *dev,
+		    __u32 *hash_rnd)
+{
+	return arp_hashfn(pkey, dev, hash_rnd);
+}
+
+static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
+{
+	return neigh_key_eq32(neigh, pkey);
+}
+
+static int arp_constructor(struct neighbour *neigh)
+{
+	__be32 addr;
+	struct net_device *dev = neigh->dev;
+	struct in_device *in_dev;
+	struct neigh_parms *parms;
+	u32 inaddr_any = INADDR_ANY;
+
+	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+		memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len);
+
+	addr = *(__be32 *)neigh->primary_key;
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
+
+	parms = in_dev->arp_parms;
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
+	rcu_read_unlock();
+
+	if (!dev->header_ops) {
+		neigh->nud_state = NUD_NOARP;
+		neigh->ops = &arp_direct_ops;
+		neigh->output = neigh_direct_output;
+	} else {
+		/* Good devices (checked by reading texts, but only Ethernet is
+		   tested)
+
+		   ARPHRD_ETHER: (ethernet, apfddi)
+		   ARPHRD_FDDI: (fddi)
+		   ARPHRD_IEEE802: (tr)
+		   ARPHRD_METRICOM: (strip)
+		   ARPHRD_ARCNET:
+		   etc. etc. etc.
+
+		   ARPHRD_IPDDP will also work, if author repairs it.
+		   I did not it, because this driver does not work even
+		   in old paradigm.
+		 */
+
+		if (neigh->type == RTN_MULTICAST) {
+			neigh->nud_state = NUD_NOARP;
+			arp_mc_map(addr, neigh->ha, dev, 1);
+		} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
+			neigh->nud_state = NUD_NOARP;
+			memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+		} else if (neigh->type == RTN_BROADCAST ||
+			   (dev->flags & IFF_POINTOPOINT)) {
+			neigh->nud_state = NUD_NOARP;
+			memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+		}
+
+		if (dev->header_ops->cache)
+			neigh->ops = &arp_hh_ops;
+		else
+			neigh->ops = &arp_generic_ops;
+
+		if (neigh->nud_state & NUD_VALID)
+			neigh->output = neigh->ops->connected_output;
+		else
+			neigh->output = neigh->ops->output;
+	}
+	return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+	dst_link_failure(skb);
+	kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
+}
+
+/* Create and send an arp packet. */
+static void arp_send_dst(int type, int ptype, __be32 dest_ip,
+			 struct net_device *dev, __be32 src_ip,
+			 const unsigned char *dest_hw,
+			 const unsigned char *src_hw,
+			 const unsigned char *target_hw,
+			 struct dst_entry *dst)
+{
+	struct sk_buff *skb;
+
+	/* arp on this interface. */
+	if (dev->flags & IFF_NOARP)
+		return;
+
+	skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+			 dest_hw, src_hw, target_hw);
+	if (!skb)
+		return;
+
+	skb_dst_set(skb, dst_clone(dst));
+	arp_xmit(skb);
+}
+
+void arp_send(int type, int ptype, __be32 dest_ip,
+	      struct net_device *dev, __be32 src_ip,
+	      const unsigned char *dest_hw, const unsigned char *src_hw,
+	      const unsigned char *target_hw)
+{
+	arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
+		     target_hw, NULL);
+}
+EXPORT_SYMBOL(arp_send);
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+	__be32 saddr = 0;
+	u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
+	struct net_device *dev = neigh->dev;
+	__be32 target = *(__be32 *)neigh->primary_key;
+	int probes = atomic_read(&neigh->probes);
+	struct in_device *in_dev;
+	struct dst_entry *dst = NULL;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev) {
+		rcu_read_unlock();
+		return;
+	}
+	switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+	default:
+	case 0:		/* By default announce any local IP */
+		if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
+					  ip_hdr(skb)->saddr) == RTN_LOCAL)
+			saddr = ip_hdr(skb)->saddr;
+		break;
+	case 1:		/* Restrict announcements of saddr in same subnet */
+		if (!skb)
+			break;
+		saddr = ip_hdr(skb)->saddr;
+		if (inet_addr_type_dev_table(dev_net(dev), dev,
+					     saddr) == RTN_LOCAL) {
+			/* saddr should be known to target */
+			if (inet_addr_onlink(in_dev, target, saddr))
+				break;
+		}
+		saddr = 0;
+		break;
+	case 2:		/* Avoid secondary IPs, get a primary/preferred one */
+		break;
+	}
+	rcu_read_unlock();
+
+	if (!saddr)
+		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
+	if (probes < 0) {
+		if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
+			pr_debug("trying to ucast probe in NUD_INVALID\n");
+		neigh_ha_snapshot(dst_ha, neigh, dev);
+		dst_hw = dst_ha;
+	} else {
+		probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
+		if (probes < 0) {
+			neigh_app_ns(neigh);
+			return;
+		}
+	}
+
+	if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+		dst = skb_dst(skb);
+	arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+		     dst_hw, dev->dev_addr, NULL, dst);
+}
+
+static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
+{
+	struct net *net = dev_net(in_dev->dev);
+	int scope;
+
+	switch (IN_DEV_ARP_IGNORE(in_dev)) {
+	case 0:	/* Reply, the tip is already validated */
+		return 0;
+	case 1:	/* Reply only if tip is configured on the incoming interface */
+		sip = 0;
+		scope = RT_SCOPE_HOST;
+		break;
+	case 2:	/*
+		 * Reply only if tip is configured on the incoming interface
+		 * and is in same subnet as sip
+		 */
+		scope = RT_SCOPE_HOST;
+		break;
+	case 3:	/* Do not reply for scope host addresses */
+		sip = 0;
+		scope = RT_SCOPE_LINK;
+		in_dev = NULL;
+		break;
+	case 4:	/* Reserved */
+	case 5:
+	case 6:
+	case 7:
+		return 0;
+	case 8:	/* Do not reply */
+		return 1;
+	default:
+		return 0;
+	}
+	return !inet_confirm_addr(net, in_dev, sip, tip, scope);
+}
+
+static int arp_accept(struct in_device *in_dev, __be32 sip)
+{
+	struct net *net = dev_net(in_dev->dev);
+	int scope = RT_SCOPE_LINK;
+
+	switch (IN_DEV_ARP_ACCEPT(in_dev)) {
+	case 0: /* Don't create new entries from garp */
+		return 0;
+	case 1: /* Create new entries from garp */
+		return 1;
+	case 2: /* Create a neighbor in the arp table only if sip
+		 * is in the same subnet as an address configured
+		 * on the interface that received the garp message
+		 */
+		return !!inet_confirm_addr(net, in_dev, sip, 0, scope);
+	default:
+		return 0;
+	}
+}
+
+static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
+{
+	struct rtable *rt;
+	int flag = 0;
+	/*unsigned long now; */
+	struct net *net = dev_net(dev);
+
+	rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev));
+	if (IS_ERR(rt))
+		return 1;
+	if (rt->dst.dev != dev) {
+		__NET_INC_STATS(net, LINUX_MIB_ARPFILTER);
+		flag = 1;
+	}
+	ip_rt_put(rt);
+	return flag;
+}
+
+/*
+ * Check if we can use proxy ARP for this path
+ */
+static inline int arp_fwd_proxy(struct in_device *in_dev,
+				struct net_device *dev,	struct rtable *rt)
+{
+	struct in_device *out_dev;
+	int imi, omi = -1;
+
+	if (rt->dst.dev == dev)
+		return 0;
+
+	if (!IN_DEV_PROXY_ARP(in_dev))
+		return 0;
+	imi = IN_DEV_MEDIUM_ID(in_dev);
+	if (imi == 0)
+		return 1;
+	if (imi == -1)
+		return 0;
+
+	/* place to check for proxy_arp for routes */
+
+	out_dev = __in_dev_get_rcu(rt->dst.dev);
+	if (out_dev)
+		omi = IN_DEV_MEDIUM_ID(out_dev);
+
+	return omi != imi && omi != -1;
+}
+
+/*
+ * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
+ *
+ * RFC3069 supports proxy arp replies back to the same interface.  This
+ * is done to support (ethernet) switch features, like RFC 3069, where
+ * the individual ports are not allowed to communicate with each
+ * other, BUT they are allowed to talk to the upstream router.  As
+ * described in RFC 3069, it is possible to allow these hosts to
+ * communicate through the upstream router, by proxy_arp'ing.
+ *
+ * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
+ *
+ *  This technology is known by different names:
+ *    In RFC 3069 it is called VLAN Aggregation.
+ *    Cisco and Allied Telesyn call it Private VLAN.
+ *    Hewlett-Packard call it Source-Port filtering or port-isolation.
+ *    Ericsson call it MAC-Forced Forwarding (RFC Draft).
+ *
+ */
+static inline int arp_fwd_pvlan(struct in_device *in_dev,
+				struct net_device *dev,	struct rtable *rt,
+				__be32 sip, __be32 tip)
+{
+	/* Private VLAN is only concerned about the same ethernet segment */
+	if (rt->dst.dev != dev)
+		return 0;
+
+	/* Don't reply on self probes (often done by windowz boxes)*/
+	if (sip == tip)
+		return 0;
+
+	if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ *	Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ *	Create an arp packet. If dest_hw is not set, we create a broadcast
+ *	message.
+ */
+struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
+			   struct net_device *dev, __be32 src_ip,
+			   const unsigned char *dest_hw,
+			   const unsigned char *src_hw,
+			   const unsigned char *target_hw)
+{
+	struct sk_buff *skb;
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+	int hlen = LL_RESERVED_SPACE(dev);
+	int tlen = dev->needed_tailroom;
+
+	/*
+	 *	Allocate a buffer
+	 */
+
+	skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, hlen);
+	skb_reset_network_header(skb);
+	arp = skb_put(skb, arp_hdr_len(dev));
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_ARP);
+	if (!src_hw)
+		src_hw = dev->dev_addr;
+	if (!dest_hw)
+		dest_hw = dev->broadcast;
+
+	/*
+	 *	Fill the device header for the ARP frame
+	 */
+	if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
+		goto out;
+
+	/*
+	 * Fill out the arp protocol part.
+	 *
+	 * The arp hardware type should match the device type, except for FDDI,
+	 * which (according to RFC 1390) should always equal 1 (Ethernet).
+	 */
+	/*
+	 *	Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+	 *	DIX code for the protocol. Make these device structure fields.
+	 */
+	switch (dev->type) {
+	default:
+		arp->ar_hrd = htons(dev->type);
+		arp->ar_pro = htons(ETH_P_IP);
+		break;
+
+#if IS_ENABLED(CONFIG_AX25)
+	case ARPHRD_AX25:
+		arp->ar_hrd = htons(ARPHRD_AX25);
+		arp->ar_pro = htons(AX25_P_IP);
+		break;
+
+#if IS_ENABLED(CONFIG_NETROM)
+	case ARPHRD_NETROM:
+		arp->ar_hrd = htons(ARPHRD_NETROM);
+		arp->ar_pro = htons(AX25_P_IP);
+		break;
+#endif
+#endif
+
+#if IS_ENABLED(CONFIG_FDDI)
+	case ARPHRD_FDDI:
+		arp->ar_hrd = htons(ARPHRD_ETHER);
+		arp->ar_pro = htons(ETH_P_IP);
+		break;
+#endif
+	}
+
+	arp->ar_hln = dev->addr_len;
+	arp->ar_pln = 4;
+	arp->ar_op = htons(type);
+
+	arp_ptr = (unsigned char *)(arp + 1);
+
+	memcpy(arp_ptr, src_hw, dev->addr_len);
+	arp_ptr += dev->addr_len;
+	memcpy(arp_ptr, &src_ip, 4);
+	arp_ptr += 4;
+
+	switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+	case ARPHRD_IEEE1394:
+		break;
+#endif
+	default:
+		if (target_hw)
+			memcpy(arp_ptr, target_hw, dev->addr_len);
+		else
+			memset(arp_ptr, 0, dev->addr_len);
+		arp_ptr += dev->addr_len;
+	}
+	memcpy(arp_ptr, &dest_ip, 4);
+
+	return skb;
+
+out:
+	kfree_skb(skb);
+	return NULL;
+}
+EXPORT_SYMBOL(arp_create);
+
+static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	return dev_queue_xmit(skb);
+}
+
+/*
+ *	Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+	/* Send it off, maybe filter it using firewalling first.  */
+	NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
+		dev_net(skb->dev), NULL, skb, NULL, skb->dev,
+		arp_xmit_finish);
+}
+EXPORT_SYMBOL(arp_xmit);
+
+static bool arp_is_garp(struct net *net, struct net_device *dev,
+			int *addr_type, __be16 ar_op,
+			__be32 sip, __be32 tip,
+			unsigned char *sha, unsigned char *tha)
+{
+	bool is_garp = tip == sip;
+
+	/* Gratuitous ARP _replies_ also require target hwaddr to be
+	 * the same as source.
+	 */
+	if (is_garp && ar_op == htons(ARPOP_REPLY))
+		is_garp =
+			/* IPv4 over IEEE 1394 doesn't provide target
+			 * hardware address field in its ARP payload.
+			 */
+			tha &&
+			!memcmp(tha, sha, dev->addr_len);
+
+	if (is_garp) {
+		*addr_type = inet_addr_type_dev_table(net, dev, sip);
+		if (*addr_type != RTN_UNICAST)
+			is_garp = false;
+	}
+	return is_garp;
+}
+
+/*
+ *	Process an arp request.
+ */
+
+static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+	struct rtable *rt;
+	unsigned char *sha;
+	unsigned char *tha = NULL;
+	__be32 sip, tip;
+	u16 dev_type = dev->type;
+	int addr_type;
+	struct neighbour *n;
+	struct dst_entry *reply_dst = NULL;
+	bool is_garp = false;
+
+	/* arp_rcv below verifies the ARP header and verifies the device
+	 * is ARP'able.
+	 */
+
+	if (!in_dev)
+		goto out_free_skb;
+
+	arp = arp_hdr(skb);
+
+	switch (dev_type) {
+	default:
+		if (arp->ar_pro != htons(ETH_P_IP) ||
+		    htons(dev_type) != arp->ar_hrd)
+			goto out_free_skb;
+		break;
+	case ARPHRD_ETHER:
+	case ARPHRD_FDDI:
+	case ARPHRD_IEEE802:
+		/*
+		 * ETHERNET, and Fibre Channel (which are IEEE 802
+		 * devices, according to RFC 2625) devices will accept ARP
+		 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+		 * This is the case also of FDDI, where the RFC 1390 says that
+		 * FDDI devices should accept ARP hardware of (1) Ethernet,
+		 * however, to be more robust, we'll accept both 1 (Ethernet)
+		 * or 6 (IEEE 802.2)
+		 */
+		if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+		     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+		    arp->ar_pro != htons(ETH_P_IP))
+			goto out_free_skb;
+		break;
+	case ARPHRD_AX25:
+		if (arp->ar_pro != htons(AX25_P_IP) ||
+		    arp->ar_hrd != htons(ARPHRD_AX25))
+			goto out_free_skb;
+		break;
+	case ARPHRD_NETROM:
+		if (arp->ar_pro != htons(AX25_P_IP) ||
+		    arp->ar_hrd != htons(ARPHRD_NETROM))
+			goto out_free_skb;
+		break;
+	}
+
+	/* Understand only these message types */
+
+	if (arp->ar_op != htons(ARPOP_REPLY) &&
+	    arp->ar_op != htons(ARPOP_REQUEST))
+		goto out_free_skb;
+
+/*
+ *	Extract fields
+ */
+	arp_ptr = (unsigned char *)(arp + 1);
+	sha	= arp_ptr;
+	arp_ptr += dev->addr_len;
+	memcpy(&sip, arp_ptr, 4);
+	arp_ptr += 4;
+	switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+	case ARPHRD_IEEE1394:
+		break;
+#endif
+	default:
+		tha = arp_ptr;
+		arp_ptr += dev->addr_len;
+	}
+	memcpy(&tip, arp_ptr, 4);
+/*
+ *	Check for bad requests for 127.x.x.x and requests for multicast
+ *	addresses.  If this is one such, delete it.
+ */
+	if (ipv4_is_multicast(tip) ||
+	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
+		goto out_free_skb;
+
+ /*
+  *	For some 802.11 wireless deployments (and possibly other networks),
+  *	there will be an ARP proxy and gratuitous ARP frames are attacks
+  *	and thus should not be accepted.
+  */
+	if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
+		goto out_free_skb;
+
+/*
+ *     Special case: We must set Frame Relay source Q.922 address
+ */
+	if (dev_type == ARPHRD_DLCI)
+		sha = dev->broadcast;
+
+/*
+ *  Process entry.  The idea here is we want to send a reply if it is a
+ *  request for us or if it is a request for someone else that we hold
+ *  a proxy for.  We want to add an entry to our cache if it is a reply
+ *  to us or if it is a request for our address.
+ *  (The assumption for this last is that if someone is requesting our
+ *  address, they are probably intending to talk to us, so it saves time
+ *  if we cache their address.  Their address is also probably not in
+ *  our cache, since ours is not in their cache.)
+ *
+ *  Putting this another way, we only care about replies if they are to
+ *  us, in which case we add them to the cache.  For requests, we care
+ *  about those for us and those for our proxies.  We reply to both,
+ *  and in the case of requests for us we add the requester to the arp
+ *  cache.
+ */
+
+	if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
+		reply_dst = (struct dst_entry *)
+			    iptunnel_metadata_reply(skb_metadata_dst(skb),
+						    GFP_ATOMIC);
+
+	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
+	if (sip == 0) {
+		if (arp->ar_op == htons(ARPOP_REQUEST) &&
+		    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
+		    !arp_ignore(in_dev, sip, tip))
+			arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
+				     sha, dev->dev_addr, sha, reply_dst);
+		goto out_consume_skb;
+	}
+
+	if (arp->ar_op == htons(ARPOP_REQUEST) &&
+	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+
+		rt = skb_rtable(skb);
+		addr_type = rt->rt_type;
+
+		if (addr_type == RTN_LOCAL) {
+			int dont_send;
+
+			dont_send = arp_ignore(in_dev, sip, tip);
+			if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+				dont_send = arp_filter(sip, tip, dev);
+			if (!dont_send) {
+				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+				if (n) {
+					arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
+						     sip, dev, tip, sha,
+						     dev->dev_addr, sha,
+						     reply_dst);
+					neigh_release(n);
+				}
+			}
+			goto out_consume_skb;
+		} else if (IN_DEV_FORWARD(in_dev)) {
+			if (addr_type == RTN_UNICAST  &&
+			    (arp_fwd_proxy(in_dev, dev, rt) ||
+			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+			     (rt->dst.dev != dev &&
+			      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+				if (n)
+					neigh_release(n);
+
+				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
+				    skb->pkt_type == PACKET_HOST ||
+				    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
+					arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
+						     sip, dev, tip, sha,
+						     dev->dev_addr, sha,
+						     reply_dst);
+				} else {
+					pneigh_enqueue(&arp_tbl,
+						       in_dev->arp_parms, skb);
+					goto out_free_dst;
+				}
+				goto out_consume_skb;
+			}
+		}
+	}
+
+	/* Update our ARP tables */
+
+	n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+	addr_type = -1;
+	if (n || arp_accept(in_dev, sip)) {
+		is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
+				      sip, tip, sha, tha);
+	}
+
+	if (arp_accept(in_dev, sip)) {
+		/* Unsolicited ARP is not accepted by default.
+		   It is possible, that this option should be enabled for some
+		   devices (strip is candidate)
+		 */
+		if (!n &&
+		    (is_garp ||
+		     (arp->ar_op == htons(ARPOP_REPLY) &&
+		      (addr_type == RTN_UNICAST ||
+		       (addr_type < 0 &&
+			/* postpone calculation to as late as possible */
+			inet_addr_type_dev_table(net, dev, sip) ==
+				RTN_UNICAST)))))
+			n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
+	}
+
+	if (n) {
+		int state = NUD_REACHABLE;
+		int override;
+
+		/* If several different ARP replies follows back-to-back,
+		   use the FIRST one. It is possible, if several proxy
+		   agents are active. Taking the first reply prevents
+		   arp trashing and chooses the fastest router.
+		 */
+		override = time_after(jiffies,
+				      n->updated +
+				      NEIGH_VAR(n->parms, LOCKTIME)) ||
+			   is_garp;
+
+		/* Broadcast replies and request packets
+		   do not assert neighbour reachability.
+		 */
+		if (arp->ar_op != htons(ARPOP_REPLY) ||
+		    skb->pkt_type != PACKET_HOST)
+			state = NUD_STALE;
+		neigh_update(n, sha, state,
+			     override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
+		neigh_release(n);
+	}
+
+out_consume_skb:
+	consume_skb(skb);
+
+out_free_dst:
+	dst_release(reply_dst);
+	return NET_RX_SUCCESS;
+
+out_free_skb:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+	arp_process(dev_net(skb->dev), NULL, skb);
+}
+
+static int arp_is_multicast(const void *pkey)
+{
+	return ipv4_is_multicast(*((__be32 *)pkey));
+}
+
+/*
+ *	Receive an arp request from the device layer.
+ */
+
+static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
+		   struct packet_type *pt, struct net_device *orig_dev)
+{
+	const struct arphdr *arp;
+
+	/* do not tweak dropwatch on an ARP we will ignore */
+	if (dev->flags & IFF_NOARP ||
+	    skb->pkt_type == PACKET_OTHERHOST ||
+	    skb->pkt_type == PACKET_LOOPBACK)
+		goto consumeskb;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		goto out_of_mem;
+
+	/* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
+	if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+		goto freeskb;
+
+	arp = arp_hdr(skb);
+	if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
+		goto freeskb;
+
+	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+	return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
+		       dev_net(dev), NULL, skb, dev, NULL,
+		       arp_process);
+
+consumeskb:
+	consume_skb(skb);
+	return NET_RX_SUCCESS;
+freeskb:
+	kfree_skb(skb);
+out_of_mem:
+	return NET_RX_DROP;
+}
+
+/*
+ *	User level interface (ioctl)
+ */
+
+/*
+ *	Set (create) an ARP cache entry.
+ */
+
+static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
+{
+	if (!dev) {
+		IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
+		return 0;
+	}
+	if (__in_dev_get_rtnl(dev)) {
+		IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+		return 0;
+	}
+	return -ENXIO;
+}
+
+static int arp_req_set_public(struct net *net, struct arpreq *r,
+		struct net_device *dev)
+{
+	__be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+	__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+	if (mask && mask != htonl(0xFFFFFFFF))
+		return -EINVAL;
+	if (!dev && (r->arp_flags & ATF_COM)) {
+		dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
+				      r->arp_ha.sa_data);
+		if (!dev)
+			return -ENODEV;
+	}
+	if (mask) {
+		if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
+			return -ENOBUFS;
+		return 0;
+	}
+
+	return arp_req_set_proxy(net, dev, 1);
+}
+
+static int arp_req_set(struct net *net, struct arpreq *r,
+		       struct net_device *dev)
+{
+	__be32 ip;
+	struct neighbour *neigh;
+	int err;
+
+	if (r->arp_flags & ATF_PUBL)
+		return arp_req_set_public(net, r, dev);
+
+	ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+	if (r->arp_flags & ATF_PERM)
+		r->arp_flags |= ATF_COM;
+	if (!dev) {
+		struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+
+		if (IS_ERR(rt))
+			return PTR_ERR(rt);
+		dev = rt->dst.dev;
+		ip_rt_put(rt);
+		if (!dev)
+			return -EINVAL;
+	}
+	switch (dev->type) {
+#if IS_ENABLED(CONFIG_FDDI)
+	case ARPHRD_FDDI:
+		/*
+		 * According to RFC 1390, FDDI devices should accept ARP
+		 * hardware types of 1 (Ethernet).  However, to be more
+		 * robust, we'll accept hardware types of either 1 (Ethernet)
+		 * or 6 (IEEE 802.2).
+		 */
+		if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+		    r->arp_ha.sa_family != ARPHRD_ETHER &&
+		    r->arp_ha.sa_family != ARPHRD_IEEE802)
+			return -EINVAL;
+		break;
+#endif
+	default:
+		if (r->arp_ha.sa_family != dev->type)
+			return -EINVAL;
+		break;
+	}
+
+	neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+	err = PTR_ERR(neigh);
+	if (!IS_ERR(neigh)) {
+		unsigned int state = NUD_STALE;
+		if (r->arp_flags & ATF_PERM)
+			state = NUD_PERMANENT;
+		err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
+				   r->arp_ha.sa_data : NULL, state,
+				   NEIGH_UPDATE_F_OVERRIDE |
+				   NEIGH_UPDATE_F_ADMIN, 0);
+		neigh_release(neigh);
+	}
+	return err;
+}
+
+static unsigned int arp_state_to_flags(struct neighbour *neigh)
+{
+	if (neigh->nud_state&NUD_PERMANENT)
+		return ATF_PERM | ATF_COM;
+	else if (neigh->nud_state&NUD_VALID)
+		return ATF_COM;
+	else
+		return 0;
+}
+
+/*
+ *	Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+	__be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+	struct neighbour *neigh;
+	int err = -ENXIO;
+
+	neigh = neigh_lookup(&arp_tbl, &ip, dev);
+	if (neigh) {
+		if (!(READ_ONCE(neigh->nud_state) & NUD_NOARP)) {
+			read_lock_bh(&neigh->lock);
+			memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+			r->arp_flags = arp_state_to_flags(neigh);
+			read_unlock_bh(&neigh->lock);
+			r->arp_ha.sa_family = dev->type;
+			strscpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+			err = 0;
+		}
+		neigh_release(neigh);
+	}
+	return err;
+}
+
+int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
+{
+	struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+	int err = -ENXIO;
+	struct neigh_table *tbl = &arp_tbl;
+
+	if (neigh) {
+		if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) {
+			neigh_release(neigh);
+			return 0;
+		}
+
+		if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP)
+			err = neigh_update(neigh, NULL, NUD_FAILED,
+					   NEIGH_UPDATE_F_OVERRIDE|
+					   NEIGH_UPDATE_F_ADMIN, 0);
+		write_lock_bh(&tbl->lock);
+		neigh_release(neigh);
+		neigh_remove_one(neigh, tbl);
+		write_unlock_bh(&tbl->lock);
+	}
+
+	return err;
+}
+
+static int arp_req_delete_public(struct net *net, struct arpreq *r,
+		struct net_device *dev)
+{
+	__be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+	__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+	if (mask == htonl(0xFFFFFFFF))
+		return pneigh_delete(&arp_tbl, net, &ip, dev);
+
+	if (mask)
+		return -EINVAL;
+
+	return arp_req_set_proxy(net, dev, 0);
+}
+
+static int arp_req_delete(struct net *net, struct arpreq *r,
+			  struct net_device *dev)
+{
+	__be32 ip;
+
+	if (r->arp_flags & ATF_PUBL)
+		return arp_req_delete_public(net, r, dev);
+
+	ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+	if (!dev) {
+		struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+		if (IS_ERR(rt))
+			return PTR_ERR(rt);
+		dev = rt->dst.dev;
+		ip_rt_put(rt);
+		if (!dev)
+			return -EINVAL;
+	}
+	return arp_invalidate(dev, ip, true);
+}
+
+/*
+ *	Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+	int err;
+	struct arpreq r;
+	struct net_device *dev = NULL;
+
+	switch (cmd) {
+	case SIOCDARP:
+	case SIOCSARP:
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
+		fallthrough;
+	case SIOCGARP:
+		err = copy_from_user(&r, arg, sizeof(struct arpreq));
+		if (err)
+			return -EFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (r.arp_pa.sa_family != AF_INET)
+		return -EPFNOSUPPORT;
+
+	if (!(r.arp_flags & ATF_PUBL) &&
+	    (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
+		return -EINVAL;
+	if (!(r.arp_flags & ATF_NETMASK))
+		((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+							   htonl(0xFFFFFFFFUL);
+	rtnl_lock();
+	if (r.arp_dev[0]) {
+		err = -ENODEV;
+		dev = __dev_get_by_name(net, r.arp_dev);
+		if (!dev)
+			goto out;
+
+		/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+		if (!r.arp_ha.sa_family)
+			r.arp_ha.sa_family = dev->type;
+		err = -EINVAL;
+		if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+			goto out;
+	} else if (cmd == SIOCGARP) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	switch (cmd) {
+	case SIOCDARP:
+		err = arp_req_delete(net, &r, dev);
+		break;
+	case SIOCSARP:
+		err = arp_req_set(net, &r, dev);
+		break;
+	case SIOCGARP:
+		err = arp_req_get(&r, dev);
+		break;
+	}
+out:
+	rtnl_unlock();
+	if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
+		err = -EFAULT;
+	return err;
+}
+
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_change_info *change_info;
+	struct in_device *in_dev;
+	bool evict_nocarrier;
+
+	switch (event) {
+	case NETDEV_CHANGEADDR:
+		neigh_changeaddr(&arp_tbl, dev);
+		rt_cache_flush(dev_net(dev));
+		break;
+	case NETDEV_CHANGE:
+		change_info = ptr;
+		if (change_info->flags_changed & IFF_NOARP)
+			neigh_changeaddr(&arp_tbl, dev);
+
+		in_dev = __in_dev_get_rtnl(dev);
+		if (!in_dev)
+			evict_nocarrier = true;
+		else
+			evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev);
+
+		if (evict_nocarrier && !netif_carrier_ok(dev))
+			neigh_carrier_down(&arp_tbl, dev);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block arp_netdev_notifier = {
+	.notifier_call = arp_netdev_event,
+};
+
+/* Note, that it is not on notifier chain.
+   It is necessary, that this routine was called after route cache will be
+   flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+	neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ *	Called once on startup.
+ */
+
+static struct packet_type arp_packet_type __read_mostly = {
+	.type =	cpu_to_be16(ETH_P_ARP),
+	.func =	arp_rcv,
+};
+
+#ifdef CONFIG_PROC_FS
+#if IS_ENABLED(CONFIG_AX25)
+
+/*
+ *	ax25 -> ASCII conversion
+ */
+static void ax2asc2(ax25_address *a, char *buf)
+{
+	char c, *s;
+	int n;
+
+	for (n = 0, s = buf; n < 6; n++) {
+		c = (a->ax25_call[n] >> 1) & 0x7F;
+
+		if (c != ' ')
+			*s++ = c;
+	}
+
+	*s++ = '-';
+	n = (a->ax25_call[6] >> 1) & 0x0F;
+	if (n > 9) {
+		*s++ = '1';
+		n -= 10;
+	}
+
+	*s++ = n + '0';
+	*s++ = '\0';
+
+	if (*buf == '\0' || *buf == '-') {
+		buf[0] = '*';
+		buf[1] = '\0';
+	}
+}
+#endif /* CONFIG_AX25 */
+
+#define HBUFFERLEN 30
+
+static void arp_format_neigh_entry(struct seq_file *seq,
+				   struct neighbour *n)
+{
+	char hbuffer[HBUFFERLEN];
+	int k, j;
+	char tbuf[16];
+	struct net_device *dev = n->dev;
+	int hatype = dev->type;
+
+	read_lock(&n->lock);
+	/* Convert hardware address to XX:XX:XX:XX ... form. */
+#if IS_ENABLED(CONFIG_AX25)
+	if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+		ax2asc2((ax25_address *)n->ha, hbuffer);
+	else {
+#endif
+	for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+		hbuffer[k++] = hex_asc_hi(n->ha[j]);
+		hbuffer[k++] = hex_asc_lo(n->ha[j]);
+		hbuffer[k++] = ':';
+	}
+	if (k != 0)
+		--k;
+	hbuffer[k] = 0;
+#if IS_ENABLED(CONFIG_AX25)
+	}
+#endif
+	sprintf(tbuf, "%pI4", n->primary_key);
+	seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s     *        %s\n",
+		   tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+	read_unlock(&n->lock);
+}
+
+static void arp_format_pneigh_entry(struct seq_file *seq,
+				    struct pneigh_entry *n)
+{
+	struct net_device *dev = n->dev;
+	int hatype = dev ? dev->type : 0;
+	char tbuf[16];
+
+	sprintf(tbuf, "%pI4", n->key);
+	seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
+		   tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+		   dev ? dev->name : "*");
+}
+
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "IP address       HW type     Flags       "
+			      "HW address            Mask     Device\n");
+	} else {
+		struct neigh_seq_state *state = seq->private;
+
+		if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+			arp_format_pneigh_entry(seq, v);
+		else
+			arp_format_neigh_entry(seq, v);
+	}
+
+	return 0;
+}
+
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* Don't want to confuse "arp -a" w/ magic entries,
+	 * so we tell the generic iterator to skip NUD_NOARP.
+	 */
+	return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+
+static const struct seq_operations arp_seq_ops = {
+	.start	= arp_seq_start,
+	.next	= neigh_seq_next,
+	.stop	= neigh_seq_stop,
+	.show	= arp_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
+
+static int __net_init arp_net_init(struct net *net)
+{
+	if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops,
+			sizeof(struct neigh_seq_state)))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit arp_net_exit(struct net *net)
+{
+	remove_proc_entry("arp", net->proc_net);
+}
+
+static struct pernet_operations arp_net_ops = {
+	.init = arp_net_init,
+	.exit = arp_net_exit,
+};
+
+void __init arp_init(void)
+{
+	neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
+
+	dev_add_pack(&arp_packet_type);
+	register_pernet_subsys(&arp_net_ops);
+#ifdef CONFIG_SYSCTL
+	neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
+#endif
+	register_netdevice_notifier(&arp_netdev_notifier);
+}
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
new file mode 100644
index 0000000000..39dcccf0f1
--- /dev/null
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <net/tcp.h>
+#include <net/bpf_sk_storage.h>
+
+/* "extern" is to avoid sparse warning.  It is only used in bpf_struct_ops.c. */
+extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+
+static u32 unsupported_ops[] = {
+	offsetof(struct tcp_congestion_ops, get_info),
+};
+
+static const struct btf_type *tcp_sock_type;
+static u32 tcp_sock_id, sock_id;
+
+static int bpf_tcp_ca_init(struct btf *btf)
+{
+	s32 type_id;
+
+	type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	sock_id = type_id;
+
+	type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	tcp_sock_id = type_id;
+	tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
+
+	return 0;
+}
+
+static bool is_unsupported(u32 member_offset)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) {
+		if (member_offset == unsupported_ops[i])
+			return true;
+	}
+
+	return false;
+}
+
+static bool bpf_tcp_ca_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info))
+		return false;
+
+	if (base_type(info->reg_type) == PTR_TO_BTF_ID &&
+	    !bpf_type_has_unsafe_modifiers(info->reg_type) &&
+	    info->btf_id == sock_id)
+		/* promote it to tcp_sock */
+		info->btf_id = tcp_sock_id;
+
+	return true;
+}
+
+static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
+					const struct bpf_reg_state *reg,
+					int off, int size)
+{
+	const struct btf_type *t;
+	size_t end;
+
+	t = btf_type_by_id(reg->btf, reg->btf_id);
+	if (t != tcp_sock_type) {
+		bpf_log(log, "only read is supported\n");
+		return -EACCES;
+	}
+
+	switch (off) {
+	case offsetof(struct sock, sk_pacing_rate):
+		end = offsetofend(struct sock, sk_pacing_rate);
+		break;
+	case offsetof(struct sock, sk_pacing_status):
+		end = offsetofend(struct sock, sk_pacing_status);
+		break;
+	case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv):
+		end = offsetofend(struct inet_connection_sock, icsk_ca_priv);
+		break;
+	case offsetof(struct inet_connection_sock, icsk_ack.pending):
+		end = offsetofend(struct inet_connection_sock,
+				  icsk_ack.pending);
+		break;
+	case offsetof(struct tcp_sock, snd_cwnd):
+		end = offsetofend(struct tcp_sock, snd_cwnd);
+		break;
+	case offsetof(struct tcp_sock, snd_cwnd_cnt):
+		end = offsetofend(struct tcp_sock, snd_cwnd_cnt);
+		break;
+	case offsetof(struct tcp_sock, snd_ssthresh):
+		end = offsetofend(struct tcp_sock, snd_ssthresh);
+		break;
+	case offsetof(struct tcp_sock, ecn_flags):
+		end = offsetofend(struct tcp_sock, ecn_flags);
+		break;
+	case offsetof(struct tcp_sock, app_limited):
+		end = offsetofend(struct tcp_sock, app_limited);
+		break;
+	default:
+		bpf_log(log, "no write support to tcp_sock at off %d\n", off);
+		return -EACCES;
+	}
+
+	if (off + size > end) {
+		bpf_log(log,
+			"write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n",
+			off, size, end);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt)
+{
+	/* bpf_tcp_ca prog cannot have NULL tp */
+	__tcp_send_ack((struct sock *)tp, rcv_nxt);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
+	.func		= bpf_tcp_send_ack,
+	.gpl_only	= false,
+	/* In case we want to report error later */
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &tcp_sock_id,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+static u32 prog_ops_moff(const struct bpf_prog *prog)
+{
+	const struct btf_member *m;
+	const struct btf_type *t;
+	u32 midx;
+
+	midx = prog->expected_attach_type;
+	t = bpf_tcp_congestion_ops.type;
+	m = &btf_type_member(t)[midx];
+
+	return __btf_member_bit_offset(t, m) / 8;
+}
+
+static const struct bpf_func_proto *
+bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
+			  const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_tcp_send_ack:
+		return &bpf_tcp_send_ack_proto;
+	case BPF_FUNC_sk_storage_get:
+		return &bpf_sk_storage_get_proto;
+	case BPF_FUNC_sk_storage_delete:
+		return &bpf_sk_storage_delete_proto;
+	case BPF_FUNC_setsockopt:
+		/* Does not allow release() to call setsockopt.
+		 * release() is called when the current bpf-tcp-cc
+		 * is retiring.  It is not allowed to call
+		 * setsockopt() to make further changes which
+		 * may potentially allocate new resources.
+		 */
+		if (prog_ops_moff(prog) !=
+		    offsetof(struct tcp_congestion_ops, release))
+			return &bpf_sk_setsockopt_proto;
+		return NULL;
+	case BPF_FUNC_getsockopt:
+		/* Since get/setsockopt is usually expected to
+		 * be available together, disable getsockopt for
+		 * release also to avoid usage surprise.
+		 * The bpf-tcp-cc already has a more powerful way
+		 * to read tcp_sock from the PTR_TO_BTF_ID.
+		 */
+		if (prog_ops_moff(prog) !=
+		    offsetof(struct tcp_congestion_ops, release))
+			return &bpf_sk_getsockopt_proto;
+		return NULL;
+	case BPF_FUNC_ktime_get_coarse_ns:
+		return &bpf_ktime_get_coarse_ns_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+BTF_SET8_START(bpf_tcp_ca_check_kfunc_ids)
+BTF_ID_FLAGS(func, tcp_reno_ssthresh)
+BTF_ID_FLAGS(func, tcp_reno_cong_avoid)
+BTF_ID_FLAGS(func, tcp_reno_undo_cwnd)
+BTF_ID_FLAGS(func, tcp_slow_start)
+BTF_ID_FLAGS(func, tcp_cong_avoid_ai)
+BTF_SET8_END(bpf_tcp_ca_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_tcp_ca_check_kfunc_ids,
+};
+
+static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
+	.get_func_proto		= bpf_tcp_ca_get_func_proto,
+	.is_valid_access	= bpf_tcp_ca_is_valid_access,
+	.btf_struct_access	= bpf_tcp_ca_btf_struct_access,
+};
+
+static int bpf_tcp_ca_init_member(const struct btf_type *t,
+				  const struct btf_member *member,
+				  void *kdata, const void *udata)
+{
+	const struct tcp_congestion_ops *utcp_ca;
+	struct tcp_congestion_ops *tcp_ca;
+	u32 moff;
+
+	utcp_ca = (const struct tcp_congestion_ops *)udata;
+	tcp_ca = (struct tcp_congestion_ops *)kdata;
+
+	moff = __btf_member_bit_offset(t, member) / 8;
+	switch (moff) {
+	case offsetof(struct tcp_congestion_ops, flags):
+		if (utcp_ca->flags & ~TCP_CONG_MASK)
+			return -EINVAL;
+		tcp_ca->flags = utcp_ca->flags;
+		return 1;
+	case offsetof(struct tcp_congestion_ops, name):
+		if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
+				     sizeof(tcp_ca->name)) <= 0)
+			return -EINVAL;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int bpf_tcp_ca_check_member(const struct btf_type *t,
+				   const struct btf_member *member,
+				   const struct bpf_prog *prog)
+{
+	if (is_unsupported(__btf_member_bit_offset(t, member) / 8))
+		return -ENOTSUPP;
+	return 0;
+}
+
+static int bpf_tcp_ca_reg(void *kdata)
+{
+	return tcp_register_congestion_control(kdata);
+}
+
+static void bpf_tcp_ca_unreg(void *kdata)
+{
+	tcp_unregister_congestion_control(kdata);
+}
+
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+	return tcp_update_congestion_control(kdata, old_kdata);
+}
+
+static int bpf_tcp_ca_validate(void *kdata)
+{
+	return tcp_validate_congestion_control(kdata);
+}
+
+struct bpf_struct_ops bpf_tcp_congestion_ops = {
+	.verifier_ops = &bpf_tcp_ca_verifier_ops,
+	.reg = bpf_tcp_ca_reg,
+	.unreg = bpf_tcp_ca_unreg,
+	.update = bpf_tcp_ca_update,
+	.check_member = bpf_tcp_ca_check_member,
+	.init_member = bpf_tcp_ca_init_member,
+	.init = bpf_tcp_ca_init,
+	.validate = bpf_tcp_ca_validate,
+	.name = "tcp_congestion_ops",
+};
+
+static int __init bpf_tcp_ca_kfunc_init(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+}
+late_initcall(bpf_tcp_ca_kfunc_init);
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
new file mode 100644
index 0000000000..00af5305e0
--- /dev/null
+++ b/net/ipv4/bpfilter/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_BPFILTER) += sockopt.o
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
new file mode 100644
index 0000000000..193bcc2acc
--- /dev/null
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+#include <uapi/linux/bpf.h>
+#include <linux/wait.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+struct bpfilter_umh_ops bpfilter_ops;
+EXPORT_SYMBOL_GPL(bpfilter_ops);
+
+static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval,
+				 unsigned int optlen, bool is_set)
+{
+	int err;
+	mutex_lock(&bpfilter_ops.lock);
+	if (!bpfilter_ops.sockopt) {
+		mutex_unlock(&bpfilter_ops.lock);
+		request_module("bpfilter");
+		mutex_lock(&bpfilter_ops.lock);
+
+		if (!bpfilter_ops.sockopt) {
+			err = -ENOPROTOOPT;
+			goto out;
+		}
+	}
+	if (bpfilter_ops.info.tgid &&
+	    thread_group_exited(bpfilter_ops.info.tgid))
+		umd_cleanup_helper(&bpfilter_ops.info);
+
+	if (!bpfilter_ops.info.tgid) {
+		err = bpfilter_ops.start();
+		if (err)
+			goto out;
+	}
+	err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set);
+out:
+	mutex_unlock(&bpfilter_ops.lock);
+	return err;
+}
+
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval,
+			    unsigned int optlen)
+{
+	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
+}
+
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
+			    int __user *optlen)
+{
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	return bpfilter_mbox_request(sk, optname, USER_SOCKPTR(optval), len,
+				     false);
+}
+
+static int __init bpfilter_sockopt_init(void)
+{
+	mutex_init(&bpfilter_ops.lock);
+	bpfilter_ops.info.tgid = NULL;
+	bpfilter_ops.info.driver_name = "bpfilter_umh";
+
+	return 0;
+}
+device_initcall(bpfilter_sockopt_init);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
new file mode 100644
index 0000000000..d048aa8332
--- /dev/null
+++ b/net/ipv4/cipso_ipv4.c
@@ -0,0 +1,2295 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CIPSO - Commercial IP Security Option
+ *
+ * This is an implementation of the CIPSO 2.2 protocol as specified in
+ * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
+ * FIPS-188.  While CIPSO never became a full IETF RFC standard many vendors
+ * have chosen to adopt the protocol and over the years it has become a
+ * de-facto standard for labeled networking.
+ *
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ *   https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ *   https://www.itl.nist.gov/fipspubs/fip188.htm
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <asm/unaligned.h>
+
+/* List of available DOI definitions */
+/* XXX - This currently assumes a minimal number of different DOIs in use,
+ * if in practice there are a lot of different DOIs this list should
+ * probably be turned into a hash table or something similar so we
+ * can do quick lookups. */
+static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
+static LIST_HEAD(cipso_v4_doi_list);
+
+/* Label mapping cache */
+int cipso_v4_cache_enabled = 1;
+int cipso_v4_cache_bucketsize = 10;
+#define CIPSO_V4_CACHE_BUCKETBITS     7
+#define CIPSO_V4_CACHE_BUCKETS        (1 << CIPSO_V4_CACHE_BUCKETBITS)
+#define CIPSO_V4_CACHE_REORDERLIMIT   10
+struct cipso_v4_map_cache_bkt {
+	spinlock_t lock;
+	u32 size;
+	struct list_head list;
+};
+
+struct cipso_v4_map_cache_entry {
+	u32 hash;
+	unsigned char *key;
+	size_t key_len;
+
+	struct netlbl_lsm_cache *lsm_data;
+
+	u32 activity;
+	struct list_head list;
+};
+
+static struct cipso_v4_map_cache_bkt *cipso_v4_cache;
+
+/* Restricted bitmap (tag #1) flags */
+int cipso_v4_rbm_optfmt;
+int cipso_v4_rbm_strictvalid = 1;
+
+/*
+ * Protocol Constants
+ */
+
+/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
+ * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
+#define CIPSO_V4_OPT_LEN_MAX          40
+
+/* Length of the base CIPSO option, this includes the option type (1 byte), the
+ * option length (1 byte), and the DOI (4 bytes). */
+#define CIPSO_V4_HDR_LEN              6
+
+/* Base length of the restrictive category bitmap tag (tag #1). */
+#define CIPSO_V4_TAG_RBM_BLEN         4
+
+/* Base length of the enumerated category tag (tag #2). */
+#define CIPSO_V4_TAG_ENUM_BLEN        4
+
+/* Base length of the ranged categories bitmap tag (tag #5). */
+#define CIPSO_V4_TAG_RNG_BLEN         4
+/* The maximum number of category ranges permitted in the ranged category tag
+ * (tag #5).  You may note that the IETF draft states that the maximum number
+ * of category ranges is 7, but if the low end of the last category range is
+ * zero then it is possible to fit 8 category ranges because the zero should
+ * be omitted. */
+#define CIPSO_V4_TAG_RNG_CAT_MAX      8
+
+/* Base length of the local tag (non-standard tag).
+ *  Tag definition (may change between kernel versions)
+ *
+ * 0          8          16         24         32
+ * +----------+----------+----------+----------+
+ * | 10000000 | 00000110 | 32-bit secid value  |
+ * +----------+----------+----------+----------+
+ * | in (host byte order)|
+ * +----------+----------+
+ *
+ */
+#define CIPSO_V4_TAG_LOC_BLEN         6
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * cipso_v4_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry)
+{
+	if (entry->lsm_data)
+		netlbl_secattr_cache_free(entry->lsm_data);
+	kfree(entry->key);
+	kfree(entry);
+}
+
+/**
+ * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CIPSO tag hashing function.  Returns a 32-bit hash value.
+ *
+ */
+static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+/*
+ * Label Mapping Cache Functions
+ */
+
+/**
+ * cipso_v4_cache_init - Initialize the CIPSO cache
+ *
+ * Description:
+ * Initializes the CIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file.  Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int __init cipso_v4_cache_init(void)
+{
+	u32 iter;
+
+	cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
+				 sizeof(struct cipso_v4_map_cache_bkt),
+				 GFP_KERNEL);
+	if (!cipso_v4_cache)
+		return -ENOMEM;
+
+	for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+		spin_lock_init(&cipso_v4_cache[iter].lock);
+		cipso_v4_cache[iter].size = 0;
+		INIT_LIST_HEAD(&cipso_v4_cache[iter].list);
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CIPSO cache.
+ *
+ */
+void cipso_v4_cache_invalidate(void)
+{
+	struct cipso_v4_map_cache_entry *entry, *tmp_entry;
+	u32 iter;
+
+	for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+		spin_lock_bh(&cipso_v4_cache[iter].lock);
+		list_for_each_entry_safe(entry,
+					 tmp_entry,
+					 &cipso_v4_cache[iter].list, list) {
+			list_del(&entry->list);
+			cipso_v4_cache_entry_free(entry);
+		}
+		cipso_v4_cache[iter].size = 0;
+		spin_unlock_bh(&cipso_v4_cache[iter].lock);
+	}
+}
+
+/**
+ * cipso_v4_cache_check - Check the CIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key.  If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes.  The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ *  1. The cache entry's activity counter is incremented
+ *  2. The previous (higher ranking) entry's activity counter is decremented
+ *  3. If the difference between the two activity counters is geater than
+ *     CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int cipso_v4_cache_check(const unsigned char *key,
+				u32 key_len,
+				struct netlbl_lsm_secattr *secattr)
+{
+	u32 bkt;
+	struct cipso_v4_map_cache_entry *entry;
+	struct cipso_v4_map_cache_entry *prev_entry = NULL;
+	u32 hash;
+
+	if (!READ_ONCE(cipso_v4_cache_enabled))
+		return -ENOENT;
+
+	hash = cipso_v4_map_cache_hash(key, key_len);
+	bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+	spin_lock_bh(&cipso_v4_cache[bkt].lock);
+	list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
+		if (entry->hash == hash &&
+		    entry->key_len == key_len &&
+		    memcmp(entry->key, key, key_len) == 0) {
+			entry->activity += 1;
+			refcount_inc(&entry->lsm_data->refcount);
+			secattr->cache = entry->lsm_data;
+			secattr->flags |= NETLBL_SECATTR_CACHE;
+			secattr->type = NETLBL_NLTYPE_CIPSOV4;
+			if (!prev_entry) {
+				spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+				return 0;
+			}
+
+			if (prev_entry->activity > 0)
+				prev_entry->activity -= 1;
+			if (entry->activity > prev_entry->activity &&
+			    entry->activity - prev_entry->activity >
+			    CIPSO_V4_CACHE_REORDERLIMIT) {
+				__list_del(entry->list.prev, entry->list.next);
+				__list_add(&entry->list,
+					   prev_entry->list.prev,
+					   &prev_entry->list);
+			}
+
+			spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+			return 0;
+		}
+		prev_entry = entry;
+	}
+	spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+	return -ENOENT;
+}
+
+/**
+ * cipso_v4_cache_add - Add an entry to the CIPSO cache
+ * @cipso_ptr: pointer to CIPSO IP option
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CIPSO label mapping cache.  Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first.  It is important to note that there is
+ * currently no checking for duplicate keys.  Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int cipso_v4_cache_add(const unsigned char *cipso_ptr,
+		       const struct netlbl_lsm_secattr *secattr)
+{
+	int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize);
+	int ret_val = -EPERM;
+	u32 bkt;
+	struct cipso_v4_map_cache_entry *entry = NULL;
+	struct cipso_v4_map_cache_entry *old_entry = NULL;
+	u32 cipso_ptr_len;
+
+	if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0)
+		return 0;
+
+	cipso_ptr_len = cipso_ptr[1];
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		return -ENOMEM;
+	entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
+	if (!entry->key) {
+		ret_val = -ENOMEM;
+		goto cache_add_failure;
+	}
+	entry->key_len = cipso_ptr_len;
+	entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
+	refcount_inc(&secattr->cache->refcount);
+	entry->lsm_data = secattr->cache;
+
+	bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+	spin_lock_bh(&cipso_v4_cache[bkt].lock);
+	if (cipso_v4_cache[bkt].size < bkt_size) {
+		list_add(&entry->list, &cipso_v4_cache[bkt].list);
+		cipso_v4_cache[bkt].size += 1;
+	} else {
+		old_entry = list_entry(cipso_v4_cache[bkt].list.prev,
+				       struct cipso_v4_map_cache_entry, list);
+		list_del(&old_entry->list);
+		list_add(&entry->list, &cipso_v4_cache[bkt].list);
+		cipso_v4_cache_entry_free(old_entry);
+	}
+	spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+	return 0;
+
+cache_add_failure:
+	if (entry)
+		cipso_v4_cache_entry_free(entry);
+	return ret_val;
+}
+
+/*
+ * DOI List Functions
+ */
+
+/**
+ * cipso_v4_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi.  The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
+{
+	struct cipso_v4_doi *iter;
+
+	list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
+		if (iter->doi == doi && refcount_read(&iter->refcount))
+			return iter;
+	return NULL;
+}
+
+/**
+ * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CIPSO engine and calls this
+ * function to add it to the list of acceptable domains.  The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see cipso_ipv4.h for details).  Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
+		     struct netlbl_audit *audit_info)
+{
+	int ret_val = -EINVAL;
+	u32 iter;
+	u32 doi;
+	u32 doi_type;
+	struct audit_buffer *audit_buf;
+
+	doi = doi_def->doi;
+	doi_type = doi_def->type;
+
+	if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+		goto doi_add_return;
+	for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
+		switch (doi_def->tags[iter]) {
+		case CIPSO_V4_TAG_RBITMAP:
+			break;
+		case CIPSO_V4_TAG_RANGE:
+		case CIPSO_V4_TAG_ENUM:
+			if (doi_def->type != CIPSO_V4_MAP_PASS)
+				goto doi_add_return;
+			break;
+		case CIPSO_V4_TAG_LOCAL:
+			if (doi_def->type != CIPSO_V4_MAP_LOCAL)
+				goto doi_add_return;
+			break;
+		case CIPSO_V4_TAG_INVALID:
+			if (iter == 0)
+				goto doi_add_return;
+			break;
+		default:
+			goto doi_add_return;
+		}
+	}
+
+	refcount_set(&doi_def->refcount, 1);
+
+	spin_lock(&cipso_v4_doi_list_lock);
+	if (cipso_v4_doi_search(doi_def->doi)) {
+		spin_unlock(&cipso_v4_doi_list_lock);
+		ret_val = -EEXIST;
+		goto doi_add_return;
+	}
+	list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
+	spin_unlock(&cipso_v4_doi_list_lock);
+	ret_val = 0;
+
+doi_add_return:
+	audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
+	if (audit_buf) {
+		const char *type_str;
+		switch (doi_type) {
+		case CIPSO_V4_MAP_TRANS:
+			type_str = "trans";
+			break;
+		case CIPSO_V4_MAP_PASS:
+			type_str = "pass";
+			break;
+		case CIPSO_V4_MAP_LOCAL:
+			type_str = "local";
+			break;
+		default:
+			type_str = "(unknown)";
+		}
+		audit_log_format(audit_buf,
+				 " cipso_doi=%u cipso_type=%s res=%u",
+				 doi, type_str, ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	return ret_val;
+}
+
+/**
+ * cipso_v4_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
+{
+	if (!doi_def)
+		return;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_TRANS:
+		kfree(doi_def->map.std->lvl.cipso);
+		kfree(doi_def->map.std->lvl.local);
+		kfree(doi_def->map.std->cat.cipso);
+		kfree(doi_def->map.std->cat.local);
+		kfree(doi_def->map.std);
+		break;
+	}
+	kfree(doi_def);
+}
+
+/**
+ * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
+{
+	struct cipso_v4_doi *doi_def;
+
+	doi_def = container_of(entry, struct cipso_v4_doi, rcu);
+	cipso_v4_doi_free(doi_def);
+}
+
+/**
+ * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes a DOI definition from the CIPSO engine.  The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list.  Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct cipso_v4_doi *doi_def;
+	struct audit_buffer *audit_buf;
+
+	spin_lock(&cipso_v4_doi_list_lock);
+	doi_def = cipso_v4_doi_search(doi);
+	if (!doi_def) {
+		spin_unlock(&cipso_v4_doi_list_lock);
+		ret_val = -ENOENT;
+		goto doi_remove_return;
+	}
+	list_del_rcu(&doi_def->list);
+	spin_unlock(&cipso_v4_doi_list_lock);
+
+	cipso_v4_doi_putdef(doi_def);
+	ret_val = 0;
+
+doi_remove_return:
+	audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
+	if (audit_buf) {
+		audit_log_format(audit_buf,
+				 " cipso_doi=%u res=%u",
+				 doi, ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	return ret_val;
+}
+
+/**
+ * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller.  Otherwise NULL is returned.  The caller must ensure that
+ * rcu_read_lock() is held while accessing the returned definition and the DOI
+ * definition reference count is decremented when the caller is done.
+ *
+ */
+struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
+{
+	struct cipso_v4_doi *doi_def;
+
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_search(doi);
+	if (!doi_def)
+		goto doi_getdef_return;
+	if (!refcount_inc_not_zero(&doi_def->refcount))
+		doi_def = NULL;
+
+doi_getdef_return:
+	rcu_read_unlock();
+	return doi_def;
+}
+
+/**
+ * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
+ *
+ */
+void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
+{
+	if (!doi_def)
+		return;
+
+	if (!refcount_dec_and_test(&doi_def->refcount))
+		return;
+
+	cipso_v4_cache_invalidate();
+	call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+}
+
+/**
+ * cipso_v4_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return.  Updates the value in @skip_cnt upon
+ * return.  Returns zero on success, negative values on failure.
+ *
+ */
+int cipso_v4_doi_walk(u32 *skip_cnt,
+		     int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
+		     void *cb_arg)
+{
+	int ret_val = -ENOENT;
+	u32 doi_cnt = 0;
+	struct cipso_v4_doi *iter_doi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
+		if (refcount_read(&iter_doi->refcount) > 0) {
+			if (doi_cnt++ < *skip_cnt)
+				continue;
+			ret_val = callback(iter_doi, cb_arg);
+			if (ret_val < 0) {
+				doi_cnt--;
+				goto doi_walk_return;
+			}
+		}
+
+doi_walk_return:
+	rcu_read_unlock();
+	*skip_cnt = doi_cnt;
+	return ret_val;
+}
+
+/*
+ * Label Mapping Functions
+ */
+
+/**
+ * cipso_v4_map_lvl_valid - Checks to see if the given level is understood
+ * @doi_def: the DOI definition
+ * @level: the level to check
+ *
+ * Description:
+ * Checks the given level against the given DOI definition and returns a
+ * negative value if the level does not have a valid mapping and a zero value
+ * if the level is defined by the DOI.
+ *
+ */
+static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
+{
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		return 0;
+	case CIPSO_V4_MAP_TRANS:
+		if ((level < doi_def->map.std->lvl.cipso_size) &&
+		    (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL))
+			return 0;
+		break;
+	}
+
+	return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network
+ * @doi_def: the DOI definition
+ * @host_lvl: the host MLS level
+ * @net_lvl: the network/CIPSO MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS level to the correct
+ * CIPSO level using the given DOI definition.  Returns zero on success,
+ * negative values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
+				 u32 host_lvl,
+				 u32 *net_lvl)
+{
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		*net_lvl = host_lvl;
+		return 0;
+	case CIPSO_V4_MAP_TRANS:
+		if (host_lvl < doi_def->map.std->lvl.local_size &&
+		    doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
+			*net_lvl = doi_def->map.std->lvl.local[host_lvl];
+			return 0;
+		}
+		return -EPERM;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host
+ * @doi_def: the DOI definition
+ * @net_lvl: the network/CIPSO MLS level
+ * @host_lvl: the host MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO level to the correct local MLS
+ * level using the given DOI definition.  Returns zero on success, negative
+ * values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
+				 u32 net_lvl,
+				 u32 *host_lvl)
+{
+	struct cipso_v4_std_map_tbl *map_tbl;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		*host_lvl = net_lvl;
+		return 0;
+	case CIPSO_V4_MAP_TRANS:
+		map_tbl = doi_def->map.std;
+		if (net_lvl < map_tbl->lvl.cipso_size &&
+		    map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
+			*host_lvl = doi_def->map.std->lvl.cipso[net_lvl];
+			return 0;
+		}
+		return -EPERM;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid
+ * @doi_def: the DOI definition
+ * @bitmap: category bitmap
+ * @bitmap_len: bitmap length in bytes
+ *
+ * Description:
+ * Checks the given category bitmap against the given DOI definition and
+ * returns a negative value if any of the categories in the bitmap do not have
+ * a valid mapping and a zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
+				      const unsigned char *bitmap,
+				      u32 bitmap_len)
+{
+	int cat = -1;
+	u32 bitmap_len_bits = bitmap_len * 8;
+	u32 cipso_cat_size;
+	u32 *cipso_array;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		return 0;
+	case CIPSO_V4_MAP_TRANS:
+		cipso_cat_size = doi_def->map.std->cat.cipso_size;
+		cipso_array = doi_def->map.std->cat.cipso;
+		for (;;) {
+			cat = netlbl_bitmap_walk(bitmap,
+						 bitmap_len_bits,
+						 cat + 1,
+						 1);
+			if (cat < 0)
+				break;
+			if (cat >= cipso_cat_size ||
+			    cipso_array[cat] >= CIPSO_V4_INV_CAT)
+				return -EFAULT;
+		}
+
+		if (cat == -1)
+			return 0;
+		break;
+	}
+
+	return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO bitmap using the given DOI definition.  Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
+				     const struct netlbl_lsm_secattr *secattr,
+				     unsigned char *net_cat,
+				     u32 net_cat_len)
+{
+	int host_spot = -1;
+	u32 net_spot = CIPSO_V4_INV_CAT;
+	u32 net_spot_max = 0;
+	u32 net_clen_bits = net_cat_len * 8;
+	u32 host_cat_size = 0;
+	u32 *host_cat_array = NULL;
+
+	if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+		host_cat_size = doi_def->map.std->cat.local_size;
+		host_cat_array = doi_def->map.std->cat.local;
+	}
+
+	for (;;) {
+		host_spot = netlbl_catmap_walk(secattr->attr.mls.cat,
+					       host_spot + 1);
+		if (host_spot < 0)
+			break;
+
+		switch (doi_def->type) {
+		case CIPSO_V4_MAP_PASS:
+			net_spot = host_spot;
+			break;
+		case CIPSO_V4_MAP_TRANS:
+			if (host_spot >= host_cat_size)
+				return -EPERM;
+			net_spot = host_cat_array[host_spot];
+			if (net_spot >= CIPSO_V4_INV_CAT)
+				return -EPERM;
+			break;
+		}
+		if (net_spot >= net_clen_bits)
+			return -ENOSPC;
+		netlbl_bitmap_setbit(net_cat, net_spot, 1);
+
+		if (net_spot > net_spot_max)
+			net_spot_max = net_spot;
+	}
+
+	if (++net_spot_max % 8)
+		return net_spot_max / 8 + 1;
+	return net_spot_max / 8;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
+				     const unsigned char *net_cat,
+				     u32 net_cat_len,
+				     struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	int net_spot = -1;
+	u32 host_spot = CIPSO_V4_INV_CAT;
+	u32 net_clen_bits = net_cat_len * 8;
+	u32 net_cat_size = 0;
+	u32 *net_cat_array = NULL;
+
+	if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+		net_cat_size = doi_def->map.std->cat.cipso_size;
+		net_cat_array = doi_def->map.std->cat.cipso;
+	}
+
+	for (;;) {
+		net_spot = netlbl_bitmap_walk(net_cat,
+					      net_clen_bits,
+					      net_spot + 1,
+					      1);
+		if (net_spot < 0) {
+			if (net_spot == -2)
+				return -EFAULT;
+			return 0;
+		}
+
+		switch (doi_def->type) {
+		case CIPSO_V4_MAP_PASS:
+			host_spot = net_spot;
+			break;
+		case CIPSO_V4_MAP_TRANS:
+			if (net_spot >= net_cat_size)
+				return -EPERM;
+			host_spot = net_cat_array[net_spot];
+			if (host_spot >= CIPSO_V4_INV_CAT)
+				return -EPERM;
+			break;
+		}
+		ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+						       host_spot,
+						       GFP_ATOMIC);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @enumcat: category list
+ * @enumcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def,
+				       const unsigned char *enumcat,
+				       u32 enumcat_len)
+{
+	u16 cat;
+	int cat_prev = -1;
+	u32 iter;
+
+	if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01)
+		return -EFAULT;
+
+	for (iter = 0; iter < enumcat_len; iter += 2) {
+		cat = get_unaligned_be16(&enumcat[iter]);
+		if (cat <= cat_prev)
+			return -EFAULT;
+		cat_prev = cat;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition.   Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
+				      const struct netlbl_lsm_secattr *secattr,
+				      unsigned char *net_cat,
+				      u32 net_cat_len)
+{
+	int cat = -1;
+	u32 cat_iter = 0;
+
+	for (;;) {
+		cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1);
+		if (cat < 0)
+			break;
+		if ((cat_iter + 2) > net_cat_len)
+			return -ENOSPC;
+
+		*((__be16 *)&net_cat[cat_iter]) = htons(cat);
+		cat_iter += 2;
+	}
+
+	return cat_iter;
+}
+
+/**
+ * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
+				      const unsigned char *net_cat,
+				      u32 net_cat_len,
+				      struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u32 iter;
+
+	for (iter = 0; iter < net_cat_len; iter += 2) {
+		ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+					     get_unaligned_be16(&net_cat[iter]),
+					     GFP_ATOMIC);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @rngcat: category list
+ * @rngcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def,
+				      const unsigned char *rngcat,
+				      u32 rngcat_len)
+{
+	u16 cat_high;
+	u16 cat_low;
+	u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1;
+	u32 iter;
+
+	if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01)
+		return -EFAULT;
+
+	for (iter = 0; iter < rngcat_len; iter += 4) {
+		cat_high = get_unaligned_be16(&rngcat[iter]);
+		if ((iter + 4) <= rngcat_len)
+			cat_low = get_unaligned_be16(&rngcat[iter + 2]);
+		else
+			cat_low = 0;
+
+		if (cat_high > cat_prev)
+			return -EFAULT;
+
+		cat_prev = cat_low;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition.   Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
+				     const struct netlbl_lsm_secattr *secattr,
+				     unsigned char *net_cat,
+				     u32 net_cat_len)
+{
+	int iter = -1;
+	u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
+	u32 array_cnt = 0;
+	u32 cat_size = 0;
+
+	/* make sure we don't overflow the 'array[]' variable */
+	if (net_cat_len >
+	    (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
+		return -ENOSPC;
+
+	for (;;) {
+		iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1);
+		if (iter < 0)
+			break;
+		cat_size += (iter == 0 ? 0 : sizeof(u16));
+		if (cat_size > net_cat_len)
+			return -ENOSPC;
+		array[array_cnt++] = iter;
+
+		iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter);
+		if (iter < 0)
+			return -EFAULT;
+		cat_size += sizeof(u16);
+		if (cat_size > net_cat_len)
+			return -ENOSPC;
+		array[array_cnt++] = iter;
+	}
+
+	for (iter = 0; array_cnt > 0;) {
+		*((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]);
+		iter += 2;
+		array_cnt--;
+		if (array[array_cnt] != 0) {
+			*((__be16 *)&net_cat[iter]) = htons(array[array_cnt]);
+			iter += 2;
+		}
+	}
+
+	return cat_size;
+}
+
+/**
+ * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
+				     const unsigned char *net_cat,
+				     u32 net_cat_len,
+				     struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u32 net_iter;
+	u16 cat_low;
+	u16 cat_high;
+
+	for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
+		cat_high = get_unaligned_be16(&net_cat[net_iter]);
+		if ((net_iter + 4) <= net_cat_len)
+			cat_low = get_unaligned_be16(&net_cat[net_iter + 2]);
+		else
+			cat_low = 0;
+
+		ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat,
+					       cat_low,
+					       cat_high,
+					       GFP_ATOMIC);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	return 0;
+}
+
+/*
+ * Protocol Handling Functions
+ */
+
+/**
+ * cipso_v4_gentag_hdr - Generate a CIPSO option header
+ * @doi_def: the DOI definition
+ * @len: the total tag length in bytes, not including this header
+ * @buf: the CIPSO option buffer
+ *
+ * Description:
+ * Write a CIPSO header into the beginning of @buffer.
+ *
+ */
+static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
+				unsigned char *buf,
+				u32 len)
+{
+	buf[0] = IPOPT_CIPSO;
+	buf[1] = CIPSO_V4_HDR_LEN + len;
+	put_unaligned_be32(doi_def->doi, &buf[2]);
+}
+
+/**
+ * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the restricted bitmap tag, tag type #1.  The
+ * actual buffer length may be larger than the indicated size due to
+ * translation between host and network category bitmaps.  Returns the size of
+ * the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr,
+			       unsigned char *buffer,
+			       u32 buffer_len)
+{
+	int ret_val;
+	u32 tag_len;
+	u32 level;
+
+	if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+		return -EPERM;
+
+	ret_val = cipso_v4_map_lvl_hton(doi_def,
+					secattr->attr.mls.lvl,
+					&level);
+	if (ret_val != 0)
+		return ret_val;
+
+	if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+		ret_val = cipso_v4_map_cat_rbm_hton(doi_def,
+						    secattr,
+						    &buffer[4],
+						    buffer_len - 4);
+		if (ret_val < 0)
+			return ret_val;
+
+		/* This will send packets using the "optimized" format when
+		 * possible as specified in  section 3.4.2.6 of the
+		 * CIPSO draft. */
+		if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 &&
+		    ret_val <= 10)
+			tag_len = 14;
+		else
+			tag_len = 4 + ret_val;
+	} else
+		tag_len = 4;
+
+	buffer[0] = CIPSO_V4_TAG_RBITMAP;
+	buffer[1] = tag_len;
+	buffer[3] = level;
+
+	return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security
+ * attributes in @secattr.  Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
+				 const unsigned char *tag,
+				 struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u8 tag_len = tag[1];
+	u32 level;
+
+	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+	if (ret_val != 0)
+		return ret_val;
+	secattr->attr.mls.lvl = level;
+	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+	if (tag_len > 4) {
+		ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
+						    &tag[4],
+						    tag_len - 4,
+						    secattr);
+		if (ret_val != 0) {
+			netlbl_catmap_free(secattr->attr.mls.cat);
+			return ret_val;
+		}
+
+		if (secattr->attr.mls.cat)
+			secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the enumerated tag, tag type #2.  Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
+				const struct netlbl_lsm_secattr *secattr,
+				unsigned char *buffer,
+				u32 buffer_len)
+{
+	int ret_val;
+	u32 tag_len;
+	u32 level;
+
+	if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+		return -EPERM;
+
+	ret_val = cipso_v4_map_lvl_hton(doi_def,
+					secattr->attr.mls.lvl,
+					&level);
+	if (ret_val != 0)
+		return ret_val;
+
+	if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+		ret_val = cipso_v4_map_cat_enum_hton(doi_def,
+						     secattr,
+						     &buffer[4],
+						     buffer_len - 4);
+		if (ret_val < 0)
+			return ret_val;
+
+		tag_len = 4 + ret_val;
+	} else
+		tag_len = 4;
+
+	buffer[0] = CIPSO_V4_TAG_ENUM;
+	buffer[1] = tag_len;
+	buffer[3] = level;
+
+	return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO enumerated tag (tag type #2) and return the security
+ * attributes in @secattr.  Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
+				  const unsigned char *tag,
+				  struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u8 tag_len = tag[1];
+	u32 level;
+
+	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+	if (ret_val != 0)
+		return ret_val;
+	secattr->attr.mls.lvl = level;
+	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+	if (tag_len > 4) {
+		ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
+						     &tag[4],
+						     tag_len - 4,
+						     secattr);
+		if (ret_val != 0) {
+			netlbl_catmap_free(secattr->attr.mls.cat);
+			return ret_val;
+		}
+
+		secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the ranged tag, tag type #5.  Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr,
+			       unsigned char *buffer,
+			       u32 buffer_len)
+{
+	int ret_val;
+	u32 tag_len;
+	u32 level;
+
+	if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+		return -EPERM;
+
+	ret_val = cipso_v4_map_lvl_hton(doi_def,
+					secattr->attr.mls.lvl,
+					&level);
+	if (ret_val != 0)
+		return ret_val;
+
+	if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+		ret_val = cipso_v4_map_cat_rng_hton(doi_def,
+						    secattr,
+						    &buffer[4],
+						    buffer_len - 4);
+		if (ret_val < 0)
+			return ret_val;
+
+		tag_len = 4 + ret_val;
+	} else
+		tag_len = 4;
+
+	buffer[0] = CIPSO_V4_TAG_RANGE;
+	buffer[1] = tag_len;
+	buffer[3] = level;
+
+	return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO ranged tag (tag type #5) and return the security attributes
+ * in @secattr.  Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
+				 const unsigned char *tag,
+				 struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u8 tag_len = tag[1];
+	u32 level;
+
+	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+	if (ret_val != 0)
+		return ret_val;
+	secattr->attr.mls.lvl = level;
+	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+	if (tag_len > 4) {
+		ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
+						    &tag[4],
+						    tag_len - 4,
+						    secattr);
+		if (ret_val != 0) {
+			netlbl_catmap_free(secattr->attr.mls.cat);
+			return ret_val;
+		}
+
+		if (secattr->attr.mls.cat)
+			secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the local tag.  Returns the size of the tag
+ * on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr,
+			       unsigned char *buffer,
+			       u32 buffer_len)
+{
+	if (!(secattr->flags & NETLBL_SECATTR_SECID))
+		return -EPERM;
+
+	buffer[0] = CIPSO_V4_TAG_LOCAL;
+	buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
+	*(u32 *)&buffer[2] = secattr->attr.secid;
+
+	return CIPSO_V4_TAG_LOC_BLEN;
+}
+
+/**
+ * cipso_v4_parsetag_loc - Parse a CIPSO local tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO local tag and return the security attributes in @secattr.
+ * Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
+				 const unsigned char *tag,
+				 struct netlbl_lsm_secattr *secattr)
+{
+	secattr->attr.secid = *(u32 *)&tag[2];
+	secattr->flags |= NETLBL_SECATTR_SECID;
+
+	return 0;
+}
+
+/**
+ * cipso_v4_optptr - Find the CIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CIPSO option.  Returns a pointer
+ * to the start of the CIPSO option on success, NULL if one is not found.
+ *
+ */
+unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+	int optlen;
+	int taglen;
+
+	for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
+		switch (optptr[0]) {
+		case IPOPT_END:
+			return NULL;
+		case IPOPT_NOOP:
+			taglen = 1;
+			break;
+		default:
+			taglen = optptr[1];
+		}
+		if (!taglen || taglen > optlen)
+			return NULL;
+		if (optptr[0] == IPOPT_CIPSO)
+			return optptr;
+
+		optlen -= taglen;
+		optptr += taglen;
+	}
+
+	return NULL;
+}
+
+/**
+ * cipso_v4_validate - Validate a CIPSO option
+ * @skb: the packet
+ * @option: the start of the option, on error it is set to point to the error
+ *
+ * Description:
+ * This routine is called to validate a CIPSO option, it checks all of the
+ * fields to ensure that they are at least valid, see the draft snippet below
+ * for details.  If the option is valid then a zero value is returned and
+ * the value of @option is unchanged.  If the option is invalid then a
+ * non-zero value is returned and @option is adjusted to point to the
+ * offending portion of the option.  From the IETF draft ...
+ *
+ *  "If any field within the CIPSO options, such as the DOI identifier, is not
+ *   recognized the IP datagram is discarded and an ICMP 'parameter problem'
+ *   (type 12) is generated and returned.  The ICMP code field is set to 'bad
+ *   parameter' (code 0) and the pointer is set to the start of the CIPSO field
+ *   that is unrecognized."
+ *
+ */
+int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
+{
+	unsigned char *opt = *option;
+	unsigned char *tag;
+	unsigned char opt_iter;
+	unsigned char err_offset = 0;
+	u8 opt_len;
+	u8 tag_len;
+	struct cipso_v4_doi *doi_def = NULL;
+	u32 tag_iter;
+
+	/* caller already checks for length values that are too large */
+	opt_len = opt[1];
+	if (opt_len < 8) {
+		err_offset = 1;
+		goto validate_return;
+	}
+
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
+	if (!doi_def) {
+		err_offset = 2;
+		goto validate_return_locked;
+	}
+
+	opt_iter = CIPSO_V4_HDR_LEN;
+	tag = opt + opt_iter;
+	while (opt_iter < opt_len) {
+		for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
+			if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID ||
+			    ++tag_iter == CIPSO_V4_TAG_MAXCNT) {
+				err_offset = opt_iter;
+				goto validate_return_locked;
+			}
+
+		if (opt_iter + 1 == opt_len) {
+			err_offset = opt_iter;
+			goto validate_return_locked;
+		}
+		tag_len = tag[1];
+		if (tag_len > (opt_len - opt_iter)) {
+			err_offset = opt_iter + 1;
+			goto validate_return_locked;
+		}
+
+		switch (tag[0]) {
+		case CIPSO_V4_TAG_RBITMAP:
+			if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+
+			/* We are already going to do all the verification
+			 * necessary at the socket layer so from our point of
+			 * view it is safe to turn these checks off (and less
+			 * work), however, the CIPSO draft says we should do
+			 * all the CIPSO validations here but it doesn't
+			 * really specify _exactly_ what we need to validate
+			 * ... so, just make it a sysctl tunable. */
+			if (READ_ONCE(cipso_v4_rbm_strictvalid)) {
+				if (cipso_v4_map_lvl_valid(doi_def,
+							   tag[3]) < 0) {
+					err_offset = opt_iter + 3;
+					goto validate_return_locked;
+				}
+				if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
+				    cipso_v4_map_cat_rbm_valid(doi_def,
+							    &tag[4],
+							    tag_len - 4) < 0) {
+					err_offset = opt_iter + 4;
+					goto validate_return_locked;
+				}
+			}
+			break;
+		case CIPSO_V4_TAG_ENUM:
+			if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+
+			if (cipso_v4_map_lvl_valid(doi_def,
+						   tag[3]) < 0) {
+				err_offset = opt_iter + 3;
+				goto validate_return_locked;
+			}
+			if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
+			    cipso_v4_map_cat_enum_valid(doi_def,
+							&tag[4],
+							tag_len - 4) < 0) {
+				err_offset = opt_iter + 4;
+				goto validate_return_locked;
+			}
+			break;
+		case CIPSO_V4_TAG_RANGE:
+			if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+
+			if (cipso_v4_map_lvl_valid(doi_def,
+						   tag[3]) < 0) {
+				err_offset = opt_iter + 3;
+				goto validate_return_locked;
+			}
+			if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
+			    cipso_v4_map_cat_rng_valid(doi_def,
+						       &tag[4],
+						       tag_len - 4) < 0) {
+				err_offset = opt_iter + 4;
+				goto validate_return_locked;
+			}
+			break;
+		case CIPSO_V4_TAG_LOCAL:
+			/* This is a non-standard tag that we only allow for
+			 * local connections, so if the incoming interface is
+			 * not the loopback device drop the packet. Further,
+			 * there is no legitimate reason for setting this from
+			 * userspace so reject it if skb is NULL. */
+			if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) {
+				err_offset = opt_iter;
+				goto validate_return_locked;
+			}
+			if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+			break;
+		default:
+			err_offset = opt_iter;
+			goto validate_return_locked;
+		}
+
+		tag += tag_len;
+		opt_iter += tag_len;
+	}
+
+validate_return_locked:
+	rcu_read_unlock();
+validate_return:
+	*option = opt + err_offset;
+	return err_offset;
+}
+
+/**
+ * cipso_v4_error - Send the correct response for a bad packet
+ * @skb: the packet
+ * @error: the error code
+ * @gateway: CIPSO gateway flag
+ *
+ * Description:
+ * Based on the error code given in @error, send an ICMP error message back to
+ * the originating host.  From the IETF draft ...
+ *
+ *  "If the contents of the CIPSO [option] are valid but the security label is
+ *   outside of the configured host or port label range, the datagram is
+ *   discarded and an ICMP 'destination unreachable' (type 3) is generated and
+ *   returned.  The code field of the ICMP is set to 'communication with
+ *   destination network administratively prohibited' (code 9) or to
+ *   'communication with destination host administratively prohibited'
+ *   (code 10).  The value of the code is dependent on whether the originator
+ *   of the ICMP message is acting as a CIPSO host or a CIPSO gateway.  The
+ *   recipient of the ICMP message MUST be able to handle either value.  The
+ *   same procedure is performed if a CIPSO [option] can not be added to an
+ *   IP packet because it is too large to fit in the IP options area."
+ *
+ *  "If the error is triggered by receipt of an ICMP message, the message is
+ *   discarded and no response is permitted (consistent with general ICMP
+ *   processing rules)."
+ *
+ */
+void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
+{
+	unsigned char optbuf[sizeof(struct ip_options) + 40];
+	struct ip_options *opt = (struct ip_options *)optbuf;
+	int res;
+
+	if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
+		return;
+
+	/*
+	 * We might be called above the IP layer,
+	 * so we can not use icmp_send and IPCB here.
+	 */
+
+	memset(opt, 0, sizeof(struct ip_options));
+	opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
+	rcu_read_lock();
+	res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL);
+	rcu_read_unlock();
+
+	if (res)
+		return;
+
+	if (gateway)
+		__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt);
+	else
+		__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt);
+}
+
+/**
+ * cipso_v4_genopt - Generate a CIPSO option
+ * @buf: the option buffer
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CIPSO option using the DOI definition and security attributes
+ * passed to the function.  Returns the length of the option on success and
+ * negative values on failure.
+ *
+ */
+static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
+			   const struct cipso_v4_doi *doi_def,
+			   const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u32 iter;
+
+	if (buf_len <= CIPSO_V4_HDR_LEN)
+		return -ENOSPC;
+
+	/* XXX - This code assumes only one tag per CIPSO option which isn't
+	 * really a good assumption to make but since we only support the MAC
+	 * tags right now it is a safe assumption. */
+	iter = 0;
+	do {
+		memset(buf, 0, buf_len);
+		switch (doi_def->tags[iter]) {
+		case CIPSO_V4_TAG_RBITMAP:
+			ret_val = cipso_v4_gentag_rbm(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
+		case CIPSO_V4_TAG_ENUM:
+			ret_val = cipso_v4_gentag_enum(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
+		case CIPSO_V4_TAG_RANGE:
+			ret_val = cipso_v4_gentag_rng(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
+		case CIPSO_V4_TAG_LOCAL:
+			ret_val = cipso_v4_gentag_loc(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
+		default:
+			return -EPERM;
+		}
+
+		iter++;
+	} while (ret_val < 0 &&
+		 iter < CIPSO_V4_TAG_MAXCNT &&
+		 doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
+	if (ret_val < 0)
+		return ret_val;
+	cipso_v4_gentag_hdr(doi_def, buf, ret_val);
+	return CIPSO_V4_HDR_LEN + ret_val;
+}
+
+/**
+ * cipso_v4_sock_setattr - Add a CIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked.  Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int cipso_v4_sock_setattr(struct sock *sk,
+			  const struct cipso_v4_doi *doi_def,
+			  const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	unsigned char *buf = NULL;
+	u32 buf_len;
+	u32 opt_len;
+	struct ip_options_rcu *old, *opt = NULL;
+	struct inet_sock *sk_inet;
+	struct inet_connection_sock *sk_conn;
+
+	/* In the case of sock_create_lite(), the sock->sk field is not
+	 * defined yet but it is not a problem as the only users of these
+	 * "lite" PF_INET sockets are functions which do an accept() call
+	 * afterwards so we will label the socket as part of the accept(). */
+	if (!sk)
+		return 0;
+
+	/* We allocate the maximum CIPSO option size here so we are probably
+	 * being a little wasteful, but it makes our life _much_ easier later
+	 * on and after all we are only talking about 40 bytes. */
+	buf_len = CIPSO_V4_OPT_LEN_MAX;
+	buf = kmalloc(buf_len, GFP_ATOMIC);
+	if (!buf) {
+		ret_val = -ENOMEM;
+		goto socket_setattr_failure;
+	}
+
+	ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+	if (ret_val < 0)
+		goto socket_setattr_failure;
+	buf_len = ret_val;
+
+	/* We can't use ip_options_get() directly because it makes a call to
+	 * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+	 * we won't always have CAP_NET_RAW even though we _always_ want to
+	 * set the IPOPT_CIPSO option. */
+	opt_len = (buf_len + 3) & ~3;
+	opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+	if (!opt) {
+		ret_val = -ENOMEM;
+		goto socket_setattr_failure;
+	}
+	memcpy(opt->opt.__data, buf, buf_len);
+	opt->opt.optlen = opt_len;
+	opt->opt.cipso = sizeof(struct iphdr);
+	kfree(buf);
+	buf = NULL;
+
+	sk_inet = inet_sk(sk);
+
+	old = rcu_dereference_protected(sk_inet->inet_opt,
+					lockdep_sock_is_held(sk));
+	if (inet_test_bit(IS_ICSK, sk)) {
+		sk_conn = inet_csk(sk);
+		if (old)
+			sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+		sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
+		sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+	}
+	rcu_assign_pointer(sk_inet->inet_opt, opt);
+	if (old)
+		kfree_rcu(old, rcu);
+
+	return 0;
+
+socket_setattr_failure:
+	kfree(buf);
+	kfree(opt);
+	return ret_val;
+}
+
+/**
+ * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int cipso_v4_req_setattr(struct request_sock *req,
+			 const struct cipso_v4_doi *doi_def,
+			 const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	unsigned char *buf = NULL;
+	u32 buf_len;
+	u32 opt_len;
+	struct ip_options_rcu *opt = NULL;
+	struct inet_request_sock *req_inet;
+
+	/* We allocate the maximum CIPSO option size here so we are probably
+	 * being a little wasteful, but it makes our life _much_ easier later
+	 * on and after all we are only talking about 40 bytes. */
+	buf_len = CIPSO_V4_OPT_LEN_MAX;
+	buf = kmalloc(buf_len, GFP_ATOMIC);
+	if (!buf) {
+		ret_val = -ENOMEM;
+		goto req_setattr_failure;
+	}
+
+	ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+	if (ret_val < 0)
+		goto req_setattr_failure;
+	buf_len = ret_val;
+
+	/* We can't use ip_options_get() directly because it makes a call to
+	 * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+	 * we won't always have CAP_NET_RAW even though we _always_ want to
+	 * set the IPOPT_CIPSO option. */
+	opt_len = (buf_len + 3) & ~3;
+	opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+	if (!opt) {
+		ret_val = -ENOMEM;
+		goto req_setattr_failure;
+	}
+	memcpy(opt->opt.__data, buf, buf_len);
+	opt->opt.optlen = opt_len;
+	opt->opt.cipso = sizeof(struct iphdr);
+	kfree(buf);
+	buf = NULL;
+
+	req_inet = inet_rsk(req);
+	opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt);
+	if (opt)
+		kfree_rcu(opt, rcu);
+
+	return 0;
+
+req_setattr_failure:
+	kfree(buf);
+	kfree(opt);
+	return ret_val;
+}
+
+/**
+ * cipso_v4_delopt - Delete the CIPSO option from a set of IP options
+ * @opt_ptr: IP option pointer
+ *
+ * Description:
+ * Deletes the CIPSO IP option from a set of IP options and makes the necessary
+ * adjustments to the IP option structure.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
+{
+	struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1);
+	int hdr_delta = 0;
+
+	if (!opt || opt->opt.cipso == 0)
+		return 0;
+	if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
+		u8 cipso_len;
+		u8 cipso_off;
+		unsigned char *cipso_ptr;
+		int iter;
+		int optlen_new;
+
+		cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+		cipso_ptr = &opt->opt.__data[cipso_off];
+		cipso_len = cipso_ptr[1];
+
+		if (opt->opt.srr > opt->opt.cipso)
+			opt->opt.srr -= cipso_len;
+		if (opt->opt.rr > opt->opt.cipso)
+			opt->opt.rr -= cipso_len;
+		if (opt->opt.ts > opt->opt.cipso)
+			opt->opt.ts -= cipso_len;
+		if (opt->opt.router_alert > opt->opt.cipso)
+			opt->opt.router_alert -= cipso_len;
+		opt->opt.cipso = 0;
+
+		memmove(cipso_ptr, cipso_ptr + cipso_len,
+			opt->opt.optlen - cipso_off - cipso_len);
+
+		/* determining the new total option length is tricky because of
+		 * the padding necessary, the only thing i can think to do at
+		 * this point is walk the options one-by-one, skipping the
+		 * padding at the end to determine the actual option size and
+		 * from there we can determine the new total option length */
+		iter = 0;
+		optlen_new = 0;
+		while (iter < opt->opt.optlen)
+			if (opt->opt.__data[iter] != IPOPT_NOP) {
+				iter += opt->opt.__data[iter + 1];
+				optlen_new = iter;
+			} else
+				iter++;
+		hdr_delta = opt->opt.optlen;
+		opt->opt.optlen = (optlen_new + 3) & ~3;
+		hdr_delta -= opt->opt.optlen;
+	} else {
+		/* only the cipso option was present on the socket so we can
+		 * remove the entire option struct */
+		*opt_ptr = NULL;
+		hdr_delta = opt->opt.optlen;
+		kfree_rcu(opt, rcu);
+	}
+
+	return hdr_delta;
+}
+
+/**
+ * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CIPSO option from a socket, if present.
+ *
+ */
+void cipso_v4_sock_delattr(struct sock *sk)
+{
+	struct inet_sock *sk_inet;
+	int hdr_delta;
+
+	sk_inet = inet_sk(sk);
+
+	hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
+	if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) {
+		struct inet_connection_sock *sk_conn = inet_csk(sk);
+		sk_conn->icsk_ext_hdr_len -= hdr_delta;
+		sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+	}
+}
+
+/**
+ * cipso_v4_req_delattr - Delete the CIPSO option from a request socket
+ * @req: the request socket
+ *
+ * Description:
+ * Removes the CIPSO option from a request socket, if present.
+ *
+ */
+void cipso_v4_req_delattr(struct request_sock *req)
+{
+	cipso_v4_delopt(&inet_rsk(req)->ireq_opt);
+}
+
+/**
+ * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
+ * @cipso: the CIPSO v4 option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @cipso and return the security attributes in @secattr.  Returns zero
+ * on success and negative values on failure.
+ *
+ */
+int cipso_v4_getattr(const unsigned char *cipso,
+		     struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	u32 doi;
+	struct cipso_v4_doi *doi_def;
+
+	if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
+		return 0;
+
+	doi = get_unaligned_be32(&cipso[2]);
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_search(doi);
+	if (!doi_def)
+		goto getattr_return;
+	/* XXX - This code assumes only one tag per CIPSO option which isn't
+	 * really a good assumption to make but since we only support the MAC
+	 * tags right now it is a safe assumption. */
+	switch (cipso[6]) {
+	case CIPSO_V4_TAG_RBITMAP:
+		ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr);
+		break;
+	case CIPSO_V4_TAG_ENUM:
+		ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr);
+		break;
+	case CIPSO_V4_TAG_RANGE:
+		ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
+		break;
+	case CIPSO_V4_TAG_LOCAL:
+		ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
+		break;
+	}
+	if (ret_val == 0)
+		secattr->type = NETLBL_NLTYPE_CIPSOV4;
+
+getattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/**
+ * cipso_v4_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CIPSO option attached to the sock and if
+ * there is return the CIPSO security attributes in @secattr.  This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself.  Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+	struct ip_options_rcu *opt;
+	int res = -ENOMSG;
+
+	rcu_read_lock();
+	opt = rcu_dereference(inet_sk(sk)->inet_opt);
+	if (opt && opt->opt.cipso)
+		res = cipso_v4_getattr(opt->opt.__data +
+						opt->opt.cipso -
+						sizeof(struct iphdr),
+				       secattr);
+	rcu_read_unlock();
+	return res;
+}
+
+/**
+ * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the DOI structure
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int cipso_v4_skbuff_setattr(struct sk_buff *skb,
+			    const struct cipso_v4_doi *doi_def,
+			    const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct iphdr *iph;
+	struct ip_options *opt = &IPCB(skb)->opt;
+	unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
+	u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
+	u32 opt_len;
+	int len_delta;
+
+	ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+	if (ret_val < 0)
+		return ret_val;
+	buf_len = ret_val;
+	opt_len = (buf_len + 3) & ~3;
+
+	/* we overwrite any existing options to ensure that we have enough
+	 * room for the CIPSO option, the reason is that we _need_ to guarantee
+	 * that the security label is applied to the packet - we do the same
+	 * thing when using the socket options and it hasn't caused a problem,
+	 * if we need to we can always revisit this choice later */
+
+	len_delta = opt_len - opt->optlen;
+	/* if we don't ensure enough headroom we could panic on the skb_push()
+	 * call below so make sure we have enough, we are also "mangling" the
+	 * packet so we should probably do a copy-on-write call anyway */
+	ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+	if (ret_val < 0)
+		return ret_val;
+
+	if (len_delta > 0) {
+		/* we assume that the header + opt->optlen have already been
+		 * "pushed" in ip_options_build() or similar */
+		iph = ip_hdr(skb);
+		skb_push(skb, len_delta);
+		memmove((char *)iph - len_delta, iph, iph->ihl << 2);
+		skb_reset_network_header(skb);
+		iph = ip_hdr(skb);
+	} else if (len_delta < 0) {
+		iph = ip_hdr(skb);
+		memset(iph + 1, IPOPT_NOP, opt->optlen);
+	} else
+		iph = ip_hdr(skb);
+
+	if (opt->optlen > 0)
+		memset(opt, 0, sizeof(*opt));
+	opt->optlen = opt_len;
+	opt->cipso = sizeof(struct iphdr);
+	opt->is_changed = 1;
+
+	/* we have to do the following because we are being called from a
+	 * netfilter hook which means the packet already has had the header
+	 * fields populated and the checksum calculated - yes this means we
+	 * are doing more work than needed but we do it to keep the core
+	 * stack clean and tidy */
+	memcpy(iph + 1, buf, buf_len);
+	if (opt_len > buf_len)
+		memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
+	if (len_delta != 0) {
+		iph->ihl = 5 + (opt_len >> 2);
+		iph_set_totlen(iph, skb->len);
+	}
+	ip_send_check(iph);
+
+	return 0;
+}
+
+/**
+ * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CIPSO options from the given packet.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_delattr(struct sk_buff *skb)
+{
+	int ret_val;
+	struct iphdr *iph;
+	struct ip_options *opt = &IPCB(skb)->opt;
+	unsigned char *cipso_ptr;
+
+	if (opt->cipso == 0)
+		return 0;
+
+	/* since we are changing the packet we should make a copy */
+	ret_val = skb_cow(skb, skb_headroom(skb));
+	if (ret_val < 0)
+		return ret_val;
+
+	/* the easiest thing to do is just replace the cipso option with noop
+	 * options since we don't change the size of the packet, although we
+	 * still need to recalculate the checksum */
+
+	iph = ip_hdr(skb);
+	cipso_ptr = (unsigned char *)iph + opt->cipso;
+	memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+	opt->cipso = 0;
+	opt->is_changed = 1;
+
+	ip_send_check(iph);
+
+	return 0;
+}
+
+/*
+ * Setup Functions
+ */
+
+/**
+ * cipso_v4_init - Initialize the CIPSO module
+ *
+ * Description:
+ * Initialize the CIPSO module and prepare it for use.  Returns zero on success
+ * and negative values on failure.
+ *
+ */
+static int __init cipso_v4_init(void)
+{
+	int ret_val;
+
+	ret_val = cipso_v4_cache_init();
+	if (ret_val != 0)
+		panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n",
+		      ret_val);
+
+	return 0;
+}
+
+subsys_initcall(cipso_v4_init);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 0000000000..cb5dbee9e0
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	common UDP/RAW code
+ *	Linux INET implementation
+ *
+ * Authors:
+ * 	Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/sock_reuseport.h>
+
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	__be32 saddr;
+	int oif;
+	int err;
+
+
+	if (addr_len < sizeof(*usin))
+		return -EINVAL;
+
+	if (usin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	sk_dst_reset(sk);
+
+	oif = sk->sk_bound_dev_if;
+	saddr = inet->inet_saddr;
+	if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
+		if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
+			oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	} else if (!oif) {
+		oif = inet->uc_index;
+	}
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif,
+			      sk->sk_protocol, inet->inet_sport,
+			      usin->sin_port, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		if (err == -ENETUNREACH)
+			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+		goto out;
+	}
+
+	if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+		ip_rt_put(rt);
+		err = -EACCES;
+		goto out;
+	}
+	if (!inet->inet_saddr)
+		inet->inet_saddr = fl4->saddr;	/* Update source address */
+	if (!inet->inet_rcv_saddr) {
+		inet->inet_rcv_saddr = fl4->saddr;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
+	}
+	inet->inet_daddr = fl4->daddr;
+	inet->inet_dport = usin->sin_port;
+	reuseport_has_conns_set(sk);
+	sk->sk_state = TCP_ESTABLISHED;
+	sk_set_txhash(sk);
+	atomic_set(&inet->inet_id, get_random_u16());
+
+	sk_dst_set(sk, &rt->dst);
+	err = 0;
+out:
+	return err;
+}
+EXPORT_SYMBOL(__ip4_datagram_connect);
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	int res;
+
+	lock_sock(sk);
+	res = __ip4_datagram_connect(sk, uaddr, addr_len);
+	release_sock(sk);
+	return res;
+}
+EXPORT_SYMBOL(ip4_datagram_connect);
+
+/* Because UDP xmit path can manipulate sk_dst_cache without holding
+ * socket lock, we need to use sk_dst_set() here,
+ * even if we own the socket lock.
+ */
+void ip4_datagram_release_cb(struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ip_options_rcu *inet_opt;
+	__be32 daddr = inet->inet_daddr;
+	struct dst_entry *dst;
+	struct flowi4 fl4;
+	struct rtable *rt;
+
+	rcu_read_lock();
+
+	dst = __sk_dst_get(sk);
+	if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+		rcu_read_unlock();
+		return;
+	}
+	inet_opt = rcu_dereference(inet->inet_opt);
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
+				   inet->inet_saddr, inet->inet_dport,
+				   inet->inet_sport, sk->sk_protocol,
+				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+
+	dst = !IS_ERR(rt) ? &rt->dst : NULL;
+	sk_dst_set(sk, dst);
+
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 0000000000..ca0ff15dc8
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,2816 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	NET3	IP device support routines.
+ *
+ *	Derived from the IP parts of dev.c 1.0.19
+ * 		Authors:	Ross Biro
+ *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *				Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ *	Additional Authors:
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	Changes:
+ *		Alexey Kuznetsov:	pa_* fields are replaced with ifaddr
+ *					lists.
+ *		Cyrus Durgin:		updated for kmod
+ *		Matthias Andree:	in devinet_ioctl, compare label and
+ *					address (4.4BSD alias style support),
+ *					fall back to comparing just the label
+ *					if no match found.
+ */
+
+
+#include <linux/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched/signal.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_addr.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/kmod.h>
+#include <linux/netconf.h>
+
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/addrconf.h>
+
+#define IPV6ONLY_FLAGS	\
+		(IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \
+		 IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \
+		 IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY)
+
+static struct ipv4_devconf ipv4_devconf = {
+	.data = {
+		[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+		[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+		[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
+		[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
+	},
+};
+
+static struct ipv4_devconf ipv4_devconf_dflt = {
+	.data = {
+		[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+		[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+		[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+		[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
+		[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
+	},
+};
+
+#define IPV4_DEVCONF_DFLT(net, attr) \
+	IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)
+
+static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
+	[IFA_LOCAL]     	= { .type = NLA_U32 },
+	[IFA_ADDRESS]   	= { .type = NLA_U32 },
+	[IFA_BROADCAST] 	= { .type = NLA_U32 },
+	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
+	[IFA_FLAGS]		= { .type = NLA_U32 },
+	[IFA_RT_PRIORITY]	= { .type = NLA_U32 },
+	[IFA_TARGET_NETNSID]	= { .type = NLA_S32 },
+	[IFA_PROTO]		= { .type = NLA_U8 },
+};
+
+struct inet_fill_args {
+	u32 portid;
+	u32 seq;
+	int event;
+	unsigned int flags;
+	int netnsid;
+	int ifindex;
+};
+
+#define IN4_ADDR_HSIZE_SHIFT	8
+#define IN4_ADDR_HSIZE		(1U << IN4_ADDR_HSIZE_SHIFT)
+
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+
+static u32 inet_addr_hash(const struct net *net, __be32 addr)
+{
+	u32 val = (__force u32) addr ^ net_hash_mix(net);
+
+	return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+	u32 hash = inet_addr_hash(net, ifa->ifa_local);
+
+	ASSERT_RTNL();
+	hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+	ASSERT_RTNL();
+	hlist_del_init_rcu(&ifa->hash);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+	struct net_device *result = NULL;
+	struct in_ifaddr *ifa;
+
+	rcu_read_lock();
+	ifa = inet_lookup_ifaddr_rcu(net, addr);
+	if (!ifa) {
+		struct flowi4 fl4 = { .daddr = addr };
+		struct fib_result res = { 0 };
+		struct fib_table *local;
+
+		/* Fallback to FIB local table so that communication
+		 * over loopback subnets work.
+		 */
+		local = fib_get_table(net, RT_TABLE_LOCAL);
+		if (local &&
+		    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+		    res.type == RTN_LOCAL)
+			result = FIB_RES_DEV(res);
+	} else {
+		result = ifa->ifa_dev->dev;
+	}
+	if (result && devref)
+		dev_hold(result);
+	rcu_read_unlock();
+	return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
+/* called under RCU lock */
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
+{
+	u32 hash = inet_addr_hash(net, addr);
+	struct in_ifaddr *ifa;
+
+	hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash)
+		if (ifa->ifa_local == addr &&
+		    net_eq(dev_net(ifa->ifa_dev->dev), net))
+			return ifa;
+
+	return NULL;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
+
+static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
+static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
+static void inet_del_ifa(struct in_device *in_dev,
+			 struct in_ifaddr __rcu **ifap,
+			 int destroy);
+#ifdef CONFIG_SYSCTL
+static int devinet_sysctl_register(struct in_device *idev);
+static void devinet_sysctl_unregister(struct in_device *idev);
+#else
+static int devinet_sysctl_register(struct in_device *idev)
+{
+	return 0;
+}
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+}
+#endif
+
+/* Locks all the inet devices. */
+
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT);
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+	struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+	if (ifa->ifa_dev)
+		in_dev_put(ifa->ifa_dev);
+	kfree(ifa);
+}
+
+static void inet_free_ifa(struct in_ifaddr *ifa)
+{
+	call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+static void in_dev_free_rcu(struct rcu_head *head)
+{
+	struct in_device *idev = container_of(head, struct in_device, rcu_head);
+
+	kfree(rcu_dereference_protected(idev->mc_hash, 1));
+	kfree(idev);
+}
+
+void in_dev_finish_destroy(struct in_device *idev)
+{
+	struct net_device *dev = idev->dev;
+
+	WARN_ON(idev->ifa_list);
+	WARN_ON(idev->mc_list);
+#ifdef NET_REFCNT_DEBUG
+	pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
+#endif
+	netdev_put(dev, &idev->dev_tracker);
+	if (!idev->dead)
+		pr_err("Freeing alive in_device %p\n", idev);
+	else
+		call_rcu(&idev->rcu_head, in_dev_free_rcu);
+}
+EXPORT_SYMBOL(in_dev_finish_destroy);
+
+static struct in_device *inetdev_init(struct net_device *dev)
+{
+	struct in_device *in_dev;
+	int err = -ENOMEM;
+
+	ASSERT_RTNL();
+
+	in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
+	if (!in_dev)
+		goto out;
+	memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
+			sizeof(in_dev->cnf));
+	in_dev->cnf.sysctl = NULL;
+	in_dev->dev = dev;
+	in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+	if (!in_dev->arp_parms)
+		goto out_kfree;
+	if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
+		dev_disable_lro(dev);
+	/* Reference in_dev->dev */
+	netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL);
+	/* Account for reference dev->ip_ptr (below) */
+	refcount_set(&in_dev->refcnt, 1);
+
+	err = devinet_sysctl_register(in_dev);
+	if (err) {
+		in_dev->dead = 1;
+		neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+		in_dev_put(in_dev);
+		in_dev = NULL;
+		goto out;
+	}
+	ip_mc_init_dev(in_dev);
+	if (dev->flags & IFF_UP)
+		ip_mc_up(in_dev);
+
+	/* we can receive as soon as ip_ptr is set -- do this last */
+	rcu_assign_pointer(dev->ip_ptr, in_dev);
+out:
+	return in_dev ?: ERR_PTR(err);
+out_kfree:
+	kfree(in_dev);
+	in_dev = NULL;
+	goto out;
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+	struct net_device *dev;
+	struct in_ifaddr *ifa;
+
+	ASSERT_RTNL();
+
+	dev = in_dev->dev;
+
+	in_dev->dead = 1;
+
+	ip_mc_destroy_dev(in_dev);
+
+	while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
+		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+		inet_free_ifa(ifa);
+	}
+
+	RCU_INIT_POINTER(dev->ip_ptr, NULL);
+
+	devinet_sysctl_unregister(in_dev);
+	neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+	arp_ifdown(dev);
+
+	in_dev_put(in_dev);
+}
+
+int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
+{
+	const struct in_ifaddr *ifa;
+
+	rcu_read_lock();
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		if (inet_ifa_match(a, ifa)) {
+			if (!b || inet_ifa_match(b, ifa)) {
+				rcu_read_unlock();
+				return 1;
+			}
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+static void __inet_del_ifa(struct in_device *in_dev,
+			   struct in_ifaddr __rcu **ifap,
+			   int destroy, struct nlmsghdr *nlh, u32 portid)
+{
+	struct in_ifaddr *promote = NULL;
+	struct in_ifaddr *ifa, *ifa1;
+	struct in_ifaddr __rcu **last_prim;
+	struct in_ifaddr *prev_prom = NULL;
+	int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
+
+	ASSERT_RTNL();
+
+	ifa1 = rtnl_dereference(*ifap);
+	last_prim = ifap;
+	if (in_dev->dead)
+		goto no_promotions;
+
+	/* 1. Deleting primary ifaddr forces deletion all secondaries
+	 * unless alias promotion is set
+	 **/
+
+	if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+		struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;
+
+		while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
+			if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
+			    ifa1->ifa_scope <= ifa->ifa_scope)
+				last_prim = &ifa->ifa_next;
+
+			if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+			    ifa1->ifa_mask != ifa->ifa_mask ||
+			    !inet_ifa_match(ifa1->ifa_address, ifa)) {
+				ifap1 = &ifa->ifa_next;
+				prev_prom = ifa;
+				continue;
+			}
+
+			if (!do_promote) {
+				inet_hash_remove(ifa);
+				*ifap1 = ifa->ifa_next;
+
+				rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
+				blocking_notifier_call_chain(&inetaddr_chain,
+						NETDEV_DOWN, ifa);
+				inet_free_ifa(ifa);
+			} else {
+				promote = ifa;
+				break;
+			}
+		}
+	}
+
+	/* On promotion all secondaries from subnet are changing
+	 * the primary IP, we must remove all their routes silently
+	 * and later to add them back with new prefsrc. Do this
+	 * while all addresses are on the device list.
+	 */
+	for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
+		if (ifa1->ifa_mask == ifa->ifa_mask &&
+		    inet_ifa_match(ifa1->ifa_address, ifa))
+			fib_del_ifaddr(ifa, ifa1);
+	}
+
+no_promotions:
+	/* 2. Unlink it */
+
+	*ifap = ifa1->ifa_next;
+	inet_hash_remove(ifa1);
+
+	/* 3. Announce address deletion */
+
+	/* Send message first, then call notifier.
+	   At first sight, FIB update triggered by notifier
+	   will refer to already deleted ifaddr, that could confuse
+	   netlink listeners. It is not true: look, gated sees
+	   that route deleted and if it still thinks that ifaddr
+	   is valid, it will try to restore deleted routes... Grr.
+	   So that, this order is correct.
+	 */
+	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+
+	if (promote) {
+		struct in_ifaddr *next_sec;
+
+		next_sec = rtnl_dereference(promote->ifa_next);
+		if (prev_prom) {
+			struct in_ifaddr *last_sec;
+
+			rcu_assign_pointer(prev_prom->ifa_next, next_sec);
+
+			last_sec = rtnl_dereference(*last_prim);
+			rcu_assign_pointer(promote->ifa_next, last_sec);
+			rcu_assign_pointer(*last_prim, promote);
+		}
+
+		promote->ifa_flags &= ~IFA_F_SECONDARY;
+		rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
+		blocking_notifier_call_chain(&inetaddr_chain,
+				NETDEV_UP, promote);
+		for (ifa = next_sec; ifa;
+		     ifa = rtnl_dereference(ifa->ifa_next)) {
+			if (ifa1->ifa_mask != ifa->ifa_mask ||
+			    !inet_ifa_match(ifa1->ifa_address, ifa))
+					continue;
+			fib_add_ifaddr(ifa);
+		}
+
+	}
+	if (destroy)
+		inet_free_ifa(ifa1);
+}
+
+static void inet_del_ifa(struct in_device *in_dev,
+			 struct in_ifaddr __rcu **ifap,
+			 int destroy)
+{
+	__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
+}
+
+static void check_lifetime(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
+
+static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+			     u32 portid, struct netlink_ext_ack *extack)
+{
+	struct in_ifaddr __rcu **last_primary, **ifap;
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct in_validator_info ivi;
+	struct in_ifaddr *ifa1;
+	int ret;
+
+	ASSERT_RTNL();
+
+	if (!ifa->ifa_local) {
+		inet_free_ifa(ifa);
+		return 0;
+	}
+
+	ifa->ifa_flags &= ~IFA_F_SECONDARY;
+	last_primary = &in_dev->ifa_list;
+
+	/* Don't set IPv6 only flags to IPv4 addresses */
+	ifa->ifa_flags &= ~IPV6ONLY_FLAGS;
+
+	ifap = &in_dev->ifa_list;
+	ifa1 = rtnl_dereference(*ifap);
+
+	while (ifa1) {
+		if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+		    ifa->ifa_scope <= ifa1->ifa_scope)
+			last_primary = &ifa1->ifa_next;
+		if (ifa1->ifa_mask == ifa->ifa_mask &&
+		    inet_ifa_match(ifa1->ifa_address, ifa)) {
+			if (ifa1->ifa_local == ifa->ifa_local) {
+				inet_free_ifa(ifa);
+				return -EEXIST;
+			}
+			if (ifa1->ifa_scope != ifa->ifa_scope) {
+				NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value");
+				inet_free_ifa(ifa);
+				return -EINVAL;
+			}
+			ifa->ifa_flags |= IFA_F_SECONDARY;
+		}
+
+		ifap = &ifa1->ifa_next;
+		ifa1 = rtnl_dereference(*ifap);
+	}
+
+	/* Allow any devices that wish to register ifaddr validtors to weigh
+	 * in now, before changes are committed.  The rntl lock is serializing
+	 * access here, so the state should not change between a validator call
+	 * and a final notify on commit.  This isn't invoked on promotion under
+	 * the assumption that validators are checking the address itself, and
+	 * not the flags.
+	 */
+	ivi.ivi_addr = ifa->ifa_address;
+	ivi.ivi_dev = ifa->ifa_dev;
+	ivi.extack = extack;
+	ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
+					   NETDEV_UP, &ivi);
+	ret = notifier_to_errno(ret);
+	if (ret) {
+		inet_free_ifa(ifa);
+		return ret;
+	}
+
+	if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+		ifap = last_primary;
+
+	rcu_assign_pointer(ifa->ifa_next, *ifap);
+	rcu_assign_pointer(*ifap, ifa);
+
+	inet_hash_insert(dev_net(in_dev->dev), ifa);
+
+	cancel_delayed_work(&check_lifetime_work);
+	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
+	/* Send message first, then call notifier.
+	   Notifier will trigger FIB update, so that
+	   listeners of netlink will know about new ifaddr */
+	rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+	return 0;
+}
+
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+	return __inet_insert_ifa(ifa, NULL, 0, NULL);
+}
+
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+	ASSERT_RTNL();
+
+	if (!in_dev) {
+		inet_free_ifa(ifa);
+		return -ENOBUFS;
+	}
+	ipv4_devconf_setall(in_dev);
+	neigh_parms_data_state_setall(in_dev->arp_parms);
+	if (ifa->ifa_dev != in_dev) {
+		WARN_ON(ifa->ifa_dev);
+		in_dev_hold(in_dev);
+		ifa->ifa_dev = in_dev;
+	}
+	if (ipv4_is_loopback(ifa->ifa_local))
+		ifa->ifa_scope = RT_SCOPE_HOST;
+	return inet_insert_ifa(ifa);
+}
+
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
+struct in_device *inetdev_by_index(struct net *net, int ifindex)
+{
+	struct net_device *dev;
+	struct in_device *in_dev = NULL;
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifindex);
+	if (dev)
+		in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+	rcu_read_unlock();
+	return in_dev;
+}
+EXPORT_SYMBOL(inetdev_by_index);
+
+/* Called only from RTNL semaphored context. No locks. */
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
+				    __be32 mask)
+{
+	struct in_ifaddr *ifa;
+
+	ASSERT_RTNL();
+
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+		if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+			return ifa;
+	}
+	return NULL;
+}
+
+static int ip_mc_autojoin_config(struct net *net, bool join,
+				 const struct in_ifaddr *ifa)
+{
+#if defined(CONFIG_IP_MULTICAST)
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = ifa->ifa_address,
+		.imr_ifindex = ifa->ifa_dev->dev->ifindex,
+	};
+	struct sock *sk = net->ipv4.mc_autojoin_sk;
+	int ret;
+
+	ASSERT_RTNL();
+
+	lock_sock(sk);
+	if (join)
+		ret = ip_mc_join_group(sk, &mreq);
+	else
+		ret = ip_mc_leave_group(sk, &mreq);
+	release_sock(sk);
+
+	return ret;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct in_ifaddr __rcu **ifap;
+	struct nlattr *tb[IFA_MAX+1];
+	struct in_device *in_dev;
+	struct ifaddrmsg *ifm;
+	struct in_ifaddr *ifa;
+	int err;
+
+	ASSERT_RTNL();
+
+	err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+				     ifa_ipv4_policy, extack);
+	if (err < 0)
+		goto errout;
+
+	ifm = nlmsg_data(nlh);
+	in_dev = inetdev_by_index(net, ifm->ifa_index);
+	if (!in_dev) {
+		NL_SET_ERR_MSG(extack, "ipv4: Device not found");
+		err = -ENODEV;
+		goto errout;
+	}
+
+	for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL;
+	     ifap = &ifa->ifa_next) {
+		if (tb[IFA_LOCAL] &&
+		    ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
+			continue;
+
+		if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
+			continue;
+
+		if (tb[IFA_ADDRESS] &&
+		    (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+		    !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa)))
+			continue;
+
+		if (ipv4_is_multicast(ifa->ifa_address))
+			ip_mc_autojoin_config(net, false, ifa);
+		__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
+		return 0;
+	}
+
+	NL_SET_ERR_MSG(extack, "ipv4: Address not found");
+	err = -EADDRNOTAVAIL;
+errout:
+	return err;
+}
+
+#define INFINITY_LIFE_TIME	0xFFFFFFFF
+
+static void check_lifetime(struct work_struct *work)
+{
+	unsigned long now, next, next_sec, next_sched;
+	struct in_ifaddr *ifa;
+	struct hlist_node *n;
+	int i;
+
+	now = jiffies;
+	next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+	for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+		bool change_needed = false;
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) {
+			unsigned long age;
+
+			if (ifa->ifa_flags & IFA_F_PERMANENT)
+				continue;
+
+			/* We try to batch several events at once. */
+			age = (now - ifa->ifa_tstamp +
+			       ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+			if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+			    age >= ifa->ifa_valid_lft) {
+				change_needed = true;
+			} else if (ifa->ifa_preferred_lft ==
+				   INFINITY_LIFE_TIME) {
+				continue;
+			} else if (age >= ifa->ifa_preferred_lft) {
+				if (time_before(ifa->ifa_tstamp +
+						ifa->ifa_valid_lft * HZ, next))
+					next = ifa->ifa_tstamp +
+					       ifa->ifa_valid_lft * HZ;
+
+				if (!(ifa->ifa_flags & IFA_F_DEPRECATED))
+					change_needed = true;
+			} else if (time_before(ifa->ifa_tstamp +
+					       ifa->ifa_preferred_lft * HZ,
+					       next)) {
+				next = ifa->ifa_tstamp +
+				       ifa->ifa_preferred_lft * HZ;
+			}
+		}
+		rcu_read_unlock();
+		if (!change_needed)
+			continue;
+		rtnl_lock();
+		hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) {
+			unsigned long age;
+
+			if (ifa->ifa_flags & IFA_F_PERMANENT)
+				continue;
+
+			/* We try to batch several events at once. */
+			age = (now - ifa->ifa_tstamp +
+			       ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+			if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+			    age >= ifa->ifa_valid_lft) {
+				struct in_ifaddr __rcu **ifap;
+				struct in_ifaddr *tmp;
+
+				ifap = &ifa->ifa_dev->ifa_list;
+				tmp = rtnl_dereference(*ifap);
+				while (tmp) {
+					if (tmp == ifa) {
+						inet_del_ifa(ifa->ifa_dev,
+							     ifap, 1);
+						break;
+					}
+					ifap = &tmp->ifa_next;
+					tmp = rtnl_dereference(*ifap);
+				}
+			} else if (ifa->ifa_preferred_lft !=
+				   INFINITY_LIFE_TIME &&
+				   age >= ifa->ifa_preferred_lft &&
+				   !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
+				ifa->ifa_flags |= IFA_F_DEPRECATED;
+				rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+			}
+		}
+		rtnl_unlock();
+	}
+
+	next_sec = round_jiffies_up(next);
+	next_sched = next;
+
+	/* If rounded timeout is accurate enough, accept it. */
+	if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+		next_sched = next_sec;
+
+	now = jiffies;
+	/* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+	if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
+		next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
+
+	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work,
+			next_sched - now);
+}
+
+static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
+			     __u32 prefered_lft)
+{
+	unsigned long timeout;
+
+	ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+
+	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	if (addrconf_finite_timeout(timeout))
+		ifa->ifa_valid_lft = timeout;
+	else
+		ifa->ifa_flags |= IFA_F_PERMANENT;
+
+	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		if (timeout == 0)
+			ifa->ifa_flags |= IFA_F_DEPRECATED;
+		ifa->ifa_preferred_lft = timeout;
+	}
+	ifa->ifa_tstamp = jiffies;
+	if (!ifa->ifa_cstamp)
+		ifa->ifa_cstamp = ifa->ifa_tstamp;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
+				       __u32 *pvalid_lft, __u32 *pprefered_lft,
+				       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[IFA_MAX+1];
+	struct in_ifaddr *ifa;
+	struct ifaddrmsg *ifm;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	int err;
+
+	err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+				     ifa_ipv4_policy, extack);
+	if (err < 0)
+		goto errout;
+
+	ifm = nlmsg_data(nlh);
+	err = -EINVAL;
+
+	if (ifm->ifa_prefixlen > 32) {
+		NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length");
+		goto errout;
+	}
+
+	if (!tb[IFA_LOCAL]) {
+		NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied");
+		goto errout;
+	}
+
+	dev = __dev_get_by_index(net, ifm->ifa_index);
+	err = -ENODEV;
+	if (!dev) {
+		NL_SET_ERR_MSG(extack, "ipv4: Device not found");
+		goto errout;
+	}
+
+	in_dev = __in_dev_get_rtnl(dev);
+	err = -ENOBUFS;
+	if (!in_dev)
+		goto errout;
+
+	ifa = inet_alloc_ifa();
+	if (!ifa)
+		/*
+		 * A potential indev allocation can be left alive, it stays
+		 * assigned to its device and is destroy with it.
+		 */
+		goto errout;
+
+	ipv4_devconf_setall(in_dev);
+	neigh_parms_data_state_setall(in_dev->arp_parms);
+	in_dev_hold(in_dev);
+
+	if (!tb[IFA_ADDRESS])
+		tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+
+	INIT_HLIST_NODE(&ifa->hash);
+	ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+	ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+	ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
+					 ifm->ifa_flags;
+	ifa->ifa_scope = ifm->ifa_scope;
+	ifa->ifa_dev = in_dev;
+
+	ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
+	ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);
+
+	if (tb[IFA_BROADCAST])
+		ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
+
+	if (tb[IFA_LABEL])
+		nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+	else
+		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+	if (tb[IFA_RT_PRIORITY])
+		ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
+	if (tb[IFA_PROTO])
+		ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+
+	if (tb[IFA_CACHEINFO]) {
+		struct ifa_cacheinfo *ci;
+
+		ci = nla_data(tb[IFA_CACHEINFO]);
+		if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+			NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid");
+			err = -EINVAL;
+			goto errout_free;
+		}
+		*pvalid_lft = ci->ifa_valid;
+		*pprefered_lft = ci->ifa_prefered;
+	}
+
+	return ifa;
+
+errout_free:
+	inet_free_ifa(ifa);
+errout:
+	return ERR_PTR(err);
+}
+
+static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct in_ifaddr *ifa1;
+
+	if (!ifa->ifa_local)
+		return NULL;
+
+	in_dev_for_each_ifa_rtnl(ifa1, in_dev) {
+		if (ifa1->ifa_mask == ifa->ifa_mask &&
+		    inet_ifa_match(ifa1->ifa_address, ifa) &&
+		    ifa1->ifa_local == ifa->ifa_local)
+			return ifa1;
+	}
+	return NULL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct in_ifaddr *ifa;
+	struct in_ifaddr *ifa_existing;
+	__u32 valid_lft = INFINITY_LIFE_TIME;
+	__u32 prefered_lft = INFINITY_LIFE_TIME;
+
+	ASSERT_RTNL();
+
+	ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack);
+	if (IS_ERR(ifa))
+		return PTR_ERR(ifa);
+
+	ifa_existing = find_matching_ifa(ifa);
+	if (!ifa_existing) {
+		/* It would be best to check for !NLM_F_CREATE here but
+		 * userspace already relies on not having to provide this.
+		 */
+		set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+		if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
+			int ret = ip_mc_autojoin_config(net, true, ifa);
+
+			if (ret < 0) {
+				NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed");
+				inet_free_ifa(ifa);
+				return ret;
+			}
+		}
+		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
+					 extack);
+	} else {
+		u32 new_metric = ifa->ifa_rt_priority;
+		u8 new_proto = ifa->ifa_proto;
+
+		inet_free_ifa(ifa);
+
+		if (nlh->nlmsg_flags & NLM_F_EXCL ||
+		    !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
+			NL_SET_ERR_MSG(extack, "ipv4: Address already assigned");
+			return -EEXIST;
+		}
+		ifa = ifa_existing;
+
+		if (ifa->ifa_rt_priority != new_metric) {
+			fib_modify_prefix_metric(ifa, new_metric);
+			ifa->ifa_rt_priority = new_metric;
+		}
+
+		ifa->ifa_proto = new_proto;
+
+		set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+		cancel_delayed_work(&check_lifetime_work);
+		queue_delayed_work(system_power_efficient_wq,
+				&check_lifetime_work, 0);
+		rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
+	}
+	return 0;
+}
+
+/*
+ *	Determine a default network mask, based on the IP address.
+ */
+
+static int inet_abc_len(__be32 addr)
+{
+	int rc = -1;	/* Something else, probably a multicast. */
+
+	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
+		rc = 0;
+	else {
+		__u32 haddr = ntohl(addr);
+		if (IN_CLASSA(haddr))
+			rc = 8;
+		else if (IN_CLASSB(haddr))
+			rc = 16;
+		else if (IN_CLASSC(haddr))
+			rc = 24;
+		else if (IN_CLASSE(haddr))
+			rc = 32;
+	}
+
+	return rc;
+}
+
+
+int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
+{
+	struct sockaddr_in sin_orig;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
+	struct in_ifaddr __rcu **ifap = NULL;
+	struct in_device *in_dev;
+	struct in_ifaddr *ifa = NULL;
+	struct net_device *dev;
+	char *colon;
+	int ret = -EFAULT;
+	int tryaddrmatch = 0;
+
+	ifr->ifr_name[IFNAMSIZ - 1] = 0;
+
+	/* save original address for comparison */
+	memcpy(&sin_orig, sin, sizeof(*sin));
+
+	colon = strchr(ifr->ifr_name, ':');
+	if (colon)
+		*colon = 0;
+
+	dev_load(net, ifr->ifr_name);
+
+	switch (cmd) {
+	case SIOCGIFADDR:	/* Get interface address */
+	case SIOCGIFBRDADDR:	/* Get the broadcast address */
+	case SIOCGIFDSTADDR:	/* Get the destination address */
+	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
+		/* Note that these ioctls will not sleep,
+		   so that we do not impose a lock.
+		   One day we will be forced to put shlock here (I mean SMP)
+		 */
+		tryaddrmatch = (sin_orig.sin_family == AF_INET);
+		memset(sin, 0, sizeof(*sin));
+		sin->sin_family = AF_INET;
+		break;
+
+	case SIOCSIFFLAGS:
+		ret = -EPERM;
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			goto out;
+		break;
+	case SIOCSIFADDR:	/* Set interface address (and family) */
+	case SIOCSIFBRDADDR:	/* Set the broadcast address */
+	case SIOCSIFDSTADDR:	/* Set the destination address */
+	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
+		ret = -EPERM;
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			goto out;
+		ret = -EINVAL;
+		if (sin->sin_family != AF_INET)
+			goto out;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rtnl_lock();
+
+	ret = -ENODEV;
+	dev = __dev_get_by_name(net, ifr->ifr_name);
+	if (!dev)
+		goto done;
+
+	if (colon)
+		*colon = ':';
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (in_dev) {
+		if (tryaddrmatch) {
+			/* Matthias Andree */
+			/* compare label and address (4.4BSD style) */
+			/* note: we only do this for a limited set of ioctls
+			   and only if the original address family was AF_INET.
+			   This is checked above. */
+
+			for (ifap = &in_dev->ifa_list;
+			     (ifa = rtnl_dereference(*ifap)) != NULL;
+			     ifap = &ifa->ifa_next) {
+				if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
+				    sin_orig.sin_addr.s_addr ==
+							ifa->ifa_local) {
+					break; /* found */
+				}
+			}
+		}
+		/* we didn't get a match, maybe the application is
+		   4.3BSD-style and passed in junk so we fall back to
+		   comparing just the label */
+		if (!ifa) {
+			for (ifap = &in_dev->ifa_list;
+			     (ifa = rtnl_dereference(*ifap)) != NULL;
+			     ifap = &ifa->ifa_next)
+				if (!strcmp(ifr->ifr_name, ifa->ifa_label))
+					break;
+		}
+	}
+
+	ret = -EADDRNOTAVAIL;
+	if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
+		goto done;
+
+	switch (cmd) {
+	case SIOCGIFADDR:	/* Get interface address */
+		ret = 0;
+		sin->sin_addr.s_addr = ifa->ifa_local;
+		break;
+
+	case SIOCGIFBRDADDR:	/* Get the broadcast address */
+		ret = 0;
+		sin->sin_addr.s_addr = ifa->ifa_broadcast;
+		break;
+
+	case SIOCGIFDSTADDR:	/* Get the destination address */
+		ret = 0;
+		sin->sin_addr.s_addr = ifa->ifa_address;
+		break;
+
+	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
+		ret = 0;
+		sin->sin_addr.s_addr = ifa->ifa_mask;
+		break;
+
+	case SIOCSIFFLAGS:
+		if (colon) {
+			ret = -EADDRNOTAVAIL;
+			if (!ifa)
+				break;
+			ret = 0;
+			if (!(ifr->ifr_flags & IFF_UP))
+				inet_del_ifa(in_dev, ifap, 1);
+			break;
+		}
+		ret = dev_change_flags(dev, ifr->ifr_flags, NULL);
+		break;
+
+	case SIOCSIFADDR:	/* Set interface address (and family) */
+		ret = -EINVAL;
+		if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+			break;
+
+		if (!ifa) {
+			ret = -ENOBUFS;
+			ifa = inet_alloc_ifa();
+			if (!ifa)
+				break;
+			INIT_HLIST_NODE(&ifa->hash);
+			if (colon)
+				memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ);
+			else
+				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+		} else {
+			ret = 0;
+			if (ifa->ifa_local == sin->sin_addr.s_addr)
+				break;
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_broadcast = 0;
+			ifa->ifa_scope = 0;
+		}
+
+		ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+
+		if (!(dev->flags & IFF_POINTOPOINT)) {
+			ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+			ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+			if ((dev->flags & IFF_BROADCAST) &&
+			    ifa->ifa_prefixlen < 31)
+				ifa->ifa_broadcast = ifa->ifa_address |
+						     ~ifa->ifa_mask;
+		} else {
+			ifa->ifa_prefixlen = 32;
+			ifa->ifa_mask = inet_make_mask(32);
+		}
+		set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+		ret = inet_set_ifa(dev, ifa);
+		break;
+
+	case SIOCSIFBRDADDR:	/* Set the broadcast address */
+		ret = 0;
+		if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_broadcast = sin->sin_addr.s_addr;
+			inet_insert_ifa(ifa);
+		}
+		break;
+
+	case SIOCSIFDSTADDR:	/* Set the destination address */
+		ret = 0;
+		if (ifa->ifa_address == sin->sin_addr.s_addr)
+			break;
+		ret = -EINVAL;
+		if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+			break;
+		ret = 0;
+		inet_del_ifa(in_dev, ifap, 0);
+		ifa->ifa_address = sin->sin_addr.s_addr;
+		inet_insert_ifa(ifa);
+		break;
+
+	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
+
+		/*
+		 *	The mask we set must be legal.
+		 */
+		ret = -EINVAL;
+		if (bad_mask(sin->sin_addr.s_addr, 0))
+			break;
+		ret = 0;
+		if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+			__be32 old_mask = ifa->ifa_mask;
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_mask = sin->sin_addr.s_addr;
+			ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+
+			/* See if current broadcast address matches
+			 * with current netmask, then recalculate
+			 * the broadcast address. Otherwise it's a
+			 * funny address, so don't touch it since
+			 * the user seems to know what (s)he's doing...
+			 */
+			if ((dev->flags & IFF_BROADCAST) &&
+			    (ifa->ifa_prefixlen < 31) &&
+			    (ifa->ifa_broadcast ==
+			     (ifa->ifa_local|~old_mask))) {
+				ifa->ifa_broadcast = (ifa->ifa_local |
+						      ~sin->sin_addr.s_addr);
+			}
+			inet_insert_ifa(ifa);
+		}
+		break;
+	}
+done:
+	rtnl_unlock();
+out:
+	return ret;
+}
+
+int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
+{
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+	const struct in_ifaddr *ifa;
+	struct ifreq ifr;
+	int done = 0;
+
+	if (WARN_ON(size > sizeof(struct ifreq)))
+		goto out;
+
+	if (!in_dev)
+		goto out;
+
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+		if (!buf) {
+			done += size;
+			continue;
+		}
+		if (len < size)
+			break;
+		memset(&ifr, 0, sizeof(struct ifreq));
+		strcpy(ifr.ifr_name, ifa->ifa_label);
+
+		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+								ifa->ifa_local;
+
+		if (copy_to_user(buf + done, &ifr, size)) {
+			done = -EFAULT;
+			break;
+		}
+		len  -= size;
+		done += size;
+	}
+out:
+	return done;
+}
+
+static __be32 in_dev_select_addr(const struct in_device *in_dev,
+				 int scope)
+{
+	const struct in_ifaddr *ifa;
+
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		if (ifa->ifa_flags & IFA_F_SECONDARY)
+			continue;
+		if (ifa->ifa_scope != RT_SCOPE_LINK &&
+		    ifa->ifa_scope <= scope)
+			return ifa->ifa_local;
+	}
+
+	return 0;
+}
+
+__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
+{
+	const struct in_ifaddr *ifa;
+	__be32 addr = 0;
+	unsigned char localnet_scope = RT_SCOPE_HOST;
+	struct in_device *in_dev;
+	struct net *net = dev_net(dev);
+	int master_idx;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev)
+		goto no_in_dev;
+
+	if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+		localnet_scope = RT_SCOPE_LINK;
+
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		if (ifa->ifa_flags & IFA_F_SECONDARY)
+			continue;
+		if (min(ifa->ifa_scope, localnet_scope) > scope)
+			continue;
+		if (!dst || inet_ifa_match(dst, ifa)) {
+			addr = ifa->ifa_local;
+			break;
+		}
+		if (!addr)
+			addr = ifa->ifa_local;
+	}
+
+	if (addr)
+		goto out_unlock;
+no_in_dev:
+	master_idx = l3mdev_master_ifindex_rcu(dev);
+
+	/* For VRFs, the VRF device takes the place of the loopback device,
+	 * with addresses on it being preferred.  Note in such cases the
+	 * loopback device will be among the devices that fail the master_idx
+	 * equality check in the loop below.
+	 */
+	if (master_idx &&
+	    (dev = dev_get_by_index_rcu(net, master_idx)) &&
+	    (in_dev = __in_dev_get_rcu(dev))) {
+		addr = in_dev_select_addr(in_dev, scope);
+		if (addr)
+			goto out_unlock;
+	}
+
+	/* Not loopback addresses on loopback should be preferred
+	   in this case. It is important that lo is the first interface
+	   in dev_base list.
+	 */
+	for_each_netdev_rcu(net, dev) {
+		if (l3mdev_master_ifindex_rcu(dev) != master_idx)
+			continue;
+
+		in_dev = __in_dev_get_rcu(dev);
+		if (!in_dev)
+			continue;
+
+		addr = in_dev_select_addr(in_dev, scope);
+		if (addr)
+			goto out_unlock;
+	}
+out_unlock:
+	rcu_read_unlock();
+	return addr;
+}
+EXPORT_SYMBOL(inet_select_addr);
+
+static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
+			      __be32 local, int scope)
+{
+	unsigned char localnet_scope = RT_SCOPE_HOST;
+	const struct in_ifaddr *ifa;
+	__be32 addr = 0;
+	int same = 0;
+
+	if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+		localnet_scope = RT_SCOPE_LINK;
+
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		unsigned char min_scope = min(ifa->ifa_scope, localnet_scope);
+
+		if (!addr &&
+		    (local == ifa->ifa_local || !local) &&
+		    min_scope <= scope) {
+			addr = ifa->ifa_local;
+			if (same)
+				break;
+		}
+		if (!same) {
+			same = (!local || inet_ifa_match(local, ifa)) &&
+				(!dst || inet_ifa_match(dst, ifa));
+			if (same && addr) {
+				if (local || !dst)
+					break;
+				/* Is the selected addr into dst subnet? */
+				if (inet_ifa_match(addr, ifa))
+					break;
+				/* No, then can we use new local src? */
+				if (min_scope <= scope) {
+					addr = ifa->ifa_local;
+					break;
+				}
+				/* search for large dst subnet for addr */
+				same = 0;
+			}
+		}
+	}
+
+	return same ? addr : 0;
+}
+
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - net: netns to check, cannot be NULL
+ * - in_dev: only on this interface, NULL=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
+			 __be32 dst, __be32 local, int scope)
+{
+	__be32 addr = 0;
+	struct net_device *dev;
+
+	if (in_dev)
+		return confirm_addr_indev(in_dev, dst, local, scope);
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		in_dev = __in_dev_get_rcu(dev);
+		if (in_dev) {
+			addr = confirm_addr_indev(in_dev, dst, local, scope);
+			if (addr)
+				break;
+		}
+	}
+	rcu_read_unlock();
+
+	return addr;
+}
+EXPORT_SYMBOL(inet_confirm_addr);
+
+/*
+ *	Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(register_inetaddr_notifier);
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
+
+int register_inetaddr_validator_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&inetaddr_validator_chain, nb);
+}
+EXPORT_SYMBOL(register_inetaddr_validator_notifier);
+
+int unregister_inetaddr_validator_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&inetaddr_validator_chain,
+	    nb);
+}
+EXPORT_SYMBOL(unregister_inetaddr_validator_notifier);
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{
+	struct in_ifaddr *ifa;
+	int named = 0;
+
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+		char old[IFNAMSIZ], *dot;
+
+		memcpy(old, ifa->ifa_label, IFNAMSIZ);
+		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+		if (named++ == 0)
+			goto skip;
+		dot = strchr(old, ':');
+		if (!dot) {
+			sprintf(old, ":%d", named);
+			dot = old;
+		}
+		if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
+			strcat(ifa->ifa_label, dot);
+		else
+			strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
+skip:
+		rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+	}
+}
+
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+					struct in_device *in_dev)
+
+{
+	const struct in_ifaddr *ifa;
+
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+		arp_send(ARPOP_REQUEST, ETH_P_ARP,
+			 ifa->ifa_local, dev,
+			 ifa->ifa_local, NULL,
+			 dev->dev_addr, NULL);
+	}
+}
+
+/* Called only under RTNL semaphore */
+
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+			 void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+	ASSERT_RTNL();
+
+	if (!in_dev) {
+		if (event == NETDEV_REGISTER) {
+			in_dev = inetdev_init(dev);
+			if (IS_ERR(in_dev))
+				return notifier_from_errno(PTR_ERR(in_dev));
+			if (dev->flags & IFF_LOOPBACK) {
+				IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
+				IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
+			}
+		} else if (event == NETDEV_CHANGEMTU) {
+			/* Re-enabling IP */
+			if (inetdev_valid_mtu(dev->mtu))
+				in_dev = inetdev_init(dev);
+		}
+		goto out;
+	}
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		pr_debug("%s: bug\n", __func__);
+		RCU_INIT_POINTER(dev->ip_ptr, NULL);
+		break;
+	case NETDEV_UP:
+		if (!inetdev_valid_mtu(dev->mtu))
+			break;
+		if (dev->flags & IFF_LOOPBACK) {
+			struct in_ifaddr *ifa = inet_alloc_ifa();
+
+			if (ifa) {
+				INIT_HLIST_NODE(&ifa->hash);
+				ifa->ifa_local =
+				  ifa->ifa_address = htonl(INADDR_LOOPBACK);
+				ifa->ifa_prefixlen = 8;
+				ifa->ifa_mask = inet_make_mask(8);
+				in_dev_hold(in_dev);
+				ifa->ifa_dev = in_dev;
+				ifa->ifa_scope = RT_SCOPE_HOST;
+				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+				set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
+						 INFINITY_LIFE_TIME);
+				ipv4_devconf_setall(in_dev);
+				neigh_parms_data_state_setall(in_dev->arp_parms);
+				inet_insert_ifa(ifa);
+			}
+		}
+		ip_mc_up(in_dev);
+		fallthrough;
+	case NETDEV_CHANGEADDR:
+		if (!IN_DEV_ARP_NOTIFY(in_dev))
+			break;
+		fallthrough;
+	case NETDEV_NOTIFY_PEERS:
+		/* Send gratuitous ARP to notify of link change */
+		inetdev_send_gratuitous_arp(dev, in_dev);
+		break;
+	case NETDEV_DOWN:
+		ip_mc_down(in_dev);
+		break;
+	case NETDEV_PRE_TYPE_CHANGE:
+		ip_mc_unmap(in_dev);
+		break;
+	case NETDEV_POST_TYPE_CHANGE:
+		ip_mc_remap(in_dev);
+		break;
+	case NETDEV_CHANGEMTU:
+		if (inetdev_valid_mtu(dev->mtu))
+			break;
+		/* disable IP when MTU is not enough */
+		fallthrough;
+	case NETDEV_UNREGISTER:
+		inetdev_destroy(in_dev);
+		break;
+	case NETDEV_CHANGENAME:
+		/* Do not notify about label change, this event is
+		 * not interesting to applications using netlink.
+		 */
+		inetdev_changename(dev, in_dev);
+
+		devinet_sysctl_unregister(in_dev);
+		devinet_sysctl_register(in_dev);
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_netdev_notifier = {
+	.notifier_call = inetdev_event,
+};
+
+static size_t inet_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+	       + nla_total_size(4) /* IFA_ADDRESS */
+	       + nla_total_size(4) /* IFA_LOCAL */
+	       + nla_total_size(4) /* IFA_BROADCAST */
+	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(1)  /* IFA_PROTO */
+	       + nla_total_size(4)  /* IFA_RT_PRIORITY */
+	       + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
+}
+
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+	return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+			 unsigned long tstamp, u32 preferred, u32 valid)
+{
+	struct ifa_cacheinfo ci;
+
+	ci.cstamp = cstamp_delta(cstamp);
+	ci.tstamp = cstamp_delta(tstamp);
+	ci.ifa_prefered = preferred;
+	ci.ifa_valid = valid;
+
+	return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
+}
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+			    struct inet_fill_args *args)
+{
+	struct ifaddrmsg *ifm;
+	struct nlmsghdr  *nlh;
+	u32 preferred, valid;
+
+	nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
+			args->flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ifm = nlmsg_data(nlh);
+	ifm->ifa_family = AF_INET;
+	ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+	ifm->ifa_flags = ifa->ifa_flags;
+	ifm->ifa_scope = ifa->ifa_scope;
+	ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+
+	if (args->netnsid >= 0 &&
+	    nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+		goto nla_put_failure;
+
+	if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
+		preferred = ifa->ifa_preferred_lft;
+		valid = ifa->ifa_valid_lft;
+		if (preferred != INFINITY_LIFE_TIME) {
+			long tval = (jiffies - ifa->ifa_tstamp) / HZ;
+
+			if (preferred > tval)
+				preferred -= tval;
+			else
+				preferred = 0;
+			if (valid != INFINITY_LIFE_TIME) {
+				if (valid > tval)
+					valid -= tval;
+				else
+					valid = 0;
+			}
+		}
+	} else {
+		preferred = INFINITY_LIFE_TIME;
+		valid = INFINITY_LIFE_TIME;
+	}
+	if ((ifa->ifa_address &&
+	     nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+	    (ifa->ifa_local &&
+	     nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) ||
+	    (ifa->ifa_broadcast &&
+	     nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+	    (ifa->ifa_label[0] &&
+	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+	    (ifa->ifa_proto &&
+	     nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) ||
+	    nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+	    (ifa->ifa_rt_priority &&
+	     nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
+	    put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
+			  preferred, valid))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
+				      struct inet_fill_args *fillargs,
+				      struct net **tgt_net, struct sock *sk,
+				      struct netlink_callback *cb)
+{
+	struct netlink_ext_ack *extack = cb->extack;
+	struct nlattr *tb[IFA_MAX+1];
+	struct ifaddrmsg *ifm;
+	int err, i;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+		NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
+		return -EINVAL;
+	}
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
+		return -EINVAL;
+	}
+
+	fillargs->ifindex = ifm->ifa_index;
+	if (fillargs->ifindex) {
+		cb->answer_flags |= NLM_F_DUMP_FILTERED;
+		fillargs->flags |= NLM_F_DUMP_FILTERED;
+	}
+
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+					    ifa_ipv4_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= IFA_MAX; ++i) {
+		if (!tb[i])
+			continue;
+
+		if (i == IFA_TARGET_NETNSID) {
+			struct net *net;
+
+			fillargs->netnsid = nla_get_s32(tb[i]);
+
+			net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
+			if (IS_ERR(net)) {
+				fillargs->netnsid = -1;
+				NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id");
+				return PTR_ERR(net);
+			}
+			*tgt_net = net;
+		} else {
+			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
+			    struct netlink_callback *cb, int s_ip_idx,
+			    struct inet_fill_args *fillargs)
+{
+	struct in_ifaddr *ifa;
+	int ip_idx = 0;
+	int err;
+
+	in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+		if (ip_idx < s_ip_idx) {
+			ip_idx++;
+			continue;
+		}
+		err = inet_fill_ifaddr(skb, ifa, fillargs);
+		if (err < 0)
+			goto done;
+
+		nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+		ip_idx++;
+	}
+	err = 0;
+
+done:
+	cb->args[2] = ip_idx;
+
+	return err;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = cb->nlh;
+	struct inet_fill_args fillargs = {
+		.portid = NETLINK_CB(cb->skb).portid,
+		.seq = nlh->nlmsg_seq,
+		.event = RTM_NEWADDR,
+		.flags = NLM_F_MULTI,
+		.netnsid = -1,
+	};
+	struct net *net = sock_net(skb->sk);
+	struct net *tgt_net = net;
+	int h, s_h;
+	int idx, s_idx;
+	int s_ip_idx;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	struct hlist_head *head;
+	int err = 0;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+	s_ip_idx = cb->args[2];
+
+	if (cb->strict_check) {
+		err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
+						 skb->sk, cb);
+		if (err < 0)
+			goto put_tgt_net;
+
+		err = 0;
+		if (fillargs.ifindex) {
+			dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
+			if (!dev) {
+				err = -ENODEV;
+				goto put_tgt_net;
+			}
+
+			in_dev = __in_dev_get_rtnl(dev);
+			if (in_dev) {
+				err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
+						       &fillargs);
+			}
+			goto put_tgt_net;
+		}
+	}
+
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &tgt_net->dev_index_head[h];
+		rcu_read_lock();
+		cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^
+			  tgt_net->dev_base_seq;
+		hlist_for_each_entry_rcu(dev, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			if (h > s_h || idx > s_idx)
+				s_ip_idx = 0;
+			in_dev = __in_dev_get_rcu(dev);
+			if (!in_dev)
+				goto cont;
+
+			err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
+					       &fillargs);
+			if (err < 0) {
+				rcu_read_unlock();
+				goto done;
+			}
+cont:
+			idx++;
+		}
+		rcu_read_unlock();
+	}
+
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+put_tgt_net:
+	if (fillargs.netnsid >= 0)
+		put_net(tgt_net);
+
+	return skb->len ? : err;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+		      u32 portid)
+{
+	struct inet_fill_args fillargs = {
+		.portid = portid,
+		.seq = nlh ? nlh->nlmsg_seq : 0,
+		.event = event,
+		.flags = 0,
+		.netnsid = -1,
+	};
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+	struct net *net;
+
+	net = dev_net(ifa->ifa_dev->dev);
+	skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = inet_fill_ifaddr(skb, ifa, &fillargs);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
+}
+
+static size_t inet_get_link_af_size(const struct net_device *dev,
+				    u32 ext_filter_mask)
+{
+	struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+
+	if (!in_dev)
+		return 0;
+
+	return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
+}
+
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+			     u32 ext_filter_mask)
+{
+	struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+	struct nlattr *nla;
+	int i;
+
+	if (!in_dev)
+		return -ENODATA;
+
+	nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
+	if (!nla)
+		return -EMSGSIZE;
+
+	for (i = 0; i < IPV4_DEVCONF_MAX; i++)
+		((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+
+	return 0;
+}
+
+static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
+	[IFLA_INET_CONF]	= { .type = NLA_NESTED },
+};
+
+static int inet_validate_link_af(const struct net_device *dev,
+				 const struct nlattr *nla,
+				 struct netlink_ext_ack *extack)
+{
+	struct nlattr *a, *tb[IFLA_INET_MAX+1];
+	int err, rem;
+
+	if (dev && !__in_dev_get_rtnl(dev))
+		return -EAFNOSUPPORT;
+
+	err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla,
+					  inet_af_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_INET_CONF]) {
+		nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
+			int cfgid = nla_type(a);
+
+			if (nla_len(a) < 4)
+				return -EINVAL;
+
+			if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla,
+			    struct netlink_ext_ack *extack)
+{
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+	struct nlattr *a, *tb[IFLA_INET_MAX+1];
+	int rem;
+
+	if (!in_dev)
+		return -EAFNOSUPPORT;
+
+	if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
+		return -EINVAL;
+
+	if (tb[IFLA_INET_CONF]) {
+		nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
+			ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
+	}
+
+	return 0;
+}
+
+static int inet_netconf_msgsize_devconf(int type)
+{
+	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+		   + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
+
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
+		size += nla_total_size(4);
+	if (all || type == NETCONFA_RP_FILTER)
+		size += nla_total_size(4);
+	if (all || type == NETCONFA_MC_FORWARDING)
+		size += nla_total_size(4);
+	if (all || type == NETCONFA_BC_FORWARDING)
+		size += nla_total_size(4);
+	if (all || type == NETCONFA_PROXY_NEIGH)
+		size += nla_total_size(4);
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+		size += nla_total_size(4);
+
+	return size;
+}
+
+static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+				     struct ipv4_devconf *devconf, u32 portid,
+				     u32 seq, int event, unsigned int flags,
+				     int type)
+{
+	struct nlmsghdr  *nlh;
+	struct netconfmsg *ncm;
+	bool all = false;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+			flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	ncm = nlmsg_data(nlh);
+	ncm->ncm_family = AF_INET;
+
+	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+		goto nla_put_failure;
+
+	if (!devconf)
+		goto out;
+
+	if ((all || type == NETCONFA_FORWARDING) &&
+	    nla_put_s32(skb, NETCONFA_FORWARDING,
+			IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
+		goto nla_put_failure;
+	if ((all || type == NETCONFA_RP_FILTER) &&
+	    nla_put_s32(skb, NETCONFA_RP_FILTER,
+			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
+		goto nla_put_failure;
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
+	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
+		goto nla_put_failure;
+	if ((all || type == NETCONFA_BC_FORWARDING) &&
+	    nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+			IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+		goto nla_put_failure;
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
+	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
+		goto nla_put_failure;
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+			IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
+		goto nla_put_failure;
+
+out:
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+void inet_netconf_notify_devconf(struct net *net, int event, int type,
+				 int ifindex, struct ipv4_devconf *devconf)
+{
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+					event, 0, type);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+}
+
+static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
+	[NETCONFA_IFINDEX]	= { .len = sizeof(int) },
+	[NETCONFA_FORWARDING]	= { .len = sizeof(int) },
+	[NETCONFA_RP_FILTER]	= { .len = sizeof(int) },
+	[NETCONFA_PROXY_NEIGH]	= { .len = sizeof(int) },
+	[NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]	= { .len = sizeof(int) },
+};
+
+static int inet_netconf_valid_get_req(struct sk_buff *skb,
+				      const struct nlmsghdr *nlh,
+				      struct nlattr **tb,
+				      struct netlink_ext_ack *extack)
+{
+	int i, err;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
+		NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request");
+		return -EINVAL;
+	}
+
+	if (!netlink_strict_get_check(skb))
+		return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
+					      tb, NETCONFA_MAX,
+					      devconf_ipv4_policy, extack);
+
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
+					    tb, NETCONFA_MAX,
+					    devconf_ipv4_policy, extack);
+	if (err)
+		return err;
+
+	for (i = 0; i <= NETCONFA_MAX; i++) {
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case NETCONFA_IFINDEX:
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int inet_netconf_get_devconf(struct sk_buff *in_skb,
+				    struct nlmsghdr *nlh,
+				    struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nlattr *tb[NETCONFA_MAX+1];
+	struct sk_buff *skb;
+	struct ipv4_devconf *devconf;
+	struct in_device *in_dev;
+	struct net_device *dev;
+	int ifindex;
+	int err;
+
+	err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack);
+	if (err)
+		goto errout;
+
+	err = -EINVAL;
+	if (!tb[NETCONFA_IFINDEX])
+		goto errout;
+
+	ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+	switch (ifindex) {
+	case NETCONFA_IFINDEX_ALL:
+		devconf = net->ipv4.devconf_all;
+		break;
+	case NETCONFA_IFINDEX_DEFAULT:
+		devconf = net->ipv4.devconf_dflt;
+		break;
+	default:
+		dev = __dev_get_by_index(net, ifindex);
+		if (!dev)
+			goto errout;
+		in_dev = __in_dev_get_rtnl(dev);
+		if (!in_dev)
+			goto errout;
+		devconf = &in_dev->cnf;
+		break;
+	}
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = inet_netconf_fill_devconf(skb, ifindex, devconf,
+					NETLINK_CB(in_skb).portid,
+					nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+					NETCONFA_ALL);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+	return err;
+}
+
+static int inet_netconf_dump_devconf(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = cb->nlh;
+	struct net *net = sock_net(skb->sk);
+	int h, s_h;
+	int idx, s_idx;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	struct hlist_head *head;
+
+	if (cb->strict_check) {
+		struct netlink_ext_ack *extack = cb->extack;
+		struct netconfmsg *ncm;
+
+		if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+			NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request");
+			return -EINVAL;
+		}
+
+		if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+			NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request");
+			return -EINVAL;
+		}
+	}
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		rcu_read_lock();
+		cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+			  net->dev_base_seq;
+		hlist_for_each_entry_rcu(dev, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			in_dev = __in_dev_get_rcu(dev);
+			if (!in_dev)
+				goto cont;
+
+			if (inet_netconf_fill_devconf(skb, dev->ifindex,
+						      &in_dev->cnf,
+						      NETLINK_CB(cb->skb).portid,
+						      nlh->nlmsg_seq,
+						      RTM_NEWNETCONF,
+						      NLM_F_MULTI,
+						      NETCONFA_ALL) < 0) {
+				rcu_read_unlock();
+				goto done;
+			}
+			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+			idx++;
+		}
+		rcu_read_unlock();
+	}
+	if (h == NETDEV_HASHENTRIES) {
+		if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+					      net->ipv4.devconf_all,
+					      NETLINK_CB(cb->skb).portid,
+					      nlh->nlmsg_seq,
+					      RTM_NEWNETCONF, NLM_F_MULTI,
+					      NETCONFA_ALL) < 0)
+			goto done;
+		else
+			h++;
+	}
+	if (h == NETDEV_HASHENTRIES + 1) {
+		if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+					      net->ipv4.devconf_dflt,
+					      NETLINK_CB(cb->skb).portid,
+					      nlh->nlmsg_seq,
+					      RTM_NEWNETCONF, NLM_F_MULTI,
+					      NETCONFA_ALL) < 0)
+			goto done;
+		else
+			h++;
+	}
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+
+	return skb->len;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static void devinet_copy_dflt_conf(struct net *net, int i)
+{
+	struct net_device *dev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		struct in_device *in_dev;
+
+		in_dev = __in_dev_get_rcu(dev);
+		if (in_dev && !test_bit(i, in_dev->cnf.state))
+			in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
+	}
+	rcu_read_unlock();
+}
+
+/* called with RTNL locked */
+static void inet_forward_change(struct net *net)
+{
+	struct net_device *dev;
+	int on = IPV4_DEVCONF_ALL(net, FORWARDING);
+
+	IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
+	IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+	inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+				    NETCONFA_FORWARDING,
+				    NETCONFA_IFINDEX_ALL,
+				    net->ipv4.devconf_all);
+	inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+				    NETCONFA_FORWARDING,
+				    NETCONFA_IFINDEX_DEFAULT,
+				    net->ipv4.devconf_dflt);
+
+	for_each_netdev(net, dev) {
+		struct in_device *in_dev;
+
+		if (on)
+			dev_disable_lro(dev);
+
+		in_dev = __in_dev_get_rtnl(dev);
+		if (in_dev) {
+			IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_FORWARDING,
+						    dev->ifindex, &in_dev->cnf);
+		}
+	}
+}
+
+static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
+{
+	if (cnf == net->ipv4.devconf_dflt)
+		return NETCONFA_IFINDEX_DEFAULT;
+	else if (cnf == net->ipv4.devconf_all)
+		return NETCONFA_IFINDEX_ALL;
+	else {
+		struct in_device *idev
+			= container_of(cnf, struct in_device, cnf);
+		return idev->dev->ifindex;
+	}
+}
+
+static int devinet_conf_proc(struct ctl_table *ctl, int write,
+			     void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int old_value = *(int *)ctl->data;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+	int new_value = *(int *)ctl->data;
+
+	if (write) {
+		struct ipv4_devconf *cnf = ctl->extra1;
+		struct net *net = ctl->extra2;
+		int i = (int *)ctl->data - cnf->data;
+		int ifindex;
+
+		set_bit(i, cnf->state);
+
+		if (cnf == net->ipv4.devconf_dflt)
+			devinet_copy_dflt_conf(net, i);
+		if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+		    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
+			if ((new_value == 0) && (old_value != 0))
+				rt_cache_flush(net);
+
+		if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+		    new_value != old_value)
+			rt_cache_flush(net);
+
+		if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
+		    new_value != old_value) {
+			ifindex = devinet_conf_ifindex(net, cnf);
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_RP_FILTER,
+						    ifindex, cnf);
+		}
+		if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
+		    new_value != old_value) {
+			ifindex = devinet_conf_ifindex(net, cnf);
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_PROXY_NEIGH,
+						    ifindex, cnf);
+		}
+		if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
+		    new_value != old_value) {
+			ifindex = devinet_conf_ifindex(net, cnf);
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+						    ifindex, cnf);
+		}
+	}
+
+	return ret;
+}
+
+static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
+				  void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	loff_t pos = *ppos;
+	struct net *net = ctl->extra2;
+	int ret;
+
+	if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write && *valp != val) {
+		if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
+			if (!rtnl_trylock()) {
+				/* Restore the original values before restarting */
+				*valp = val;
+				*ppos = pos;
+				return restart_syscall();
+			}
+			if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
+				inet_forward_change(net);
+			} else {
+				struct ipv4_devconf *cnf = ctl->extra1;
+				struct in_device *idev =
+					container_of(cnf, struct in_device, cnf);
+				if (*valp)
+					dev_disable_lro(idev->dev);
+				inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+							    NETCONFA_FORWARDING,
+							    idev->dev->ifindex,
+							    cnf);
+			}
+			rtnl_unlock();
+			rt_cache_flush(net);
+		} else
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_FORWARDING,
+						    NETCONFA_IFINDEX_DEFAULT,
+						    net->ipv4.devconf_dflt);
+	}
+
+	return ret;
+}
+
+static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
+				void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+	struct net *net = ctl->extra2;
+
+	if (write && *valp != val)
+		rt_cache_flush(net);
+
+	return ret;
+}
+
+#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
+	{ \
+		.procname	= name, \
+		.data		= ipv4_devconf.data + \
+				  IPV4_DEVCONF_ ## attr - 1, \
+		.maxlen		= sizeof(int), \
+		.mode		= mval, \
+		.proc_handler	= proc, \
+		.extra1		= &ipv4_devconf, \
+	}
+
+#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
+	DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
+	DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
+	DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
+
+#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
+	DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
+
+static struct devinet_sysctl_table {
+	struct ctl_table_header *sysctl_header;
+	struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
+} devinet_sysctl = {
+	.devinet_vars = {
+		DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
+					     devinet_sysctl_forward),
+		DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+		DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
+
+		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
+		DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
+		DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
+		DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
+		DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
+		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
+					"accept_source_route"),
+		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+		DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
+		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
+		DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
+		DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
+		DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
+		DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
+		DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER,
+					"arp_evict_nocarrier"),
+		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+		DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+					"force_igmp_version"),
+		DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+					"igmpv2_unsolicited_report_interval"),
+		DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+					"igmpv3_unsolicited_report_interval"),
+		DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
+					"ignore_routes_with_linkdown"),
+		DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+					"drop_gratuitous_arp"),
+
+		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
+					      "promote_secondaries"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+					      "route_localnet"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+					      "drop_unicast_in_l2_multicast"),
+	},
+};
+
+static int __devinet_sysctl_register(struct net *net, char *dev_name,
+				     int ifindex, struct ipv4_devconf *p)
+{
+	int i;
+	struct devinet_sysctl_table *t;
+	char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
+
+	t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+		t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+		t->devinet_vars[i].extra1 = p;
+		t->devinet_vars[i].extra2 = net;
+	}
+
+	snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
+
+	t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
+	if (!t->sysctl_header)
+		goto free;
+
+	p->sysctl = t;
+
+	inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
+				    ifindex, p);
+	return 0;
+
+free:
+	kfree(t);
+out:
+	return -ENOMEM;
+}
+
+static void __devinet_sysctl_unregister(struct net *net,
+					struct ipv4_devconf *cnf, int ifindex)
+{
+	struct devinet_sysctl_table *t = cnf->sysctl;
+
+	if (t) {
+		cnf->sysctl = NULL;
+		unregister_net_sysctl_table(t->sysctl_header);
+		kfree(t);
+	}
+
+	inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL);
+}
+
+static int devinet_sysctl_register(struct in_device *idev)
+{
+	int err;
+
+	if (!sysctl_dev_name_is_allowed(idev->dev->name))
+		return -EINVAL;
+
+	err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
+	if (err)
+		return err;
+	err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+					idev->dev->ifindex, &idev->cnf);
+	if (err)
+		neigh_sysctl_unregister(idev->arp_parms);
+	return err;
+}
+
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+	struct net *net = dev_net(idev->dev);
+
+	__devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex);
+	neigh_sysctl_unregister(idev->arp_parms);
+}
+
+static struct ctl_table ctl_forward_entry[] = {
+	{
+		.procname	= "ip_forward",
+		.data		= &ipv4_devconf.data[
+					IPV4_DEVCONF_FORWARDING - 1],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= devinet_sysctl_forward,
+		.extra1		= &ipv4_devconf,
+		.extra2		= &init_net,
+	},
+	{ },
+};
+#endif
+
+static __net_init int devinet_init_net(struct net *net)
+{
+	int err;
+	struct ipv4_devconf *all, *dflt;
+#ifdef CONFIG_SYSCTL
+	struct ctl_table *tbl;
+	struct ctl_table_header *forw_hdr;
+#endif
+
+	err = -ENOMEM;
+	all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL);
+	if (!all)
+		goto err_alloc_all;
+
+	dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+	if (!dflt)
+		goto err_alloc_dflt;
+
+#ifdef CONFIG_SYSCTL
+	tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL);
+	if (!tbl)
+		goto err_alloc_ctl;
+
+	tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+	tbl[0].extra1 = all;
+	tbl[0].extra2 = net;
+#endif
+
+	if (!net_eq(net, &init_net)) {
+		switch (net_inherit_devconf()) {
+		case 3:
+			/* copy from the current netns */
+			memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all,
+			       sizeof(ipv4_devconf));
+			memcpy(dflt,
+			       current->nsproxy->net_ns->ipv4.devconf_dflt,
+			       sizeof(ipv4_devconf_dflt));
+			break;
+		case 0:
+		case 1:
+			/* copy from init_net */
+			memcpy(all, init_net.ipv4.devconf_all,
+			       sizeof(ipv4_devconf));
+			memcpy(dflt, init_net.ipv4.devconf_dflt,
+			       sizeof(ipv4_devconf_dflt));
+			break;
+		case 2:
+			/* use compiled values */
+			break;
+		}
+	}
+
+#ifdef CONFIG_SYSCTL
+	err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
+	if (err < 0)
+		goto err_reg_all;
+
+	err = __devinet_sysctl_register(net, "default",
+					NETCONFA_IFINDEX_DEFAULT, dflt);
+	if (err < 0)
+		goto err_reg_dflt;
+
+	err = -ENOMEM;
+	forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl,
+					  ARRAY_SIZE(ctl_forward_entry));
+	if (!forw_hdr)
+		goto err_reg_ctl;
+	net->ipv4.forw_hdr = forw_hdr;
+#endif
+
+	net->ipv4.devconf_all = all;
+	net->ipv4.devconf_dflt = dflt;
+	return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_ctl:
+	__devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT);
+err_reg_dflt:
+	__devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
+err_reg_all:
+	kfree(tbl);
+err_alloc_ctl:
+#endif
+	kfree(dflt);
+err_alloc_dflt:
+	kfree(all);
+err_alloc_all:
+	return err;
+}
+
+static __net_exit void devinet_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+	struct ctl_table *tbl;
+
+	tbl = net->ipv4.forw_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.forw_hdr);
+	__devinet_sysctl_unregister(net, net->ipv4.devconf_dflt,
+				    NETCONFA_IFINDEX_DEFAULT);
+	__devinet_sysctl_unregister(net, net->ipv4.devconf_all,
+				    NETCONFA_IFINDEX_ALL);
+	kfree(tbl);
+#endif
+	kfree(net->ipv4.devconf_dflt);
+	kfree(net->ipv4.devconf_all);
+}
+
+static __net_initdata struct pernet_operations devinet_ops = {
+	.init = devinet_init_net,
+	.exit = devinet_exit_net,
+};
+
+static struct rtnl_af_ops inet_af_ops __read_mostly = {
+	.family		  = AF_INET,
+	.fill_link_af	  = inet_fill_link_af,
+	.get_link_af_size = inet_get_link_af_size,
+	.validate_link_af = inet_validate_link_af,
+	.set_link_af	  = inet_set_link_af,
+};
+
+void __init devinet_init(void)
+{
+	int i;
+
+	for (i = 0; i < IN4_ADDR_HSIZE; i++)
+		INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
+	register_pernet_subsys(&devinet_ops);
+	register_netdevice_notifier(&ip_netdev_notifier);
+
+	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
+	rtnl_af_register(&inet_af_ops);
+
+	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0);
+	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0);
+	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0);
+	rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
+		      inet_netconf_dump_devconf, 0);
+}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 0000000000..4ccfc104f1
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,1251 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/pfkeyv2.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/in6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
+
+#include <linux/highmem.h>
+
+struct esp_skb_cb {
+	struct xfrm_skb_cb xfrm;
+	void *tmp;
+};
+
+struct esp_output_extra {
+	__be32 seqhi;
+	u32 esphoff;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the IV is placed at the front, followed
+ * by the request and finally the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int extralen)
+{
+	unsigned int len;
+
+	len = extralen;
+
+	len += crypto_aead_ivsize(aead);
+
+	if (len) {
+		len += crypto_aead_alignmask(aead) &
+		       ~(crypto_tfm_ctx_alignment() - 1);
+		len = ALIGN(len, crypto_tfm_ctx_alignment());
+	}
+
+	len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
+	len = ALIGN(len, __alignof__(struct scatterlist));
+
+	len += sizeof(struct scatterlist) * nfrags;
+
+	return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline void *esp_tmp_extra(void *tmp)
+{
+	return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
+}
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int extralen)
+{
+	return crypto_aead_ivsize(aead) ?
+	       PTR_ALIGN((u8 *)tmp + extralen,
+			 crypto_aead_alignmask(aead) + 1) : tmp + extralen;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+	struct aead_request *req;
+
+	req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+				crypto_tfm_ctx_alignment());
+	aead_request_set_tfm(req, aead);
+	return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+					     struct aead_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_aead_reqsize(aead),
+			     __alignof__(struct scatterlist));
+}
+
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+	struct crypto_aead *aead = x->data;
+	int extralen = 0;
+	u8 *iv;
+	struct aead_request *req;
+	struct scatterlist *sg;
+
+	if (x->props.flags & XFRM_STATE_ESN)
+		extralen += sizeof(struct esp_output_extra);
+
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+
+	/* Unref skb_frag_pages in the src scatterlist if necessary.
+	 * Skip the first sg which comes from skb->data.
+	 */
+	if (req->src != req->dst)
+		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+			put_page(sg_page(sg));
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+struct esp_tcp_sk {
+	struct sock *sk;
+	struct rcu_head rcu;
+};
+
+static void esp_free_tcp_sk(struct rcu_head *head)
+{
+	struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu);
+
+	sock_put(esk->sk);
+	kfree(esk);
+}
+
+static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
+{
+	struct xfrm_encap_tmpl *encap = x->encap;
+	struct net *net = xs_net(x);
+	struct esp_tcp_sk *esk;
+	__be16 sport, dport;
+	struct sock *nsk;
+	struct sock *sk;
+
+	sk = rcu_dereference(x->encap_sk);
+	if (sk && sk->sk_state == TCP_ESTABLISHED)
+		return sk;
+
+	spin_lock_bh(&x->lock);
+	sport = encap->encap_sport;
+	dport = encap->encap_dport;
+	nsk = rcu_dereference_protected(x->encap_sk,
+					lockdep_is_held(&x->lock));
+	if (sk && sk == nsk) {
+		esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
+		if (!esk) {
+			spin_unlock_bh(&x->lock);
+			return ERR_PTR(-ENOMEM);
+		}
+		RCU_INIT_POINTER(x->encap_sk, NULL);
+		esk->sk = sk;
+		call_rcu(&esk->rcu, esp_free_tcp_sk);
+	}
+	spin_unlock_bh(&x->lock);
+
+	sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4,
+				     dport, x->props.saddr.a4, sport, 0);
+	if (!sk)
+		return ERR_PTR(-ENOENT);
+
+	if (!tcp_is_ulp_esp(sk)) {
+		sock_put(sk);
+		return ERR_PTR(-EINVAL);
+	}
+
+	spin_lock_bh(&x->lock);
+	nsk = rcu_dereference_protected(x->encap_sk,
+					lockdep_is_held(&x->lock));
+	if (encap->encap_sport != sport ||
+	    encap->encap_dport != dport) {
+		sock_put(sk);
+		sk = nsk ?: ERR_PTR(-EREMCHG);
+	} else if (sk == nsk) {
+		sock_put(sk);
+	} else {
+		rcu_assign_pointer(x->encap_sk, sk);
+	}
+	spin_unlock_bh(&x->lock);
+
+	return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct sock *sk;
+	int err;
+
+	rcu_read_lock();
+
+	sk = esp_find_tcp_sk(x);
+	err = PTR_ERR_OR_ZERO(sk);
+	if (err)
+		goto out;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk))
+		err = espintcp_queue_out(sk, skb);
+	else
+		err = espintcp_push_skb(sk, skb);
+	bh_unlock_sock(sk);
+
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+				   struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
+
+	return esp_output_tcp_finish(x, skb);
+}
+
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+
+	local_bh_disable();
+	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+	local_bh_enable();
+
+	/* EINPROGRESS just happens to do the right thing.  It
+	 * actually means that the skb has been consumed and
+	 * isn't coming back.
+	 */
+	return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+
+	return -EOPNOTSUPP;
+}
+#endif
+
+static void esp_output_done(void *data, int err)
+{
+	struct sk_buff *skb = data;
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	void *tmp;
+	struct xfrm_state *x;
+
+	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+		struct sec_path *sp = skb_sec_path(skb);
+
+		x = sp->xvec[sp->len - 1];
+	} else {
+		x = skb_dst(skb)->xfrm;
+	}
+
+	tmp = ESP_SKB_CB(skb)->tmp;
+	esp_ssg_unref(x, tmp);
+	kfree(tmp);
+
+	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+		if (err) {
+			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+			kfree_skb(skb);
+			return;
+		}
+
+		skb_push(skb, skb->data - skb_mac_header(skb));
+		secpath_reset(skb);
+		xfrm_dev_resume(skb);
+	} else {
+		if (!err &&
+		    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+			esp_output_tail_tcp(x, skb);
+		else
+			xfrm_output_resume(skb->sk, skb, err);
+	}
+}
+
+/* Move ESP header back into place. */
+static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
+{
+	struct ip_esp_hdr *esph = (void *)(skb->data + offset);
+	void *tmp = ESP_SKB_CB(skb)->tmp;
+	__be32 *seqhi = esp_tmp_extra(tmp);
+
+	esph->seq_no = esph->spi;
+	esph->spi = *seqhi;
+}
+
+static void esp_output_restore_header(struct sk_buff *skb)
+{
+	void *tmp = ESP_SKB_CB(skb)->tmp;
+	struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+	esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+				sizeof(__be32));
+}
+
+static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+					       struct xfrm_state *x,
+					       struct ip_esp_hdr *esph,
+					       struct esp_output_extra *extra)
+{
+	/* For ESN we move the header forward by 4 bytes to
+	 * accommodate the high bits.  We will move it back after
+	 * encryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		__u32 seqhi;
+		struct xfrm_offload *xo = xfrm_offload(skb);
+
+		if (xo)
+			seqhi = xo->seq.hi;
+		else
+			seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
+
+		extra->esphoff = (unsigned char *)esph -
+				 skb_transport_header(skb);
+		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+		extra->seqhi = esph->spi;
+		esph->seq_no = htonl(seqhi);
+	}
+
+	esph->spi = x->id.spi;
+
+	return esph;
+}
+
+static void esp_output_done_esn(void *data, int err)
+{
+	struct sk_buff *skb = data;
+
+	esp_output_restore_header(skb);
+	esp_output_done(data, err);
+}
+
+static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
+					       int encap_type,
+					       struct esp_info *esp,
+					       __be16 sport,
+					       __be16 dport)
+{
+	struct udphdr *uh;
+	__be32 *udpdata32;
+	unsigned int len;
+
+	len = skb->len + esp->tailen - skb_transport_offset(skb);
+	if (len + sizeof(struct iphdr) > IP_MAX_MTU)
+		return ERR_PTR(-EMSGSIZE);
+
+	uh = (struct udphdr *)esp->esph;
+	uh->source = sport;
+	uh->dest = dport;
+	uh->len = htons(len);
+	uh->check = 0;
+
+	*skb_mac_header(skb) = IPPROTO_UDP;
+
+	if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) {
+		udpdata32 = (__be32 *)(uh + 1);
+		udpdata32[0] = udpdata32[1] = 0;
+		return (struct ip_esp_hdr *)(udpdata32 + 2);
+	}
+
+	return (struct ip_esp_hdr *)(uh + 1);
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+						    struct sk_buff *skb,
+						    struct esp_info *esp)
+{
+	__be16 *lenp = (void *)esp->esph;
+	struct ip_esp_hdr *esph;
+	unsigned int len;
+	struct sock *sk;
+
+	len = skb->len + esp->tailen - skb_transport_offset(skb);
+	if (len > IP_MAX_MTU)
+		return ERR_PTR(-EMSGSIZE);
+
+	rcu_read_lock();
+	sk = esp_find_tcp_sk(x);
+	rcu_read_unlock();
+
+	if (IS_ERR(sk))
+		return ERR_CAST(sk);
+
+	*lenp = htons(len);
+	esph = (struct ip_esp_hdr *)(lenp + 1);
+
+	return esph;
+}
+#else
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+						    struct sk_buff *skb,
+						    struct esp_info *esp)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+			    struct esp_info *esp)
+{
+	struct xfrm_encap_tmpl *encap = x->encap;
+	struct ip_esp_hdr *esph;
+	__be16 sport, dport;
+	int encap_type;
+
+	spin_lock_bh(&x->lock);
+	sport = encap->encap_sport;
+	dport = encap->encap_dport;
+	encap_type = encap->encap_type;
+	spin_unlock_bh(&x->lock);
+
+	switch (encap_type) {
+	default:
+	case UDP_ENCAP_ESPINUDP:
+	case UDP_ENCAP_ESPINUDP_NON_IKE:
+		esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
+		break;
+	case TCP_ENCAP_ESPINTCP:
+		esph = esp_output_tcp_encap(x, skb, esp);
+		break;
+	}
+
+	if (IS_ERR(esph))
+		return PTR_ERR(esph);
+
+	esp->esph = esph;
+
+	return 0;
+}
+
+int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+	u8 *tail;
+	int nfrags;
+	int esph_offset;
+	struct page *page;
+	struct sk_buff *trailer;
+	int tailen = esp->tailen;
+
+	/* this is non-NULL only with TCP/UDP Encapsulation */
+	if (x->encap) {
+		int err = esp_output_encap(x, skb, esp);
+
+		if (err < 0)
+			return err;
+	}
+
+	if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+	    ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+		goto cow;
+
+	if (!skb_cloned(skb)) {
+		if (tailen <= skb_tailroom(skb)) {
+			nfrags = 1;
+			trailer = skb;
+			tail = skb_tail_pointer(trailer);
+
+			goto skip_cow;
+		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+			   && !skb_has_frag_list(skb)) {
+			int allocsize;
+			struct sock *sk = skb->sk;
+			struct page_frag *pfrag = &x->xfrag;
+
+			esp->inplace = false;
+
+			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+			spin_lock_bh(&x->lock);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				goto cow;
+			}
+
+			page = pfrag->page;
+			get_page(page);
+
+			tail = page_address(page) + pfrag->offset;
+
+			esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+
+			nfrags = skb_shinfo(skb)->nr_frags;
+
+			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+					     tailen);
+			skb_shinfo(skb)->nr_frags = ++nfrags;
+
+			pfrag->offset = pfrag->offset + allocsize;
+
+			spin_unlock_bh(&x->lock);
+
+			nfrags++;
+
+			skb_len_add(skb, tailen);
+			if (sk && sk_fullsock(sk))
+				refcount_add(tailen, &sk->sk_wmem_alloc);
+
+			goto out;
+		}
+	}
+
+cow:
+	esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
+	nfrags = skb_cow_data(skb, tailen, &trailer);
+	if (nfrags < 0)
+		goto out;
+	tail = skb_tail_pointer(trailer);
+	esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
+
+skip_cow:
+	esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+	pskb_put(skb, trailer, tailen);
+
+out:
+	return nfrags;
+}
+EXPORT_SYMBOL_GPL(esp_output_head);
+
+int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+	u8 *iv;
+	int alen;
+	void *tmp;
+	int ivlen;
+	int assoclen;
+	int extralen;
+	struct page *page;
+	struct ip_esp_hdr *esph;
+	struct crypto_aead *aead;
+	struct aead_request *req;
+	struct scatterlist *sg, *dsg;
+	struct esp_output_extra *extra;
+	int err = -ENOMEM;
+
+	assoclen = sizeof(struct ip_esp_hdr);
+	extralen = 0;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		extralen += sizeof(*extra);
+		assoclen += sizeof(__be32);
+	}
+
+	aead = x->data;
+	alen = crypto_aead_authsize(aead);
+	ivlen = crypto_aead_ivsize(aead);
+
+	tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
+	if (!tmp)
+		goto error;
+
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+
+	if (esp->inplace)
+		dsg = sg;
+	else
+		dsg = &sg[esp->nfrags];
+
+	esph = esp_output_set_extra(skb, x, esp->esph, extra);
+	esp->esph = esph;
+
+	sg_init_table(sg, esp->nfrags);
+	err = skb_to_sgvec(skb, sg,
+		           (unsigned char *)esph - skb->data,
+		           assoclen + ivlen + esp->clen + alen);
+	if (unlikely(err < 0))
+		goto error_free;
+
+	if (!esp->inplace) {
+		int allocsize;
+		struct page_frag *pfrag = &x->xfrag;
+
+		allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+		spin_lock_bh(&x->lock);
+		if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+			spin_unlock_bh(&x->lock);
+			goto error_free;
+		}
+
+		skb_shinfo(skb)->nr_frags = 1;
+
+		page = pfrag->page;
+		get_page(page);
+		/* replace page frags in skb with new page */
+		__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+		pfrag->offset = pfrag->offset + allocsize;
+		spin_unlock_bh(&x->lock);
+
+		sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+		err = skb_to_sgvec(skb, dsg,
+			           (unsigned char *)esph - skb->data,
+			           assoclen + ivlen + esp->clen + alen);
+		if (unlikely(err < 0))
+			goto error_free;
+	}
+
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_output_done, skb);
+
+	aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
+	aead_request_set_ad(req, assoclen);
+
+	memset(iv, 0, ivlen);
+	memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
+	       min(ivlen, 8));
+
+	ESP_SKB_CB(skb)->tmp = tmp;
+	err = crypto_aead_encrypt(req);
+
+	switch (err) {
+	case -EINPROGRESS:
+		goto error;
+
+	case -ENOSPC:
+		err = NET_XMIT_DROP;
+		break;
+
+	case 0:
+		if ((x->props.flags & XFRM_STATE_ESN))
+			esp_output_restore_header(skb);
+	}
+
+	if (sg != dsg)
+		esp_ssg_unref(x, tmp);
+
+	if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+		err = esp_output_tail_tcp(x, skb);
+
+error_free:
+	kfree(tmp);
+error:
+	return err;
+}
+EXPORT_SYMBOL_GPL(esp_output_tail);
+
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int alen;
+	int blksize;
+	struct ip_esp_hdr *esph;
+	struct crypto_aead *aead;
+	struct esp_info esp;
+
+	esp.inplace = true;
+
+	esp.proto = *skb_mac_header(skb);
+	*skb_mac_header(skb) = IPPROTO_ESP;
+
+	/* skb is pure payload to encrypt */
+
+	aead = x->data;
+	alen = crypto_aead_authsize(aead);
+
+	esp.tfclen = 0;
+	if (x->tfcpad) {
+		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+		u32 padto;
+
+		padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
+		if (skb->len < padto)
+			esp.tfclen = padto - skb->len;
+	}
+	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+	esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+	esp.plen = esp.clen - skb->len - esp.tfclen;
+	esp.tailen = esp.tfclen + esp.plen + alen;
+
+	esp.esph = ip_esp_hdr(skb);
+
+	esp.nfrags = esp_output_head(x, skb, &esp);
+	if (esp.nfrags < 0)
+		return esp.nfrags;
+
+	esph = esp.esph;
+	esph->spi = x->id.spi;
+
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+	esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+				 ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+	skb_push(skb, -skb_network_offset(skb));
+
+	return esp_output_tail(x, skb, &esp);
+}
+
+static inline int esp_remove_trailer(struct sk_buff *skb)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct crypto_aead *aead = x->data;
+	int alen, hlen, elen;
+	int padlen, trimlen;
+	__wsum csumdiff;
+	u8 nexthdr[2];
+	int ret;
+
+	alen = crypto_aead_authsize(aead);
+	hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	elen = skb->len - hlen;
+
+	if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
+		BUG();
+
+	ret = -EINVAL;
+	padlen = nexthdr[0];
+	if (padlen + 2 + alen >= elen) {
+		net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
+				    padlen + 2, elen - alen);
+		goto out;
+	}
+
+	trimlen = alen + padlen + 2;
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+		skb->csum = csum_block_sub(skb->csum, csumdiff,
+					   skb->len - trimlen);
+	}
+	ret = pskb_trim(skb, skb->len - trimlen);
+	if (unlikely(ret))
+		return ret;
+
+	ret = nexthdr[1];
+
+out:
+	return ret;
+}
+
+int esp_input_done2(struct sk_buff *skb, int err)
+{
+	const struct iphdr *iph;
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct crypto_aead *aead = x->data;
+	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	int ihl;
+
+	if (!xo || !(xo->flags & CRYPTO_DONE))
+		kfree(ESP_SKB_CB(skb)->tmp);
+
+	if (unlikely(err))
+		goto out;
+
+	err = esp_remove_trailer(skb);
+	if (unlikely(err < 0))
+		goto out;
+
+	iph = ip_hdr(skb);
+	ihl = iph->ihl * 4;
+
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap = x->encap;
+		struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);
+		struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+		__be16 source;
+
+		switch (x->encap->encap_type) {
+		case TCP_ENCAP_ESPINTCP:
+			source = th->source;
+			break;
+		case UDP_ENCAP_ESPINUDP:
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			source = uh->source;
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			err = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * 1) if the NAT-T peer's IP or port changed then
+		 *    advertise the change to the keying daemon.
+		 *    This is an inbound SA, so just compare
+		 *    SRC ports.
+		 */
+		if (iph->saddr != x->props.saddr.a4 ||
+		    source != encap->encap_sport) {
+			xfrm_address_t ipaddr;
+
+			ipaddr.a4 = iph->saddr;
+			km_new_mapping(x, &ipaddr, source);
+
+			/* XXX: perhaps add an extra
+			 * policy check here, to see
+			 * if we should allow or
+			 * reject a packet from a
+			 * different source
+			 * address/port.
+			 */
+		}
+
+		/*
+		 * 2) ignore UDP/TCP checksums in case
+		 *    of NAT-T in Transport Mode, or
+		 *    perform other post-processing fixes
+		 *    as per draft-ietf-ipsec-udp-encaps-06,
+		 *    section 3.1.2
+		 */
+		if (x->props.mode == XFRM_MODE_TRANSPORT)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+
+	skb_pull_rcsum(skb, hlen);
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		skb_reset_transport_header(skb);
+	else
+		skb_set_transport_header(skb, -ihl);
+
+	/* RFC4303: Drop dummy packets without any error */
+	if (err == IPPROTO_NONE)
+		err = -EINVAL;
+
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(esp_input_done2);
+
+static void esp_input_done(void *data, int err)
+{
+	struct sk_buff *skb = data;
+
+	xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+static void esp_input_restore_header(struct sk_buff *skb)
+{
+	esp_restore_header(skb, 0);
+	__skb_pull(skb, 4);
+}
+
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ip_esp_hdr *esph;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accommodate the high bits.  We will move it back after
+	 * decryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = skb_push(skb, 4);
+		*seqhi = esph->spi;
+		esph->spi = esph->seq_no;
+		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	}
+}
+
+static void esp_input_done_esn(void *data, int err)
+{
+	struct sk_buff *skb = data;
+
+	esp_input_restore_header(skb);
+	esp_input_done(data, err);
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct crypto_aead *aead = x->data;
+	struct aead_request *req;
+	struct sk_buff *trailer;
+	int ivlen = crypto_aead_ivsize(aead);
+	int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
+	int nfrags;
+	int assoclen;
+	int seqhilen;
+	__be32 *seqhi;
+	void *tmp;
+	u8 *iv;
+	struct scatterlist *sg;
+	int err = -EINVAL;
+
+	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
+		goto out;
+
+	if (elen <= 0)
+		goto out;
+
+	assoclen = sizeof(struct ip_esp_hdr);
+	seqhilen = 0;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		seqhilen += sizeof(__be32);
+		assoclen += seqhilen;
+	}
+
+	if (!skb_cloned(skb)) {
+		if (!skb_is_nonlinear(skb)) {
+			nfrags = 1;
+
+			goto skip_cow;
+		} else if (!skb_has_frag_list(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
+		}
+	}
+
+	err = skb_cow_data(skb, 0, &trailer);
+	if (err < 0)
+		goto out;
+
+	nfrags = err;
+
+skip_cow:
+	err = -ENOMEM;
+	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+	if (!tmp)
+		goto out;
+
+	ESP_SKB_CB(skb)->tmp = tmp;
+	seqhi = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+
+	esp_input_set_header(skb, seqhi);
+
+	sg_init_table(sg, nfrags);
+	err = skb_to_sgvec(skb, sg, 0, skb->len);
+	if (unlikely(err < 0)) {
+		kfree(tmp);
+		goto out;
+	}
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_input_done, skb);
+
+	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
+	aead_request_set_ad(req, assoclen);
+
+	err = crypto_aead_decrypt(req);
+	if (err == -EINPROGRESS)
+		goto out;
+
+	if ((x->props.flags & XFRM_STATE_ESN))
+		esp_input_restore_header(skb);
+
+	err = esp_input_done2(skb, err);
+
+out:
+	return err;
+}
+
+static int esp4_err(struct sk_buff *skb, u32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+			return 0;
+		break;
+	case ICMP_REDIRECT:
+		break;
+	default:
+		return 0;
+	}
+
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      esph->spi, IPPROTO_ESP, AF_INET);
+	if (!x)
+		return 0;
+
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
+	else
+		ipv4_redirect(skb, net, 0, IPPROTO_ESP);
+	xfrm_state_put(x);
+
+	return 0;
+}
+
+static void esp_destroy(struct xfrm_state *x)
+{
+	struct crypto_aead *aead = x->data;
+
+	if (!aead)
+		return;
+
+	crypto_free_aead(aead);
+}
+
+static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
+{
+	char aead_name[CRYPTO_MAX_ALG_NAME];
+	struct crypto_aead *aead;
+	int err;
+
+	if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+		NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+		return -ENAMETOOLONG;
+	}
+
+	aead = crypto_alloc_aead(aead_name, 0, 0);
+	err = PTR_ERR(aead);
+	if (IS_ERR(aead))
+		goto error;
+
+	x->data = aead;
+
+	err = crypto_aead_setkey(aead, x->aead->alg_key,
+				 (x->aead->alg_key_len + 7) / 8);
+	if (err)
+		goto error;
+
+	err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+	if (err)
+		goto error;
+
+	return 0;
+
+error:
+	NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+	return err;
+}
+
+static int esp_init_authenc(struct xfrm_state *x,
+			    struct netlink_ext_ack *extack)
+{
+	struct crypto_aead *aead;
+	struct crypto_authenc_key_param *param;
+	struct rtattr *rta;
+	char *key;
+	char *p;
+	char authenc_name[CRYPTO_MAX_ALG_NAME];
+	unsigned int keylen;
+	int err;
+
+	err = -ENAMETOOLONG;
+
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+			     "%s%sauthencesn(%s,%s)%s",
+			     x->geniv ?: "", x->geniv ? "(" : "",
+			     x->aalg ? x->aalg->alg_name : "digest_null",
+			     x->ealg->alg_name,
+			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+			goto error;
+		}
+	} else {
+		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+			     "%s%sauthenc(%s,%s)%s",
+			     x->geniv ?: "", x->geniv ? "(" : "",
+			     x->aalg ? x->aalg->alg_name : "digest_null",
+			     x->ealg->alg_name,
+			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+			goto error;
+		}
+	}
+
+	aead = crypto_alloc_aead(authenc_name, 0, 0);
+	err = PTR_ERR(aead);
+	if (IS_ERR(aead)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+		goto error;
+	}
+
+	x->data = aead;
+
+	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+	err = -ENOMEM;
+	key = kmalloc(keylen, GFP_KERNEL);
+	if (!key)
+		goto error;
+
+	p = key;
+	rta = (void *)p;
+	rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+	rta->rta_len = RTA_LENGTH(sizeof(*param));
+	param = RTA_DATA(rta);
+	p += RTA_SPACE(sizeof(*param));
+
+	if (x->aalg) {
+		struct xfrm_algo_desc *aalg_desc;
+
+		memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+		p += (x->aalg->alg_key_len + 7) / 8;
+
+		aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+		BUG_ON(!aalg_desc);
+
+		err = -EINVAL;
+		if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
+		    crypto_aead_authsize(aead)) {
+			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+			goto free_key;
+		}
+
+		err = crypto_aead_setauthsize(
+			aead, x->aalg->alg_trunc_len / 8);
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+			goto free_key;
+		}
+	}
+
+	param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+	memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+	err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+	kfree_sensitive(key);
+
+error:
+	return err;
+}
+
+static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
+{
+	struct crypto_aead *aead;
+	u32 align;
+	int err;
+
+	x->data = NULL;
+
+	if (x->aead) {
+		err = esp_init_aead(x, extack);
+	} else if (x->ealg) {
+		err = esp_init_authenc(x, extack);
+	} else {
+		NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
+		err = -EINVAL;
+	}
+
+	if (err)
+		goto error;
+
+	aead = x->data;
+
+	x->props.header_len = sizeof(struct ip_esp_hdr) +
+			      crypto_aead_ivsize(aead);
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		x->props.header_len += sizeof(struct iphdr);
+	else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
+		x->props.header_len += IPV4_BEET_PHMAXLEN;
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap = x->encap;
+
+		switch (encap->encap_type) {
+		default:
+			NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
+			err = -EINVAL;
+			goto error;
+		case UDP_ENCAP_ESPINUDP:
+			x->props.header_len += sizeof(struct udphdr);
+			break;
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+			break;
+#ifdef CONFIG_INET_ESPINTCP
+		case TCP_ENCAP_ESPINTCP:
+			/* only the length field, TCP encap is done by
+			 * the socket
+			 */
+			x->props.header_len += 2;
+			break;
+#endif
+		}
+	}
+
+	align = ALIGN(crypto_aead_blocksize(aead), 4);
+	x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
+
+error:
+	return err;
+}
+
+static int esp4_rcv_cb(struct sk_buff *skb, int err)
+{
+	return 0;
+}
+
+static const struct xfrm_type esp_type =
+{
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_ESP,
+	.flags		= XFRM_TYPE_REPLAY_PROT,
+	.init_state	= esp_init_state,
+	.destructor	= esp_destroy,
+	.input		= esp_input,
+	.output		= esp_output,
+};
+
+static struct xfrm4_protocol esp4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.input_handler	=	xfrm_input,
+	.cb_handler	=	esp4_rcv_cb,
+	.err_handler	=	esp4_err,
+	.priority	=	0,
+};
+
+static int __init esp4_init(void)
+{
+	if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+		pr_info("%s: can't add xfrm type\n", __func__);
+		return -EAGAIN;
+	}
+	if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
+		pr_info("%s: can't add protocol\n", __func__);
+		xfrm_unregister_type(&esp_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit esp4_fini(void)
+{
+	if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
+		pr_info("%s: can't remove protocol\n", __func__);
+	xfrm_unregister_type(&esp_type, AF_INET);
+}
+
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 0000000000..10e96ed6c9
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/udp.h>
+
+static struct sk_buff *esp4_gro_receive(struct list_head *head,
+					struct sk_buff *skb)
+{
+	int offset = skb_gro_offset(skb);
+	struct xfrm_offload *xo;
+	struct xfrm_state *x;
+	__be32 seq;
+	__be32 spi;
+
+	if (!pskb_pull(skb, offset))
+		return NULL;
+
+	if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0)
+		goto out;
+
+	xo = xfrm_offload(skb);
+	if (!xo || !(xo->flags & CRYPTO_DONE)) {
+		struct sec_path *sp = secpath_set(skb);
+
+		if (!sp)
+			goto out;
+
+		if (sp->len == XFRM_MAX_DEPTH)
+			goto out_reset;
+
+		x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
+				      (xfrm_address_t *)&ip_hdr(skb)->daddr,
+				      spi, IPPROTO_ESP, AF_INET);
+		if (!x)
+			goto out_reset;
+
+		skb->mark = xfrm_smark_get(skb->mark, x);
+
+		sp->xvec[sp->len++] = x;
+		sp->olen++;
+
+		xo = xfrm_offload(skb);
+		if (!xo)
+			goto out_reset;
+	}
+
+	xo->flags |= XFRM_GRO;
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+	XFRM_SPI_SKB_CB(skb)->seq = seq;
+
+	/* We don't need to handle errors from xfrm_input, it does all
+	 * the error handling and frees the resources on error. */
+	xfrm_input(skb, IPPROTO_ESP, spi, -2);
+
+	return ERR_PTR(-EINPROGRESS);
+out_reset:
+	secpath_reset(skb);
+out:
+	skb_push(skb, offset);
+	NAPI_GRO_CB(skb)->same_flow = 0;
+	NAPI_GRO_CB(skb)->flush = 1;
+
+	return NULL;
+}
+
+static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ip_esp_hdr *esph;
+	struct iphdr *iph = ip_hdr(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	int proto = iph->protocol;
+
+	skb_push(skb, -skb_network_offset(skb));
+	esph = ip_esp_hdr(skb);
+	*skb_mac_header(skb) = IPPROTO_ESP;
+
+	esph->spi = x->id.spi;
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+	xo->proto = proto;
+}
+
+static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x,
+						struct sk_buff *skb,
+						netdev_features_t features)
+{
+	__be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6)
+						       : htons(ETH_P_IP);
+
+	return skb_eth_gso_segment(skb, features, type);
+}
+
+static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
+						   struct sk_buff *skb,
+						   netdev_features_t features)
+{
+	const struct net_offload *ops;
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+
+	skb->transport_header += x->props.header_len;
+	ops = rcu_dereference(inet_offloads[xo->proto]);
+	if (likely(ops && ops->callbacks.gso_segment))
+		segs = ops->callbacks.gso_segment(skb, features);
+
+	return segs;
+}
+
+static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x,
+					      struct sk_buff *skb,
+					      netdev_features_t features)
+{
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	const struct net_offload *ops;
+	u8 proto = xo->proto;
+
+	skb->transport_header += x->props.header_len;
+
+	if (x->sel.family != AF_INET6) {
+		if (proto == IPPROTO_BEETPH) {
+			struct ip_beet_phdr *ph =
+				(struct ip_beet_phdr *)skb->data;
+
+			skb->transport_header += ph->hdrlen * 8;
+			proto = ph->nexthdr;
+		} else {
+			skb->transport_header -= IPV4_BEET_PHMAXLEN;
+		}
+	} else {
+		__be16 frag;
+
+		skb->transport_header +=
+			ipv6_skip_exthdr(skb, 0, &proto, &frag);
+		if (proto == IPPROTO_TCP)
+			skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+	}
+
+	if (proto == IPPROTO_IPV6)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
+
+	__skb_pull(skb, skb_transport_offset(skb));
+	ops = rcu_dereference(inet_offloads[proto]);
+	if (likely(ops && ops->callbacks.gso_segment))
+		segs = ops->callbacks.gso_segment(skb, features);
+
+	return segs;
+}
+
+static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
+						    struct sk_buff *skb,
+						    netdev_features_t features)
+{
+	switch (x->outer_mode.encap) {
+	case XFRM_MODE_TUNNEL:
+		return xfrm4_tunnel_gso_segment(x, skb, features);
+	case XFRM_MODE_TRANSPORT:
+		return xfrm4_transport_gso_segment(x, skb, features);
+	case XFRM_MODE_BEET:
+		return xfrm4_beet_gso_segment(x, skb, features);
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
+				        netdev_features_t features)
+{
+	struct xfrm_state *x;
+	struct ip_esp_hdr *esph;
+	struct crypto_aead *aead;
+	netdev_features_t esp_features = features;
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct sec_path *sp;
+
+	if (!xo)
+		return ERR_PTR(-EINVAL);
+
+	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))
+		return ERR_PTR(-EINVAL);
+
+	sp = skb_sec_path(skb);
+	x = sp->xvec[sp->len - 1];
+	aead = x->data;
+	esph = ip_esp_hdr(skb);
+
+	if (esph->spi != x->id.spi)
+		return ERR_PTR(-EINVAL);
+
+	if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+		return ERR_PTR(-EINVAL);
+
+	__skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
+
+	skb->encap_hdr_csum = 1;
+
+	if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) &&
+	     !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev)
+		esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK |
+					    NETIF_F_SCTP_CRC);
+	else if (!(features & NETIF_F_HW_ESP_TX_CSUM) &&
+		 !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM))
+		esp_features = features & ~(NETIF_F_CSUM_MASK |
+					    NETIF_F_SCTP_CRC);
+
+	xo->flags |= XFRM_GSO_SEGMENT;
+
+	return xfrm4_outer_mode_gso_segment(x, skb, esp_features);
+}
+
+static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct crypto_aead *aead = x->data;
+	struct xfrm_offload *xo = xfrm_offload(skb);
+
+	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
+		return -EINVAL;
+
+	if (!(xo->flags & CRYPTO_DONE))
+		skb->ip_summed = CHECKSUM_NONE;
+
+	return esp_input_done2(skb, 0);
+}
+
+static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features_t features)
+{
+	int err;
+	int alen;
+	int blksize;
+	struct xfrm_offload *xo;
+	struct ip_esp_hdr *esph;
+	struct crypto_aead *aead;
+	struct esp_info esp;
+	bool hw_offload = true;
+	__u32 seq;
+
+	esp.inplace = true;
+
+	xo = xfrm_offload(skb);
+
+	if (!xo)
+		return -EINVAL;
+
+	if ((!(features & NETIF_F_HW_ESP) &&
+	     !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) ||
+	    x->xso.dev != skb->dev) {
+		xo->flags |= CRYPTO_FALLBACK;
+		hw_offload = false;
+	}
+
+	esp.proto = xo->proto;
+
+	/* skb is pure payload to encrypt */
+
+	aead = x->data;
+	alen = crypto_aead_authsize(aead);
+
+	esp.tfclen = 0;
+	/* XXX: Add support for tfc padding here. */
+
+	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+	esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+	esp.plen = esp.clen - skb->len - esp.tfclen;
+	esp.tailen = esp.tfclen + esp.plen + alen;
+
+	esp.esph = ip_esp_hdr(skb);
+
+
+	if (!hw_offload || !skb_is_gso(skb)) {
+		esp.nfrags = esp_output_head(x, skb, &esp);
+		if (esp.nfrags < 0)
+			return esp.nfrags;
+	}
+
+	seq = xo->seq.low;
+
+	esph = esp.esph;
+	esph->spi = x->id.spi;
+
+	skb_push(skb, -skb_network_offset(skb));
+
+	if (xo->flags & XFRM_GSO_SEGMENT) {
+		esph->seq_no = htonl(seq);
+
+		if (!skb_is_gso(skb))
+			xo->seq.low++;
+		else
+			xo->seq.low += skb_shinfo(skb)->gso_segs;
+	}
+
+	if (xo->seq.low < seq)
+		xo->seq.hi++;
+
+	esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32));
+
+	ip_hdr(skb)->tot_len = htons(skb->len);
+	ip_send_check(ip_hdr(skb));
+
+	if (hw_offload) {
+		if (!skb_ext_add(skb, SKB_EXT_SEC_PATH))
+			return -ENOMEM;
+
+		xo = xfrm_offload(skb);
+		if (!xo)
+			return -EINVAL;
+
+		xo->flags |= XFRM_XMIT;
+		return 0;
+	}
+
+	err = esp_output_tail(x, skb, &esp);
+	if (err)
+		return err;
+
+	secpath_reset(skb);
+
+	if (skb_needs_linearize(skb, skb->dev->features) &&
+	    __skb_linearize(skb))
+		return -ENOMEM;
+	return 0;
+}
+
+static const struct net_offload esp4_offload = {
+	.callbacks = {
+		.gro_receive = esp4_gro_receive,
+		.gso_segment = esp4_gso_segment,
+	},
+};
+
+static const struct xfrm_type_offload esp_type_offload = {
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_ESP,
+	.input_tail	= esp_input_tail,
+	.xmit		= esp_xmit,
+	.encap		= esp4_gso_encap,
+};
+
+static int __init esp4_offload_init(void)
+{
+	if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) {
+		pr_info("%s: can't add xfrm type offload\n", __func__);
+		return -EAGAIN;
+	}
+
+	return inet_add_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+static void __exit esp4_offload_exit(void)
+{
+	xfrm_unregister_type_offload(&esp_type_offload, AF_INET);
+	inet_del_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+module_init(esp4_offload_init);
+module_exit(esp4_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP);
+MODULE_DESCRIPTION("IPV4 GSO/GRO offload support");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 0000000000..390f4be7f7
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,1663 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <net/inet_dscp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#include <net/nexthop.h>
+#include <net/rtnetlink.h>
+#include <net/xfrm.h>
+#include <net/l3mdev.h>
+#include <net/lwtunnel.h>
+#include <trace/events/fib.h>
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+static int __net_init fib4_rules_init(struct net *net)
+{
+	struct fib_table *local_table, *main_table;
+
+	main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
+	if (!main_table)
+		return -ENOMEM;
+
+	local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
+	if (!local_table)
+		goto fail;
+
+	hlist_add_head_rcu(&local_table->tb_hlist,
+				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
+	hlist_add_head_rcu(&main_table->tb_hlist,
+				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
+	return 0;
+
+fail:
+	fib_free_table(main_table);
+	return -ENOMEM;
+}
+#else
+
+struct fib_table *fib_new_table(struct net *net, u32 id)
+{
+	struct fib_table *tb, *alias = NULL;
+	unsigned int h;
+
+	if (id == 0)
+		id = RT_TABLE_MAIN;
+	tb = fib_get_table(net, id);
+	if (tb)
+		return tb;
+
+	if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
+		alias = fib_new_table(net, RT_TABLE_MAIN);
+
+	tb = fib_trie_table(id, alias);
+	if (!tb)
+		return NULL;
+
+	switch (id) {
+	case RT_TABLE_MAIN:
+		rcu_assign_pointer(net->ipv4.fib_main, tb);
+		break;
+	case RT_TABLE_DEFAULT:
+		rcu_assign_pointer(net->ipv4.fib_default, tb);
+		break;
+	default:
+		break;
+	}
+
+	h = id & (FIB_TABLE_HASHSZ - 1);
+	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
+	return tb;
+}
+EXPORT_SYMBOL_GPL(fib_new_table);
+
+/* caller must hold either rtnl or rcu read lock */
+struct fib_table *fib_get_table(struct net *net, u32 id)
+{
+	struct fib_table *tb;
+	struct hlist_head *head;
+	unsigned int h;
+
+	if (id == 0)
+		id = RT_TABLE_MAIN;
+	h = id & (FIB_TABLE_HASHSZ - 1);
+
+	head = &net->ipv4.fib_table_hash[h];
+	hlist_for_each_entry_rcu(tb, head, tb_hlist,
+				 lockdep_rtnl_is_held()) {
+		if (tb->tb_id == id)
+			return tb;
+	}
+	return NULL;
+}
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+static void fib_replace_table(struct net *net, struct fib_table *old,
+			      struct fib_table *new)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	switch (new->tb_id) {
+	case RT_TABLE_MAIN:
+		rcu_assign_pointer(net->ipv4.fib_main, new);
+		break;
+	case RT_TABLE_DEFAULT:
+		rcu_assign_pointer(net->ipv4.fib_default, new);
+		break;
+	default:
+		break;
+	}
+
+#endif
+	/* replace the old table in the hlist */
+	hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
+}
+
+int fib_unmerge(struct net *net)
+{
+	struct fib_table *old, *new, *main_table;
+
+	/* attempt to fetch local table if it has been allocated */
+	old = fib_get_table(net, RT_TABLE_LOCAL);
+	if (!old)
+		return 0;
+
+	new = fib_trie_unmerge(old);
+	if (!new)
+		return -ENOMEM;
+
+	/* table is already unmerged */
+	if (new == old)
+		return 0;
+
+	/* replace merged table with clean table */
+	fib_replace_table(net, old, new);
+	fib_free_table(old);
+
+	/* attempt to fetch main table if it has been allocated */
+	main_table = fib_get_table(net, RT_TABLE_MAIN);
+	if (!main_table)
+		return 0;
+
+	/* flush local entries from main table */
+	fib_table_flush_external(main_table);
+
+	return 0;
+}
+
+void fib_flush(struct net *net)
+{
+	int flushed = 0;
+	unsigned int h;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct hlist_node *tmp;
+		struct fib_table *tb;
+
+		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
+			flushed += fib_table_flush(net, tb, false);
+	}
+
+	if (flushed)
+		rt_cache_flush(net);
+}
+
+/*
+ * Find address type as if only "dev" was present in the system. If
+ * on_dev is NULL then all interfaces are taken into consideration.
+ */
+static inline unsigned int __inet_dev_addr_type(struct net *net,
+						const struct net_device *dev,
+						__be32 addr, u32 tb_id)
+{
+	struct flowi4		fl4 = { .daddr = addr };
+	struct fib_result	res;
+	unsigned int ret = RTN_BROADCAST;
+	struct fib_table *table;
+
+	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
+		return RTN_BROADCAST;
+	if (ipv4_is_multicast(addr))
+		return RTN_MULTICAST;
+
+	rcu_read_lock();
+
+	table = fib_get_table(net, tb_id);
+	if (table) {
+		ret = RTN_UNICAST;
+		if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+			struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
+
+			if (!dev || dev == nhc->nhc_dev)
+				ret = res.type;
+		}
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
+{
+	return __inet_dev_addr_type(net, NULL, addr, tb_id);
+}
+EXPORT_SYMBOL(inet_addr_type_table);
+
+unsigned int inet_addr_type(struct net *net, __be32 addr)
+{
+	return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
+}
+EXPORT_SYMBOL(inet_addr_type);
+
+unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
+				__be32 addr)
+{
+	u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+
+	return __inet_dev_addr_type(net, dev, addr, rt_table);
+}
+EXPORT_SYMBOL(inet_dev_addr_type);
+
+/* inet_addr_type with dev == NULL but using the table from a dev
+ * if one is associated
+ */
+unsigned int inet_addr_type_dev_table(struct net *net,
+				      const struct net_device *dev,
+				      __be32 addr)
+{
+	u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+
+	return __inet_dev_addr_type(net, NULL, addr, rt_table);
+}
+EXPORT_SYMBOL(inet_addr_type_dev_table);
+
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	struct fib_result res;
+	struct rtable *rt;
+	struct net *net;
+	int scope;
+
+	rt = skb_rtable(skb);
+	if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+	    RTCF_LOCAL)
+		return ip_hdr(skb)->daddr;
+
+	in_dev = __in_dev_get_rcu(dev);
+
+	net = dev_net(dev);
+
+	scope = RT_SCOPE_UNIVERSE;
+	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+		bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
+		struct flowi4 fl4 = {
+			.flowi4_iif = LOOPBACK_IFINDEX,
+			.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
+			.daddr = ip_hdr(skb)->saddr,
+			.flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
+			.flowi4_scope = scope,
+			.flowi4_mark = vmark ? skb->mark : 0,
+		};
+		if (!fib_lookup(net, &fl4, &res, 0))
+			return fib_result_prefsrc(net, &res);
+	} else {
+		scope = RT_SCOPE_LINK;
+	}
+
+	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
+}
+
+bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
+{
+	bool dev_match = false;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (unlikely(fi->nh)) {
+		dev_match = nexthop_uses_dev(fi->nh, dev);
+	} else {
+		int ret;
+
+		for (ret = 0; ret < fib_info_num_path(fi); ret++) {
+			const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
+
+			if (nhc_l3mdev_matches_dev(nhc, dev)) {
+				dev_match = true;
+				break;
+			}
+		}
+	}
+#else
+	if (fib_info_nhc(fi, 0)->nhc_dev == dev)
+		dev_match = true;
+#endif
+
+	return dev_match;
+}
+EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ *   address.
+ * - figure out what "logical" interface this packet arrived
+ *   and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
+ */
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+				 u8 tos, int oif, struct net_device *dev,
+				 int rpf, struct in_device *idev, u32 *itag)
+{
+	struct net *net = dev_net(dev);
+	struct flow_keys flkeys;
+	int ret, no_addr;
+	struct fib_result res;
+	struct flowi4 fl4;
+	bool dev_match;
+
+	fl4.flowi4_oif = 0;
+	fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
+	fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+	fl4.daddr = src;
+	fl4.saddr = dst;
+	fl4.flowi4_tos = tos;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_tun_key.tun_id = 0;
+	fl4.flowi4_flags = 0;
+	fl4.flowi4_uid = sock_net_uid(net, NULL);
+	fl4.flowi4_multipath_hash = 0;
+
+	no_addr = idev->ifa_list == NULL;
+
+	fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+	if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
+		fl4.flowi4_proto = 0;
+		fl4.fl4_sport = 0;
+		fl4.fl4_dport = 0;
+	} else {
+		swap(fl4.fl4_sport, fl4.fl4_dport);
+	}
+
+	if (fib_lookup(net, &fl4, &res, 0))
+		goto last_resort;
+	if (res.type != RTN_UNICAST &&
+	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
+		goto e_inval;
+	fib_combine_itag(itag, &res);
+
+	dev_match = fib_info_nh_uses_dev(res.fi, dev);
+	/* This is not common, loopback packets retain skb_dst so normally they
+	 * would not even hit this slow path.
+	 */
+	dev_match = dev_match || (res.type == RTN_LOCAL &&
+				  dev == net->loopback_dev);
+	if (dev_match) {
+		ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
+		return ret;
+	}
+	if (no_addr)
+		goto last_resort;
+	if (rpf == 1)
+		goto e_rpf;
+	fl4.flowi4_oif = dev->ifindex;
+
+	ret = 0;
+	if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
+		if (res.type == RTN_UNICAST)
+			ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
+	}
+	return ret;
+
+last_resort:
+	if (rpf)
+		goto e_rpf;
+	*itag = 0;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+e_rpf:
+	return -EXDEV;
+}
+
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+			u8 tos, int oif, struct net_device *dev,
+			struct in_device *idev, u32 *itag)
+{
+	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+	struct net *net = dev_net(dev);
+
+	if (!r && !fib_num_tclassid_users(net) &&
+	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+		if (IN_DEV_ACCEPT_LOCAL(idev))
+			goto ok;
+		/* with custom local routes in place, checking local addresses
+		 * only will be too optimistic, with custom rules, checking
+		 * local addresses only can be too strict, e.g. due to vrf
+		 */
+		if (net->ipv4.fib_has_custom_local_routes ||
+		    fib4_has_custom_rules(net))
+			goto full_check;
+		/* Within the same container, it is regarded as a martian source,
+		 * and the same host but different containers are not.
+		 */
+		if (inet_lookup_ifaddr_rcu(net, src))
+			return -EINVAL;
+
+ok:
+		*itag = 0;
+		return 0;
+	}
+
+full_check:
+	return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
+}
+
+static inline __be32 sk_extract_addr(struct sockaddr *addr)
+{
+	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+}
+
+static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
+{
+	struct nlattr *nla;
+
+	nla = (struct nlattr *) ((char *) mx + len);
+	nla->nla_type = type;
+	nla->nla_len = nla_attr_size(4);
+	*(u32 *) nla_data(nla) = value;
+
+	return len + nla_total_size(4);
+}
+
+static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
+				 struct fib_config *cfg)
+{
+	__be32 addr;
+	int plen;
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->fc_nlinfo.nl_net = net;
+
+	if (rt->rt_dst.sa_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	/*
+	 * Check mask for validity:
+	 * a) it must be contiguous.
+	 * b) destination must have all host bits clear.
+	 * c) if application forgot to set correct family (AF_INET),
+	 *    reject request unless it is absolutely clear i.e.
+	 *    both family and mask are zero.
+	 */
+	plen = 32;
+	addr = sk_extract_addr(&rt->rt_dst);
+	if (!(rt->rt_flags & RTF_HOST)) {
+		__be32 mask = sk_extract_addr(&rt->rt_genmask);
+
+		if (rt->rt_genmask.sa_family != AF_INET) {
+			if (mask || rt->rt_genmask.sa_family)
+				return -EAFNOSUPPORT;
+		}
+
+		if (bad_mask(mask, addr))
+			return -EINVAL;
+
+		plen = inet_mask_len(mask);
+	}
+
+	cfg->fc_dst_len = plen;
+	cfg->fc_dst = addr;
+
+	if (cmd != SIOCDELRT) {
+		cfg->fc_nlflags = NLM_F_CREATE;
+		cfg->fc_protocol = RTPROT_BOOT;
+	}
+
+	if (rt->rt_metric)
+		cfg->fc_priority = rt->rt_metric - 1;
+
+	if (rt->rt_flags & RTF_REJECT) {
+		cfg->fc_scope = RT_SCOPE_HOST;
+		cfg->fc_type = RTN_UNREACHABLE;
+		return 0;
+	}
+
+	cfg->fc_scope = RT_SCOPE_NOWHERE;
+	cfg->fc_type = RTN_UNICAST;
+
+	if (rt->rt_dev) {
+		char *colon;
+		struct net_device *dev;
+		char devname[IFNAMSIZ];
+
+		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
+			return -EFAULT;
+
+		devname[IFNAMSIZ-1] = 0;
+		colon = strchr(devname, ':');
+		if (colon)
+			*colon = 0;
+		dev = __dev_get_by_name(net, devname);
+		if (!dev)
+			return -ENODEV;
+		cfg->fc_oif = dev->ifindex;
+		cfg->fc_table = l3mdev_fib_table(dev);
+		if (colon) {
+			const struct in_ifaddr *ifa;
+			struct in_device *in_dev;
+
+			in_dev = __in_dev_get_rtnl(dev);
+			if (!in_dev)
+				return -ENODEV;
+
+			*colon = ':';
+
+			rcu_read_lock();
+			in_dev_for_each_ifa_rcu(ifa, in_dev) {
+				if (strcmp(ifa->ifa_label, devname) == 0)
+					break;
+			}
+			rcu_read_unlock();
+
+			if (!ifa)
+				return -ENODEV;
+			cfg->fc_prefsrc = ifa->ifa_local;
+		}
+	}
+
+	addr = sk_extract_addr(&rt->rt_gateway);
+	if (rt->rt_gateway.sa_family == AF_INET && addr) {
+		unsigned int addr_type;
+
+		cfg->fc_gw4 = addr;
+		cfg->fc_gw_family = AF_INET;
+		addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
+		if (rt->rt_flags & RTF_GATEWAY &&
+		    addr_type == RTN_UNICAST)
+			cfg->fc_scope = RT_SCOPE_UNIVERSE;
+	}
+
+	if (!cfg->fc_table)
+		cfg->fc_table = RT_TABLE_MAIN;
+
+	if (cmd == SIOCDELRT)
+		return 0;
+
+	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
+		return -EINVAL;
+
+	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
+		cfg->fc_scope = RT_SCOPE_LINK;
+
+	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
+		struct nlattr *mx;
+		int len = 0;
+
+		mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
+		if (!mx)
+			return -ENOMEM;
+
+		if (rt->rt_flags & RTF_MTU)
+			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
+
+		if (rt->rt_flags & RTF_WINDOW)
+			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
+
+		if (rt->rt_flags & RTF_IRTT)
+			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
+
+		cfg->fc_mx = mx;
+		cfg->fc_mx_len = len;
+	}
+
+	return 0;
+}
+
+/*
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
+ */
+int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
+{
+	struct fib_config cfg;
+	int err;
+
+	switch (cmd) {
+	case SIOCADDRT:		/* Add a route */
+	case SIOCDELRT:		/* Delete a route */
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
+
+		rtnl_lock();
+		err = rtentry_to_fib_config(net, cmd, rt, &cfg);
+		if (err == 0) {
+			struct fib_table *tb;
+
+			if (cmd == SIOCDELRT) {
+				tb = fib_get_table(net, cfg.fc_table);
+				if (tb)
+					err = fib_table_delete(net, tb, &cfg,
+							       NULL);
+				else
+					err = -ESRCH;
+			} else {
+				tb = fib_new_table(net, cfg.fc_table);
+				if (tb)
+					err = fib_table_insert(net, tb,
+							       &cfg, NULL);
+				else
+					err = -ENOBUFS;
+			}
+
+			/* allocated by rtentry_to_fib_config() */
+			kfree(cfg.fc_mx);
+		}
+		rtnl_unlock();
+		return err;
+	}
+	return -EINVAL;
+}
+
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
+	[RTA_DST]		= { .type = NLA_U32 },
+	[RTA_SRC]		= { .type = NLA_U32 },
+	[RTA_IIF]		= { .type = NLA_U32 },
+	[RTA_OIF]		= { .type = NLA_U32 },
+	[RTA_GATEWAY]		= { .type = NLA_U32 },
+	[RTA_PRIORITY]		= { .type = NLA_U32 },
+	[RTA_PREFSRC]		= { .type = NLA_U32 },
+	[RTA_METRICS]		= { .type = NLA_NESTED },
+	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
+	[RTA_FLOW]		= { .type = NLA_U32 },
+	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
+	[RTA_ENCAP]		= { .type = NLA_NESTED },
+	[RTA_UID]		= { .type = NLA_U32 },
+	[RTA_MARK]		= { .type = NLA_U32 },
+	[RTA_TABLE]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
+	[RTA_NH_ID]		= { .type = NLA_U32 },
+};
+
+int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
+		    struct netlink_ext_ack *extack)
+{
+	struct rtvia *via;
+	int alen;
+
+	if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
+		NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
+		return -EINVAL;
+	}
+
+	via = nla_data(nla);
+	alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
+
+	switch (via->rtvia_family) {
+	case AF_INET:
+		if (alen != sizeof(__be32)) {
+			NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
+			return -EINVAL;
+		}
+		cfg->fc_gw_family = AF_INET;
+		cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
+		break;
+	case AF_INET6:
+#if IS_ENABLED(CONFIG_IPV6)
+		if (alen != sizeof(struct in6_addr)) {
+			NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
+			return -EINVAL;
+		}
+		cfg->fc_gw_family = AF_INET6;
+		cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
+#else
+		NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+		return -EINVAL;
+#endif
+		break;
+	default:
+		NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
+			     struct nlmsghdr *nlh, struct fib_config *cfg,
+			     struct netlink_ext_ack *extack)
+{
+	bool has_gw = false, has_via = false;
+	struct nlattr *attr;
+	int err, remaining;
+	struct rtmsg *rtm;
+
+	err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
+					rtm_ipv4_policy, extack);
+	if (err < 0)
+		goto errout;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	rtm = nlmsg_data(nlh);
+
+	if (!inet_validate_dscp(rtm->rtm_tos)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid dsfield (tos): ECN bits must be 0");
+		err = -EINVAL;
+		goto errout;
+	}
+	cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
+
+	cfg->fc_dst_len = rtm->rtm_dst_len;
+	cfg->fc_table = rtm->rtm_table;
+	cfg->fc_protocol = rtm->rtm_protocol;
+	cfg->fc_scope = rtm->rtm_scope;
+	cfg->fc_type = rtm->rtm_type;
+	cfg->fc_flags = rtm->rtm_flags;
+	cfg->fc_nlflags = nlh->nlmsg_flags;
+
+	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
+	cfg->fc_nlinfo.nlh = nlh;
+	cfg->fc_nlinfo.nl_net = net;
+
+	if (cfg->fc_type > RTN_MAX) {
+		NL_SET_ERR_MSG(extack, "Invalid route type");
+		err = -EINVAL;
+		goto errout;
+	}
+
+	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
+		switch (nla_type(attr)) {
+		case RTA_DST:
+			cfg->fc_dst = nla_get_be32(attr);
+			break;
+		case RTA_OIF:
+			cfg->fc_oif = nla_get_u32(attr);
+			break;
+		case RTA_GATEWAY:
+			has_gw = true;
+			cfg->fc_gw4 = nla_get_be32(attr);
+			if (cfg->fc_gw4)
+				cfg->fc_gw_family = AF_INET;
+			break;
+		case RTA_VIA:
+			has_via = true;
+			err = fib_gw_from_via(cfg, attr, extack);
+			if (err)
+				goto errout;
+			break;
+		case RTA_PRIORITY:
+			cfg->fc_priority = nla_get_u32(attr);
+			break;
+		case RTA_PREFSRC:
+			cfg->fc_prefsrc = nla_get_be32(attr);
+			break;
+		case RTA_METRICS:
+			cfg->fc_mx = nla_data(attr);
+			cfg->fc_mx_len = nla_len(attr);
+			break;
+		case RTA_MULTIPATH:
+			err = lwtunnel_valid_encap_type_attr(nla_data(attr),
+							     nla_len(attr),
+							     extack);
+			if (err < 0)
+				goto errout;
+			cfg->fc_mp = nla_data(attr);
+			cfg->fc_mp_len = nla_len(attr);
+			break;
+		case RTA_FLOW:
+			cfg->fc_flow = nla_get_u32(attr);
+			break;
+		case RTA_TABLE:
+			cfg->fc_table = nla_get_u32(attr);
+			break;
+		case RTA_ENCAP:
+			cfg->fc_encap = attr;
+			break;
+		case RTA_ENCAP_TYPE:
+			cfg->fc_encap_type = nla_get_u16(attr);
+			err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
+							extack);
+			if (err < 0)
+				goto errout;
+			break;
+		case RTA_NH_ID:
+			cfg->fc_nh_id = nla_get_u32(attr);
+			break;
+		}
+	}
+
+	if (cfg->fc_nh_id) {
+		if (cfg->fc_oif || cfg->fc_gw_family ||
+		    cfg->fc_encap || cfg->fc_mp) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop specification and nexthop id are mutually exclusive");
+			return -EINVAL;
+		}
+	}
+
+	if (has_gw && has_via) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop configuration can not contain both GATEWAY and VIA");
+		return -EINVAL;
+	}
+
+	if (!cfg->fc_table)
+		cfg->fc_table = RT_TABLE_MAIN;
+
+	return 0;
+errout:
+	return err;
+}
+
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_config cfg;
+	struct fib_table *tb;
+	int err;
+
+	err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
+	if (err < 0)
+		goto errout;
+
+	if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
+		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+		err = -EINVAL;
+		goto errout;
+	}
+
+	tb = fib_get_table(net, cfg.fc_table);
+	if (!tb) {
+		NL_SET_ERR_MSG(extack, "FIB table does not exist");
+		err = -ESRCH;
+		goto errout;
+	}
+
+	err = fib_table_delete(net, tb, &cfg, extack);
+errout:
+	return err;
+}
+
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_config cfg;
+	struct fib_table *tb;
+	int err;
+
+	err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
+	if (err < 0)
+		goto errout;
+
+	tb = fib_new_table(net, cfg.fc_table);
+	if (!tb) {
+		err = -ENOBUFS;
+		goto errout;
+	}
+
+	err = fib_table_insert(net, tb, &cfg, extack);
+	if (!err && cfg.fc_type == RTN_LOCAL)
+		net->ipv4.fib_has_custom_local_routes = true;
+errout:
+	return err;
+}
+
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
+			  struct netlink_callback *cb)
+{
+	struct netlink_ext_ack *extack = cb->extack;
+	struct nlattr *tb[RTA_MAX + 1];
+	struct rtmsg *rtm;
+	int err, i;
+
+	ASSERT_RTNL();
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
+		return -EINVAL;
+	}
+
+	rtm = nlmsg_data(nlh);
+	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
+	    rtm->rtm_scope) {
+		NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
+		return -EINVAL;
+	}
+
+	if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
+		NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
+		return -EINVAL;
+	}
+	if (rtm->rtm_flags & RTM_F_CLONED)
+		filter->dump_routes = false;
+	else
+		filter->dump_exceptions = false;
+
+	filter->flags    = rtm->rtm_flags;
+	filter->protocol = rtm->rtm_protocol;
+	filter->rt_type  = rtm->rtm_type;
+	filter->table_id = rtm->rtm_table;
+
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+					    rtm_ipv4_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case RTA_TABLE:
+			filter->table_id = nla_get_u32(tb[i]);
+			break;
+		case RTA_OIF:
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
+	}
+
+	if (filter->flags || filter->protocol || filter->rt_type ||
+	    filter->table_id || filter->dev) {
+		filter->filter_set = 1;
+		cb->answer_flags = NLM_F_DUMP_FILTERED;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
+
+static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct fib_dump_filter filter = { .dump_routes = true,
+					  .dump_exceptions = true };
+	const struct nlmsghdr *nlh = cb->nlh;
+	struct net *net = sock_net(skb->sk);
+	unsigned int h, s_h;
+	unsigned int e = 0, s_e;
+	struct fib_table *tb;
+	struct hlist_head *head;
+	int dumped = 0, err;
+
+	if (cb->strict_check) {
+		err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
+		if (err < 0)
+			return err;
+	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+		struct rtmsg *rtm = nlmsg_data(nlh);
+
+		filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
+	}
+
+	/* ipv4 does not use prefix flag */
+	if (filter.flags & RTM_F_PREFIX)
+		return skb->len;
+
+	if (filter.table_id) {
+		tb = fib_get_table(net, filter.table_id);
+		if (!tb) {
+			if (rtnl_msg_family(cb->nlh) != PF_INET)
+				return skb->len;
+
+			NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
+			return -ENOENT;
+		}
+
+		rcu_read_lock();
+		err = fib_table_dump(tb, skb, cb, &filter);
+		rcu_read_unlock();
+		return skb->len ? : err;
+	}
+
+	s_h = cb->args[0];
+	s_e = cb->args[1];
+
+	rcu_read_lock();
+
+	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
+		e = 0;
+		head = &net->ipv4.fib_table_hash[h];
+		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+			if (e < s_e)
+				goto next;
+			if (dumped)
+				memset(&cb->args[2], 0, sizeof(cb->args) -
+						 2 * sizeof(cb->args[0]));
+			err = fib_table_dump(tb, skb, cb, &filter);
+			if (err < 0) {
+				if (likely(skb->len))
+					goto out;
+
+				goto out_err;
+			}
+			dumped = 1;
+next:
+			e++;
+		}
+	}
+out:
+	err = skb->len;
+out_err:
+	rcu_read_unlock();
+
+	cb->args[1] = e;
+	cb->args[0] = h;
+
+	return err;
+}
+
+/* Prepare and feed intra-kernel routing request.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
+ */
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
+		      struct in_ifaddr *ifa, u32 rt_priority)
+{
+	struct net *net = dev_net(ifa->ifa_dev->dev);
+	u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
+	struct fib_table *tb;
+	struct fib_config cfg = {
+		.fc_protocol = RTPROT_KERNEL,
+		.fc_type = type,
+		.fc_dst = dst,
+		.fc_dst_len = dst_len,
+		.fc_priority = rt_priority,
+		.fc_prefsrc = ifa->ifa_local,
+		.fc_oif = ifa->ifa_dev->dev->ifindex,
+		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+		.fc_nlinfo = {
+			.nl_net = net,
+		},
+	};
+
+	if (!tb_id)
+		tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
+
+	tb = fib_new_table(net, tb_id);
+	if (!tb)
+		return;
+
+	cfg.fc_table = tb->tb_id;
+
+	if (type != RTN_LOCAL)
+		cfg.fc_scope = RT_SCOPE_LINK;
+	else
+		cfg.fc_scope = RT_SCOPE_HOST;
+
+	if (cmd == RTM_NEWROUTE)
+		fib_table_insert(net, tb, &cfg, NULL);
+	else
+		fib_table_delete(net, tb, &cfg, NULL);
+}
+
+void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+	struct in_ifaddr *prim = ifa;
+	__be32 mask = ifa->ifa_mask;
+	__be32 addr = ifa->ifa_local;
+	__be32 prefix = ifa->ifa_address & mask;
+
+	if (ifa->ifa_flags & IFA_F_SECONDARY) {
+		prim = inet_ifa_byprefix(in_dev, prefix, mask);
+		if (!prim) {
+			pr_warn("%s: bug: prim == NULL\n", __func__);
+			return;
+		}
+	}
+
+	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
+
+	if (!(dev->flags & IFF_UP))
+		return;
+
+	/* Add broadcast address, if it is explicitly assigned. */
+	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
+		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+			  prim, 0);
+		arp_invalidate(dev, ifa->ifa_broadcast, false);
+	}
+
+	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
+	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
+		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+			fib_magic(RTM_NEWROUTE,
+				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+				  prefix, ifa->ifa_prefixlen, prim,
+				  ifa->ifa_rt_priority);
+
+		/* Add the network broadcast address, when it makes sense */
+		if (ifa->ifa_prefixlen < 31) {
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+				  32, prim, 0);
+			arp_invalidate(dev, prefix | ~mask, false);
+		}
+	}
+}
+
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
+{
+	__be32 prefix = ifa->ifa_address & ifa->ifa_mask;
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+
+	if (!(dev->flags & IFF_UP) ||
+	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
+	    ipv4_is_zeronet(prefix) ||
+	    (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
+		return;
+
+	/* add the new */
+	fib_magic(RTM_NEWROUTE,
+		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+		  prefix, ifa->ifa_prefixlen, ifa, new_metric);
+
+	/* delete the old */
+	fib_magic(RTM_DELROUTE,
+		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+		  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
+}
+
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+	struct in_ifaddr *ifa1;
+	struct in_ifaddr *prim = ifa, *prim1 = NULL;
+	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+	__be32 any = ifa->ifa_address & ifa->ifa_mask;
+#define LOCAL_OK	1
+#define BRD_OK		2
+#define BRD0_OK		4
+#define BRD1_OK		8
+	unsigned int ok = 0;
+	int subnet = 0;		/* Primary network */
+	int gone = 1;		/* Address is missing */
+	int same_prefsrc = 0;	/* Another primary with same IP */
+
+	if (ifa->ifa_flags & IFA_F_SECONDARY) {
+		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+		if (!prim) {
+			/* if the device has been deleted, we don't perform
+			 * address promotion
+			 */
+			if (!in_dev->dead)
+				pr_warn("%s: bug: prim == NULL\n", __func__);
+			return;
+		}
+		if (iprim && iprim != prim) {
+			pr_warn("%s: bug: iprim != prim\n", __func__);
+			return;
+		}
+	} else if (!ipv4_is_zeronet(any) &&
+		   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+			fib_magic(RTM_DELROUTE,
+				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+				  any, ifa->ifa_prefixlen, prim, 0);
+		subnet = 1;
+	}
+
+	if (in_dev->dead)
+		goto no_promotions;
+
+	/* Deletion is more complicated than add.
+	 * We should take care of not to delete too much :-)
+	 *
+	 * Scan address list to be sure that addresses are really gone.
+	 */
+	rcu_read_lock();
+	in_dev_for_each_ifa_rcu(ifa1, in_dev) {
+		if (ifa1 == ifa) {
+			/* promotion, keep the IP */
+			gone = 0;
+			continue;
+		}
+		/* Ignore IFAs from our subnet */
+		if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+		    inet_ifa_match(ifa1->ifa_address, iprim))
+			continue;
+
+		/* Ignore ifa1 if it uses different primary IP (prefsrc) */
+		if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+			/* Another address from our subnet? */
+			if (ifa1->ifa_mask == prim->ifa_mask &&
+			    inet_ifa_match(ifa1->ifa_address, prim))
+				prim1 = prim;
+			else {
+				/* We reached the secondaries, so
+				 * same_prefsrc should be determined.
+				 */
+				if (!same_prefsrc)
+					continue;
+				/* Search new prim1 if ifa1 is not
+				 * using the current prim1
+				 */
+				if (!prim1 ||
+				    ifa1->ifa_mask != prim1->ifa_mask ||
+				    !inet_ifa_match(ifa1->ifa_address, prim1))
+					prim1 = inet_ifa_byprefix(in_dev,
+							ifa1->ifa_address,
+							ifa1->ifa_mask);
+				if (!prim1)
+					continue;
+				if (prim1->ifa_local != prim->ifa_local)
+					continue;
+			}
+		} else {
+			if (prim->ifa_local != ifa1->ifa_local)
+				continue;
+			prim1 = ifa1;
+			if (prim != prim1)
+				same_prefsrc = 1;
+		}
+		if (ifa->ifa_local == ifa1->ifa_local)
+			ok |= LOCAL_OK;
+		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+			ok |= BRD_OK;
+		if (brd == ifa1->ifa_broadcast)
+			ok |= BRD1_OK;
+		if (any == ifa1->ifa_broadcast)
+			ok |= BRD0_OK;
+		/* primary has network specific broadcasts */
+		if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+			__be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+			__be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+
+			if (!ipv4_is_zeronet(any1)) {
+				if (ifa->ifa_broadcast == brd1 ||
+				    ifa->ifa_broadcast == any1)
+					ok |= BRD_OK;
+				if (brd == brd1 || brd == any1)
+					ok |= BRD1_OK;
+				if (any == brd1 || any == any1)
+					ok |= BRD0_OK;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+no_promotions:
+	if (!(ok & BRD_OK))
+		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+			  prim, 0);
+	if (subnet && ifa->ifa_prefixlen < 31) {
+		if (!(ok & BRD1_OK))
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
+				  prim, 0);
+		if (!(ok & BRD0_OK))
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
+				  prim, 0);
+	}
+	if (!(ok & LOCAL_OK)) {
+		unsigned int addr_type;
+
+		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
+
+		/* Check, that this local address finally disappeared. */
+		addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
+						     ifa->ifa_local);
+		if (gone && addr_type != RTN_LOCAL) {
+			/* And the last, but not the least thing.
+			 * We must flush stray FIB entries.
+			 *
+			 * First of all, we scan fib_info list searching
+			 * for stray nexthop entries, then ignite fib_flush.
+			 */
+			if (fib_sync_down_addr(dev, ifa->ifa_local))
+				fib_flush(dev_net(dev));
+		}
+	}
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
+{
+
+	struct fib_result       res;
+	struct flowi4           fl4 = {
+		.flowi4_mark = frn->fl_mark,
+		.daddr = frn->fl_addr,
+		.flowi4_tos = frn->fl_tos,
+		.flowi4_scope = frn->fl_scope,
+	};
+	struct fib_table *tb;
+
+	rcu_read_lock();
+
+	tb = fib_get_table(net, frn->tb_id_in);
+
+	frn->err = -ENOENT;
+	if (tb) {
+		local_bh_disable();
+
+		frn->tb_id = tb->tb_id;
+		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+
+		if (!frn->err) {
+			frn->prefixlen = res.prefixlen;
+			frn->nh_sel = res.nh_sel;
+			frn->type = res.type;
+			frn->scope = res.scope;
+		}
+		local_bh_enable();
+	}
+
+	rcu_read_unlock();
+}
+
+static void nl_fib_input(struct sk_buff *skb)
+{
+	struct net *net;
+	struct fib_result_nl *frn;
+	struct nlmsghdr *nlh;
+	u32 portid;
+
+	net = sock_net(skb->sk);
+	nlh = nlmsg_hdr(skb);
+	if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
+	    skb->len < nlh->nlmsg_len ||
+	    nlmsg_len(nlh) < sizeof(*frn))
+		return;
+
+	skb = netlink_skb_clone(skb, GFP_KERNEL);
+	if (!skb)
+		return;
+	nlh = nlmsg_hdr(skb);
+
+	frn = nlmsg_data(nlh);
+	nl_fib_lookup(net, frn);
+
+	portid = NETLINK_CB(skb).portid;      /* netlink portid */
+	NETLINK_CB(skb).portid = 0;        /* from kernel */
+	NETLINK_CB(skb).dst_group = 0;  /* unicast */
+	nlmsg_unicast(net->ipv4.fibnl, skb, portid);
+}
+
+static int __net_init nl_fib_lookup_init(struct net *net)
+{
+	struct sock *sk;
+	struct netlink_kernel_cfg cfg = {
+		.input	= nl_fib_input,
+	};
+
+	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
+	if (!sk)
+		return -EAFNOSUPPORT;
+	net->ipv4.fibnl = sk;
+	return 0;
+}
+
+static void nl_fib_lookup_exit(struct net *net)
+{
+	netlink_kernel_release(net->ipv4.fibnl);
+	net->ipv4.fibnl = NULL;
+}
+
+static void fib_disable_ip(struct net_device *dev, unsigned long event,
+			   bool force)
+{
+	if (fib_sync_down_dev(dev, event, force))
+		fib_flush(dev_net(dev));
+	else
+		rt_cache_flush(dev_net(dev));
+	arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *dev = ifa->ifa_dev->dev;
+	struct net *net = dev_net(dev);
+
+	switch (event) {
+	case NETDEV_UP:
+		fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fib_sync_up(dev, RTNH_F_DEAD);
+#endif
+		atomic_inc(&net->ipv4.dev_addr_genid);
+		rt_cache_flush(dev_net(dev));
+		break;
+	case NETDEV_DOWN:
+		fib_del_ifaddr(ifa, NULL);
+		atomic_inc(&net->ipv4.dev_addr_genid);
+		if (!ifa->ifa_dev->ifa_list) {
+			/* Last address was deleted from this interface.
+			 * Disable IP.
+			 */
+			fib_disable_ip(dev, event, true);
+		} else {
+			rt_cache_flush(dev_net(dev));
+		}
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_changeupper_info *upper_info = ptr;
+	struct netdev_notifier_info_ext *info_ext = ptr;
+	struct in_device *in_dev;
+	struct net *net = dev_net(dev);
+	struct in_ifaddr *ifa;
+	unsigned int flags;
+
+	if (event == NETDEV_UNREGISTER) {
+		fib_disable_ip(dev, event, true);
+		rt_flush_dev(dev);
+		return NOTIFY_DONE;
+	}
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (!in_dev)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+			fib_add_ifaddr(ifa);
+		}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fib_sync_up(dev, RTNH_F_DEAD);
+#endif
+		atomic_inc(&net->ipv4.dev_addr_genid);
+		rt_cache_flush(net);
+		break;
+	case NETDEV_DOWN:
+		fib_disable_ip(dev, event, false);
+		break;
+	case NETDEV_CHANGE:
+		flags = dev_get_flags(dev);
+		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+			fib_sync_up(dev, RTNH_F_LINKDOWN);
+		else
+			fib_sync_down_dev(dev, event, false);
+		rt_cache_flush(net);
+		break;
+	case NETDEV_CHANGEMTU:
+		fib_sync_mtu(dev, info_ext->ext.mtu);
+		rt_cache_flush(net);
+		break;
+	case NETDEV_CHANGEUPPER:
+		upper_info = ptr;
+		/* flush all routes if dev is linked to or unlinked from
+		 * an L3 master device (e.g., VRF)
+		 */
+		if (upper_info->upper_dev &&
+		    netif_is_l3_master(upper_info->upper_dev))
+			fib_disable_ip(dev, NETDEV_DOWN, true);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_inetaddr_notifier = {
+	.notifier_call = fib_inetaddr_event,
+};
+
+static struct notifier_block fib_netdev_notifier = {
+	.notifier_call = fib_netdev_event,
+};
+
+static int __net_init ip_fib_net_init(struct net *net)
+{
+	int err;
+	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+
+	err = fib4_notifier_init(net);
+	if (err)
+		return err;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	/* Default to 3-tuple */
+	net->ipv4.sysctl_fib_multipath_hash_fields =
+		FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
+#endif
+
+	/* Avoid false sharing : Use at least a full cache line */
+	size = max_t(size_t, size, L1_CACHE_BYTES);
+
+	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
+	if (!net->ipv4.fib_table_hash) {
+		err = -ENOMEM;
+		goto err_table_hash_alloc;
+	}
+
+	err = fib4_rules_init(net);
+	if (err < 0)
+		goto err_rules_init;
+	return 0;
+
+err_rules_init:
+	kfree(net->ipv4.fib_table_hash);
+err_table_hash_alloc:
+	fib4_notifier_exit(net);
+	return err;
+}
+
+static void ip_fib_net_exit(struct net *net)
+{
+	int i;
+
+	ASSERT_RTNL();
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
+	RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
+#endif
+	/* Destroy the tables in reverse order to guarantee that the
+	 * local table, ID 255, is destroyed before the main table, ID
+	 * 254. This is necessary as the local table may contain
+	 * references to data contained in the main table.
+	 */
+	for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[i];
+		struct hlist_node *tmp;
+		struct fib_table *tb;
+
+		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+			hlist_del(&tb->tb_hlist);
+			fib_table_flush(net, tb, true);
+			fib_free_table(tb);
+		}
+	}
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	fib4_rules_exit(net);
+#endif
+
+	kfree(net->ipv4.fib_table_hash);
+	fib4_notifier_exit(net);
+}
+
+static int __net_init fib_net_init(struct net *net)
+{
+	int error;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
+#endif
+	error = ip_fib_net_init(net);
+	if (error < 0)
+		goto out;
+	error = nl_fib_lookup_init(net);
+	if (error < 0)
+		goto out_nlfl;
+	error = fib_proc_init(net);
+	if (error < 0)
+		goto out_proc;
+out:
+	return error;
+
+out_proc:
+	nl_fib_lookup_exit(net);
+out_nlfl:
+	rtnl_lock();
+	ip_fib_net_exit(net);
+	rtnl_unlock();
+	goto out;
+}
+
+static void __net_exit fib_net_exit(struct net *net)
+{
+	fib_proc_exit(net);
+	nl_fib_lookup_exit(net);
+}
+
+static void __net_exit fib_net_exit_batch(struct list_head *net_list)
+{
+	struct net *net;
+
+	rtnl_lock();
+	list_for_each_entry(net, net_list, exit_list)
+		ip_fib_net_exit(net);
+
+	rtnl_unlock();
+}
+
+static struct pernet_operations fib_net_ops = {
+	.init = fib_net_init,
+	.exit = fib_net_exit,
+	.exit_batch = fib_net_exit_batch,
+};
+
+void __init ip_fib_init(void)
+{
+	fib_trie_init();
+
+	register_pernet_subsys(&fib_net_ops);
+
+	register_netdevice_notifier(&fib_netdev_notifier);
+	register_inetaddr_notifier(&fib_inetaddr_notifier);
+
+	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
+	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
+	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
+}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 0000000000..f9b9e26c32
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/inet_dscp.h>
+#include <net/ip_fib.h>
+#include <net/nexthop.h>
+
+struct fib_alias {
+	struct hlist_node	fa_list;
+	struct fib_info		*fa_info;
+	dscp_t			fa_dscp;
+	u8			fa_type;
+	u8			fa_state;
+	u8			fa_slen;
+	u32			tb_id;
+	s16			fa_default;
+	u8			offload;
+	u8			trap;
+	u8			offload_failed;
+	struct rcu_head		rcu;
+};
+
+#define FA_S_ACCESSED	0x01
+
+/* Don't write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+	if (!(fa->fa_state & FA_S_ACCESSED))
+		fa->fa_state |= FA_S_ACCESSED;
+}
+
+/* Exported by fib_semantics.c */
+void fib_release_info(struct fib_info *);
+struct fib_info *fib_create_info(struct fib_config *cfg,
+				 struct netlink_ext_ack *extack);
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
+		 struct netlink_ext_ack *extack);
+bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+		  const struct fib_rt_info *fri, unsigned int flags);
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+	       u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+size_t fib_nlmsg_size(struct fib_info *fi);
+
+static inline void fib_result_assign(struct fib_result *res,
+				     struct fib_info *fi)
+{
+	/* we used to play games with refcounts, but we now use RCU */
+	res->fi = fi;
+	res->nhc = fib_info_nhc(fi, 0);
+}
+
+struct fib_prop {
+	int	error;
+	u8	scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
+
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
new file mode 100644
index 0000000000..0e23ade744
--- /dev/null
+++ b/net/ipv4/fib_notifier.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
+#include <net/ip_fib.h>
+
+int call_fib4_notifier(struct notifier_block *nb,
+		       enum fib_event_type event_type,
+		       struct fib_notifier_info *info)
+{
+	info->family = AF_INET;
+	return call_fib_notifier(nb, event_type, info);
+}
+
+int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
+			struct fib_notifier_info *info)
+{
+	ASSERT_RTNL();
+
+	info->family = AF_INET;
+	net->ipv4.fib_seq++;
+	return call_fib_notifiers(net, event_type, info);
+}
+
+static unsigned int fib4_seq_read(struct net *net)
+{
+	ASSERT_RTNL();
+
+	return net->ipv4.fib_seq + fib4_rules_seq_read(net);
+}
+
+static int fib4_dump(struct net *net, struct notifier_block *nb,
+		     struct netlink_ext_ack *extack)
+{
+	int err;
+
+	err = fib4_rules_dump(net, nb, extack);
+	if (err)
+		return err;
+
+	return fib_notify(net, nb, extack);
+}
+
+static const struct fib_notifier_ops fib4_notifier_ops_template = {
+	.family		= AF_INET,
+	.fib_seq_read	= fib4_seq_read,
+	.fib_dump	= fib4_dump,
+	.owner		= THIS_MODULE,
+};
+
+int __net_init fib4_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	net->ipv4.fib_seq = 0;
+
+	ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv4.notifier_ops = ops;
+
+	return 0;
+}
+
+void __net_exit fib4_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv4.notifier_ops);
+}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 0000000000..513f475c6a
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,436 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: policy rules.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *		Thomas Graf <tgraf@suug.ch>
+ *
+ * Fixes:
+ *		Rani Assaf	:	local_rule cannot be deleted
+ *		Marc Boucher	:	routing by fwmark
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <net/inet_dscp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/ip_fib.h>
+#include <net/nexthop.h>
+#include <net/fib_rules.h>
+#include <linux/indirect_call_wrapper.h>
+
+struct fib4_rule {
+	struct fib_rule		common;
+	u8			dst_len;
+	u8			src_len;
+	dscp_t			dscp;
+	__be32			src;
+	__be32			srcmask;
+	__be32			dst;
+	__be32			dstmask;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	u32			tclassid;
+#endif
+};
+
+static bool fib4_rule_matchall(const struct fib_rule *rule)
+{
+	struct fib4_rule *r = container_of(rule, struct fib4_rule, common);
+
+	if (r->dst_len || r->src_len || r->dscp)
+		return false;
+	return fib_rule_matchall(rule);
+}
+
+bool fib4_rule_default(const struct fib_rule *rule)
+{
+	if (!fib4_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL ||
+	    rule->l3mdev)
+		return false;
+	if (rule->table != RT_TABLE_LOCAL && rule->table != RT_TABLE_MAIN &&
+	    rule->table != RT_TABLE_DEFAULT)
+		return false;
+	return true;
+}
+EXPORT_SYMBOL_GPL(fib4_rule_default);
+
+int fib4_rules_dump(struct net *net, struct notifier_block *nb,
+		    struct netlink_ext_ack *extack)
+{
+	return fib_rules_dump(net, nb, AF_INET, extack);
+}
+
+unsigned int fib4_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, AF_INET);
+}
+
+int __fib_lookup(struct net *net, struct flowi4 *flp,
+		 struct fib_result *res, unsigned int flags)
+{
+	struct fib_lookup_arg arg = {
+		.result = res,
+		.flags = flags,
+	};
+	int err;
+
+	/* update flow if oif or iif point to device enslaved to l3mdev */
+	l3mdev_update_flow(net, flowi4_to_flowi(flp));
+
+	err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (arg.rule)
+		res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+	else
+		res->tclassid = 0;
+#endif
+
+	if (err == -ESRCH)
+		err = -ENETUNREACH;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(__fib_lookup);
+
+INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule,
+					     struct flowi *flp, int flags,
+					     struct fib_lookup_arg *arg)
+{
+	int err = -EAGAIN;
+	struct fib_table *tbl;
+	u32 tb_id;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+
+	tb_id = fib_rule_get_table(rule, arg);
+	tbl = fib_get_table(rule->fr_net, tb_id);
+	if (tbl)
+		err = fib_table_lookup(tbl, &flp->u.ip4,
+				       (struct fib_result *)arg->result,
+				       arg->flags);
+
+	rcu_read_unlock();
+	return err;
+}
+
+INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule,
+						int flags,
+						struct fib_lookup_arg *arg)
+{
+	struct fib_result *result = arg->result;
+	struct net_device *dev = NULL;
+
+	if (result->fi) {
+		struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0);
+
+		dev = nhc->nhc_dev;
+	}
+
+	/* do not accept result if the route does
+	 * not meet the required prefix length
+	 */
+	if (result->prefixlen <= rule->suppress_prefixlen)
+		goto suppress_route;
+
+	/* do not accept result if the route uses a device
+	 * belonging to a forbidden interface group
+	 */
+	if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+		goto suppress_route;
+
+	return false;
+
+suppress_route:
+	if (!(arg->flags & FIB_LOOKUP_NOREF))
+		fib_info_put(result->fi);
+	return true;
+}
+
+INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
+					    struct flowi *fl, int flags)
+{
+	struct fib4_rule *r = (struct fib4_rule *) rule;
+	struct flowi4 *fl4 = &fl->u.ip4;
+	__be32 daddr = fl4->daddr;
+	__be32 saddr = fl4->saddr;
+
+	if (((saddr ^ r->src) & r->srcmask) ||
+	    ((daddr ^ r->dst) & r->dstmask))
+		return 0;
+
+	if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
+		return 0;
+
+	if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
+		return 0;
+
+	if (fib_rule_port_range_set(&rule->sport_range) &&
+	    !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport))
+		return 0;
+
+	if (fib_rule_port_range_set(&rule->dport_range) &&
+	    !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport))
+		return 0;
+
+	return 1;
+}
+
+static struct fib_table *fib_empty_table(struct net *net)
+{
+	u32 id = 1;
+
+	while (1) {
+		if (!fib_get_table(net, id))
+			return fib_new_table(net, id);
+
+		if (id++ == RT_TABLE_MAX)
+			break;
+	}
+	return NULL;
+}
+
+static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+			       struct fib_rule_hdr *frh,
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	int err = -EINVAL;
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+	if (!inet_validate_dscp(frh->tos)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid dsfield (tos): ECN bits must be 0");
+		goto errout;
+	}
+	/* IPv4 currently doesn't handle high order DSCP bits correctly */
+	if (frh->tos & ~IPTOS_TOS_MASK) {
+		NL_SET_ERR_MSG(extack, "Invalid tos");
+		goto errout;
+	}
+	rule4->dscp = inet_dsfield_to_dscp(frh->tos);
+
+	/* split local/main if they are not already split */
+	err = fib_unmerge(net);
+	if (err)
+		goto errout;
+
+	if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) {
+		if (rule->action == FR_ACT_TO_TBL) {
+			struct fib_table *table;
+
+			table = fib_empty_table(net);
+			if (!table) {
+				err = -ENOBUFS;
+				goto errout;
+			}
+
+			rule->table = table->tb_id;
+		}
+	}
+
+	if (frh->src_len)
+		rule4->src = nla_get_in_addr(tb[FRA_SRC]);
+
+	if (frh->dst_len)
+		rule4->dst = nla_get_in_addr(tb[FRA_DST]);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (tb[FRA_FLOW]) {
+		rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+		if (rule4->tclassid)
+			atomic_inc(&net->ipv4.fib_num_tclassid_users);
+	}
+#endif
+
+	if (fib_rule_requires_fldissect(rule))
+		net->ipv4.fib_rules_require_fldissect++;
+
+	rule4->src_len = frh->src_len;
+	rule4->srcmask = inet_make_mask(rule4->src_len);
+	rule4->dst_len = frh->dst_len;
+	rule4->dstmask = inet_make_mask(rule4->dst_len);
+
+	net->ipv4.fib_has_custom_rules = true;
+
+	err = 0;
+errout:
+	return err;
+}
+
+static int fib4_rule_delete(struct fib_rule *rule)
+{
+	struct net *net = rule->fr_net;
+	int err;
+
+	/* split local/main if they are not already split */
+	err = fib_unmerge(net);
+	if (err)
+		goto errout;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (((struct fib4_rule *)rule)->tclassid)
+		atomic_dec(&net->ipv4.fib_num_tclassid_users);
+#endif
+	net->ipv4.fib_has_custom_rules = true;
+
+	if (net->ipv4.fib_rules_require_fldissect &&
+	    fib_rule_requires_fldissect(rule))
+		net->ipv4.fib_rules_require_fldissect--;
+errout:
+	return err;
+}
+
+static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			     struct nlattr **tb)
+{
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+	if (frh->src_len && (rule4->src_len != frh->src_len))
+		return 0;
+
+	if (frh->dst_len && (rule4->dst_len != frh->dst_len))
+		return 0;
+
+	if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos)
+		return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
+		return 0;
+#endif
+
+	if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC])))
+		return 0;
+
+	if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST])))
+		return 0;
+
+	return 1;
+}
+
+static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			  struct fib_rule_hdr *frh)
+{
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+	frh->dst_len = rule4->dst_len;
+	frh->src_len = rule4->src_len;
+	frh->tos = inet_dscp_to_dsfield(rule4->dscp);
+
+	if ((rule4->dst_len &&
+	     nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
+	    (rule4->src_len &&
+	     nla_put_in_addr(skb, FRA_SRC, rule4->src)))
+		goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (rule4->tclassid &&
+	    nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+		goto nla_put_failure;
+#endif
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
+{
+	return nla_total_size(4) /* dst */
+	       + nla_total_size(4) /* src */
+	       + nla_total_size(4); /* flow */
+}
+
+static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+{
+	rt_cache_flush(ops->fro_net);
+}
+
+static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
+	.family		= AF_INET,
+	.rule_size	= sizeof(struct fib4_rule),
+	.addr_size	= sizeof(u32),
+	.action		= fib4_rule_action,
+	.suppress	= fib4_rule_suppress,
+	.match		= fib4_rule_match,
+	.configure	= fib4_rule_configure,
+	.delete		= fib4_rule_delete,
+	.compare	= fib4_rule_compare,
+	.fill		= fib4_rule_fill,
+	.nlmsg_payload	= fib4_rule_nlmsg_payload,
+	.flush_cache	= fib4_rule_flush_cache,
+	.nlgroup	= RTNLGRP_IPV4_RULE,
+	.owner		= THIS_MODULE,
+};
+
+static int fib_default_rules_init(struct fib_rules_ops *ops)
+{
+	int err;
+
+	err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
+	if (err < 0)
+		return err;
+	err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
+	if (err < 0)
+		return err;
+	err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+int __net_init fib4_rules_init(struct net *net)
+{
+	int err;
+	struct fib_rules_ops *ops;
+
+	ops = fib_rules_register(&fib4_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	err = fib_default_rules_init(ops);
+	if (err < 0)
+		goto fail;
+	net->ipv4.rules_ops = ops;
+	net->ipv4.fib_has_custom_rules = false;
+	net->ipv4.fib_rules_require_fldissect = 0;
+	return 0;
+
+fail:
+	/* also cleans all rules already added */
+	fib_rules_unregister(ops);
+	return err;
+}
+
+void __net_exit fib4_rules_exit(struct net *net)
+{
+	fib_rules_unregister(net->ipv4.rules_ops);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 0000000000..5eb1b8d302
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,2275 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: semantics.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/netlink.h>
+#include <linux/hash.h>
+#include <linux/nospec.h>
+
+#include <net/arp.h>
+#include <net/inet_dscp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/ip6_fib.h>
+#include <net/nexthop.h>
+#include <net/netlink.h>
+#include <net/rtnh.h>
+#include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
+#include <net/addrconf.h>
+
+#include "fib_lookup.h"
+
+static DEFINE_SPINLOCK(fib_info_lock);
+static struct hlist_head *fib_info_hash;
+static struct hlist_head *fib_info_laddrhash;
+static unsigned int fib_info_hash_size;
+static unsigned int fib_info_hash_bits;
+static unsigned int fib_info_cnt;
+
+#define DEVINDEX_HASHBITS 8
+#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+
+/* for_nexthops and change_nexthops only used when nexthop object
+ * is not set in a fib_info. The logic within can reference fib_nh.
+ */
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh;				\
+	for (nhsel = 0, nh = (fi)->fib_nh;				\
+	     nhsel < fib_info_num_path((fi));				\
+	     nh++, nhsel++)
+
+#define change_nexthops(fi) {						\
+	int nhsel; struct fib_nh *nexthop_nh;				\
+	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	     nhsel < fib_info_num_path((fi));				\
+	     nexthop_nh++, nhsel++)
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
+	for (nhsel = 0; nhsel < 1; nhsel++)
+
+#define change_nexthops(fi) {						\
+	int nhsel;							\
+	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	for (nhsel = 0; nhsel < 1; nhsel++)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define endfor_nexthops(fi) }
+
+
+const struct fib_prop fib_props[RTN_MAX + 1] = {
+	[RTN_UNSPEC] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_NOWHERE,
+	},
+	[RTN_UNICAST] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_LOCAL] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_HOST,
+	},
+	[RTN_BROADCAST] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_LINK,
+	},
+	[RTN_ANYCAST] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_LINK,
+	},
+	[RTN_MULTICAST] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_BLACKHOLE] = {
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_UNREACHABLE] = {
+		.error	= -EHOSTUNREACH,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_PROHIBIT] = {
+		.error	= -EACCES,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_THROW] = {
+		.error	= -EAGAIN,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_NAT] = {
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_NOWHERE,
+	},
+	[RTN_XRESOLVE] = {
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_NOWHERE,
+	},
+};
+
+static void rt_fibinfo_free(struct rtable __rcu **rtp)
+{
+	struct rtable *rt = rcu_dereference_protected(*rtp, 1);
+
+	if (!rt)
+		return;
+
+	/* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+	 * because we waited an RCU grace period before calling
+	 * free_fib_info_rcu()
+	 */
+
+	dst_dev_put(&rt->dst);
+	dst_release_immediate(&rt->dst);
+}
+
+static void free_nh_exceptions(struct fib_nh_common *nhc)
+{
+	struct fnhe_hash_bucket *hash;
+	int i;
+
+	hash = rcu_dereference_protected(nhc->nhc_exceptions, 1);
+	if (!hash)
+		return;
+	for (i = 0; i < FNHE_HASH_SIZE; i++) {
+		struct fib_nh_exception *fnhe;
+
+		fnhe = rcu_dereference_protected(hash[i].chain, 1);
+		while (fnhe) {
+			struct fib_nh_exception *next;
+
+			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+			rt_fibinfo_free(&fnhe->fnhe_rth_input);
+			rt_fibinfo_free(&fnhe->fnhe_rth_output);
+
+			kfree(fnhe);
+
+			fnhe = next;
+		}
+	}
+	kfree(hash);
+}
+
+static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+	int cpu;
+
+	if (!rtp)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct rtable *rt;
+
+		rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+		if (rt) {
+			dst_dev_put(&rt->dst);
+			dst_release_immediate(&rt->dst);
+		}
+	}
+	free_percpu(rtp);
+}
+
+void fib_nh_common_release(struct fib_nh_common *nhc)
+{
+	netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker);
+	lwtstate_put(nhc->nhc_lwtstate);
+	rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
+	rt_fibinfo_free(&nhc->nhc_rth_input);
+	free_nh_exceptions(nhc);
+}
+EXPORT_SYMBOL_GPL(fib_nh_common_release);
+
+void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (fib_nh->nh_tclassid)
+		atomic_dec(&net->ipv4.fib_num_tclassid_users);
+#endif
+	fib_nh_common_release(&fib_nh->nh_common);
+}
+
+/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+	struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+	if (fi->nh) {
+		nexthop_put(fi->nh);
+	} else {
+		change_nexthops(fi) {
+			fib_nh_release(fi->fib_net, nexthop_nh);
+		} endfor_nexthops(fi);
+	}
+
+	ip_fib_metrics_put(fi->fib_metrics);
+
+	kfree(fi);
+}
+
+void free_fib_info(struct fib_info *fi)
+{
+	if (fi->fib_dead == 0) {
+		pr_warn("Freeing alive fib_info %p\n", fi);
+		return;
+	}
+
+	call_rcu(&fi->rcu, free_fib_info_rcu);
+}
+EXPORT_SYMBOL_GPL(free_fib_info);
+
+void fib_release_info(struct fib_info *fi)
+{
+	spin_lock_bh(&fib_info_lock);
+	if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
+		hlist_del(&fi->fib_hash);
+
+		/* Paired with READ_ONCE() in fib_create_info(). */
+		WRITE_ONCE(fib_info_cnt, fib_info_cnt - 1);
+
+		if (fi->fib_prefsrc)
+			hlist_del(&fi->fib_lhash);
+		if (fi->nh) {
+			list_del(&fi->nh_list);
+		} else {
+			change_nexthops(fi) {
+				if (!nexthop_nh->fib_nh_dev)
+					continue;
+				hlist_del(&nexthop_nh->nh_hash);
+			} endfor_nexthops(fi)
+		}
+		/* Paired with READ_ONCE() from fib_table_lookup() */
+		WRITE_ONCE(fi->fib_dead, 1);
+		fib_info_put(fi);
+	}
+	spin_unlock_bh(&fib_info_lock);
+}
+
+static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
+{
+	const struct fib_nh *onh;
+
+	if (fi->nh || ofi->nh)
+		return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1;
+
+	if (ofi->fib_nhs == 0)
+		return 0;
+
+	for_nexthops(fi) {
+		onh = fib_info_nh(ofi, nhsel);
+
+		if (nh->fib_nh_oif != onh->fib_nh_oif ||
+		    nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
+		    nh->fib_nh_scope != onh->fib_nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		    nh->fib_nh_weight != onh->fib_nh_weight ||
+#endif
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		    nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+		    lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
+		    ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
+			return -1;
+
+		if (nh->fib_nh_gw_family == AF_INET &&
+		    nh->fib_nh_gw4 != onh->fib_nh_gw4)
+			return -1;
+
+		if (nh->fib_nh_gw_family == AF_INET6 &&
+		    ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
+			return -1;
+	} endfor_nexthops(fi);
+	return 0;
+}
+
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+	return hash_32(val, DEVINDEX_HASHBITS);
+}
+
+static struct hlist_head *
+fib_info_devhash_bucket(const struct net_device *dev)
+{
+	u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex;
+
+	return &fib_info_devhash[fib_devindex_hashfn(val)];
+}
+
+static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
+				      u32 prefsrc, u32 priority)
+{
+	unsigned int val = init_val;
+
+	val ^= (protocol << 8) | scope;
+	val ^= prefsrc;
+	val ^= priority;
+
+	return val;
+}
+
+static unsigned int fib_info_hashfn_result(unsigned int val)
+{
+	unsigned int mask = (fib_info_hash_size - 1);
+
+	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+
+static inline unsigned int fib_info_hashfn(struct fib_info *fi)
+{
+	unsigned int val;
+
+	val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
+				fi->fib_scope, (__force u32)fi->fib_prefsrc,
+				fi->fib_priority);
+
+	if (fi->nh) {
+		val ^= fib_devindex_hashfn(fi->nh->id);
+	} else {
+		for_nexthops(fi) {
+			val ^= fib_devindex_hashfn(nh->fib_nh_oif);
+		} endfor_nexthops(fi)
+	}
+
+	return fib_info_hashfn_result(val);
+}
+
+/* no metrics, only nexthop id */
+static struct fib_info *fib_find_info_nh(struct net *net,
+					 const struct fib_config *cfg)
+{
+	struct hlist_head *head;
+	struct fib_info *fi;
+	unsigned int hash;
+
+	hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id),
+				 cfg->fc_protocol, cfg->fc_scope,
+				 (__force u32)cfg->fc_prefsrc,
+				 cfg->fc_priority);
+	hash = fib_info_hashfn_result(hash);
+	head = &fib_info_hash[hash];
+
+	hlist_for_each_entry(fi, head, fib_hash) {
+		if (!net_eq(fi->fib_net, net))
+			continue;
+		if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
+			continue;
+		if (cfg->fc_protocol == fi->fib_protocol &&
+		    cfg->fc_scope == fi->fib_scope &&
+		    cfg->fc_prefsrc == fi->fib_prefsrc &&
+		    cfg->fc_priority == fi->fib_priority &&
+		    cfg->fc_type == fi->fib_type &&
+		    cfg->fc_table == fi->fib_tb_id &&
+		    !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
+			return fi;
+	}
+
+	return NULL;
+}
+
+static struct fib_info *fib_find_info(struct fib_info *nfi)
+{
+	struct hlist_head *head;
+	struct fib_info *fi;
+	unsigned int hash;
+
+	hash = fib_info_hashfn(nfi);
+	head = &fib_info_hash[hash];
+
+	hlist_for_each_entry(fi, head, fib_hash) {
+		if (!net_eq(fi->fib_net, nfi->fib_net))
+			continue;
+		if (fi->fib_nhs != nfi->fib_nhs)
+			continue;
+		if (nfi->fib_protocol == fi->fib_protocol &&
+		    nfi->fib_scope == fi->fib_scope &&
+		    nfi->fib_prefsrc == fi->fib_prefsrc &&
+		    nfi->fib_priority == fi->fib_priority &&
+		    nfi->fib_type == fi->fib_type &&
+		    nfi->fib_tb_id == fi->fib_tb_id &&
+		    memcmp(nfi->fib_metrics, fi->fib_metrics,
+			   sizeof(u32) * RTAX_MAX) == 0 &&
+		    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
+		    nh_comp(fi, nfi) == 0)
+			return fi;
+	}
+
+	return NULL;
+}
+
+/* Check, that the gateway is already configured.
+ * Used only by redirect accept routine.
+ */
+int ip_fib_check_default(__be32 gw, struct net_device *dev)
+{
+	struct hlist_head *head;
+	struct fib_nh *nh;
+
+	spin_lock(&fib_info_lock);
+
+	head = fib_info_devhash_bucket(dev);
+
+	hlist_for_each_entry(nh, head, nh_hash) {
+		if (nh->fib_nh_dev == dev &&
+		    nh->fib_nh_gw4 == gw &&
+		    !(nh->fib_nh_flags & RTNH_F_DEAD)) {
+			spin_unlock(&fib_info_lock);
+			return 0;
+		}
+	}
+
+	spin_unlock(&fib_info_lock);
+
+	return -1;
+}
+
+size_t fib_nlmsg_size(struct fib_info *fi)
+{
+	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+			 + nla_total_size(4) /* RTA_TABLE */
+			 + nla_total_size(4) /* RTA_DST */
+			 + nla_total_size(4) /* RTA_PRIORITY */
+			 + nla_total_size(4) /* RTA_PREFSRC */
+			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+	unsigned int nhs = fib_info_num_path(fi);
+
+	/* space for nested metrics */
+	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
+
+	if (fi->nh)
+		payload += nla_total_size(4); /* RTA_NH_ID */
+
+	if (nhs) {
+		size_t nh_encapsize = 0;
+		/* Also handles the special case nhs == 1 */
+
+		/* each nexthop is packed in an attribute */
+		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+		unsigned int i;
+
+		/* may contain flow and gateway attribute */
+		nhsize += 2 * nla_total_size(4);
+
+		/* grab encap info */
+		for (i = 0; i < fib_info_num_path(fi); i++) {
+			struct fib_nh_common *nhc = fib_info_nhc(fi, i);
+
+			if (nhc->nhc_lwtstate) {
+				/* RTA_ENCAP_TYPE */
+				nh_encapsize += lwtunnel_get_encap_size(
+						nhc->nhc_lwtstate);
+				/* RTA_ENCAP */
+				nh_encapsize +=  nla_total_size(2);
+			}
+		}
+
+		/* all nexthops are packed in a nested attribute */
+		payload += nla_total_size((nhs * nhsize) + nh_encapsize);
+
+	}
+
+	return payload;
+}
+
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
+	       int dst_len, u32 tb_id, const struct nl_info *info,
+	       unsigned int nlm_flags)
+{
+	struct fib_rt_info fri;
+	struct sk_buff *skb;
+	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	fri.fi = fa->fa_info;
+	fri.tb_id = tb_id;
+	fri.dst = key;
+	fri.dst_len = dst_len;
+	fri.dscp = fa->fa_dscp;
+	fri.type = fa->fa_type;
+	fri.offload = READ_ONCE(fa->offload);
+	fri.trap = READ_ONCE(fa->trap);
+	fri.offload_failed = READ_ONCE(fa->offload_failed);
+	err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
+		    info->nlh, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
+}
+
+static int fib_detect_death(struct fib_info *fi, int order,
+			    struct fib_info **last_resort, int *last_idx,
+			    int dflt)
+{
+	const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+	struct neighbour *n;
+	int state = NUD_NONE;
+
+	if (likely(nhc->nhc_gw_family == AF_INET))
+		n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev);
+	else if (nhc->nhc_gw_family == AF_INET6)
+		n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6,
+				 nhc->nhc_dev);
+	else
+		n = NULL;
+
+	if (n) {
+		state = READ_ONCE(n->nud_state);
+		neigh_release(n);
+	} else {
+		return 0;
+	}
+	if (state == NUD_REACHABLE)
+		return 0;
+	if ((state & NUD_VALID) && order != dflt)
+		return 0;
+	if ((state & NUD_VALID) ||
+	    (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
+		*last_resort = fi;
+		*last_idx = order;
+	}
+	return 1;
+}
+
+int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
+		       struct nlattr *encap, u16 encap_type,
+		       void *cfg, gfp_t gfp_flags,
+		       struct netlink_ext_ack *extack)
+{
+	int err;
+
+	nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
+						    gfp_flags);
+	if (!nhc->nhc_pcpu_rth_output)
+		return -ENOMEM;
+
+	if (encap) {
+		struct lwtunnel_state *lwtstate;
+
+		if (encap_type == LWTUNNEL_ENCAP_NONE) {
+			NL_SET_ERR_MSG(extack, "LWT encap type not specified");
+			err = -EINVAL;
+			goto lwt_failure;
+		}
+		err = lwtunnel_build_state(net, encap_type, encap,
+					   nhc->nhc_family, cfg, &lwtstate,
+					   extack);
+		if (err)
+			goto lwt_failure;
+
+		nhc->nhc_lwtstate = lwtstate_get(lwtstate);
+	}
+
+	return 0;
+
+lwt_failure:
+	rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
+	nhc->nhc_pcpu_rth_output = NULL;
+	return err;
+}
+EXPORT_SYMBOL_GPL(fib_nh_common_init);
+
+int fib_nh_init(struct net *net, struct fib_nh *nh,
+		struct fib_config *cfg, int nh_weight,
+		struct netlink_ext_ack *extack)
+{
+	int err;
+
+	nh->fib_nh_family = AF_INET;
+
+	err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap,
+				 cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
+	if (err)
+		return err;
+
+	nh->fib_nh_oif = cfg->fc_oif;
+	nh->fib_nh_gw_family = cfg->fc_gw_family;
+	if (cfg->fc_gw_family == AF_INET)
+		nh->fib_nh_gw4 = cfg->fc_gw4;
+	else if (cfg->fc_gw_family == AF_INET6)
+		nh->fib_nh_gw6 = cfg->fc_gw6;
+
+	nh->fib_nh_flags = cfg->fc_flags;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	nh->nh_tclassid = cfg->fc_flow;
+	if (nh->nh_tclassid)
+		atomic_inc(&net->ipv4.fib_num_tclassid_users);
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	nh->fib_nh_weight = nh_weight;
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
+			      struct netlink_ext_ack *extack)
+{
+	int nhs = 0;
+
+	while (rtnh_ok(rtnh, remaining)) {
+		nhs++;
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	/* leftover implies invalid nexthop configuration, discard it */
+	if (remaining > 0) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid nexthop configuration - extra data after nexthops");
+		nhs = 0;
+	}
+
+	return nhs;
+}
+
+static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla,
+			    struct netlink_ext_ack *extack)
+{
+	if (nla_len(nla) < sizeof(*gw)) {
+		NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY");
+		return -EINVAL;
+	}
+
+	*gw = nla_get_in_addr(nla);
+
+	return 0;
+}
+
+/* only called when fib_nh is integrated into fib_info */
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+		       int remaining, struct fib_config *cfg,
+		       struct netlink_ext_ack *extack)
+{
+	struct net *net = fi->fib_net;
+	struct fib_config fib_cfg;
+	struct fib_nh *nh;
+	int ret;
+
+	change_nexthops(fi) {
+		int attrlen;
+
+		memset(&fib_cfg, 0, sizeof(fib_cfg));
+
+		if (!rtnh_ok(rtnh, remaining)) {
+			NL_SET_ERR_MSG(extack,
+				       "Invalid nexthop configuration - extra data after nexthop");
+			return -EINVAL;
+		}
+
+		if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
+			NL_SET_ERR_MSG(extack,
+				       "Invalid flags for nexthop - can not contain DEAD or LINKDOWN");
+			return -EINVAL;
+		}
+
+		fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+		fib_cfg.fc_oif = rtnh->rtnh_ifindex;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
+
+			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			nlav = nla_find(attrs, attrlen, RTA_VIA);
+			if (nla && nlav) {
+				NL_SET_ERR_MSG(extack,
+					       "Nexthop configuration can not contain both GATEWAY and VIA");
+				return -EINVAL;
+			}
+			if (nla) {
+				ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla,
+						       extack);
+				if (ret)
+					goto errout;
+
+				if (fib_cfg.fc_gw4)
+					fib_cfg.fc_gw_family = AF_INET;
+			} else if (nlav) {
+				ret = fib_gw_from_via(&fib_cfg, nlav, extack);
+				if (ret)
+					goto errout;
+			}
+
+			nla = nla_find(attrs, attrlen, RTA_FLOW);
+			if (nla) {
+				if (nla_len(nla) < sizeof(u32)) {
+					NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
+					return -EINVAL;
+				}
+				fib_cfg.fc_flow = nla_get_u32(nla);
+			}
+
+			fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+			/* RTA_ENCAP_TYPE length checked in
+			 * lwtunnel_valid_encap_type_attr
+			 */
+			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+			if (nla)
+				fib_cfg.fc_encap_type = nla_get_u16(nla);
+		}
+
+		ret = fib_nh_init(net, nexthop_nh, &fib_cfg,
+				  rtnh->rtnh_hops + 1, extack);
+		if (ret)
+			goto errout;
+
+		rtnh = rtnh_next(rtnh, &remaining);
+	} endfor_nexthops(fi);
+
+	ret = -EINVAL;
+	nh = fib_info_nh(fi, 0);
+	if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop device index does not match RTA_OIF");
+		goto errout;
+	}
+	if (cfg->fc_gw_family) {
+		if (cfg->fc_gw_family != nh->fib_nh_gw_family ||
+		    (cfg->fc_gw_family == AF_INET &&
+		     nh->fib_nh_gw4 != cfg->fc_gw4) ||
+		    (cfg->fc_gw_family == AF_INET6 &&
+		     ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
+			goto errout;
+		}
+	}
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop class id does not match RTA_FLOW");
+		goto errout;
+	}
+#endif
+	ret = 0;
+errout:
+	return ret;
+}
+
+/* only called when fib_nh is integrated into fib_info */
+static void fib_rebalance(struct fib_info *fi)
+{
+	int total;
+	int w;
+
+	if (fib_info_num_path(fi) < 2)
+		return;
+
+	total = 0;
+	for_nexthops(fi) {
+		if (nh->fib_nh_flags & RTNH_F_DEAD)
+			continue;
+
+		if (ip_ignore_linkdown(nh->fib_nh_dev) &&
+		    nh->fib_nh_flags & RTNH_F_LINKDOWN)
+			continue;
+
+		total += nh->fib_nh_weight;
+	} endfor_nexthops(fi);
+
+	w = 0;
+	change_nexthops(fi) {
+		int upper_bound;
+
+		if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) {
+			upper_bound = -1;
+		} else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) &&
+			   nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
+			upper_bound = -1;
+		} else {
+			w += nexthop_nh->fib_nh_weight;
+			upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+							    total) - 1;
+		}
+
+		atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound);
+	} endfor_nexthops(fi);
+}
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+		       int remaining, struct fib_config *cfg,
+		       struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel");
+
+	return -EINVAL;
+}
+
+#define fib_rebalance(fi) do { } while (0)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+static int fib_encap_match(struct net *net, u16 encap_type,
+			   struct nlattr *encap,
+			   const struct fib_nh *nh,
+			   const struct fib_config *cfg,
+			   struct netlink_ext_ack *extack)
+{
+	struct lwtunnel_state *lwtstate;
+	int ret, result = 0;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE)
+		return 0;
+
+	ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
+				   cfg, &lwtstate, extack);
+	if (!ret) {
+		result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
+		lwtstate_free(lwtstate);
+	}
+
+	return result;
+}
+
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
+		 struct netlink_ext_ack *extack)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	struct rtnexthop *rtnh;
+	int remaining;
+#endif
+
+	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
+		return 1;
+
+	if (cfg->fc_nh_id) {
+		if (fi->nh && cfg->fc_nh_id == fi->nh->id)
+			return 0;
+		return 1;
+	}
+
+	if (fi->nh) {
+		if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp)
+			return 1;
+		return 0;
+	}
+
+	if (cfg->fc_oif || cfg->fc_gw_family) {
+		struct fib_nh *nh;
+
+		nh = fib_info_nh(fi, 0);
+		if (cfg->fc_encap) {
+			if (fib_encap_match(net, cfg->fc_encap_type,
+					    cfg->fc_encap, nh, cfg, extack))
+				return 1;
+		}
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		if (cfg->fc_flow &&
+		    cfg->fc_flow != nh->nh_tclassid)
+			return 1;
+#endif
+		if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) ||
+		    (cfg->fc_gw_family &&
+		     cfg->fc_gw_family != nh->fib_nh_gw_family))
+			return 1;
+
+		if (cfg->fc_gw_family == AF_INET &&
+		    cfg->fc_gw4 != nh->fib_nh_gw4)
+			return 1;
+
+		if (cfg->fc_gw_family == AF_INET6 &&
+		    ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6))
+			return 1;
+
+		return 0;
+	}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (!cfg->fc_mp)
+		return 0;
+
+	rtnh = cfg->fc_mp;
+	remaining = cfg->fc_mp_len;
+
+	for_nexthops(fi) {
+		int attrlen;
+
+		if (!rtnh_ok(rtnh, remaining))
+			return -EINVAL;
+
+		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif)
+			return 1;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
+			int err;
+
+			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			nlav = nla_find(attrs, attrlen, RTA_VIA);
+			if (nla && nlav) {
+				NL_SET_ERR_MSG(extack,
+					       "Nexthop configuration can not contain both GATEWAY and VIA");
+				return -EINVAL;
+			}
+
+			if (nla) {
+				__be32 gw;
+
+				err = fib_gw_from_attr(&gw, nla, extack);
+				if (err)
+					return err;
+
+				if (nh->fib_nh_gw_family != AF_INET ||
+				    gw != nh->fib_nh_gw4)
+					return 1;
+			} else if (nlav) {
+				struct fib_config cfg2;
+
+				err = fib_gw_from_via(&cfg2, nlav, extack);
+				if (err)
+					return err;
+
+				switch (nh->fib_nh_gw_family) {
+				case AF_INET:
+					if (cfg2.fc_gw_family != AF_INET ||
+					    cfg2.fc_gw4 != nh->fib_nh_gw4)
+						return 1;
+					break;
+				case AF_INET6:
+					if (cfg2.fc_gw_family != AF_INET6 ||
+					    ipv6_addr_cmp(&cfg2.fc_gw6,
+							  &nh->fib_nh_gw6))
+						return 1;
+					break;
+				}
+			}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+			nla = nla_find(attrs, attrlen, RTA_FLOW);
+			if (nla) {
+				if (nla_len(nla) < sizeof(u32)) {
+					NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
+					return -EINVAL;
+				}
+				if (nla_get_u32(nla) != nh->nh_tclassid)
+					return 1;
+			}
+#endif
+		}
+
+		rtnh = rtnh_next(rtnh, &remaining);
+	} endfor_nexthops(fi);
+#endif
+	return 0;
+}
+
+bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
+{
+	struct nlattr *nla;
+	int remaining;
+
+	if (!cfg->fc_mx)
+		return true;
+
+	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+		int type = nla_type(nla);
+		u32 fi_val, val;
+
+		if (!type)
+			continue;
+		if (type > RTAX_MAX)
+			return false;
+
+		type = array_index_nospec(type, RTAX_MAX + 1);
+		if (type == RTAX_CC_ALGO) {
+			char tmp[TCP_CA_NAME_MAX];
+			bool ecn_ca = false;
+
+			nla_strscpy(tmp, nla, sizeof(tmp));
+			val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
+		} else {
+			if (nla_len(nla) != sizeof(u32))
+				return false;
+			val = nla_get_u32(nla);
+		}
+
+		fi_val = fi->fib_metrics->metrics[type - 1];
+		if (type == RTAX_FEATURES)
+			fi_val &= ~DST_FEATURE_ECN_CA;
+
+		if (fi_val != val)
+			return false;
+	}
+
+	return true;
+}
+
+static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
+			      u32 table, struct netlink_ext_ack *extack)
+{
+	struct fib6_config cfg = {
+		.fc_table = table,
+		.fc_flags = nh->fib_nh_flags | RTF_GATEWAY,
+		.fc_ifindex = nh->fib_nh_oif,
+		.fc_gateway = nh->fib_nh_gw6,
+	};
+	struct fib6_nh fib6_nh = {};
+	int err;
+
+	err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
+	if (!err) {
+		nh->fib_nh_dev = fib6_nh.fib_nh_dev;
+		netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+			    GFP_KERNEL);
+		nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
+		nh->fib_nh_scope = RT_SCOPE_LINK;
+
+		ipv6_stub->fib6_nh_release(&fib6_nh);
+	}
+
+	return err;
+}
+
+/*
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ *    so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ *    described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ *    contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix}  -> (gw, oif) [scope link]
+ *		  |
+ *		  |-> {link prefix} -> (gw, oif) [scope local]
+ *					|
+ *					|-> {local prefix} (terminal node)
+ */
+static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
+			      u8 scope, struct netlink_ext_ack *extack)
+{
+	struct net_device *dev;
+	struct fib_result res;
+	int err = 0;
+
+	if (nh->fib_nh_flags & RTNH_F_ONLINK) {
+		unsigned int addr_type;
+
+		if (scope >= RT_SCOPE_LINK) {
+			NL_SET_ERR_MSG(extack, "Nexthop has invalid scope");
+			return -EINVAL;
+		}
+		dev = __dev_get_by_index(net, nh->fib_nh_oif);
+		if (!dev) {
+			NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
+			return -ENODEV;
+		}
+		if (!(dev->flags & IFF_UP)) {
+			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+			return -ENETDOWN;
+		}
+		addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4);
+		if (addr_type != RTN_UNICAST) {
+			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+			return -EINVAL;
+		}
+		if (!netif_carrier_ok(dev))
+			nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+		nh->fib_nh_dev = dev;
+		netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+		nh->fib_nh_scope = RT_SCOPE_LINK;
+		return 0;
+	}
+	rcu_read_lock();
+	{
+		struct fib_table *tbl = NULL;
+		struct flowi4 fl4 = {
+			.daddr = nh->fib_nh_gw4,
+			.flowi4_scope = scope + 1,
+			.flowi4_oif = nh->fib_nh_oif,
+			.flowi4_iif = LOOPBACK_IFINDEX,
+		};
+
+		/* It is not necessary, but requires a bit of thinking */
+		if (fl4.flowi4_scope < RT_SCOPE_LINK)
+			fl4.flowi4_scope = RT_SCOPE_LINK;
+
+		if (table && table != RT_TABLE_MAIN)
+			tbl = fib_get_table(net, table);
+
+		if (tbl)
+			err = fib_table_lookup(tbl, &fl4, &res,
+					       FIB_LOOKUP_IGNORE_LINKSTATE |
+					       FIB_LOOKUP_NOREF);
+
+		/* on error or if no table given do full lookup. This
+		 * is needed for example when nexthops are in the local
+		 * table rather than the given table
+		 */
+		if (!tbl || err) {
+			err = fib_lookup(net, &fl4, &res,
+					 FIB_LOOKUP_IGNORE_LINKSTATE);
+		}
+
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+			goto out;
+		}
+	}
+
+	err = -EINVAL;
+	if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
+		NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+		goto out;
+	}
+	nh->fib_nh_scope = res.scope;
+	nh->fib_nh_oif = FIB_RES_OIF(res);
+	nh->fib_nh_dev = dev = FIB_RES_DEV(res);
+	if (!dev) {
+		NL_SET_ERR_MSG(extack,
+			       "No egress device for nexthop gateway");
+		goto out;
+	}
+	netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+	if (!netif_carrier_ok(dev))
+		nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+	err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
+			      struct netlink_ext_ack *extack)
+{
+	struct in_device *in_dev;
+	int err;
+
+	if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+
+	err = -ENODEV;
+	in_dev = inetdev_by_index(net, nh->fib_nh_oif);
+	if (!in_dev)
+		goto out;
+	err = -ENETDOWN;
+	if (!(in_dev->dev->flags & IFF_UP)) {
+		NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
+		goto out;
+	}
+
+	nh->fib_nh_dev = in_dev->dev;
+	netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+	nh->fib_nh_scope = RT_SCOPE_HOST;
+	if (!netif_carrier_ok(nh->fib_nh_dev))
+		nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+	err = 0;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+		 struct netlink_ext_ack *extack)
+{
+	int err;
+
+	if (nh->fib_nh_gw_family == AF_INET)
+		err = fib_check_nh_v4_gw(net, nh, table, scope, extack);
+	else if (nh->fib_nh_gw_family == AF_INET6)
+		err = fib_check_nh_v6_gw(net, nh, table, extack);
+	else
+		err = fib_check_nh_nongw(net, nh, extack);
+
+	return err;
+}
+
+static struct hlist_head *
+fib_info_laddrhash_bucket(const struct net *net, __be32 val)
+{
+	u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val,
+			   fib_info_hash_bits);
+
+	return &fib_info_laddrhash[slot];
+}
+
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+			       struct hlist_head *new_laddrhash,
+			       unsigned int new_size)
+{
+	struct hlist_head *old_info_hash, *old_laddrhash;
+	unsigned int old_size = fib_info_hash_size;
+	unsigned int i;
+
+	spin_lock_bh(&fib_info_lock);
+	old_info_hash = fib_info_hash;
+	old_laddrhash = fib_info_laddrhash;
+	fib_info_hash_size = new_size;
+	fib_info_hash_bits = ilog2(new_size);
+
+	for (i = 0; i < old_size; i++) {
+		struct hlist_head *head = &fib_info_hash[i];
+		struct hlist_node *n;
+		struct fib_info *fi;
+
+		hlist_for_each_entry_safe(fi, n, head, fib_hash) {
+			struct hlist_head *dest;
+			unsigned int new_hash;
+
+			new_hash = fib_info_hashfn(fi);
+			dest = &new_info_hash[new_hash];
+			hlist_add_head(&fi->fib_hash, dest);
+		}
+	}
+	fib_info_hash = new_info_hash;
+
+	fib_info_laddrhash = new_laddrhash;
+	for (i = 0; i < old_size; i++) {
+		struct hlist_head *lhead = &old_laddrhash[i];
+		struct hlist_node *n;
+		struct fib_info *fi;
+
+		hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
+			struct hlist_head *ldest;
+
+			ldest = fib_info_laddrhash_bucket(fi->fib_net,
+							  fi->fib_prefsrc);
+			hlist_add_head(&fi->fib_lhash, ldest);
+		}
+	}
+
+	spin_unlock_bh(&fib_info_lock);
+
+	kvfree(old_info_hash);
+	kvfree(old_laddrhash);
+}
+
+__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
+				 unsigned char scope)
+{
+	struct fib_nh *nh;
+	__be32 saddr;
+
+	if (nhc->nhc_family != AF_INET)
+		return inet_select_addr(nhc->nhc_dev, 0, scope);
+
+	nh = container_of(nhc, struct fib_nh, nh_common);
+	saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);
+
+	WRITE_ONCE(nh->nh_saddr, saddr);
+	WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid));
+
+	return saddr;
+}
+
+__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
+{
+	struct fib_nh_common *nhc = res->nhc;
+
+	if (res->fi->fib_prefsrc)
+		return res->fi->fib_prefsrc;
+
+	if (nhc->nhc_family == AF_INET) {
+		struct fib_nh *nh;
+
+		nh = container_of(nhc, struct fib_nh, nh_common);
+		if (READ_ONCE(nh->nh_saddr_genid) ==
+		    atomic_read(&net->ipv4.dev_addr_genid))
+			return READ_ONCE(nh->nh_saddr);
+	}
+
+	return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope);
+}
+
+static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
+{
+	if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+	    fib_prefsrc != cfg->fc_dst) {
+		u32 tb_id = cfg->fc_table;
+		int rc;
+
+		if (tb_id == RT_TABLE_MAIN)
+			tb_id = RT_TABLE_LOCAL;
+
+		rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+					  fib_prefsrc, tb_id);
+
+		if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
+			rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+						  fib_prefsrc, RT_TABLE_LOCAL);
+		}
+
+		if (rc != RTN_LOCAL)
+			return false;
+	}
+	return true;
+}
+
+struct fib_info *fib_create_info(struct fib_config *cfg,
+				 struct netlink_ext_ack *extack)
+{
+	int err;
+	struct fib_info *fi = NULL;
+	struct nexthop *nh = NULL;
+	struct fib_info *ofi;
+	int nhs = 1;
+	struct net *net = cfg->fc_nlinfo.nl_net;
+
+	if (cfg->fc_type > RTN_MAX)
+		goto err_inval;
+
+	/* Fast check to catch the most weird cases */
+	if (fib_props[cfg->fc_type].scope > cfg->fc_scope) {
+		NL_SET_ERR_MSG(extack, "Invalid scope");
+		goto err_inval;
+	}
+
+	if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid rtm_flags - can not contain DEAD or LINKDOWN");
+		goto err_inval;
+	}
+
+	if (cfg->fc_nh_id) {
+		if (!cfg->fc_mx) {
+			fi = fib_find_info_nh(net, cfg);
+			if (fi) {
+				refcount_inc(&fi->fib_treeref);
+				return fi;
+			}
+		}
+
+		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+		if (!nh) {
+			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+			goto err_inval;
+		}
+		nhs = 0;
+	}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (cfg->fc_mp) {
+		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
+		if (nhs == 0)
+			goto err_inval;
+	}
+#endif
+
+	err = -ENOBUFS;
+
+	/* Paired with WRITE_ONCE() in fib_release_info() */
+	if (READ_ONCE(fib_info_cnt) >= fib_info_hash_size) {
+		unsigned int new_size = fib_info_hash_size << 1;
+		struct hlist_head *new_info_hash;
+		struct hlist_head *new_laddrhash;
+		size_t bytes;
+
+		if (!new_size)
+			new_size = 16;
+		bytes = (size_t)new_size * sizeof(struct hlist_head *);
+		new_info_hash = kvzalloc(bytes, GFP_KERNEL);
+		new_laddrhash = kvzalloc(bytes, GFP_KERNEL);
+		if (!new_info_hash || !new_laddrhash) {
+			kvfree(new_info_hash);
+			kvfree(new_laddrhash);
+		} else {
+			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
+		}
+		if (!fib_info_hash_size)
+			goto failure;
+	}
+
+	fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
+	if (!fi)
+		goto failure;
+	fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
+					      cfg->fc_mx_len, extack);
+	if (IS_ERR(fi->fib_metrics)) {
+		err = PTR_ERR(fi->fib_metrics);
+		kfree(fi);
+		return ERR_PTR(err);
+	}
+
+	fi->fib_net = net;
+	fi->fib_protocol = cfg->fc_protocol;
+	fi->fib_scope = cfg->fc_scope;
+	fi->fib_flags = cfg->fc_flags;
+	fi->fib_priority = cfg->fc_priority;
+	fi->fib_prefsrc = cfg->fc_prefsrc;
+	fi->fib_type = cfg->fc_type;
+	fi->fib_tb_id = cfg->fc_table;
+
+	fi->fib_nhs = nhs;
+	if (nh) {
+		if (!nexthop_get(nh)) {
+			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+			err = -EINVAL;
+		} else {
+			err = 0;
+			fi->nh = nh;
+		}
+	} else {
+		change_nexthops(fi) {
+			nexthop_nh->nh_parent = fi;
+		} endfor_nexthops(fi)
+
+		if (cfg->fc_mp)
+			err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg,
+					  extack);
+		else
+			err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
+	}
+
+	if (err != 0)
+		goto failure;
+
+	if (fib_props[cfg->fc_type].error) {
+		if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) {
+			NL_SET_ERR_MSG(extack,
+				       "Gateway, device and multipath can not be specified for this route type");
+			goto err_inval;
+		}
+		goto link_it;
+	} else {
+		switch (cfg->fc_type) {
+		case RTN_UNICAST:
+		case RTN_LOCAL:
+		case RTN_BROADCAST:
+		case RTN_ANYCAST:
+		case RTN_MULTICAST:
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Invalid route type");
+			goto err_inval;
+		}
+	}
+
+	if (cfg->fc_scope > RT_SCOPE_HOST) {
+		NL_SET_ERR_MSG(extack, "Invalid scope");
+		goto err_inval;
+	}
+
+	if (fi->nh) {
+		err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack);
+		if (err)
+			goto failure;
+	} else if (cfg->fc_scope == RT_SCOPE_HOST) {
+		struct fib_nh *nh = fi->fib_nh;
+
+		/* Local address is added. */
+		if (nhs != 1) {
+			NL_SET_ERR_MSG(extack,
+				       "Route with host scope can not have multiple nexthops");
+			goto err_inval;
+		}
+		if (nh->fib_nh_gw_family) {
+			NL_SET_ERR_MSG(extack,
+				       "Route with host scope can not have a gateway");
+			goto err_inval;
+		}
+		nh->fib_nh_scope = RT_SCOPE_NOWHERE;
+		nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif);
+		err = -ENODEV;
+		if (!nh->fib_nh_dev)
+			goto failure;
+		netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+				     GFP_KERNEL);
+	} else {
+		int linkdown = 0;
+
+		change_nexthops(fi) {
+			err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
+					   cfg->fc_table, cfg->fc_scope,
+					   extack);
+			if (err != 0)
+				goto failure;
+			if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
+				linkdown++;
+		} endfor_nexthops(fi)
+		if (linkdown == fi->fib_nhs)
+			fi->fib_flags |= RTNH_F_LINKDOWN;
+	}
+
+	if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) {
+		NL_SET_ERR_MSG(extack, "Invalid prefsrc address");
+		goto err_inval;
+	}
+
+	if (!fi->nh) {
+		change_nexthops(fi) {
+			fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
+						  fi->fib_scope);
+			if (nexthop_nh->fib_nh_gw_family == AF_INET6)
+				fi->fib_nh_is_v6 = true;
+		} endfor_nexthops(fi)
+
+		fib_rebalance(fi);
+	}
+
+link_it:
+	ofi = fib_find_info(fi);
+	if (ofi) {
+		/* fib_table_lookup() should not see @fi yet. */
+		fi->fib_dead = 1;
+		free_fib_info(fi);
+		refcount_inc(&ofi->fib_treeref);
+		return ofi;
+	}
+
+	refcount_set(&fi->fib_treeref, 1);
+	refcount_set(&fi->fib_clntref, 1);
+	spin_lock_bh(&fib_info_lock);
+	fib_info_cnt++;
+	hlist_add_head(&fi->fib_hash,
+		       &fib_info_hash[fib_info_hashfn(fi)]);
+	if (fi->fib_prefsrc) {
+		struct hlist_head *head;
+
+		head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc);
+		hlist_add_head(&fi->fib_lhash, head);
+	}
+	if (fi->nh) {
+		list_add(&fi->nh_list, &nh->fi_list);
+	} else {
+		change_nexthops(fi) {
+			struct hlist_head *head;
+
+			if (!nexthop_nh->fib_nh_dev)
+				continue;
+			head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev);
+			hlist_add_head(&nexthop_nh->nh_hash, head);
+		} endfor_nexthops(fi)
+	}
+	spin_unlock_bh(&fib_info_lock);
+	return fi;
+
+err_inval:
+	err = -EINVAL;
+
+failure:
+	if (fi) {
+		/* fib_table_lookup() should not see @fi yet. */
+		fi->fib_dead = 1;
+		free_fib_info(fi);
+	}
+
+	return ERR_PTR(err);
+}
+
+int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
+		     u8 rt_family, unsigned char *flags, bool skip_oif)
+{
+	if (nhc->nhc_flags & RTNH_F_DEAD)
+		*flags |= RTNH_F_DEAD;
+
+	if (nhc->nhc_flags & RTNH_F_LINKDOWN) {
+		*flags |= RTNH_F_LINKDOWN;
+
+		rcu_read_lock();
+		switch (nhc->nhc_family) {
+		case AF_INET:
+			if (ip_ignore_linkdown(nhc->nhc_dev))
+				*flags |= RTNH_F_DEAD;
+			break;
+		case AF_INET6:
+			if (ip6_ignore_linkdown(nhc->nhc_dev))
+				*flags |= RTNH_F_DEAD;
+			break;
+		}
+		rcu_read_unlock();
+	}
+
+	switch (nhc->nhc_gw_family) {
+	case AF_INET:
+		if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
+			goto nla_put_failure;
+		break;
+	case AF_INET6:
+		/* if gateway family does not match nexthop family
+		 * gateway is encoded as RTA_VIA
+		 */
+		if (rt_family != nhc->nhc_gw_family) {
+			int alen = sizeof(struct in6_addr);
+			struct nlattr *nla;
+			struct rtvia *via;
+
+			nla = nla_reserve(skb, RTA_VIA, alen + 2);
+			if (!nla)
+				goto nla_put_failure;
+
+			via = nla_data(nla);
+			via->rtvia_family = AF_INET6;
+			memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen);
+		} else if (nla_put_in6_addr(skb, RTA_GATEWAY,
+					    &nhc->nhc_gw.ipv6) < 0) {
+			goto nla_put_failure;
+		}
+		break;
+	}
+
+	*flags |= (nhc->nhc_flags &
+		   (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP));
+
+	if (!skip_oif && nhc->nhc_dev &&
+	    nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
+		goto nla_put_failure;
+
+	if (nhc->nhc_lwtstate &&
+	    lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
+				RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(fib_nexthop_info);
+
+#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
+int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
+		    int nh_weight, u8 rt_family, u32 nh_tclassid)
+{
+	const struct net_device *dev = nhc->nhc_dev;
+	struct rtnexthop *rtnh;
+	unsigned char flags = 0;
+
+	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+	if (!rtnh)
+		goto nla_put_failure;
+
+	rtnh->rtnh_hops = nh_weight - 1;
+	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
+
+	if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
+		goto nla_put_failure;
+
+	rtnh->rtnh_flags = flags;
+
+	if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid))
+		goto nla_put_failure;
+
+	/* length of rtnetlink header + attributes */
+	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(fib_add_nexthop);
+#endif
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
+{
+	struct nlattr *mp;
+
+	mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
+	if (!mp)
+		goto nla_put_failure;
+
+	if (unlikely(fi->nh)) {
+		if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
+			goto nla_put_failure;
+		goto mp_end;
+	}
+
+	for_nexthops(fi) {
+		u32 nh_tclassid = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		nh_tclassid = nh->nh_tclassid;
+#endif
+		if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
+				    AF_INET, nh_tclassid) < 0)
+			goto nla_put_failure;
+	} endfor_nexthops(fi);
+
+mp_end:
+	nla_nest_end(skb, mp);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+#else
+static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
+{
+	return 0;
+}
+#endif
+
+int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
+		  const struct fib_rt_info *fri, unsigned int flags)
+{
+	unsigned int nhs = fib_info_num_path(fri->fi);
+	struct fib_info *fi = fri->fi;
+	u32 tb_id = fri->tb_id;
+	struct nlmsghdr *nlh;
+	struct rtmsg *rtm;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family = AF_INET;
+	rtm->rtm_dst_len = fri->dst_len;
+	rtm->rtm_src_len = 0;
+	rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp);
+	if (tb_id < 256)
+		rtm->rtm_table = tb_id;
+	else
+		rtm->rtm_table = RT_TABLE_COMPAT;
+	if (nla_put_u32(skb, RTA_TABLE, tb_id))
+		goto nla_put_failure;
+	rtm->rtm_type = fri->type;
+	rtm->rtm_flags = fi->fib_flags;
+	rtm->rtm_scope = fi->fib_scope;
+	rtm->rtm_protocol = fi->fib_protocol;
+
+	if (rtm->rtm_dst_len &&
+	    nla_put_in_addr(skb, RTA_DST, fri->dst))
+		goto nla_put_failure;
+	if (fi->fib_priority &&
+	    nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+		goto nla_put_failure;
+	if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
+		goto nla_put_failure;
+
+	if (fi->fib_prefsrc &&
+	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
+		goto nla_put_failure;
+
+	if (fi->nh) {
+		if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id))
+			goto nla_put_failure;
+		if (nexthop_is_blackhole(fi->nh))
+			rtm->rtm_type = RTN_BLACKHOLE;
+		if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode))
+			goto offload;
+	}
+
+	if (nhs == 1) {
+		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+		unsigned char flags = 0;
+
+		if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
+			goto nla_put_failure;
+
+		rtm->rtm_flags = flags;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		if (nhc->nhc_family == AF_INET) {
+			struct fib_nh *nh;
+
+			nh = container_of(nhc, struct fib_nh, nh_common);
+			if (nh->nh_tclassid &&
+			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+				goto nla_put_failure;
+		}
+#endif
+	} else {
+		if (fib_add_multipath(skb, fi) < 0)
+			goto nla_put_failure;
+	}
+
+offload:
+	if (fri->offload)
+		rtm->rtm_flags |= RTM_F_OFFLOAD;
+	if (fri->trap)
+		rtm->rtm_flags |= RTM_F_TRAP;
+	if (fri->offload_failed)
+		rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+/*
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ *   referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
+ */
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
+{
+	int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
+	struct net *net = dev_net(dev);
+	struct hlist_head *head;
+	struct fib_info *fi;
+	int ret = 0;
+
+	if (!fib_info_laddrhash || local == 0)
+		return 0;
+
+	head = fib_info_laddrhash_bucket(net, local);
+	hlist_for_each_entry(fi, head, fib_lhash) {
+		if (!net_eq(fi->fib_net, net) ||
+		    fi->fib_tb_id != tb_id)
+			continue;
+		if (fi->fib_prefsrc == local) {
+			fi->fib_flags |= RTNH_F_DEAD;
+			fi->pfsrc_removed = true;
+			ret++;
+		}
+	}
+	return ret;
+}
+
+static int call_fib_nh_notifiers(struct fib_nh *nh,
+				 enum fib_event_type event_type)
+{
+	bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev);
+	struct fib_nh_notifier_info info = {
+		.fib_nh = nh,
+	};
+
+	switch (event_type) {
+	case FIB_EVENT_NH_ADD:
+		if (nh->fib_nh_flags & RTNH_F_DEAD)
+			break;
+		if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN)
+			break;
+		return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type,
+					   &info.info);
+	case FIB_EVENT_NH_DEL:
+		if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) ||
+		    (nh->fib_nh_flags & RTNH_F_DEAD))
+			return call_fib4_notifiers(dev_net(nh->fib_nh_dev),
+						   event_type, &info.info);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/* Update the PMTU of exceptions when:
+ * - the new MTU of the first hop becomes smaller than the PMTU
+ * - the old MTU was the same as the PMTU, and it limited discovery of
+ *   larger MTUs on the path. With that limit raised, we can now
+ *   discover larger MTUs
+ * A special case is locked exceptions, for which the PMTU is smaller
+ * than the minimal accepted PMTU:
+ * - if the new MTU is greater than the PMTU, don't make any change
+ * - otherwise, unlock and set PMTU
+ */
+void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
+{
+	struct fnhe_hash_bucket *bucket;
+	int i;
+
+	bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1);
+	if (!bucket)
+		return;
+
+	for (i = 0; i < FNHE_HASH_SIZE; i++) {
+		struct fib_nh_exception *fnhe;
+
+		for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
+		     fnhe;
+		     fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
+			if (fnhe->fnhe_mtu_locked) {
+				if (new <= fnhe->fnhe_pmtu) {
+					fnhe->fnhe_pmtu = new;
+					fnhe->fnhe_mtu_locked = false;
+				}
+			} else if (new < fnhe->fnhe_pmtu ||
+				   orig == fnhe->fnhe_pmtu) {
+				fnhe->fnhe_pmtu = new;
+			}
+		}
+	}
+}
+
+void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+	struct hlist_head *head = fib_info_devhash_bucket(dev);
+	struct fib_nh *nh;
+
+	hlist_for_each_entry(nh, head, nh_hash) {
+		if (nh->fib_nh_dev == dev)
+			fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
+	}
+}
+
+/* Event              force Flags           Description
+ * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
+ * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
+ * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
+ * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
+ *
+ * only used when fib_nh is built into fib_info
+ */
+int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
+{
+	struct hlist_head *head = fib_info_devhash_bucket(dev);
+	struct fib_info *prev_fi = NULL;
+	int scope = RT_SCOPE_NOWHERE;
+	struct fib_nh *nh;
+	int ret = 0;
+
+	if (force)
+		scope = -1;
+
+	hlist_for_each_entry(nh, head, nh_hash) {
+		struct fib_info *fi = nh->nh_parent;
+		int dead;
+
+		BUG_ON(!fi->fib_nhs);
+		if (nh->fib_nh_dev != dev || fi == prev_fi)
+			continue;
+		prev_fi = fi;
+		dead = 0;
+		change_nexthops(fi) {
+			if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD)
+				dead++;
+			else if (nexthop_nh->fib_nh_dev == dev &&
+				 nexthop_nh->fib_nh_scope != scope) {
+				switch (event) {
+				case NETDEV_DOWN:
+				case NETDEV_UNREGISTER:
+					nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
+					fallthrough;
+				case NETDEV_CHANGE:
+					nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+					break;
+				}
+				call_fib_nh_notifiers(nexthop_nh,
+						      FIB_EVENT_NH_DEL);
+				dead++;
+			}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+			if (event == NETDEV_UNREGISTER &&
+			    nexthop_nh->fib_nh_dev == dev) {
+				dead = fi->fib_nhs;
+				break;
+			}
+#endif
+		} endfor_nexthops(fi)
+		if (dead == fi->fib_nhs) {
+			switch (event) {
+			case NETDEV_DOWN:
+			case NETDEV_UNREGISTER:
+				fi->fib_flags |= RTNH_F_DEAD;
+				fallthrough;
+			case NETDEV_CHANGE:
+				fi->fib_flags |= RTNH_F_LINKDOWN;
+				break;
+			}
+			ret++;
+		}
+
+		fib_rebalance(fi);
+	}
+
+	return ret;
+}
+
+/* Must be invoked inside of an RCU protected region.  */
+static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
+{
+	struct fib_info *fi = NULL, *last_resort = NULL;
+	struct hlist_head *fa_head = res->fa_head;
+	struct fib_table *tb = res->table;
+	u8 slen = 32 - res->prefixlen;
+	int order = -1, last_idx = -1;
+	struct fib_alias *fa, *fa1 = NULL;
+	u32 last_prio = res->fi->fib_priority;
+	dscp_t last_dscp = 0;
+
+	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
+		struct fib_info *next_fi = fa->fa_info;
+		struct fib_nh_common *nhc;
+
+		if (fa->fa_slen != slen)
+			continue;
+		if (fa->fa_dscp &&
+		    fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos))
+			continue;
+		if (fa->tb_id != tb->tb_id)
+			continue;
+		if (next_fi->fib_priority > last_prio &&
+		    fa->fa_dscp == last_dscp) {
+			if (last_dscp)
+				continue;
+			break;
+		}
+		if (next_fi->fib_flags & RTNH_F_DEAD)
+			continue;
+		last_dscp = fa->fa_dscp;
+		last_prio = next_fi->fib_priority;
+
+		if (next_fi->fib_scope != res->scope ||
+		    fa->fa_type != RTN_UNICAST)
+			continue;
+
+		nhc = fib_info_nhc(next_fi, 0);
+		if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK)
+			continue;
+
+		fib_alias_accessed(fa);
+
+		if (!fi) {
+			if (next_fi != res->fi)
+				break;
+			fa1 = fa;
+		} else if (!fib_detect_death(fi, order, &last_resort,
+					     &last_idx, fa1->fa_default)) {
+			fib_result_assign(res, fi);
+			fa1->fa_default = order;
+			goto out;
+		}
+		fi = next_fi;
+		order++;
+	}
+
+	if (order <= 0 || !fi) {
+		if (fa1)
+			fa1->fa_default = -1;
+		goto out;
+	}
+
+	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+			      fa1->fa_default)) {
+		fib_result_assign(res, fi);
+		fa1->fa_default = order;
+		goto out;
+	}
+
+	if (last_idx >= 0)
+		fib_result_assign(res, last_resort);
+	fa1->fa_default = last_idx;
+out:
+	return;
+}
+
+/*
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
+ *
+ * only used when fib_nh is built into fib_info
+ */
+int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
+{
+	struct fib_info *prev_fi;
+	struct hlist_head *head;
+	struct fib_nh *nh;
+	int ret;
+
+	if (!(dev->flags & IFF_UP))
+		return 0;
+
+	if (nh_flags & RTNH_F_DEAD) {
+		unsigned int flags = dev_get_flags(dev);
+
+		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+			nh_flags |= RTNH_F_LINKDOWN;
+	}
+
+	prev_fi = NULL;
+	head = fib_info_devhash_bucket(dev);
+	ret = 0;
+
+	hlist_for_each_entry(nh, head, nh_hash) {
+		struct fib_info *fi = nh->nh_parent;
+		int alive;
+
+		BUG_ON(!fi->fib_nhs);
+		if (nh->fib_nh_dev != dev || fi == prev_fi)
+			continue;
+
+		prev_fi = fi;
+		alive = 0;
+		change_nexthops(fi) {
+			if (!(nexthop_nh->fib_nh_flags & nh_flags)) {
+				alive++;
+				continue;
+			}
+			if (!nexthop_nh->fib_nh_dev ||
+			    !(nexthop_nh->fib_nh_dev->flags & IFF_UP))
+				continue;
+			if (nexthop_nh->fib_nh_dev != dev ||
+			    !__in_dev_get_rtnl(dev))
+				continue;
+			alive++;
+			nexthop_nh->fib_nh_flags &= ~nh_flags;
+			call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
+		} endfor_nexthops(fi)
+
+		if (alive > 0) {
+			fi->fib_flags &= ~nh_flags;
+			ret++;
+		}
+
+		fib_rebalance(fi);
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static bool fib_good_nh(const struct fib_nh *nh)
+{
+	int state = NUD_REACHABLE;
+
+	if (nh->fib_nh_scope == RT_SCOPE_LINK) {
+		struct neighbour *n;
+
+		rcu_read_lock();
+
+		if (likely(nh->fib_nh_gw_family == AF_INET))
+			n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+						   (__force u32)nh->fib_nh_gw4);
+		else if (nh->fib_nh_gw_family == AF_INET6)
+			n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev,
+							   &nh->fib_nh_gw6);
+		else
+			n = NULL;
+		if (n)
+			state = READ_ONCE(n->nud_state);
+
+		rcu_read_unlock();
+	}
+
+	return !!(state & NUD_VALID);
+}
+
+void fib_select_multipath(struct fib_result *res, int hash)
+{
+	struct fib_info *fi = res->fi;
+	struct net *net = fi->fib_net;
+	bool first = false;
+
+	if (unlikely(res->fi->nh)) {
+		nexthop_path_fib_result(res, hash);
+		return;
+	}
+
+	change_nexthops(fi) {
+		if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) {
+			if (!fib_good_nh(nexthop_nh))
+				continue;
+			if (!first) {
+				res->nh_sel = nhsel;
+				res->nhc = &nexthop_nh->nh_common;
+				first = true;
+			}
+		}
+
+		if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
+			continue;
+
+		res->nh_sel = nhsel;
+		res->nhc = &nexthop_nh->nh_common;
+		return;
+	} endfor_nexthops(fi);
+}
+#endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+		     struct flowi4 *fl4, const struct sk_buff *skb)
+{
+	if (fl4->flowi4_oif)
+		goto check_saddr;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (fib_info_num_path(res->fi) > 1) {
+		int h = fib_multipath_hash(net, fl4, skb, NULL);
+
+		fib_select_multipath(res, h);
+	}
+	else
+#endif
+	if (!res->prefixlen &&
+	    res->table->tb_num_default > 1 &&
+	    res->type == RTN_UNICAST)
+		fib_select_default(fl4, res);
+
+check_saddr:
+	if (!fl4->saddr)
+		fl4->saddr = fib_result_prefsrc(net, res);
+}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 0000000000..9bdfdab906
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,3067 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *
+ *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
+ *     & Swedish University of Agricultural Sciences.
+ *
+ *   Jens Laas <jens.laas@data.slu.se> Swedish University of
+ *     Agricultural Sciences.
+ *
+ *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
+ *
+ * This work is based on the LPC-trie which is originally described in:
+ *
+ * An experimental study of compression methods for dynamic tries
+ * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ * https://www.csc.kth.se/~snilsson/software/dyntrie2/
+ *
+ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+ *
+ * Code from fib_hash has been reused which includes the following header:
+ *
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 FIB: lookup engine and maintenance routines.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Substantial contributions to this work comes from:
+ *
+ *		David S. Miller, <davem@davemloft.net>
+ *		Stephen Hemminger <shemminger@osdl.org>
+ *		Paul E. McKenney <paulmck@us.ibm.com>
+ *		Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/cache.h>
+#include <linux/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/notifier.h>
+#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/fib_notifier.h>
+#include <trace/events/fib.h>
+#include "fib_lookup.h"
+
+static int call_fib_entry_notifier(struct notifier_block *nb,
+				   enum fib_event_type event_type, u32 dst,
+				   int dst_len, struct fib_alias *fa,
+				   struct netlink_ext_ack *extack)
+{
+	struct fib_entry_notifier_info info = {
+		.info.extack = extack,
+		.dst = dst,
+		.dst_len = dst_len,
+		.fi = fa->fa_info,
+		.dscp = fa->fa_dscp,
+		.type = fa->fa_type,
+		.tb_id = fa->tb_id,
+	};
+	return call_fib4_notifier(nb, event_type, &info.info);
+}
+
+static int call_fib_entry_notifiers(struct net *net,
+				    enum fib_event_type event_type, u32 dst,
+				    int dst_len, struct fib_alias *fa,
+				    struct netlink_ext_ack *extack)
+{
+	struct fib_entry_notifier_info info = {
+		.info.extack = extack,
+		.dst = dst,
+		.dst_len = dst_len,
+		.fi = fa->fa_info,
+		.dscp = fa->fa_dscp,
+		.type = fa->fa_type,
+		.tb_id = fa->tb_id,
+	};
+	return call_fib4_notifiers(net, event_type, &info.info);
+}
+
+#define MAX_STAT_DEPTH 32
+
+#define KEYLENGTH	(8*sizeof(t_key))
+#define KEY_MAX		((t_key)~0)
+
+typedef unsigned int t_key;
+
+#define IS_TRIE(n)	((n)->pos >= KEYLENGTH)
+#define IS_TNODE(n)	((n)->bits)
+#define IS_LEAF(n)	(!(n)->bits)
+
+struct key_vector {
+	t_key key;
+	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
+	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
+	unsigned char slen;
+	union {
+		/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
+		struct hlist_head leaf;
+		/* This array is valid if (pos | bits) > 0 (TNODE) */
+		DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode);
+	};
+};
+
+struct tnode {
+	struct rcu_head rcu;
+	t_key empty_children;		/* KEYLENGTH bits needed */
+	t_key full_children;		/* KEYLENGTH bits needed */
+	struct key_vector __rcu *parent;
+	struct key_vector kv[1];
+#define tn_bits kv[0].bits
+};
+
+#define TNODE_SIZE(n)	offsetof(struct tnode, kv[0].tnode[n])
+#define LEAF_SIZE	TNODE_SIZE(1)
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+struct trie_use_stats {
+	unsigned int gets;
+	unsigned int backtrack;
+	unsigned int semantic_match_passed;
+	unsigned int semantic_match_miss;
+	unsigned int null_node_hit;
+	unsigned int resize_node_skipped;
+};
+#endif
+
+struct trie_stat {
+	unsigned int totdepth;
+	unsigned int maxdepth;
+	unsigned int tnodes;
+	unsigned int leaves;
+	unsigned int nullpointers;
+	unsigned int prefixes;
+	unsigned int nodesizes[MAX_STAT_DEPTH];
+};
+
+struct trie {
+	struct key_vector kv[1];
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	struct trie_use_stats __percpu *stats;
+#endif
+};
+
+static struct key_vector *resize(struct trie *t, struct key_vector *tn);
+static unsigned int tnode_free_size;
+
+/*
+ * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
+ * especially useful before resizing the root node with PREEMPT_NONE configs;
+ * the value was obtained experimentally, aiming to avoid visible slowdown.
+ */
+unsigned int sysctl_fib_sync_mem = 512 * 1024;
+unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
+unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
+
+static struct kmem_cache *fn_alias_kmem __ro_after_init;
+static struct kmem_cache *trie_leaf_kmem __ro_after_init;
+
+static inline struct tnode *tn_info(struct key_vector *kv)
+{
+	return container_of(kv, struct tnode, kv[0]);
+}
+
+/* caller must hold RTNL */
+#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
+#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
+
+/* caller must hold RCU read lock or RTNL */
+#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
+#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
+
+/* wrapper for rcu_assign_pointer */
+static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
+{
+	if (n)
+		rcu_assign_pointer(tn_info(n)->parent, tp);
+}
+
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
+
+/* This provides us with the number of children in this node, in the case of a
+ * leaf this will return 0 meaning none of the children are accessible.
+ */
+static inline unsigned long child_length(const struct key_vector *tn)
+{
+	return (1ul << tn->bits) & ~(1ul);
+}
+
+#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
+
+static inline unsigned long get_index(t_key key, struct key_vector *kv)
+{
+	unsigned long index = key ^ kv->key;
+
+	if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
+		return 0;
+
+	return index >> kv->pos;
+}
+
+/* To understand this stuff, an understanding of keys and all their bits is
+ * necessary. Every node in the trie has a key associated with it, but not
+ * all of the bits in that key are significant.
+ *
+ * Consider a node 'n' and its parent 'tp'.
+ *
+ * If n is a leaf, every bit in its key is significant. Its presence is
+ * necessitated by path compression, since during a tree traversal (when
+ * searching for a leaf - unless we are doing an insertion) we will completely
+ * ignore all skipped bits we encounter. Thus we need to verify, at the end of
+ * a potentially successful search, that we have indeed been walking the
+ * correct key path.
+ *
+ * Note that we can never "miss" the correct key in the tree if present by
+ * following the wrong path. Path compression ensures that segments of the key
+ * that are the same for all keys with a given prefix are skipped, but the
+ * skipped part *is* identical for each node in the subtrie below the skipped
+ * bit! trie_insert() in this implementation takes care of that.
+ *
+ * if n is an internal node - a 'tnode' here, the various parts of its key
+ * have many different meanings.
+ *
+ * Example:
+ * _________________________________________________________________
+ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+ * -----------------------------------------------------------------
+ *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
+ *
+ * _________________________________________________________________
+ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+ * -----------------------------------------------------------------
+ *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
+ *
+ * tp->pos = 22
+ * tp->bits = 3
+ * n->pos = 13
+ * n->bits = 4
+ *
+ * First, let's just ignore the bits that come before the parent tp, that is
+ * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
+ * point we do not use them for anything.
+ *
+ * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+ * index into the parent's child array. That is, they will be used to find
+ * 'n' among tp's children.
+ *
+ * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
+ * for the node n.
+ *
+ * All the bits we have seen so far are significant to the node n. The rest
+ * of the bits are really not needed or indeed known in n->key.
+ *
+ * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+ * n's child array, and will of course be different for each child.
+ *
+ * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
+ * at this point.
+ */
+
+static const int halve_threshold = 25;
+static const int inflate_threshold = 50;
+static const int halve_threshold_root = 15;
+static const int inflate_threshold_root = 30;
+
+static void __alias_free_mem(struct rcu_head *head)
+{
+	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+	kmem_cache_free(fn_alias_kmem, fa);
+}
+
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
+{
+	call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+#define TNODE_VMALLOC_MAX \
+	ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
+
+static void __node_free_rcu(struct rcu_head *head)
+{
+	struct tnode *n = container_of(head, struct tnode, rcu);
+
+	if (!n->tn_bits)
+		kmem_cache_free(trie_leaf_kmem, n);
+	else
+		kvfree(n);
+}
+
+#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
+
+static struct tnode *tnode_alloc(int bits)
+{
+	size_t size;
+
+	/* verify bits is within bounds */
+	if (bits > TNODE_VMALLOC_MAX)
+		return NULL;
+
+	/* determine size and verify it is non-zero and didn't overflow */
+	size = TNODE_SIZE(1ul << bits);
+
+	if (size <= PAGE_SIZE)
+		return kzalloc(size, GFP_KERNEL);
+	else
+		return vzalloc(size);
+}
+
+static inline void empty_child_inc(struct key_vector *n)
+{
+	tn_info(n)->empty_children++;
+
+	if (!tn_info(n)->empty_children)
+		tn_info(n)->full_children++;
+}
+
+static inline void empty_child_dec(struct key_vector *n)
+{
+	if (!tn_info(n)->empty_children)
+		tn_info(n)->full_children--;
+
+	tn_info(n)->empty_children--;
+}
+
+static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
+{
+	struct key_vector *l;
+	struct tnode *kv;
+
+	kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+	if (!kv)
+		return NULL;
+
+	/* initialize key vector */
+	l = kv->kv;
+	l->key = key;
+	l->pos = 0;
+	l->bits = 0;
+	l->slen = fa->fa_slen;
+
+	/* link leaf to fib alias */
+	INIT_HLIST_HEAD(&l->leaf);
+	hlist_add_head(&fa->fa_list, &l->leaf);
+
+	return l;
+}
+
+static struct key_vector *tnode_new(t_key key, int pos, int bits)
+{
+	unsigned int shift = pos + bits;
+	struct key_vector *tn;
+	struct tnode *tnode;
+
+	/* verify bits and pos their msb bits clear and values are valid */
+	BUG_ON(!bits || (shift > KEYLENGTH));
+
+	tnode = tnode_alloc(bits);
+	if (!tnode)
+		return NULL;
+
+	pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+		 sizeof(struct key_vector *) << bits);
+
+	if (bits == KEYLENGTH)
+		tnode->full_children = 1;
+	else
+		tnode->empty_children = 1ul << bits;
+
+	tn = tnode->kv;
+	tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
+	tn->pos = pos;
+	tn->bits = bits;
+	tn->slen = pos;
+
+	return tn;
+}
+
+/* Check whether a tnode 'n' is "full", i.e. it is an internal node
+ * and no bits are skipped. See discussion in dyntree paper p. 6
+ */
+static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
+{
+	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
+}
+
+/* Add a child at position i overwriting the old value.
+ * Update the value of full_children and empty_children.
+ */
+static void put_child(struct key_vector *tn, unsigned long i,
+		      struct key_vector *n)
+{
+	struct key_vector *chi = get_child(tn, i);
+	int isfull, wasfull;
+
+	BUG_ON(i >= child_length(tn));
+
+	/* update emptyChildren, overflow into fullChildren */
+	if (!n && chi)
+		empty_child_inc(tn);
+	if (n && !chi)
+		empty_child_dec(tn);
+
+	/* update fullChildren */
+	wasfull = tnode_full(tn, chi);
+	isfull = tnode_full(tn, n);
+
+	if (wasfull && !isfull)
+		tn_info(tn)->full_children--;
+	else if (!wasfull && isfull)
+		tn_info(tn)->full_children++;
+
+	if (n && (tn->slen < n->slen))
+		tn->slen = n->slen;
+
+	rcu_assign_pointer(tn->tnode[i], n);
+}
+
+static void update_children(struct key_vector *tn)
+{
+	unsigned long i;
+
+	/* update all of the child parent pointers */
+	for (i = child_length(tn); i;) {
+		struct key_vector *inode = get_child(tn, --i);
+
+		if (!inode)
+			continue;
+
+		/* Either update the children of a tnode that
+		 * already belongs to us or update the child
+		 * to point to ourselves.
+		 */
+		if (node_parent(inode) == tn)
+			update_children(inode);
+		else
+			node_set_parent(inode, tn);
+	}
+}
+
+static inline void put_child_root(struct key_vector *tp, t_key key,
+				  struct key_vector *n)
+{
+	if (IS_TRIE(tp))
+		rcu_assign_pointer(tp->tnode[0], n);
+	else
+		put_child(tp, get_index(key, tp), n);
+}
+
+static inline void tnode_free_init(struct key_vector *tn)
+{
+	tn_info(tn)->rcu.next = NULL;
+}
+
+static inline void tnode_free_append(struct key_vector *tn,
+				     struct key_vector *n)
+{
+	tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
+	tn_info(tn)->rcu.next = &tn_info(n)->rcu;
+}
+
+static void tnode_free(struct key_vector *tn)
+{
+	struct callback_head *head = &tn_info(tn)->rcu;
+
+	while (head) {
+		head = head->next;
+		tnode_free_size += TNODE_SIZE(1ul << tn->bits);
+		node_free(tn);
+
+		tn = container_of(head, struct tnode, rcu)->kv;
+	}
+
+	if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
+		tnode_free_size = 0;
+		synchronize_rcu();
+	}
+}
+
+static struct key_vector *replace(struct trie *t,
+				  struct key_vector *oldtnode,
+				  struct key_vector *tn)
+{
+	struct key_vector *tp = node_parent(oldtnode);
+	unsigned long i;
+
+	/* setup the parent pointer out of and back into this node */
+	NODE_INIT_PARENT(tn, tp);
+	put_child_root(tp, tn->key, tn);
+
+	/* update all of the child parent pointers */
+	update_children(tn);
+
+	/* all pointers should be clean so we are done */
+	tnode_free(oldtnode);
+
+	/* resize children now that oldtnode is freed */
+	for (i = child_length(tn); i;) {
+		struct key_vector *inode = get_child(tn, --i);
+
+		/* resize child node */
+		if (tnode_full(tn, inode))
+			tn = resize(t, inode);
+	}
+
+	return tp;
+}
+
+static struct key_vector *inflate(struct trie *t,
+				  struct key_vector *oldtnode)
+{
+	struct key_vector *tn;
+	unsigned long i;
+	t_key m;
+
+	pr_debug("In inflate\n");
+
+	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
+	if (!tn)
+		goto notnode;
+
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
+
+	/* Assemble all of the pointers in our cluster, in this case that
+	 * represents all of the pointers out of our allocated nodes that
+	 * point to existing tnodes and the links between our allocated
+	 * nodes.
+	 */
+	for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
+		struct key_vector *inode = get_child(oldtnode, --i);
+		struct key_vector *node0, *node1;
+		unsigned long j, k;
+
+		/* An empty child */
+		if (!inode)
+			continue;
+
+		/* A leaf or an internal node with skipped bits */
+		if (!tnode_full(oldtnode, inode)) {
+			put_child(tn, get_index(inode->key, tn), inode);
+			continue;
+		}
+
+		/* drop the node in the old tnode free list */
+		tnode_free_append(oldtnode, inode);
+
+		/* An internal node with two children */
+		if (inode->bits == 1) {
+			put_child(tn, 2 * i + 1, get_child(inode, 1));
+			put_child(tn, 2 * i, get_child(inode, 0));
+			continue;
+		}
+
+		/* We will replace this node 'inode' with two new
+		 * ones, 'node0' and 'node1', each with half of the
+		 * original children. The two new nodes will have
+		 * a position one bit further down the key and this
+		 * means that the "significant" part of their keys
+		 * (see the discussion near the top of this file)
+		 * will differ by one bit, which will be "0" in
+		 * node0's key and "1" in node1's key. Since we are
+		 * moving the key position by one step, the bit that
+		 * we are moving away from - the bit at position
+		 * (tn->pos) - is the one that will differ between
+		 * node0 and node1. So... we synthesize that bit in the
+		 * two new keys.
+		 */
+		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
+		if (!node1)
+			goto nomem;
+		node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
+
+		tnode_free_append(tn, node1);
+		if (!node0)
+			goto nomem;
+		tnode_free_append(tn, node0);
+
+		/* populate child pointers in new nodes */
+		for (k = child_length(inode), j = k / 2; j;) {
+			put_child(node1, --j, get_child(inode, --k));
+			put_child(node0, j, get_child(inode, j));
+			put_child(node1, --j, get_child(inode, --k));
+			put_child(node0, j, get_child(inode, j));
+		}
+
+		/* link new nodes to parent */
+		NODE_INIT_PARENT(node1, tn);
+		NODE_INIT_PARENT(node0, tn);
+
+		/* link parent to nodes */
+		put_child(tn, 2 * i + 1, node1);
+		put_child(tn, 2 * i, node0);
+	}
+
+	/* setup the parent pointers into and out of this node */
+	return replace(t, oldtnode, tn);
+nomem:
+	/* all pointers should be clean so we are done */
+	tnode_free(tn);
+notnode:
+	return NULL;
+}
+
+static struct key_vector *halve(struct trie *t,
+				struct key_vector *oldtnode)
+{
+	struct key_vector *tn;
+	unsigned long i;
+
+	pr_debug("In halve\n");
+
+	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
+	if (!tn)
+		goto notnode;
+
+	/* prepare oldtnode to be freed */
+	tnode_free_init(oldtnode);
+
+	/* Assemble all of the pointers in our cluster, in this case that
+	 * represents all of the pointers out of our allocated nodes that
+	 * point to existing tnodes and the links between our allocated
+	 * nodes.
+	 */
+	for (i = child_length(oldtnode); i;) {
+		struct key_vector *node1 = get_child(oldtnode, --i);
+		struct key_vector *node0 = get_child(oldtnode, --i);
+		struct key_vector *inode;
+
+		/* At least one of the children is empty */
+		if (!node1 || !node0) {
+			put_child(tn, i / 2, node1 ? : node0);
+			continue;
+		}
+
+		/* Two nonempty children */
+		inode = tnode_new(node0->key, oldtnode->pos, 1);
+		if (!inode)
+			goto nomem;
+		tnode_free_append(tn, inode);
+
+		/* initialize pointers out of node */
+		put_child(inode, 1, node1);
+		put_child(inode, 0, node0);
+		NODE_INIT_PARENT(inode, tn);
+
+		/* link parent to node */
+		put_child(tn, i / 2, inode);
+	}
+
+	/* setup the parent pointers into and out of this node */
+	return replace(t, oldtnode, tn);
+nomem:
+	/* all pointers should be clean so we are done */
+	tnode_free(tn);
+notnode:
+	return NULL;
+}
+
+static struct key_vector *collapse(struct trie *t,
+				   struct key_vector *oldtnode)
+{
+	struct key_vector *n, *tp;
+	unsigned long i;
+
+	/* scan the tnode looking for that one child that might still exist */
+	for (n = NULL, i = child_length(oldtnode); !n && i;)
+		n = get_child(oldtnode, --i);
+
+	/* compress one level */
+	tp = node_parent(oldtnode);
+	put_child_root(tp, oldtnode->key, n);
+	node_set_parent(n, tp);
+
+	/* drop dead node */
+	node_free(oldtnode);
+
+	return tp;
+}
+
+static unsigned char update_suffix(struct key_vector *tn)
+{
+	unsigned char slen = tn->pos;
+	unsigned long stride, i;
+	unsigned char slen_max;
+
+	/* only vector 0 can have a suffix length greater than or equal to
+	 * tn->pos + tn->bits, the second highest node will have a suffix
+	 * length at most of tn->pos + tn->bits - 1
+	 */
+	slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
+
+	/* search though the list of children looking for nodes that might
+	 * have a suffix greater than the one we currently have.  This is
+	 * why we start with a stride of 2 since a stride of 1 would
+	 * represent the nodes with suffix length equal to tn->pos
+	 */
+	for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
+		struct key_vector *n = get_child(tn, i);
+
+		if (!n || (n->slen <= slen))
+			continue;
+
+		/* update stride and slen based on new value */
+		stride <<= (n->slen - slen);
+		slen = n->slen;
+		i &= ~(stride - 1);
+
+		/* stop searching if we have hit the maximum possible value */
+		if (slen >= slen_max)
+			break;
+	}
+
+	tn->slen = slen;
+
+	return slen;
+}
+
+/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = child_length(tn) - tn->empty_children -
+ *     tn->full_children;
+ *
+ * new_child_length = child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ *      new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ *      inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ *      inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (child_length(tn) - tn->empty_children +
+ *    tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (child_length(tn) - tn->empty_children +
+ *    tn->full_children) >=
+ *      inflate_threshold * child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + child_length(tn) -
+ *    tn->empty_children) >= inflate_threshold *
+ *    child_length(tn)
+ *
+ */
+static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
+{
+	unsigned long used = child_length(tn);
+	unsigned long threshold = used;
+
+	/* Keep root node larger */
+	threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
+	used -= tn_info(tn)->empty_children;
+	used += tn_info(tn)->full_children;
+
+	/* if bits == KEYLENGTH then pos = 0, and will fail below */
+
+	return (used > 1) && tn->pos && ((50 * used) >= threshold);
+}
+
+static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
+{
+	unsigned long used = child_length(tn);
+	unsigned long threshold = used;
+
+	/* Keep root node larger */
+	threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
+	used -= tn_info(tn)->empty_children;
+
+	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
+
+	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
+}
+
+static inline bool should_collapse(struct key_vector *tn)
+{
+	unsigned long used = child_length(tn);
+
+	used -= tn_info(tn)->empty_children;
+
+	/* account for bits == KEYLENGTH case */
+	if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
+		used -= KEY_MAX;
+
+	/* One child or none, time to drop us from the trie */
+	return used < 2;
+}
+
+#define MAX_WORK 10
+static struct key_vector *resize(struct trie *t, struct key_vector *tn)
+{
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	struct trie_use_stats __percpu *stats = t->stats;
+#endif
+	struct key_vector *tp = node_parent(tn);
+	unsigned long cindex = get_index(tn->key, tp);
+	int max_work = MAX_WORK;
+
+	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+		 tn, inflate_threshold, halve_threshold);
+
+	/* track the tnode via the pointer from the parent instead of
+	 * doing it ourselves.  This way we can let RCU fully do its
+	 * thing without us interfering
+	 */
+	BUG_ON(tn != get_child(tp, cindex));
+
+	/* Double as long as the resulting node has a number of
+	 * nonempty nodes that are above the threshold.
+	 */
+	while (should_inflate(tp, tn) && max_work) {
+		tp = inflate(t, tn);
+		if (!tp) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			this_cpu_inc(stats->resize_node_skipped);
+#endif
+			break;
+		}
+
+		max_work--;
+		tn = get_child(tp, cindex);
+	}
+
+	/* update parent in case inflate failed */
+	tp = node_parent(tn);
+
+	/* Return if at least one inflate is run */
+	if (max_work != MAX_WORK)
+		return tp;
+
+	/* Halve as long as the number of empty children in this
+	 * node is above threshold.
+	 */
+	while (should_halve(tp, tn) && max_work) {
+		tp = halve(t, tn);
+		if (!tp) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			this_cpu_inc(stats->resize_node_skipped);
+#endif
+			break;
+		}
+
+		max_work--;
+		tn = get_child(tp, cindex);
+	}
+
+	/* Only one child remains */
+	if (should_collapse(tn))
+		return collapse(t, tn);
+
+	/* update parent in case halve failed */
+	return node_parent(tn);
+}
+
+static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
+{
+	unsigned char node_slen = tn->slen;
+
+	while ((node_slen > tn->pos) && (node_slen > slen)) {
+		slen = update_suffix(tn);
+		if (node_slen == slen)
+			break;
+
+		tn = node_parent(tn);
+		node_slen = tn->slen;
+	}
+}
+
+static void node_push_suffix(struct key_vector *tn, unsigned char slen)
+{
+	while (tn->slen < slen) {
+		tn->slen = slen;
+		tn = node_parent(tn);
+	}
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+static struct key_vector *fib_find_node(struct trie *t,
+					struct key_vector **tp, u32 key)
+{
+	struct key_vector *pn, *n = t->kv;
+	unsigned long index = 0;
+
+	do {
+		pn = n;
+		n = get_child_rcu(n, index);
+
+		if (!n)
+			break;
+
+		index = get_cindex(key, n);
+
+		/* This bit of code is a bit tricky but it combines multiple
+		 * checks into a single check.  The prefix consists of the
+		 * prefix plus zeros for the bits in the cindex. The index
+		 * is the difference between the key and this value.  From
+		 * this we can actually derive several pieces of data.
+		 *   if (index >= (1ul << bits))
+		 *     we have a mismatch in skip bits and failed
+		 *   else
+		 *     we know the value is cindex
+		 *
+		 * This check is safe even if bits == KEYLENGTH due to the
+		 * fact that we can only allocate a node with 32 bits if a
+		 * long is greater than 32 bits.
+		 */
+		if (index >= (1ul << n->bits)) {
+			n = NULL;
+			break;
+		}
+
+		/* keep searching until we find a perfect match leaf or NULL */
+	} while (IS_TNODE(n));
+
+	*tp = pn;
+
+	return n;
+}
+
+/* Return the first fib alias matching DSCP with
+ * priority less than or equal to PRIO.
+ * If 'find_first' is set, return the first matching
+ * fib alias, regardless of DSCP and priority.
+ */
+static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
+					dscp_t dscp, u32 prio, u32 tb_id,
+					bool find_first)
+{
+	struct fib_alias *fa;
+
+	if (!fah)
+		return NULL;
+
+	hlist_for_each_entry(fa, fah, fa_list) {
+		/* Avoid Sparse warning when using dscp_t in inequalities */
+		u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
+		u8 __dscp = inet_dscp_to_dsfield(dscp);
+
+		if (fa->fa_slen < slen)
+			continue;
+		if (fa->fa_slen != slen)
+			break;
+		if (fa->tb_id > tb_id)
+			continue;
+		if (fa->tb_id != tb_id)
+			break;
+		if (find_first)
+			return fa;
+		if (__fa_dscp > __dscp)
+			continue;
+		if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
+			return fa;
+	}
+
+	return NULL;
+}
+
+static struct fib_alias *
+fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
+{
+	u8 slen = KEYLENGTH - fri->dst_len;
+	struct key_vector *l, *tp;
+	struct fib_table *tb;
+	struct fib_alias *fa;
+	struct trie *t;
+
+	tb = fib_get_table(net, fri->tb_id);
+	if (!tb)
+		return NULL;
+
+	t = (struct trie *)tb->tb_data;
+	l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
+	if (!l)
+		return NULL;
+
+	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+		if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
+		    fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi &&
+		    fa->fa_type == fri->type)
+			return fa;
+	}
+
+	return NULL;
+}
+
+void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
+{
+	u8 fib_notify_on_flag_change;
+	struct fib_alias *fa_match;
+	struct sk_buff *skb;
+	int err;
+
+	rcu_read_lock();
+
+	fa_match = fib_find_matching_alias(net, fri);
+	if (!fa_match)
+		goto out;
+
+	/* These are paired with the WRITE_ONCE() happening in this function.
+	 * The reason is that we are only protected by RCU at this point.
+	 */
+	if (READ_ONCE(fa_match->offload) == fri->offload &&
+	    READ_ONCE(fa_match->trap) == fri->trap &&
+	    READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
+		goto out;
+
+	WRITE_ONCE(fa_match->offload, fri->offload);
+	WRITE_ONCE(fa_match->trap, fri->trap);
+
+	fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change);
+
+	/* 2 means send notifications only if offload_failed was changed. */
+	if (fib_notify_on_flag_change == 2 &&
+	    READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
+		goto out;
+
+	WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
+
+	if (!fib_notify_on_flag_change)
+		goto out;
+
+	skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
+	if (!skb) {
+		err = -ENOBUFS;
+		goto errout;
+	}
+
+	err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
+	goto out;
+
+errout:
+	rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
+
+static void trie_rebalance(struct trie *t, struct key_vector *tn)
+{
+	while (!IS_TRIE(tn))
+		tn = resize(t, tn);
+}
+
+static int fib_insert_node(struct trie *t, struct key_vector *tp,
+			   struct fib_alias *new, t_key key)
+{
+	struct key_vector *n, *l;
+
+	l = leaf_new(key, new);
+	if (!l)
+		goto noleaf;
+
+	/* retrieve child from parent node */
+	n = get_child(tp, get_index(key, tp));
+
+	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
+	 *
+	 *  Add a new tnode here
+	 *  first tnode need some special handling
+	 *  leaves us in position for handling as case 3
+	 */
+	if (n) {
+		struct key_vector *tn;
+
+		tn = tnode_new(key, __fls(key ^ n->key), 1);
+		if (!tn)
+			goto notnode;
+
+		/* initialize routes out of node */
+		NODE_INIT_PARENT(tn, tp);
+		put_child(tn, get_index(key, tn) ^ 1, n);
+
+		/* start adding routes into the node */
+		put_child_root(tp, key, tn);
+		node_set_parent(n, tn);
+
+		/* parent now has a NULL spot where the leaf can go */
+		tp = tn;
+	}
+
+	/* Case 3: n is NULL, and will just insert a new leaf */
+	node_push_suffix(tp, new->fa_slen);
+	NODE_INIT_PARENT(l, tp);
+	put_child_root(tp, key, l);
+	trie_rebalance(t, tp);
+
+	return 0;
+notnode:
+	node_free(l);
+noleaf:
+	return -ENOMEM;
+}
+
+static int fib_insert_alias(struct trie *t, struct key_vector *tp,
+			    struct key_vector *l, struct fib_alias *new,
+			    struct fib_alias *fa, t_key key)
+{
+	if (!l)
+		return fib_insert_node(t, tp, new, key);
+
+	if (fa) {
+		hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
+	} else {
+		struct fib_alias *last;
+
+		hlist_for_each_entry(last, &l->leaf, fa_list) {
+			if (new->fa_slen < last->fa_slen)
+				break;
+			if ((new->fa_slen == last->fa_slen) &&
+			    (new->tb_id > last->tb_id))
+				break;
+			fa = last;
+		}
+
+		if (fa)
+			hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
+		else
+			hlist_add_head_rcu(&new->fa_list, &l->leaf);
+	}
+
+	/* if we added to the tail node then we need to update slen */
+	if (l->slen < new->fa_slen) {
+		l->slen = new->fa_slen;
+		node_push_suffix(tp, new->fa_slen);
+	}
+
+	return 0;
+}
+
+static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
+{
+	if (plen > KEYLENGTH) {
+		NL_SET_ERR_MSG(extack, "Invalid prefix length");
+		return false;
+	}
+
+	if ((plen < KEYLENGTH) && (key << plen)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid prefix for given prefix length");
+		return false;
+	}
+
+	return true;
+}
+
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+			     struct key_vector *l, struct fib_alias *old);
+
+/* Caller must hold RTNL. */
+int fib_table_insert(struct net *net, struct fib_table *tb,
+		     struct fib_config *cfg, struct netlink_ext_ack *extack)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct fib_alias *fa, *new_fa;
+	struct key_vector *l, *tp;
+	u16 nlflags = NLM_F_EXCL;
+	struct fib_info *fi;
+	u8 plen = cfg->fc_dst_len;
+	u8 slen = KEYLENGTH - plen;
+	dscp_t dscp;
+	u32 key;
+	int err;
+
+	key = ntohl(cfg->fc_dst);
+
+	if (!fib_valid_key_len(key, plen, extack))
+		return -EINVAL;
+
+	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
+
+	fi = fib_create_info(cfg, extack);
+	if (IS_ERR(fi)) {
+		err = PTR_ERR(fi);
+		goto err;
+	}
+
+	dscp = cfg->fc_dscp;
+	l = fib_find_node(t, &tp, key);
+	fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
+				tb->tb_id, false) : NULL;
+
+	/* Now fa, if non-NULL, points to the first fib alias
+	 * with the same keys [prefix,dscp,priority], if such key already
+	 * exists or to the node before which we will insert new one.
+	 *
+	 * If fa is NULL, we will need to allocate a new one and
+	 * insert to the tail of the section matching the suffix length
+	 * of the new alias.
+	 */
+
+	if (fa && fa->fa_dscp == dscp &&
+	    fa->fa_info->fib_priority == fi->fib_priority) {
+		struct fib_alias *fa_first, *fa_match;
+
+		err = -EEXIST;
+		if (cfg->fc_nlflags & NLM_F_EXCL)
+			goto out;
+
+		nlflags &= ~NLM_F_EXCL;
+
+		/* We have 2 goals:
+		 * 1. Find exact match for type, scope, fib_info to avoid
+		 * duplicate routes
+		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
+		 */
+		fa_match = NULL;
+		fa_first = fa;
+		hlist_for_each_entry_from(fa, fa_list) {
+			if ((fa->fa_slen != slen) ||
+			    (fa->tb_id != tb->tb_id) ||
+			    (fa->fa_dscp != dscp))
+				break;
+			if (fa->fa_info->fib_priority != fi->fib_priority)
+				break;
+			if (fa->fa_type == cfg->fc_type &&
+			    fa->fa_info == fi) {
+				fa_match = fa;
+				break;
+			}
+		}
+
+		if (cfg->fc_nlflags & NLM_F_REPLACE) {
+			struct fib_info *fi_drop;
+			u8 state;
+
+			nlflags |= NLM_F_REPLACE;
+			fa = fa_first;
+			if (fa_match) {
+				if (fa == fa_match)
+					err = 0;
+				goto out;
+			}
+			err = -ENOBUFS;
+			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+			if (!new_fa)
+				goto out;
+
+			fi_drop = fa->fa_info;
+			new_fa->fa_dscp = fa->fa_dscp;
+			new_fa->fa_info = fi;
+			new_fa->fa_type = cfg->fc_type;
+			state = fa->fa_state;
+			new_fa->fa_state = state & ~FA_S_ACCESSED;
+			new_fa->fa_slen = fa->fa_slen;
+			new_fa->tb_id = tb->tb_id;
+			new_fa->fa_default = -1;
+			new_fa->offload = 0;
+			new_fa->trap = 0;
+			new_fa->offload_failed = 0;
+
+			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+
+			if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
+					   tb->tb_id, true) == new_fa) {
+				enum fib_event_type fib_event;
+
+				fib_event = FIB_EVENT_ENTRY_REPLACE;
+				err = call_fib_entry_notifiers(net, fib_event,
+							       key, plen,
+							       new_fa, extack);
+				if (err) {
+					hlist_replace_rcu(&new_fa->fa_list,
+							  &fa->fa_list);
+					goto out_free_new_fa;
+				}
+			}
+
+			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
+
+			alias_free_mem_rcu(fa);
+
+			fib_release_info(fi_drop);
+			if (state & FA_S_ACCESSED)
+				rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+			goto succeeded;
+		}
+		/* Error if we find a perfect match which
+		 * uses the same scope, type, and nexthop
+		 * information.
+		 */
+		if (fa_match)
+			goto out;
+
+		if (cfg->fc_nlflags & NLM_F_APPEND)
+			nlflags |= NLM_F_APPEND;
+		else
+			fa = fa_first;
+	}
+	err = -ENOENT;
+	if (!(cfg->fc_nlflags & NLM_F_CREATE))
+		goto out;
+
+	nlflags |= NLM_F_CREATE;
+	err = -ENOBUFS;
+	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+	if (!new_fa)
+		goto out;
+
+	new_fa->fa_info = fi;
+	new_fa->fa_dscp = dscp;
+	new_fa->fa_type = cfg->fc_type;
+	new_fa->fa_state = 0;
+	new_fa->fa_slen = slen;
+	new_fa->tb_id = tb->tb_id;
+	new_fa->fa_default = -1;
+	new_fa->offload = 0;
+	new_fa->trap = 0;
+	new_fa->offload_failed = 0;
+
+	/* Insert new entry to the list. */
+	err = fib_insert_alias(t, tp, l, new_fa, fa, key);
+	if (err)
+		goto out_free_new_fa;
+
+	/* The alias was already inserted, so the node must exist. */
+	l = l ? l : fib_find_node(t, &tp, key);
+	if (WARN_ON_ONCE(!l)) {
+		err = -ENOENT;
+		goto out_free_new_fa;
+	}
+
+	if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
+	    new_fa) {
+		enum fib_event_type fib_event;
+
+		fib_event = FIB_EVENT_ENTRY_REPLACE;
+		err = call_fib_entry_notifiers(net, fib_event, key, plen,
+					       new_fa, extack);
+		if (err)
+			goto out_remove_new_fa;
+	}
+
+	if (!plen)
+		tb->tb_num_default++;
+
+	rt_cache_flush(cfg->fc_nlinfo.nl_net);
+	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
+		  &cfg->fc_nlinfo, nlflags);
+succeeded:
+	return 0;
+
+out_remove_new_fa:
+	fib_remove_alias(t, tp, l, new_fa);
+out_free_new_fa:
+	kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+	fib_release_info(fi);
+err:
+	return err;
+}
+
+static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
+{
+	t_key prefix = n->key;
+
+	return (key ^ prefix) & (prefix | -prefix);
+}
+
+bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
+			 const struct flowi4 *flp)
+{
+	if (nhc->nhc_flags & RTNH_F_DEAD)
+		return false;
+
+	if (ip_ignore_linkdown(nhc->nhc_dev) &&
+	    nhc->nhc_flags & RTNH_F_LINKDOWN &&
+	    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
+		return false;
+
+	if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif)
+		return false;
+
+	return true;
+}
+
+/* should be called with rcu_read_lock */
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
+		     struct fib_result *res, int fib_flags)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	struct trie_use_stats __percpu *stats = t->stats;
+#endif
+	const t_key key = ntohl(flp->daddr);
+	struct key_vector *n, *pn;
+	struct fib_alias *fa;
+	unsigned long index;
+	t_key cindex;
+
+	pn = t->kv;
+	cindex = 0;
+
+	n = get_child_rcu(pn, cindex);
+	if (!n) {
+		trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
+		return -EAGAIN;
+	}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	this_cpu_inc(stats->gets);
+#endif
+
+	/* Step 1: Travel to the longest prefix match in the trie */
+	for (;;) {
+		index = get_cindex(key, n);
+
+		/* This bit of code is a bit tricky but it combines multiple
+		 * checks into a single check.  The prefix consists of the
+		 * prefix plus zeros for the "bits" in the prefix. The index
+		 * is the difference between the key and this value.  From
+		 * this we can actually derive several pieces of data.
+		 *   if (index >= (1ul << bits))
+		 *     we have a mismatch in skip bits and failed
+		 *   else
+		 *     we know the value is cindex
+		 *
+		 * This check is safe even if bits == KEYLENGTH due to the
+		 * fact that we can only allocate a node with 32 bits if a
+		 * long is greater than 32 bits.
+		 */
+		if (index >= (1ul << n->bits))
+			break;
+
+		/* we have found a leaf. Prefixes have already been compared */
+		if (IS_LEAF(n))
+			goto found;
+
+		/* only record pn and cindex if we are going to be chopping
+		 * bits later.  Otherwise we are just wasting cycles.
+		 */
+		if (n->slen > n->pos) {
+			pn = n;
+			cindex = index;
+		}
+
+		n = get_child_rcu(n, index);
+		if (unlikely(!n))
+			goto backtrace;
+	}
+
+	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
+	for (;;) {
+		/* record the pointer where our next node pointer is stored */
+		struct key_vector __rcu **cptr = n->tnode;
+
+		/* This test verifies that none of the bits that differ
+		 * between the key and the prefix exist in the region of
+		 * the lsb and higher in the prefix.
+		 */
+		if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
+			goto backtrace;
+
+		/* exit out and process leaf */
+		if (unlikely(IS_LEAF(n)))
+			break;
+
+		/* Don't bother recording parent info.  Since we are in
+		 * prefix match mode we will have to come back to wherever
+		 * we started this traversal anyway
+		 */
+
+		while ((n = rcu_dereference(*cptr)) == NULL) {
+backtrace:
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			if (!n)
+				this_cpu_inc(stats->null_node_hit);
+#endif
+			/* If we are at cindex 0 there are no more bits for
+			 * us to strip at this level so we must ascend back
+			 * up one level to see if there are any more bits to
+			 * be stripped there.
+			 */
+			while (!cindex) {
+				t_key pkey = pn->key;
+
+				/* If we don't have a parent then there is
+				 * nothing for us to do as we do not have any
+				 * further nodes to parse.
+				 */
+				if (IS_TRIE(pn)) {
+					trace_fib_table_lookup(tb->tb_id, flp,
+							       NULL, -EAGAIN);
+					return -EAGAIN;
+				}
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+				this_cpu_inc(stats->backtrack);
+#endif
+				/* Get Child's index */
+				pn = node_parent_rcu(pn);
+				cindex = get_index(pkey, pn);
+			}
+
+			/* strip the least significant bit from the cindex */
+			cindex &= cindex - 1;
+
+			/* grab pointer for next child node */
+			cptr = &pn->tnode[cindex];
+		}
+	}
+
+found:
+	/* this line carries forward the xor from earlier in the function */
+	index = key ^ n->key;
+
+	/* Step 3: Process the leaf, if that fails fall back to backtracing */
+	hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+		struct fib_nh_common *nhc;
+		int nhsel, err;
+
+		if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
+			if (index >= (1ul << fa->fa_slen))
+				continue;
+		}
+		if (fa->fa_dscp &&
+		    inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
+			continue;
+		/* Paired with WRITE_ONCE() in fib_release_info() */
+		if (READ_ONCE(fi->fib_dead))
+			continue;
+		if (fa->fa_info->fib_scope < flp->flowi4_scope)
+			continue;
+		fib_alias_accessed(fa);
+		err = fib_props[fa->fa_type].error;
+		if (unlikely(err < 0)) {
+out_reject:
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			this_cpu_inc(stats->semantic_match_passed);
+#endif
+			trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
+			return err;
+		}
+		if (fi->fib_flags & RTNH_F_DEAD)
+			continue;
+
+		if (unlikely(fi->nh)) {
+			if (nexthop_is_blackhole(fi->nh)) {
+				err = fib_props[RTN_BLACKHOLE].error;
+				goto out_reject;
+			}
+
+			nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
+						     &nhsel);
+			if (nhc)
+				goto set_result;
+			goto miss;
+		}
+
+		for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+			nhc = fib_info_nhc(fi, nhsel);
+
+			if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
+				continue;
+set_result:
+			if (!(fib_flags & FIB_LOOKUP_NOREF))
+				refcount_inc(&fi->fib_clntref);
+
+			res->prefix = htonl(n->key);
+			res->prefixlen = KEYLENGTH - fa->fa_slen;
+			res->nh_sel = nhsel;
+			res->nhc = nhc;
+			res->type = fa->fa_type;
+			res->scope = fi->fib_scope;
+			res->fi = fi;
+			res->table = tb;
+			res->fa_head = &n->leaf;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			this_cpu_inc(stats->semantic_match_passed);
+#endif
+			trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
+
+			return err;
+		}
+	}
+miss:
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	this_cpu_inc(stats->semantic_match_miss);
+#endif
+	goto backtrace;
+}
+EXPORT_SYMBOL_GPL(fib_table_lookup);
+
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+			     struct key_vector *l, struct fib_alias *old)
+{
+	/* record the location of the previous list_info entry */
+	struct hlist_node **pprev = old->fa_list.pprev;
+	struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
+
+	/* remove the fib_alias from the list */
+	hlist_del_rcu(&old->fa_list);
+
+	/* if we emptied the list this leaf will be freed and we can sort
+	 * out parent suffix lengths as a part of trie_rebalance
+	 */
+	if (hlist_empty(&l->leaf)) {
+		if (tp->slen == l->slen)
+			node_pull_suffix(tp, tp->pos);
+		put_child_root(tp, l->key, NULL);
+		node_free(l);
+		trie_rebalance(t, tp);
+		return;
+	}
+
+	/* only access fa if it is pointing at the last valid hlist_node */
+	if (*pprev)
+		return;
+
+	/* update the trie with the latest suffix length */
+	l->slen = fa->fa_slen;
+	node_pull_suffix(tp, fa->fa_slen);
+}
+
+static void fib_notify_alias_delete(struct net *net, u32 key,
+				    struct hlist_head *fah,
+				    struct fib_alias *fa_to_delete,
+				    struct netlink_ext_ack *extack)
+{
+	struct fib_alias *fa_next, *fa_to_notify;
+	u32 tb_id = fa_to_delete->tb_id;
+	u8 slen = fa_to_delete->fa_slen;
+	enum fib_event_type fib_event;
+
+	/* Do not notify if we do not care about the route. */
+	if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
+		return;
+
+	/* Determine if the route should be replaced by the next route in the
+	 * list.
+	 */
+	fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
+				   struct fib_alias, fa_list);
+	if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
+		fib_event = FIB_EVENT_ENTRY_REPLACE;
+		fa_to_notify = fa_next;
+	} else {
+		fib_event = FIB_EVENT_ENTRY_DEL;
+		fa_to_notify = fa_to_delete;
+	}
+	call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
+				 fa_to_notify, extack);
+}
+
+/* Caller must hold RTNL. */
+int fib_table_delete(struct net *net, struct fib_table *tb,
+		     struct fib_config *cfg, struct netlink_ext_ack *extack)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	struct fib_alias *fa, *fa_to_delete;
+	struct key_vector *l, *tp;
+	u8 plen = cfg->fc_dst_len;
+	u8 slen = KEYLENGTH - plen;
+	dscp_t dscp;
+	u32 key;
+
+	key = ntohl(cfg->fc_dst);
+
+	if (!fib_valid_key_len(key, plen, extack))
+		return -EINVAL;
+
+	l = fib_find_node(t, &tp, key);
+	if (!l)
+		return -ESRCH;
+
+	dscp = cfg->fc_dscp;
+	fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
+	if (!fa)
+		return -ESRCH;
+
+	pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
+		 inet_dscp_to_dsfield(dscp), t);
+
+	fa_to_delete = NULL;
+	hlist_for_each_entry_from(fa, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if ((fa->fa_slen != slen) ||
+		    (fa->tb_id != tb->tb_id) ||
+		    (fa->fa_dscp != dscp))
+			break;
+
+		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
+		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
+		     fa->fa_info->fib_scope == cfg->fc_scope) &&
+		    (!cfg->fc_prefsrc ||
+		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
+		    (!cfg->fc_protocol ||
+		     fi->fib_protocol == cfg->fc_protocol) &&
+		    fib_nh_match(net, cfg, fi, extack) == 0 &&
+		    fib_metrics_match(cfg, fi)) {
+			fa_to_delete = fa;
+			break;
+		}
+	}
+
+	if (!fa_to_delete)
+		return -ESRCH;
+
+	fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
+	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
+		  &cfg->fc_nlinfo, 0);
+
+	if (!plen)
+		tb->tb_num_default--;
+
+	fib_remove_alias(t, tp, l, fa_to_delete);
+
+	if (fa_to_delete->fa_state & FA_S_ACCESSED)
+		rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+	fib_release_info(fa_to_delete->fa_info);
+	alias_free_mem_rcu(fa_to_delete);
+	return 0;
+}
+
+/* Scan for the next leaf starting at the provided key value */
+static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
+{
+	struct key_vector *pn, *n = *tn;
+	unsigned long cindex;
+
+	/* this loop is meant to try and find the key in the trie */
+	do {
+		/* record parent and next child index */
+		pn = n;
+		cindex = (key > pn->key) ? get_index(key, pn) : 0;
+
+		if (cindex >> pn->bits)
+			break;
+
+		/* descend into the next child */
+		n = get_child_rcu(pn, cindex++);
+		if (!n)
+			break;
+
+		/* guarantee forward progress on the keys */
+		if (IS_LEAF(n) && (n->key >= key))
+			goto found;
+	} while (IS_TNODE(n));
+
+	/* this loop will search for the next leaf with a greater key */
+	while (!IS_TRIE(pn)) {
+		/* if we exhausted the parent node we will need to climb */
+		if (cindex >= (1ul << pn->bits)) {
+			t_key pkey = pn->key;
+
+			pn = node_parent_rcu(pn);
+			cindex = get_index(pkey, pn) + 1;
+			continue;
+		}
+
+		/* grab the next available node */
+		n = get_child_rcu(pn, cindex++);
+		if (!n)
+			continue;
+
+		/* no need to compare keys since we bumped the index */
+		if (IS_LEAF(n))
+			goto found;
+
+		/* Rescan start scanning in new node */
+		pn = n;
+		cindex = 0;
+	}
+
+	*tn = pn;
+	return NULL; /* Root of trie */
+found:
+	/* if we are at the limit for keys just return NULL for the tnode */
+	*tn = pn;
+	return n;
+}
+
+static void fib_trie_free(struct fib_table *tb)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *pn = t->kv;
+	unsigned long cindex = 1;
+	struct hlist_node *tmp;
+	struct fib_alias *fa;
+
+	/* walk trie in reverse order and free everything */
+	for (;;) {
+		struct key_vector *n;
+
+		if (!(cindex--)) {
+			t_key pkey = pn->key;
+
+			if (IS_TRIE(pn))
+				break;
+
+			n = pn;
+			pn = node_parent(pn);
+
+			/* drop emptied tnode */
+			put_child_root(pn, n->key, NULL);
+			node_free(n);
+
+			cindex = get_index(pkey, pn);
+
+			continue;
+		}
+
+		/* grab the next available node */
+		n = get_child(pn, cindex);
+		if (!n)
+			continue;
+
+		if (IS_TNODE(n)) {
+			/* record pn and cindex for leaf walking */
+			pn = n;
+			cindex = 1ul << n->bits;
+
+			continue;
+		}
+
+		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+			hlist_del_rcu(&fa->fa_list);
+			alias_free_mem_rcu(fa);
+		}
+
+		put_child_root(pn, n->key, NULL);
+		node_free(n);
+	}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	free_percpu(t->stats);
+#endif
+	kfree(tb);
+}
+
+struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
+{
+	struct trie *ot = (struct trie *)oldtb->tb_data;
+	struct key_vector *l, *tp = ot->kv;
+	struct fib_table *local_tb;
+	struct fib_alias *fa;
+	struct trie *lt;
+	t_key key = 0;
+
+	if (oldtb->tb_data == oldtb->__data)
+		return oldtb;
+
+	local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
+	if (!local_tb)
+		return NULL;
+
+	lt = (struct trie *)local_tb->tb_data;
+
+	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+		struct key_vector *local_l = NULL, *local_tp;
+
+		hlist_for_each_entry(fa, &l->leaf, fa_list) {
+			struct fib_alias *new_fa;
+
+			if (local_tb->tb_id != fa->tb_id)
+				continue;
+
+			/* clone fa for new local table */
+			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+			if (!new_fa)
+				goto out;
+
+			memcpy(new_fa, fa, sizeof(*fa));
+
+			/* insert clone into table */
+			if (!local_l)
+				local_l = fib_find_node(lt, &local_tp, l->key);
+
+			if (fib_insert_alias(lt, local_tp, local_l, new_fa,
+					     NULL, l->key)) {
+				kmem_cache_free(fn_alias_kmem, new_fa);
+				goto out;
+			}
+		}
+
+		/* stop loop if key wrapped back to 0 */
+		key = l->key + 1;
+		if (key < l->key)
+			break;
+	}
+
+	return local_tb;
+out:
+	fib_trie_free(local_tb);
+
+	return NULL;
+}
+
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *pn = t->kv;
+	unsigned long cindex = 1;
+	struct hlist_node *tmp;
+	struct fib_alias *fa;
+
+	/* walk trie in reverse order */
+	for (;;) {
+		unsigned char slen = 0;
+		struct key_vector *n;
+
+		if (!(cindex--)) {
+			t_key pkey = pn->key;
+
+			/* cannot resize the trie vector */
+			if (IS_TRIE(pn))
+				break;
+
+			/* update the suffix to address pulled leaves */
+			if (pn->slen > pn->pos)
+				update_suffix(pn);
+
+			/* resize completed node */
+			pn = resize(t, pn);
+			cindex = get_index(pkey, pn);
+
+			continue;
+		}
+
+		/* grab the next available node */
+		n = get_child(pn, cindex);
+		if (!n)
+			continue;
+
+		if (IS_TNODE(n)) {
+			/* record pn and cindex for leaf walking */
+			pn = n;
+			cindex = 1ul << n->bits;
+
+			continue;
+		}
+
+		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+			/* if alias was cloned to local then we just
+			 * need to remove the local copy from main
+			 */
+			if (tb->tb_id != fa->tb_id) {
+				hlist_del_rcu(&fa->fa_list);
+				alias_free_mem_rcu(fa);
+				continue;
+			}
+
+			/* record local slen */
+			slen = fa->fa_slen;
+		}
+
+		/* update leaf slen */
+		n->slen = slen;
+
+		if (hlist_empty(&n->leaf)) {
+			put_child_root(pn, n->key, NULL);
+			node_free(n);
+		}
+	}
+}
+
+/* Caller must hold RTNL. */
+int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct nl_info info = { .nl_net = net };
+	struct key_vector *pn = t->kv;
+	unsigned long cindex = 1;
+	struct hlist_node *tmp;
+	struct fib_alias *fa;
+	int found = 0;
+
+	/* walk trie in reverse order */
+	for (;;) {
+		unsigned char slen = 0;
+		struct key_vector *n;
+
+		if (!(cindex--)) {
+			t_key pkey = pn->key;
+
+			/* cannot resize the trie vector */
+			if (IS_TRIE(pn))
+				break;
+
+			/* update the suffix to address pulled leaves */
+			if (pn->slen > pn->pos)
+				update_suffix(pn);
+
+			/* resize completed node */
+			pn = resize(t, pn);
+			cindex = get_index(pkey, pn);
+
+			continue;
+		}
+
+		/* grab the next available node */
+		n = get_child(pn, cindex);
+		if (!n)
+			continue;
+
+		if (IS_TNODE(n)) {
+			/* record pn and cindex for leaf walking */
+			pn = n;
+			cindex = 1ul << n->bits;
+
+			continue;
+		}
+
+		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+			struct fib_info *fi = fa->fa_info;
+
+			if (!fi || tb->tb_id != fa->tb_id ||
+			    (!(fi->fib_flags & RTNH_F_DEAD) &&
+			     !fib_props[fa->fa_type].error)) {
+				slen = fa->fa_slen;
+				continue;
+			}
+
+			/* Do not flush error routes if network namespace is
+			 * not being dismantled
+			 */
+			if (!flush_all && fib_props[fa->fa_type].error) {
+				slen = fa->fa_slen;
+				continue;
+			}
+
+			fib_notify_alias_delete(net, n->key, &n->leaf, fa,
+						NULL);
+			if (fi->pfsrc_removed)
+				rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa,
+					  KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0);
+			hlist_del_rcu(&fa->fa_list);
+			fib_release_info(fa->fa_info);
+			alias_free_mem_rcu(fa);
+			found++;
+		}
+
+		/* update leaf slen */
+		n->slen = slen;
+
+		if (hlist_empty(&n->leaf)) {
+			put_child_root(pn, n->key, NULL);
+			node_free(n);
+		}
+	}
+
+	pr_debug("trie_flush found=%d\n", found);
+	return found;
+}
+
+/* derived from fib_trie_free */
+static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
+				     struct nl_info *info)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *pn = t->kv;
+	unsigned long cindex = 1;
+	struct fib_alias *fa;
+
+	for (;;) {
+		struct key_vector *n;
+
+		if (!(cindex--)) {
+			t_key pkey = pn->key;
+
+			if (IS_TRIE(pn))
+				break;
+
+			pn = node_parent(pn);
+			cindex = get_index(pkey, pn);
+			continue;
+		}
+
+		/* grab the next available node */
+		n = get_child(pn, cindex);
+		if (!n)
+			continue;
+
+		if (IS_TNODE(n)) {
+			/* record pn and cindex for leaf walking */
+			pn = n;
+			cindex = 1ul << n->bits;
+
+			continue;
+		}
+
+		hlist_for_each_entry(fa, &n->leaf, fa_list) {
+			struct fib_info *fi = fa->fa_info;
+
+			if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
+				continue;
+
+			rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
+				  KEYLENGTH - fa->fa_slen, tb->tb_id,
+				  info, NLM_F_REPLACE);
+		}
+	}
+}
+
+void fib_info_notify_update(struct net *net, struct nl_info *info)
+{
+	unsigned int h;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb_hlist,
+					 lockdep_rtnl_is_held())
+			__fib_info_notify_update(net, tb, info);
+	}
+}
+
+static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
+			   struct notifier_block *nb,
+			   struct netlink_ext_ack *extack)
+{
+	struct fib_alias *fa;
+	int last_slen = -1;
+	int err;
+
+	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (!fi)
+			continue;
+
+		/* local and main table can share the same trie,
+		 * so don't notify twice for the same entry.
+		 */
+		if (tb->tb_id != fa->tb_id)
+			continue;
+
+		if (fa->fa_slen == last_slen)
+			continue;
+
+		last_slen = fa->fa_slen;
+		err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
+					      l->key, KEYLENGTH - fa->fa_slen,
+					      fa, extack);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
+			    struct netlink_ext_ack *extack)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *l, *tp = t->kv;
+	t_key key = 0;
+	int err;
+
+	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+		err = fib_leaf_notify(l, tb, nb, extack);
+		if (err)
+			return err;
+
+		key = l->key + 1;
+		/* stop in case of wrap around */
+		if (key < l->key)
+			break;
+	}
+	return 0;
+}
+
+int fib_notify(struct net *net, struct notifier_block *nb,
+	       struct netlink_ext_ack *extack)
+{
+	unsigned int h;
+	int err;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+			err = fib_table_notify(tb, nb, extack);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+static void __trie_free_rcu(struct rcu_head *head)
+{
+	struct fib_table *tb = container_of(head, struct fib_table, rcu);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	struct trie *t = (struct trie *)tb->tb_data;
+
+	if (tb->tb_data == tb->__data)
+		free_percpu(t->stats);
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
+	kfree(tb);
+}
+
+void fib_free_table(struct fib_table *tb)
+{
+	call_rcu(&tb->rcu, __trie_free_rcu);
+}
+
+static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
+			     struct sk_buff *skb, struct netlink_callback *cb,
+			     struct fib_dump_filter *filter)
+{
+	unsigned int flags = NLM_F_MULTI;
+	__be32 xkey = htonl(l->key);
+	int i, s_i, i_fa, s_fa, err;
+	struct fib_alias *fa;
+
+	if (filter->filter_set ||
+	    !filter->dump_exceptions || !filter->dump_routes)
+		flags |= NLM_F_DUMP_FILTERED;
+
+	s_i = cb->args[4];
+	s_fa = cb->args[5];
+	i = 0;
+
+	/* rcu_read_lock is hold by caller */
+	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (i < s_i)
+			goto next;
+
+		i_fa = 0;
+
+		if (tb->tb_id != fa->tb_id)
+			goto next;
+
+		if (filter->filter_set) {
+			if (filter->rt_type && fa->fa_type != filter->rt_type)
+				goto next;
+
+			if ((filter->protocol &&
+			     fi->fib_protocol != filter->protocol))
+				goto next;
+
+			if (filter->dev &&
+			    !fib_info_nh_uses_dev(fi, filter->dev))
+				goto next;
+		}
+
+		if (filter->dump_routes) {
+			if (!s_fa) {
+				struct fib_rt_info fri;
+
+				fri.fi = fi;
+				fri.tb_id = tb->tb_id;
+				fri.dst = xkey;
+				fri.dst_len = KEYLENGTH - fa->fa_slen;
+				fri.dscp = fa->fa_dscp;
+				fri.type = fa->fa_type;
+				fri.offload = READ_ONCE(fa->offload);
+				fri.trap = READ_ONCE(fa->trap);
+				fri.offload_failed = READ_ONCE(fa->offload_failed);
+				err = fib_dump_info(skb,
+						    NETLINK_CB(cb->skb).portid,
+						    cb->nlh->nlmsg_seq,
+						    RTM_NEWROUTE, &fri, flags);
+				if (err < 0)
+					goto stop;
+			}
+
+			i_fa++;
+		}
+
+		if (filter->dump_exceptions) {
+			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
+						 &i_fa, s_fa, flags);
+			if (err < 0)
+				goto stop;
+		}
+
+next:
+		i++;
+	}
+
+	cb->args[4] = i;
+	return skb->len;
+
+stop:
+	cb->args[4] = i;
+	cb->args[5] = i_fa;
+	return err;
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+		   struct netlink_callback *cb, struct fib_dump_filter *filter)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *l, *tp = t->kv;
+	/* Dump starting at last key.
+	 * Note: 0.0.0.0/0 (ie default) is first key.
+	 */
+	int count = cb->args[2];
+	t_key key = cb->args[3];
+
+	/* First time here, count and key are both always 0. Count > 0
+	 * and key == 0 means the dump has wrapped around and we are done.
+	 */
+	if (count && !key)
+		return skb->len;
+
+	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+		int err;
+
+		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
+		if (err < 0) {
+			cb->args[3] = key;
+			cb->args[2] = count;
+			return err;
+		}
+
+		++count;
+		key = l->key + 1;
+
+		memset(&cb->args[4], 0,
+		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
+
+		/* stop loop if key wrapped back to 0 */
+		if (key < l->key)
+			break;
+	}
+
+	cb->args[3] = key;
+	cb->args[2] = count;
+
+	return skb->len;
+}
+
+void __init fib_trie_init(void)
+{
+	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+					  sizeof(struct fib_alias),
+					  0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
+
+	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+					   LEAF_SIZE,
+					   0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
+}
+
+struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
+{
+	struct fib_table *tb;
+	struct trie *t;
+	size_t sz = sizeof(*tb);
+
+	if (!alias)
+		sz += sizeof(struct trie);
+
+	tb = kzalloc(sz, GFP_KERNEL);
+	if (!tb)
+		return NULL;
+
+	tb->tb_id = id;
+	tb->tb_num_default = 0;
+	tb->tb_data = (alias ? alias->__data : tb->__data);
+
+	if (alias)
+		return tb;
+
+	t = (struct trie *) tb->tb_data;
+	t->kv[0].pos = KEYLENGTH;
+	t->kv[0].slen = KEYLENGTH;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	t->stats = alloc_percpu(struct trie_use_stats);
+	if (!t->stats) {
+		kfree(tb);
+		tb = NULL;
+	}
+#endif
+
+	return tb;
+}
+
+#ifdef CONFIG_PROC_FS
+/* Depth first Trie walk iterator */
+struct fib_trie_iter {
+	struct seq_net_private p;
+	struct fib_table *tb;
+	struct key_vector *tnode;
+	unsigned int index;
+	unsigned int depth;
+};
+
+static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
+{
+	unsigned long cindex = iter->index;
+	struct key_vector *pn = iter->tnode;
+	t_key pkey;
+
+	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
+		 iter->tnode, iter->index, iter->depth);
+
+	while (!IS_TRIE(pn)) {
+		while (cindex < child_length(pn)) {
+			struct key_vector *n = get_child_rcu(pn, cindex++);
+
+			if (!n)
+				continue;
+
+			if (IS_LEAF(n)) {
+				iter->tnode = pn;
+				iter->index = cindex;
+			} else {
+				/* push down one level */
+				iter->tnode = n;
+				iter->index = 0;
+				++iter->depth;
+			}
+
+			return n;
+		}
+
+		/* Current node exhausted, pop back up */
+		pkey = pn->key;
+		pn = node_parent_rcu(pn);
+		cindex = get_index(pkey, pn) + 1;
+		--iter->depth;
+	}
+
+	/* record root node so further searches know we are done */
+	iter->tnode = pn;
+	iter->index = 0;
+
+	return NULL;
+}
+
+static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
+					     struct trie *t)
+{
+	struct key_vector *n, *pn;
+
+	if (!t)
+		return NULL;
+
+	pn = t->kv;
+	n = rcu_dereference(pn->tnode[0]);
+	if (!n)
+		return NULL;
+
+	if (IS_TNODE(n)) {
+		iter->tnode = n;
+		iter->index = 0;
+		iter->depth = 1;
+	} else {
+		iter->tnode = pn;
+		iter->index = 0;
+		iter->depth = 0;
+	}
+
+	return n;
+}
+
+static void trie_collect_stats(struct trie *t, struct trie_stat *s)
+{
+	struct key_vector *n;
+	struct fib_trie_iter iter;
+
+	memset(s, 0, sizeof(*s));
+
+	rcu_read_lock();
+	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
+		if (IS_LEAF(n)) {
+			struct fib_alias *fa;
+
+			s->leaves++;
+			s->totdepth += iter.depth;
+			if (iter.depth > s->maxdepth)
+				s->maxdepth = iter.depth;
+
+			hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
+				++s->prefixes;
+		} else {
+			s->tnodes++;
+			if (n->bits < MAX_STAT_DEPTH)
+				s->nodesizes[n->bits]++;
+			s->nullpointers += tn_info(n)->empty_children;
+		}
+	}
+	rcu_read_unlock();
+}
+
+/*
+ *	This outputs /proc/net/fib_triestats
+ */
+static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
+{
+	unsigned int i, max, pointers, bytes, avdepth;
+
+	if (stat->leaves)
+		avdepth = stat->totdepth*100 / stat->leaves;
+	else
+		avdepth = 0;
+
+	seq_printf(seq, "\tAver depth:     %u.%02d\n",
+		   avdepth / 100, avdepth % 100);
+	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
+
+	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
+	bytes = LEAF_SIZE * stat->leaves;
+
+	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
+	bytes += sizeof(struct fib_alias) * stat->prefixes;
+
+	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
+	bytes += TNODE_SIZE(0) * stat->tnodes;
+
+	max = MAX_STAT_DEPTH;
+	while (max > 0 && stat->nodesizes[max-1] == 0)
+		max--;
+
+	pointers = 0;
+	for (i = 1; i < max; i++)
+		if (stat->nodesizes[i] != 0) {
+			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
+			pointers += (1<<i) * stat->nodesizes[i];
+		}
+	seq_putc(seq, '\n');
+	seq_printf(seq, "\tPointers: %u\n", pointers);
+
+	bytes += sizeof(struct key_vector *) * pointers;
+	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
+	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
+}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+static void trie_show_usage(struct seq_file *seq,
+			    const struct trie_use_stats __percpu *stats)
+{
+	struct trie_use_stats s = { 0 };
+	int cpu;
+
+	/* loop through all of the CPUs and gather up the stats */
+	for_each_possible_cpu(cpu) {
+		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
+
+		s.gets += pcpu->gets;
+		s.backtrack += pcpu->backtrack;
+		s.semantic_match_passed += pcpu->semantic_match_passed;
+		s.semantic_match_miss += pcpu->semantic_match_miss;
+		s.null_node_hit += pcpu->null_node_hit;
+		s.resize_node_skipped += pcpu->resize_node_skipped;
+	}
+
+	seq_printf(seq, "\nCounters:\n---------\n");
+	seq_printf(seq, "gets = %u\n", s.gets);
+	seq_printf(seq, "backtracks = %u\n", s.backtrack);
+	seq_printf(seq, "semantic match passed = %u\n",
+		   s.semantic_match_passed);
+	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
+	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
+	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
+}
+#endif /*  CONFIG_IP_FIB_TRIE_STATS */
+
+static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
+{
+	if (tb->tb_id == RT_TABLE_LOCAL)
+		seq_puts(seq, "Local:\n");
+	else if (tb->tb_id == RT_TABLE_MAIN)
+		seq_puts(seq, "Main:\n");
+	else
+		seq_printf(seq, "Id %d:\n", tb->tb_id);
+}
+
+
+static int fib_triestat_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq->private;
+	unsigned int h;
+
+	seq_printf(seq,
+		   "Basic info: size of leaf:"
+		   " %zd bytes, size of tnode: %zd bytes.\n",
+		   LEAF_SIZE, TNODE_SIZE(0));
+
+	rcu_read_lock();
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+			struct trie *t = (struct trie *) tb->tb_data;
+			struct trie_stat stat;
+
+			if (!t)
+				continue;
+
+			fib_table_print(seq, tb);
+
+			trie_collect_stats(t, &stat);
+			trie_show_stats(seq, &stat);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			trie_show_usage(seq, t->stats);
+#endif
+		}
+		cond_resched_rcu();
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct fib_trie_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	loff_t idx = 0;
+	unsigned int h;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+			struct key_vector *n;
+
+			for (n = fib_trie_get_first(iter,
+						    (struct trie *) tb->tb_data);
+			     n; n = fib_trie_get_next(iter))
+				if (pos == idx++) {
+					iter->tb = tb;
+					return n;
+				}
+		}
+	}
+
+	return NULL;
+}
+
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return fib_trie_get_idx(seq, *pos);
+}
+
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct fib_trie_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct fib_table *tb = iter->tb;
+	struct hlist_node *tb_node;
+	unsigned int h;
+	struct key_vector *n;
+
+	++*pos;
+	/* next node in same table */
+	n = fib_trie_get_next(iter);
+	if (n)
+		return n;
+
+	/* walk rest of this hash chain */
+	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
+	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
+		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
+		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+		if (n)
+			goto found;
+	}
+
+	/* new hash chain */
+	while (++h < FIB_TABLE_HASHSZ) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+			if (n)
+				goto found;
+		}
+	}
+	return NULL;
+
+found:
+	iter->tb = tb;
+	return n;
+}
+
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static void seq_indent(struct seq_file *seq, int n)
+{
+	while (n-- > 0)
+		seq_puts(seq, "   ");
+}
+
+static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
+{
+	switch (s) {
+	case RT_SCOPE_UNIVERSE: return "universe";
+	case RT_SCOPE_SITE:	return "site";
+	case RT_SCOPE_LINK:	return "link";
+	case RT_SCOPE_HOST:	return "host";
+	case RT_SCOPE_NOWHERE:	return "nowhere";
+	default:
+		snprintf(buf, len, "scope=%d", s);
+		return buf;
+	}
+}
+
+static const char *const rtn_type_names[__RTN_MAX] = {
+	[RTN_UNSPEC] = "UNSPEC",
+	[RTN_UNICAST] = "UNICAST",
+	[RTN_LOCAL] = "LOCAL",
+	[RTN_BROADCAST] = "BROADCAST",
+	[RTN_ANYCAST] = "ANYCAST",
+	[RTN_MULTICAST] = "MULTICAST",
+	[RTN_BLACKHOLE] = "BLACKHOLE",
+	[RTN_UNREACHABLE] = "UNREACHABLE",
+	[RTN_PROHIBIT] = "PROHIBIT",
+	[RTN_THROW] = "THROW",
+	[RTN_NAT] = "NAT",
+	[RTN_XRESOLVE] = "XRESOLVE",
+};
+
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
+{
+	if (t < __RTN_MAX && rtn_type_names[t])
+		return rtn_type_names[t];
+	snprintf(buf, len, "type %u", t);
+	return buf;
+}
+
+/* Pretty print the trie */
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
+{
+	const struct fib_trie_iter *iter = seq->private;
+	struct key_vector *n = v;
+
+	if (IS_TRIE(node_parent_rcu(n)))
+		fib_table_print(seq, iter->tb);
+
+	if (IS_TNODE(n)) {
+		__be32 prf = htonl(n->key);
+
+		seq_indent(seq, iter->depth-1);
+		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
+			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
+			   tn_info(n)->full_children,
+			   tn_info(n)->empty_children);
+	} else {
+		__be32 val = htonl(n->key);
+		struct fib_alias *fa;
+
+		seq_indent(seq, iter->depth);
+		seq_printf(seq, "  |-- %pI4\n", &val);
+
+		hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+			char buf1[32], buf2[32];
+
+			seq_indent(seq, iter->depth + 1);
+			seq_printf(seq, "  /%zu %s %s",
+				   KEYLENGTH - fa->fa_slen,
+				   rtn_scope(buf1, sizeof(buf1),
+					     fa->fa_info->fib_scope),
+				   rtn_type(buf2, sizeof(buf2),
+					    fa->fa_type));
+			if (fa->fa_dscp)
+				seq_printf(seq, " tos=%d",
+					   inet_dscp_to_dsfield(fa->fa_dscp));
+			seq_putc(seq, '\n');
+		}
+	}
+
+	return 0;
+}
+
+static const struct seq_operations fib_trie_seq_ops = {
+	.start  = fib_trie_seq_start,
+	.next   = fib_trie_seq_next,
+	.stop   = fib_trie_seq_stop,
+	.show   = fib_trie_seq_show,
+};
+
+struct fib_route_iter {
+	struct seq_net_private p;
+	struct fib_table *main_tb;
+	struct key_vector *tnode;
+	loff_t	pos;
+	t_key	key;
+};
+
+static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
+					    loff_t pos)
+{
+	struct key_vector *l, **tp = &iter->tnode;
+	t_key key;
+
+	/* use cached location of previously found key */
+	if (iter->pos > 0 && pos >= iter->pos) {
+		key = iter->key;
+	} else {
+		iter->pos = 1;
+		key = 0;
+	}
+
+	pos -= iter->pos;
+
+	while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
+		key = l->key + 1;
+		iter->pos++;
+		l = NULL;
+
+		/* handle unlikely case of a key wrap */
+		if (!key)
+			break;
+	}
+
+	if (l)
+		iter->key = l->key;	/* remember it */
+	else
+		iter->pos = 0;		/* forget it */
+
+	return l;
+}
+
+static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	struct fib_route_iter *iter = seq->private;
+	struct fib_table *tb;
+	struct trie *t;
+
+	rcu_read_lock();
+
+	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
+	if (!tb)
+		return NULL;
+
+	iter->main_tb = tb;
+	t = (struct trie *)tb->tb_data;
+	iter->tnode = t->kv;
+
+	if (*pos != 0)
+		return fib_route_get_idx(iter, *pos);
+
+	iter->pos = 0;
+	iter->key = KEY_MAX;
+
+	return SEQ_START_TOKEN;
+}
+
+static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct fib_route_iter *iter = seq->private;
+	struct key_vector *l = NULL;
+	t_key key = iter->key + 1;
+
+	++*pos;
+
+	/* only allow key of 0 for start of sequence */
+	if ((v == SEQ_START_TOKEN) || key)
+		l = leaf_walk_rcu(&iter->tnode, key);
+
+	if (l) {
+		iter->key = l->key;
+		iter->pos++;
+	} else {
+		iter->pos = 0;
+	}
+
+	return l;
+}
+
+static void fib_route_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
+{
+	unsigned int flags = 0;
+
+	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+		flags = RTF_REJECT;
+	if (fi) {
+		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+
+		if (nhc->nhc_gw.ipv4)
+			flags |= RTF_GATEWAY;
+	}
+	if (mask == htonl(0xFFFFFFFF))
+		flags |= RTF_HOST;
+	flags |= RTF_UP;
+	return flags;
+}
+
+/*
+ *	This outputs /proc/net/route.
+ *	The format of the file is not supposed to be changed
+ *	and needs to be same as fib_hash output to avoid breaking
+ *	legacy utilities
+ */
+static int fib_route_seq_show(struct seq_file *seq, void *v)
+{
+	struct fib_route_iter *iter = seq->private;
+	struct fib_table *tb = iter->main_tb;
+	struct fib_alias *fa;
+	struct key_vector *l = v;
+	__be32 prefix;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+			   "\tWindow\tIRTT");
+		return 0;
+	}
+
+	prefix = htonl(l->key);
+
+	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+		__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
+		unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+
+		if ((fa->fa_type == RTN_BROADCAST) ||
+		    (fa->fa_type == RTN_MULTICAST))
+			continue;
+
+		if (fa->tb_id != tb->tb_id)
+			continue;
+
+		seq_setwidth(seq, 127);
+
+		if (fi) {
+			struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+			__be32 gw = 0;
+
+			if (nhc->nhc_gw_family == AF_INET)
+				gw = nhc->nhc_gw.ipv4;
+
+			seq_printf(seq,
+				   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+				   "%d\t%08X\t%d\t%u\t%u",
+				   nhc->nhc_dev ? nhc->nhc_dev->name : "*",
+				   prefix, gw, flags, 0, 0,
+				   fi->fib_priority,
+				   mask,
+				   (fi->fib_advmss ?
+				    fi->fib_advmss + 40 : 0),
+				   fi->fib_window,
+				   fi->fib_rtt >> 3);
+		} else {
+			seq_printf(seq,
+				   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+				   "%d\t%08X\t%d\t%u\t%u",
+				   prefix, 0, flags, 0, 0, 0,
+				   mask, 0, 0, 0);
+		}
+		seq_pad(seq, '\n');
+	}
+
+	return 0;
+}
+
+static const struct seq_operations fib_route_seq_ops = {
+	.start  = fib_route_seq_start,
+	.next   = fib_route_seq_next,
+	.stop   = fib_route_seq_stop,
+	.show   = fib_route_seq_show,
+};
+
+int __net_init fib_proc_init(struct net *net)
+{
+	if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
+			sizeof(struct fib_trie_iter)))
+		goto out1;
+
+	if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
+			fib_triestat_seq_show, NULL))
+		goto out2;
+
+	if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
+			sizeof(struct fib_route_iter)))
+		goto out3;
+
+	return 0;
+
+out3:
+	remove_proc_entry("fib_triestat", net->proc_net);
+out2:
+	remove_proc_entry("fib_trie", net->proc_net);
+out1:
+	return -ENOMEM;
+}
+
+void __net_exit fib_proc_exit(struct net *net)
+{
+	remove_proc_entry("fib_trie", net->proc_net);
+	remove_proc_entry("fib_triestat", net->proc_net);
+	remove_proc_entry("route", net->proc_net);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
new file mode 100644
index 0000000000..3760a14b6b
--- /dev/null
+++ b/net/ipv4/fou_bpf.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Fou Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/fou.h>
+
+struct bpf_fou_encap {
+	__be16 sport;
+	__be16 dport;
+};
+
+enum bpf_fou_encap_type {
+	FOU_BPF_ENCAP_FOU,
+	FOU_BPF_ENCAP_GUE,
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in BTF");
+
+/* bpf_skb_set_fou_encap - Set FOU encap parameters
+ *
+ * This function allows for using GUE or FOU encapsulation together with an
+ * ipip device in collect-metadata mode.
+ *
+ * It is meant to be used in BPF tc-hooks and after a call to the
+ * bpf_skb_set_tunnel_key helper, responsible for setting IP addresses.
+ *
+ * Parameters:
+ * @skb_ctx	Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap	Pointer to a `struct bpf_fou_encap` storing UDP src and
+ * 		dst ports. If sport is set to 0 the kernel will auto-assign a
+ * 		port. This is similar to using `encap-sport auto`.
+ * 		Cannot be NULL
+ * @type	Encapsulation type for the packet. Their definitions are
+ * 		specified in `enum bpf_fou_encap_type`
+ */
+__bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
+				      struct bpf_fou_encap *encap, int type)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!encap))
+		return -EINVAL;
+
+	if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX)))
+		return -EINVAL;
+
+	switch (type) {
+	case FOU_BPF_ENCAP_FOU:
+		info->encap.type = TUNNEL_ENCAP_FOU;
+		break;
+	case FOU_BPF_ENCAP_GUE:
+		info->encap.type = TUNNEL_ENCAP_GUE;
+		break;
+	default:
+		info->encap.type = TUNNEL_ENCAP_NONE;
+	}
+
+	if (info->key.tun_flags & TUNNEL_CSUM)
+		info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
+
+	info->encap.sport = encap->sport;
+	info->encap.dport = encap->dport;
+
+	return 0;
+}
+
+/* bpf_skb_get_fou_encap - Get FOU encap parameters
+ *
+ * This function allows for reading encap metadata from a packet received
+ * on an ipip device in collect-metadata mode.
+ *
+ * Parameters:
+ * @skb_ctx	Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap	Pointer to a struct bpf_fou_encap storing UDP source and
+ * 		destination port. Cannot be NULL
+ */
+__bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
+				      struct bpf_fou_encap *encap)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!info))
+		return -EINVAL;
+
+	encap->sport = info->encap.sport;
+	encap->dport = info->encap.dport;
+
+	return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(fou_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
+BTF_ID_FLAGS(func, bpf_skb_get_fou_encap)
+BTF_SET8_END(fou_kfunc_set)
+
+static const struct btf_kfunc_id_set fou_bpf_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &fou_kfunc_set,
+};
+
+int register_fou_bpf(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					 &fou_bpf_kfunc_set);
+}
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
new file mode 100644
index 0000000000..0c41076e31
--- /dev/null
+++ b/net/ipv4/fou_core.c
@@ -0,0 +1,1266 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/genetlink.h>
+#include <net/gro.h>
+#include <net/gue.h>
+#include <net/fou.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <uapi/linux/fou.h>
+#include <uapi/linux/genetlink.h>
+
+#include "fou_nl.h"
+
+struct fou {
+	struct socket *sock;
+	u8 protocol;
+	u8 flags;
+	__be16 port;
+	u8 family;
+	u16 type;
+	struct list_head list;
+	struct rcu_head rcu;
+};
+
+#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
+
+struct fou_cfg {
+	u16 type;
+	u8 protocol;
+	u8 flags;
+	struct udp_port_cfg udp_config;
+};
+
+static unsigned int fou_net_id;
+
+struct fou_net {
+	struct list_head fou_list;
+	struct mutex fou_lock;
+};
+
+static inline struct fou *fou_from_sock(struct sock *sk)
+{
+	return sk->sk_user_data;
+}
+
+static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
+{
+	/* Remove 'len' bytes from the packet (UDP header and
+	 * FOU header if present).
+	 */
+	if (fou->family == AF_INET)
+		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+	else
+		ipv6_hdr(skb)->payload_len =
+		    htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
+	__skb_pull(skb, len);
+	skb_postpull_rcsum(skb, udp_hdr(skb), len);
+	skb_reset_transport_header(skb);
+	return iptunnel_pull_offloads(skb);
+}
+
+static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct fou *fou = fou_from_sock(sk);
+
+	if (!fou)
+		return 1;
+
+	if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
+		goto drop;
+
+	return -fou->protocol;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
+				  void *data, size_t hdrlen, u8 ipproto,
+				  bool nopartial)
+{
+	__be16 *pd = data;
+	size_t start = ntohs(pd[0]);
+	size_t offset = ntohs(pd[1]);
+	size_t plen = sizeof(struct udphdr) + hdrlen +
+	    max_t(size_t, offset + sizeof(u16), start);
+
+	if (skb->remcsum_offload)
+		return guehdr;
+
+	if (!pskb_may_pull(skb, plen))
+		return NULL;
+	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+	skb_remcsum_process(skb, (void *)guehdr + hdrlen,
+			    start, offset, nopartial);
+
+	return guehdr;
+}
+
+static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr)
+{
+	/* No support yet */
+	kfree_skb(skb);
+	return 0;
+}
+
+static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct fou *fou = fou_from_sock(sk);
+	size_t len, optlen, hdrlen;
+	struct guehdr *guehdr;
+	void *data;
+	u16 doffset = 0;
+	u8 proto_ctype;
+
+	if (!fou)
+		return 1;
+
+	len = sizeof(struct udphdr) + sizeof(struct guehdr);
+	if (!pskb_may_pull(skb, len))
+		goto drop;
+
+	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+	switch (guehdr->version) {
+	case 0: /* Full GUE header present */
+		break;
+
+	case 1: {
+		/* Direct encapsulation of IPv4 or IPv6 */
+
+		int prot;
+
+		switch (((struct iphdr *)guehdr)->version) {
+		case 4:
+			prot = IPPROTO_IPIP;
+			break;
+		case 6:
+			prot = IPPROTO_IPV6;
+			break;
+		default:
+			goto drop;
+		}
+
+		if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
+			goto drop;
+
+		return -prot;
+	}
+
+	default: /* Undefined version */
+		goto drop;
+	}
+
+	optlen = guehdr->hlen << 2;
+	len += optlen;
+
+	if (!pskb_may_pull(skb, len))
+		goto drop;
+
+	/* guehdr may change after pull */
+	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+	if (validate_gue_flags(guehdr, optlen))
+		goto drop;
+
+	hdrlen = sizeof(struct guehdr) + optlen;
+
+	if (fou->family == AF_INET)
+		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+	else
+		ipv6_hdr(skb)->payload_len =
+		    htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
+	/* Pull csum through the guehdr now . This can be used if
+	 * there is a remote checksum offload.
+	 */
+	skb_postpull_rcsum(skb, udp_hdr(skb), len);
+
+	data = &guehdr[1];
+
+	if (guehdr->flags & GUE_FLAG_PRIV) {
+		__be32 flags = *(__be32 *)(data + doffset);
+
+		doffset += GUE_LEN_PRIV;
+
+		if (flags & GUE_PFLAG_REMCSUM) {
+			guehdr = gue_remcsum(skb, guehdr, data + doffset,
+					     hdrlen, guehdr->proto_ctype,
+					     !!(fou->flags &
+						FOU_F_REMCSUM_NOPARTIAL));
+			if (!guehdr)
+				goto drop;
+
+			data = &guehdr[1];
+
+			doffset += GUE_PLEN_REMCSUM;
+		}
+	}
+
+	if (unlikely(guehdr->control))
+		return gue_control_message(skb, guehdr);
+
+	proto_ctype = guehdr->proto_ctype;
+	__skb_pull(skb, sizeof(struct udphdr) + hdrlen);
+	skb_reset_transport_header(skb);
+
+	if (iptunnel_pull_offloads(skb))
+		goto drop;
+
+	return -proto_ctype;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static struct sk_buff *fou_gro_receive(struct sock *sk,
+				       struct list_head *head,
+				       struct sk_buff *skb)
+{
+	const struct net_offload __rcu **offloads;
+	u8 proto = fou_from_sock(sk)->protocol;
+	const struct net_offload *ops;
+	struct sk_buff *pp = NULL;
+
+	/* We can clear the encap_mark for FOU as we are essentially doing
+	 * one of two possible things.  We are either adding an L4 tunnel
+	 * header to the outer L3 tunnel header, or we are simply
+	 * treating the GRE tunnel header as though it is a UDP protocol
+	 * specific header such as VXLAN or GENEVE.
+	 */
+	NAPI_GRO_CB(skb)->encap_mark = 0;
+
+	/* Flag this frame as already having an outer encap header */
+	NAPI_GRO_CB(skb)->is_fou = 1;
+
+	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+	ops = rcu_dereference(offloads[proto]);
+	if (!ops || !ops->callbacks.gro_receive)
+		goto out;
+
+	pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+
+out:
+	return pp;
+}
+
+static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
+			    int nhoff)
+{
+	const struct net_offload __rcu **offloads;
+	u8 proto = fou_from_sock(sk)->protocol;
+	const struct net_offload *ops;
+	int err = -ENOSYS;
+
+	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+	ops = rcu_dereference(offloads[proto]);
+	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+		goto out;
+
+	err = ops->callbacks.gro_complete(skb, nhoff);
+
+	skb_set_inner_mac_header(skb, nhoff);
+
+out:
+	return err;
+}
+
+static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
+				      struct guehdr *guehdr, void *data,
+				      size_t hdrlen, struct gro_remcsum *grc,
+				      bool nopartial)
+{
+	__be16 *pd = data;
+	size_t start = ntohs(pd[0]);
+	size_t offset = ntohs(pd[1]);
+
+	if (skb->remcsum_offload)
+		return guehdr;
+
+	if (!NAPI_GRO_CB(skb)->csum_valid)
+		return NULL;
+
+	guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen,
+					 start, offset, grc, nopartial);
+
+	skb->remcsum_offload = 1;
+
+	return guehdr;
+}
+
+static struct sk_buff *gue_gro_receive(struct sock *sk,
+				       struct list_head *head,
+				       struct sk_buff *skb)
+{
+	const struct net_offload __rcu **offloads;
+	const struct net_offload *ops;
+	struct sk_buff *pp = NULL;
+	struct sk_buff *p;
+	struct guehdr *guehdr;
+	size_t len, optlen, hdrlen, off;
+	void *data;
+	u16 doffset = 0;
+	int flush = 1;
+	struct fou *fou = fou_from_sock(sk);
+	struct gro_remcsum grc;
+	u8 proto;
+
+	skb_gro_remcsum_init(&grc);
+
+	off = skb_gro_offset(skb);
+	len = off + sizeof(*guehdr);
+
+	guehdr = skb_gro_header(skb, len, off);
+	if (unlikely(!guehdr))
+		goto out;
+
+	switch (guehdr->version) {
+	case 0:
+		break;
+	case 1:
+		switch (((struct iphdr *)guehdr)->version) {
+		case 4:
+			proto = IPPROTO_IPIP;
+			break;
+		case 6:
+			proto = IPPROTO_IPV6;
+			break;
+		default:
+			goto out;
+		}
+		goto next_proto;
+	default:
+		goto out;
+	}
+
+	optlen = guehdr->hlen << 2;
+	len += optlen;
+
+	if (skb_gro_header_hard(skb, len)) {
+		guehdr = skb_gro_header_slow(skb, len, off);
+		if (unlikely(!guehdr))
+			goto out;
+	}
+
+	if (unlikely(guehdr->control) || guehdr->version != 0 ||
+	    validate_gue_flags(guehdr, optlen))
+		goto out;
+
+	hdrlen = sizeof(*guehdr) + optlen;
+
+	/* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr,
+	 * this is needed if there is a remote checkcsum offload.
+	 */
+	skb_gro_postpull_rcsum(skb, guehdr, hdrlen);
+
+	data = &guehdr[1];
+
+	if (guehdr->flags & GUE_FLAG_PRIV) {
+		__be32 flags = *(__be32 *)(data + doffset);
+
+		doffset += GUE_LEN_PRIV;
+
+		if (flags & GUE_PFLAG_REMCSUM) {
+			guehdr = gue_gro_remcsum(skb, off, guehdr,
+						 data + doffset, hdrlen, &grc,
+						 !!(fou->flags &
+						    FOU_F_REMCSUM_NOPARTIAL));
+
+			if (!guehdr)
+				goto out;
+
+			data = &guehdr[1];
+
+			doffset += GUE_PLEN_REMCSUM;
+		}
+	}
+
+	skb_gro_pull(skb, hdrlen);
+
+	list_for_each_entry(p, head, list) {
+		const struct guehdr *guehdr2;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		guehdr2 = (struct guehdr *)(p->data + off);
+
+		/* Compare base GUE header to be equal (covers
+		 * hlen, version, proto_ctype, and flags.
+		 */
+		if (guehdr->word != guehdr2->word) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		/* Compare optional fields are the same. */
+		if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
+					   guehdr->hlen << 2)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+	}
+
+	proto = guehdr->proto_ctype;
+
+next_proto:
+
+	/* We can clear the encap_mark for GUE as we are essentially doing
+	 * one of two possible things.  We are either adding an L4 tunnel
+	 * header to the outer L3 tunnel header, or we are simply
+	 * treating the GRE tunnel header as though it is a UDP protocol
+	 * specific header such as VXLAN or GENEVE.
+	 */
+	NAPI_GRO_CB(skb)->encap_mark = 0;
+
+	/* Flag this frame as already having an outer encap header */
+	NAPI_GRO_CB(skb)->is_fou = 1;
+
+	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+	ops = rcu_dereference(offloads[proto]);
+	if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
+		goto out;
+
+	pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+	flush = 0;
+
+out:
+	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
+
+	return pp;
+}
+
+static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
+{
+	struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
+	const struct net_offload __rcu **offloads;
+	const struct net_offload *ops;
+	unsigned int guehlen = 0;
+	u8 proto;
+	int err = -ENOENT;
+
+	switch (guehdr->version) {
+	case 0:
+		proto = guehdr->proto_ctype;
+		guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+		break;
+	case 1:
+		switch (((struct iphdr *)guehdr)->version) {
+		case 4:
+			proto = IPPROTO_IPIP;
+			break;
+		case 6:
+			proto = IPPROTO_IPV6;
+			break;
+		default:
+			return err;
+		}
+		break;
+	default:
+		return err;
+	}
+
+	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+	ops = rcu_dereference(offloads[proto]);
+	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+		goto out;
+
+	err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
+
+	skb_set_inner_mac_header(skb, nhoff + guehlen);
+
+out:
+	return err;
+}
+
+static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg)
+{
+	struct sock *sk = fou->sock->sk;
+	struct udp_port_cfg *udp_cfg = &cfg->udp_config;
+
+	if (fou->family != udp_cfg->family ||
+	    fou->port != udp_cfg->local_udp_port ||
+	    sk->sk_dport != udp_cfg->peer_udp_port ||
+	    sk->sk_bound_dev_if != udp_cfg->bind_ifindex)
+		return false;
+
+	if (fou->family == AF_INET) {
+		if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr ||
+		    sk->sk_daddr != udp_cfg->peer_ip.s_addr)
+			return false;
+		else
+			return true;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) ||
+		    ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6))
+			return false;
+		else
+			return true;
+#endif
+	}
+
+	return false;
+}
+
+static int fou_add_to_port_list(struct net *net, struct fou *fou,
+				struct fou_cfg *cfg)
+{
+	struct fou_net *fn = net_generic(net, fou_net_id);
+	struct fou *fout;
+
+	mutex_lock(&fn->fou_lock);
+	list_for_each_entry(fout, &fn->fou_list, list) {
+		if (fou_cfg_cmp(fout, cfg)) {
+			mutex_unlock(&fn->fou_lock);
+			return -EALREADY;
+		}
+	}
+
+	list_add(&fou->list, &fn->fou_list);
+	mutex_unlock(&fn->fou_lock);
+
+	return 0;
+}
+
+static void fou_release(struct fou *fou)
+{
+	struct socket *sock = fou->sock;
+
+	list_del(&fou->list);
+	udp_tunnel_sock_release(sock);
+
+	kfree_rcu(fou, rcu);
+}
+
+static int fou_create(struct net *net, struct fou_cfg *cfg,
+		      struct socket **sockp)
+{
+	struct socket *sock = NULL;
+	struct fou *fou = NULL;
+	struct sock *sk;
+	struct udp_tunnel_sock_cfg tunnel_cfg;
+	int err;
+
+	/* Open UDP socket */
+	err = udp_sock_create(net, &cfg->udp_config, &sock);
+	if (err < 0)
+		goto error;
+
+	/* Allocate FOU port structure */
+	fou = kzalloc(sizeof(*fou), GFP_KERNEL);
+	if (!fou) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	sk = sock->sk;
+
+	fou->port = cfg->udp_config.local_udp_port;
+	fou->family = cfg->udp_config.family;
+	fou->flags = cfg->flags;
+	fou->type = cfg->type;
+	fou->sock = sock;
+
+	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
+	tunnel_cfg.encap_type = 1;
+	tunnel_cfg.sk_user_data = fou;
+	tunnel_cfg.encap_destroy = NULL;
+
+	/* Initial for fou type */
+	switch (cfg->type) {
+	case FOU_ENCAP_DIRECT:
+		tunnel_cfg.encap_rcv = fou_udp_recv;
+		tunnel_cfg.gro_receive = fou_gro_receive;
+		tunnel_cfg.gro_complete = fou_gro_complete;
+		fou->protocol = cfg->protocol;
+		break;
+	case FOU_ENCAP_GUE:
+		tunnel_cfg.encap_rcv = gue_udp_recv;
+		tunnel_cfg.gro_receive = gue_gro_receive;
+		tunnel_cfg.gro_complete = gue_gro_complete;
+		break;
+	default:
+		err = -EINVAL;
+		goto error;
+	}
+
+	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
+
+	sk->sk_allocation = GFP_ATOMIC;
+
+	err = fou_add_to_port_list(net, fou, cfg);
+	if (err)
+		goto error;
+
+	if (sockp)
+		*sockp = sock;
+
+	return 0;
+
+error:
+	kfree(fou);
+	if (sock)
+		udp_tunnel_sock_release(sock);
+
+	return err;
+}
+
+static int fou_destroy(struct net *net, struct fou_cfg *cfg)
+{
+	struct fou_net *fn = net_generic(net, fou_net_id);
+	int err = -EINVAL;
+	struct fou *fou;
+
+	mutex_lock(&fn->fou_lock);
+	list_for_each_entry(fou, &fn->fou_list, list) {
+		if (fou_cfg_cmp(fou, cfg)) {
+			fou_release(fou);
+			err = 0;
+			break;
+		}
+	}
+	mutex_unlock(&fn->fou_lock);
+
+	return err;
+}
+
+static struct genl_family fou_nl_family;
+
+static int parse_nl_config(struct genl_info *info,
+			   struct fou_cfg *cfg)
+{
+	bool has_local = false, has_peer = false;
+	struct nlattr *attr;
+	int ifindex;
+	__be16 port;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	cfg->udp_config.family = AF_INET;
+
+	if (info->attrs[FOU_ATTR_AF]) {
+		u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
+
+		switch (family) {
+		case AF_INET:
+			break;
+		case AF_INET6:
+			cfg->udp_config.ipv6_v6only = 1;
+			break;
+		default:
+			return -EAFNOSUPPORT;
+		}
+
+		cfg->udp_config.family = family;
+	}
+
+	if (info->attrs[FOU_ATTR_PORT]) {
+		port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
+		cfg->udp_config.local_udp_port = port;
+	}
+
+	if (info->attrs[FOU_ATTR_IPPROTO])
+		cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
+
+	if (info->attrs[FOU_ATTR_TYPE])
+		cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
+
+	if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL])
+		cfg->flags |= FOU_F_REMCSUM_NOPARTIAL;
+
+	if (cfg->udp_config.family == AF_INET) {
+		if (info->attrs[FOU_ATTR_LOCAL_V4]) {
+			attr = info->attrs[FOU_ATTR_LOCAL_V4];
+			cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr);
+			has_local = true;
+		}
+
+		if (info->attrs[FOU_ATTR_PEER_V4]) {
+			attr = info->attrs[FOU_ATTR_PEER_V4];
+			cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr);
+			has_peer = true;
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		if (info->attrs[FOU_ATTR_LOCAL_V6]) {
+			attr = info->attrs[FOU_ATTR_LOCAL_V6];
+			cfg->udp_config.local_ip6 = nla_get_in6_addr(attr);
+			has_local = true;
+		}
+
+		if (info->attrs[FOU_ATTR_PEER_V6]) {
+			attr = info->attrs[FOU_ATTR_PEER_V6];
+			cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr);
+			has_peer = true;
+		}
+#endif
+	}
+
+	if (has_peer) {
+		if (info->attrs[FOU_ATTR_PEER_PORT]) {
+			port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]);
+			cfg->udp_config.peer_udp_port = port;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	if (info->attrs[FOU_ATTR_IFINDEX]) {
+		if (!has_local)
+			return -EINVAL;
+
+		ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]);
+
+		cfg->udp_config.bind_ifindex = ifindex;
+	}
+
+	return 0;
+}
+
+int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct fou_cfg cfg;
+	int err;
+
+	err = parse_nl_config(info, &cfg);
+	if (err)
+		return err;
+
+	return fou_create(net, &cfg, NULL);
+}
+
+int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct fou_cfg cfg;
+	int err;
+
+	err = parse_nl_config(info, &cfg);
+	if (err)
+		return err;
+
+	return fou_destroy(net, &cfg);
+}
+
+static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
+{
+	struct sock *sk = fou->sock->sk;
+
+	if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
+	    nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
+	    nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) ||
+	    nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
+	    nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) ||
+	    nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if))
+		return -1;
+
+	if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
+		if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
+			return -1;
+
+	if (fou->sock->sk->sk_family == AF_INET) {
+		if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr))
+			return -1;
+
+		if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr))
+			return -1;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6,
+				     &sk->sk_v6_rcv_saddr))
+			return -1;
+
+		if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr))
+			return -1;
+#endif
+	}
+
+	return 0;
+}
+
+static int fou_dump_info(struct fou *fou, u32 portid, u32 seq,
+			 u32 flags, struct sk_buff *skb, u8 cmd)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd);
+	if (!hdr)
+		return -ENOMEM;
+
+	if (fou_fill_info(fou, skb) < 0)
+		goto nla_put_failure;
+
+	genlmsg_end(skb, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct fou_net *fn = net_generic(net, fou_net_id);
+	struct sk_buff *msg;
+	struct fou_cfg cfg;
+	struct fou *fout;
+	__be16 port;
+	u8 family;
+	int ret;
+
+	ret = parse_nl_config(info, &cfg);
+	if (ret)
+		return ret;
+	port = cfg.udp_config.local_udp_port;
+	if (port == 0)
+		return -EINVAL;
+
+	family = cfg.udp_config.family;
+	if (family != AF_INET && family != AF_INET6)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = -ESRCH;
+	mutex_lock(&fn->fou_lock);
+	list_for_each_entry(fout, &fn->fou_list, list) {
+		if (fou_cfg_cmp(fout, &cfg)) {
+			ret = fou_dump_info(fout, info->snd_portid,
+					    info->snd_seq, 0, msg,
+					    info->genlhdr->cmd);
+			break;
+		}
+	}
+	mutex_unlock(&fn->fou_lock);
+	if (ret < 0)
+		goto out_free;
+
+	return genlmsg_reply(msg, info);
+
+out_free:
+	nlmsg_free(msg);
+	return ret;
+}
+
+int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fou_net *fn = net_generic(net, fou_net_id);
+	struct fou *fout;
+	int idx = 0, ret;
+
+	mutex_lock(&fn->fou_lock);
+	list_for_each_entry(fout, &fn->fou_list, list) {
+		if (idx++ < cb->args[0])
+			continue;
+		ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid,
+				    cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				    skb, FOU_CMD_GET);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&fn->fou_lock);
+
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static struct genl_family fou_nl_family __ro_after_init = {
+	.hdrsize	= 0,
+	.name		= FOU_GENL_NAME,
+	.version	= FOU_GENL_VERSION,
+	.maxattr	= FOU_ATTR_MAX,
+	.policy		= fou_nl_policy,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.small_ops	= fou_nl_ops,
+	.n_small_ops	= ARRAY_SIZE(fou_nl_ops),
+	.resv_start_op	= FOU_CMD_GET + 1,
+};
+
+size_t fou_encap_hlen(struct ip_tunnel_encap *e)
+{
+	return sizeof(struct udphdr);
+}
+EXPORT_SYMBOL(fou_encap_hlen);
+
+size_t gue_encap_hlen(struct ip_tunnel_encap *e)
+{
+	size_t len;
+	bool need_priv = false;
+
+	len = sizeof(struct udphdr) + sizeof(struct guehdr);
+
+	if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) {
+		len += GUE_PLEN_REMCSUM;
+		need_priv = true;
+	}
+
+	len += need_priv ? GUE_LEN_PRIV : 0;
+
+	return len;
+}
+EXPORT_SYMBOL(gue_encap_hlen);
+
+int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+		       u8 *protocol, __be16 *sport, int type)
+{
+	int err;
+
+	err = iptunnel_handle_offloads(skb, type);
+	if (err)
+		return err;
+
+	*sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+						skb, 0, 0, false);
+
+	return 0;
+}
+EXPORT_SYMBOL(__fou_build_header);
+
+int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+		       u8 *protocol, __be16 *sport, int type)
+{
+	struct guehdr *guehdr;
+	size_t hdrlen, optlen = 0;
+	void *data;
+	bool need_priv = false;
+	int err;
+
+	if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
+	    skb->ip_summed == CHECKSUM_PARTIAL) {
+		optlen += GUE_PLEN_REMCSUM;
+		type |= SKB_GSO_TUNNEL_REMCSUM;
+		need_priv = true;
+	}
+
+	optlen += need_priv ? GUE_LEN_PRIV : 0;
+
+	err = iptunnel_handle_offloads(skb, type);
+	if (err)
+		return err;
+
+	/* Get source port (based on flow hash) before skb_push */
+	*sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+						skb, 0, 0, false);
+
+	hdrlen = sizeof(struct guehdr) + optlen;
+
+	skb_push(skb, hdrlen);
+
+	guehdr = (struct guehdr *)skb->data;
+
+	guehdr->control = 0;
+	guehdr->version = 0;
+	guehdr->hlen = optlen >> 2;
+	guehdr->flags = 0;
+	guehdr->proto_ctype = *protocol;
+
+	data = &guehdr[1];
+
+	if (need_priv) {
+		__be32 *flags = data;
+
+		guehdr->flags |= GUE_FLAG_PRIV;
+		*flags = 0;
+		data += GUE_LEN_PRIV;
+
+		if (type & SKB_GSO_TUNNEL_REMCSUM) {
+			u16 csum_start = skb_checksum_start_offset(skb);
+			__be16 *pd = data;
+
+			if (csum_start < hdrlen)
+				return -EINVAL;
+
+			csum_start -= hdrlen;
+			pd[0] = htons(csum_start);
+			pd[1] = htons(csum_start + skb->csum_offset);
+
+			if (!skb_is_gso(skb)) {
+				skb->ip_summed = CHECKSUM_NONE;
+				skb->encapsulation = 0;
+			}
+
+			*flags |= GUE_PFLAG_REMCSUM;
+			data += GUE_PLEN_REMCSUM;
+		}
+
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__gue_build_header);
+
+#ifdef CONFIG_NET_FOU_IP_TUNNELS
+
+static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+			  struct flowi4 *fl4, u8 *protocol, __be16 sport)
+{
+	struct udphdr *uh;
+
+	skb_push(skb, sizeof(struct udphdr));
+	skb_reset_transport_header(skb);
+
+	uh = udp_hdr(skb);
+
+	uh->dest = e->dport;
+	uh->source = sport;
+	uh->len = htons(skb->len);
+	udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
+		     fl4->saddr, fl4->daddr, skb->len);
+
+	*protocol = IPPROTO_UDP;
+}
+
+static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+			    u8 *protocol, struct flowi4 *fl4)
+{
+	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+						       SKB_GSO_UDP_TUNNEL;
+	__be16 sport;
+	int err;
+
+	err = __fou_build_header(skb, e, protocol, &sport, type);
+	if (err)
+		return err;
+
+	fou_build_udp(skb, e, fl4, protocol, sport);
+
+	return 0;
+}
+
+static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+			    u8 *protocol, struct flowi4 *fl4)
+{
+	int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+						       SKB_GSO_UDP_TUNNEL;
+	__be16 sport;
+	int err;
+
+	err = __gue_build_header(skb, e, protocol, &sport, type);
+	if (err)
+		return err;
+
+	fou_build_udp(skb, e, fl4, protocol, sport);
+
+	return 0;
+}
+
+static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
+{
+	const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);
+
+	if (ipprot && ipprot->err_handler) {
+		if (!ipprot->err_handler(skb, info))
+			return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int gue_err(struct sk_buff *skb, u32 info)
+{
+	int transport_offset = skb_transport_offset(skb);
+	struct guehdr *guehdr;
+	size_t len, optlen;
+	int ret;
+
+	len = sizeof(struct udphdr) + sizeof(struct guehdr);
+	if (!pskb_may_pull(skb, transport_offset + len))
+		return -EINVAL;
+
+	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+	switch (guehdr->version) {
+	case 0: /* Full GUE header present */
+		break;
+	case 1: {
+		/* Direct encapsulation of IPv4 or IPv6 */
+		skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+
+		switch (((struct iphdr *)guehdr)->version) {
+		case 4:
+			ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
+			goto out;
+#if IS_ENABLED(CONFIG_IPV6)
+		case 6:
+			ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
+			goto out;
+#endif
+		default:
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+	}
+	default: /* Undefined version */
+		return -EOPNOTSUPP;
+	}
+
+	if (guehdr->control)
+		return -ENOENT;
+
+	optlen = guehdr->hlen << 2;
+
+	if (!pskb_may_pull(skb, transport_offset + len + optlen))
+		return -EINVAL;
+
+	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+	if (validate_gue_flags(guehdr, optlen))
+		return -EINVAL;
+
+	/* Handling exceptions for direct UDP encapsulation in GUE would lead to
+	 * recursion. Besides, this kind of encapsulation can't even be
+	 * configured currently. Discard this.
+	 */
+	if (guehdr->proto_ctype == IPPROTO_UDP ||
+	    guehdr->proto_ctype == IPPROTO_UDPLITE)
+		return -EOPNOTSUPP;
+
+	skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+	ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);
+
+out:
+	skb_set_transport_header(skb, transport_offset);
+	return ret;
+}
+
+
+static const struct ip_tunnel_encap_ops fou_iptun_ops = {
+	.encap_hlen = fou_encap_hlen,
+	.build_header = fou_build_header,
+	.err_handler = gue_err,
+};
+
+static const struct ip_tunnel_encap_ops gue_iptun_ops = {
+	.encap_hlen = gue_encap_hlen,
+	.build_header = gue_build_header,
+	.err_handler = gue_err,
+};
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+	int ret;
+
+	ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+	if (ret < 0) {
+		pr_err("can't add fou ops\n");
+		return ret;
+	}
+
+	ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+	if (ret < 0) {
+		pr_err("can't add gue ops\n");
+		ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+	ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+	ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+	return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
+static __net_init int fou_init_net(struct net *net)
+{
+	struct fou_net *fn = net_generic(net, fou_net_id);
+
+	INIT_LIST_HEAD(&fn->fou_list);
+	mutex_init(&fn->fou_lock);
+	return 0;
+}
+
+static __net_exit void fou_exit_net(struct net *net)
+{
+	struct fou_net *fn = net_generic(net, fou_net_id);
+	struct fou *fou, *next;
+
+	/* Close all the FOU sockets */
+	mutex_lock(&fn->fou_lock);
+	list_for_each_entry_safe(fou, next, &fn->fou_list, list)
+		fou_release(fou);
+	mutex_unlock(&fn->fou_lock);
+}
+
+static struct pernet_operations fou_net_ops = {
+	.init = fou_init_net,
+	.exit = fou_exit_net,
+	.id   = &fou_net_id,
+	.size = sizeof(struct fou_net),
+};
+
+static int __init fou_init(void)
+{
+	int ret;
+
+	ret = register_pernet_device(&fou_net_ops);
+	if (ret)
+		goto exit;
+
+	ret = genl_register_family(&fou_nl_family);
+	if (ret < 0)
+		goto unregister;
+
+	ret = register_fou_bpf();
+	if (ret < 0)
+		goto kfunc_failed;
+
+	ret = ip_tunnel_encap_add_fou_ops();
+	if (ret == 0)
+		return 0;
+
+kfunc_failed:
+	genl_unregister_family(&fou_nl_family);
+unregister:
+	unregister_pernet_device(&fou_net_ops);
+exit:
+	return ret;
+}
+
+static void __exit fou_fini(void)
+{
+	ip_tunnel_encap_del_fou_ops();
+	genl_unregister_family(&fou_nl_family);
+	unregister_pernet_device(&fou_net_ops);
+}
+
+module_init(fou_init);
+module_exit(fou_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Foo over UDP");
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
new file mode 100644
index 0000000000..98b90107b5
--- /dev/null
+++ b/net/ipv4/fou_nl.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/fou.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "fou_nl.h"
+
+#include <uapi/linux/fou.h>
+
+/* Global operation policy for fou */
+const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = {
+	[FOU_ATTR_PORT] = { .type = NLA_U16, },
+	[FOU_ATTR_AF] = { .type = NLA_U8, },
+	[FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
+	[FOU_ATTR_TYPE] = { .type = NLA_U8, },
+	[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
+	[FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, },
+	[FOU_ATTR_LOCAL_V6] = { .len = 16, },
+	[FOU_ATTR_PEER_V4] = { .type = NLA_U32, },
+	[FOU_ATTR_PEER_V6] = { .len = 16, },
+	[FOU_ATTR_PEER_PORT] = { .type = NLA_U16, },
+	[FOU_ATTR_IFINDEX] = { .type = NLA_S32, },
+};
+
+/* Ops table for fou */
+const struct genl_small_ops fou_nl_ops[3] = {
+	{
+		.cmd		= FOU_CMD_ADD,
+		.validate	= GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit		= fou_nl_add_doit,
+		.flags		= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd		= FOU_CMD_DEL,
+		.validate	= GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit		= fou_nl_del_doit,
+		.flags		= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd		= FOU_CMD_GET,
+		.validate	= GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit		= fou_nl_get_doit,
+		.dumpit		= fou_nl_get_dumpit,
+	},
+};
diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h
new file mode 100644
index 0000000000..63a6c4ed80
--- /dev/null
+++ b/net/ipv4/fou_nl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/fou.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_FOU_GEN_H
+#define _LINUX_FOU_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/fou.h>
+
+/* Global operation policy for fou */
+extern const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1];
+
+/* Ops table for fou */
+extern const struct genl_small_ops fou_nl_ops[3];
+
+int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+
+#endif /* _LINUX_FOU_GEN_H */
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
new file mode 100644
index 0000000000..cbb2b4bb0d
--- /dev/null
+++ b/net/ipv4/gre_demux.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	GRE over IPv4 demultiplexer driver
+ *
+ *	Authors: Dmitry Kozlov (xeb@mail.ru)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/if.h>
+#include <linux/icmp.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+#include <net/erspan.h>
+
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+	if (version >= GREPROTO_MAX)
+		return -EINVAL;
+
+	return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
+		0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+	int ret;
+
+	if (version >= GREPROTO_MAX)
+		return -EINVAL;
+
+	ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
+		0 : -EBUSY;
+
+	if (ret)
+		return ret;
+
+	synchronize_rcu();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+/* Fills in tpi and returns header length to be pulled.
+ * Note that caller must use pskb_may_pull() before pulling GRE header.
+ */
+int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+		     bool *csum_err, __be16 proto, int nhs)
+{
+	const struct gre_base_hdr *greh;
+	__be32 *options;
+	int hdr_len;
+
+	if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr))))
+		return -EINVAL;
+
+	greh = (struct gre_base_hdr *)(skb->data + nhs);
+	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+		return -EINVAL;
+
+	tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+	hdr_len = gre_calc_hlen(tpi->flags);
+
+	if (!pskb_may_pull(skb, nhs + hdr_len))
+		return -EINVAL;
+
+	greh = (struct gre_base_hdr *)(skb->data + nhs);
+	tpi->proto = greh->protocol;
+
+	options = (__be32 *)(greh + 1);
+	if (greh->flags & GRE_CSUM) {
+		if (!skb_checksum_simple_validate(skb)) {
+			skb_checksum_try_convert(skb, IPPROTO_GRE,
+						 null_compute_pseudo);
+		} else if (csum_err) {
+			*csum_err = true;
+			return -EINVAL;
+		}
+
+		options++;
+	}
+
+	if (greh->flags & GRE_KEY) {
+		tpi->key = *options;
+		options++;
+	} else {
+		tpi->key = 0;
+	}
+	if (unlikely(greh->flags & GRE_SEQ)) {
+		tpi->seq = *options;
+		options++;
+	} else {
+		tpi->seq = 0;
+	}
+	/* WCCP version 1 and 2 protocol decoding.
+	 * - Change protocol to IPv4/IPv6
+	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+	 */
+	if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+		u8 _val, *val;
+
+		val = skb_header_pointer(skb, nhs + hdr_len,
+					 sizeof(_val), &_val);
+		if (!val)
+			return -EINVAL;
+		tpi->proto = proto;
+		if ((*val & 0xF0) != 0x40)
+			hdr_len += 4;
+	}
+	tpi->hdr_len = hdr_len;
+
+	/* ERSPAN ver 1 and 2 protocol sets GRE key field
+	 * to 0 and sets the configured key in the
+	 * inner erspan header field
+	 */
+	if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) ||
+	    greh->protocol == htons(ETH_P_ERSPAN2)) {
+		struct erspan_base_hdr *ershdr;
+
+		if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr)))
+			return -EINVAL;
+
+		ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len);
+		tpi->key = cpu_to_be32(get_session_id(ershdr));
+	}
+
+	return hdr_len;
+}
+EXPORT_SYMBOL(gre_parse_header);
+
+static int gre_rcv(struct sk_buff *skb)
+{
+	const struct gre_protocol *proto;
+	u8 ver;
+	int ret;
+
+	if (!pskb_may_pull(skb, 12))
+		goto drop;
+
+	ver = skb->data[1]&0x7f;
+	if (ver >= GREPROTO_MAX)
+		goto drop;
+
+	rcu_read_lock();
+	proto = rcu_dereference(gre_proto[ver]);
+	if (!proto || !proto->handler)
+		goto drop_unlock;
+	ret = proto->handler(skb);
+	rcu_read_unlock();
+	return ret;
+
+drop_unlock:
+	rcu_read_unlock();
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static int gre_err(struct sk_buff *skb, u32 info)
+{
+	const struct gre_protocol *proto;
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+	int err = 0;
+
+	if (ver >= GREPROTO_MAX)
+		return -EINVAL;
+
+	rcu_read_lock();
+	proto = rcu_dereference(gre_proto[ver]);
+	if (proto && proto->err_handler)
+		proto->err_handler(skb, info);
+	else
+		err = -EPROTONOSUPPORT;
+	rcu_read_unlock();
+
+	return err;
+}
+
+static const struct net_protocol net_gre_protocol = {
+	.handler     = gre_rcv,
+	.err_handler = gre_err,
+};
+
+static int __init gre_init(void)
+{
+	pr_info("GRE over IPv4 demultiplexor driver\n");
+
+	if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+		pr_err("can't add protocol\n");
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit gre_exit(void)
+{
+	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
new file mode 100644
index 0000000000..311e70bfce
--- /dev/null
+++ b/net/ipv4/gre_offload.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	IPV4 GSO/GRO offload support
+ *	Linux INET implementation
+ *
+ *	GRE GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+#include <net/gro.h>
+#include <net/gso.h>
+
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+				       netdev_features_t features)
+{
+	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+	bool need_csum, offload_csum, gso_partial, need_ipsec;
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	u16 mac_offset = skb->mac_header;
+	__be16 protocol = skb->protocol;
+	u16 mac_len = skb->mac_len;
+	int gre_offset, outer_hlen;
+
+	if (!skb->encapsulation)
+		goto out;
+
+	if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr)))
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+		goto out;
+
+	/* setup inner skb. */
+	skb->encapsulation = 0;
+	SKB_GSO_CB(skb)->encap_level = 0;
+	__skb_pull(skb, tnl_hlen);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb_inner_network_offset(skb));
+	skb->mac_len = skb_inner_network_offset(skb);
+	skb->protocol = skb->inner_protocol;
+
+	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
+	skb->encap_hdr_csum = need_csum;
+
+	features &= skb->dev->hw_enc_features;
+	if (need_csum)
+		features &= ~NETIF_F_SCTP_CRC;
+
+	need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+	/* Try to offload checksum if possible */
+	offload_csum = !!(need_csum && !need_ipsec &&
+			  (skb->dev->features & NETIF_F_HW_CSUM));
+
+	/* segment inner packet. */
+	segs = skb_mac_gso_segment(skb, features);
+	if (IS_ERR_OR_NULL(segs)) {
+		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+				     mac_len);
+		goto out;
+	}
+
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
+	outer_hlen = skb_tnl_header_len(skb);
+	gre_offset = outer_hlen - tnl_hlen;
+	skb = segs;
+	do {
+		struct gre_base_hdr *greh;
+		__sum16 *pcsum;
+
+		/* Set up inner headers if we are offloading inner checksum */
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			skb_reset_inner_headers(skb);
+			skb->encapsulation = 1;
+		}
+
+		skb->mac_len = mac_len;
+		skb->protocol = protocol;
+
+		__skb_push(skb, outer_hlen);
+		skb_reset_mac_header(skb);
+		skb_set_network_header(skb, mac_len);
+		skb_set_transport_header(skb, gre_offset);
+
+		if (!need_csum)
+			continue;
+
+		greh = (struct gre_base_hdr *)skb_transport_header(skb);
+		pcsum = (__sum16 *)(greh + 1);
+
+		if (gso_partial && skb_is_gso(skb)) {
+			unsigned int partial_adj;
+
+			/* Adjust checksum to account for the fact that
+			 * the partial checksum is based on actual size
+			 * whereas headers should be based on MSS size.
+			 */
+			partial_adj = skb->len + skb_headroom(skb) -
+				      SKB_GSO_CB(skb)->data_offset -
+				      skb_shinfo(skb)->gso_size;
+			*pcsum = ~csum_fold((__force __wsum)htonl(partial_adj));
+		} else {
+			*pcsum = 0;
+		}
+
+		*(pcsum + 1) = 0;
+		if (skb->encapsulation || !offload_csum) {
+			*pcsum = gso_make_checksum(skb, 0);
+		} else {
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->csum_start = skb_transport_header(skb) - skb->head;
+			skb->csum_offset = sizeof(*greh);
+		}
+	} while ((skb = skb->next));
+out:
+	return segs;
+}
+
+static struct sk_buff *gre_gro_receive(struct list_head *head,
+				       struct sk_buff *skb)
+{
+	struct sk_buff *pp = NULL;
+	struct sk_buff *p;
+	const struct gre_base_hdr *greh;
+	unsigned int hlen, grehlen;
+	unsigned int off;
+	int flush = 1;
+	struct packet_offload *ptype;
+	__be16 type;
+
+	if (NAPI_GRO_CB(skb)->encap_mark)
+		goto out;
+
+	NAPI_GRO_CB(skb)->encap_mark = 1;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*greh);
+	greh = skb_gro_header(skb, hlen, off);
+	if (unlikely(!greh))
+		goto out;
+
+	/* Only support version 0 and K (key), C (csum) flags. Note that
+	 * although the support for the S (seq#) flag can be added easily
+	 * for GRO, this is problematic for GSO hence can not be enabled
+	 * here because a GRO pkt may end up in the forwarding path, thus
+	 * requiring GSO support to break it up correctly.
+	 */
+	if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
+		goto out;
+
+	/* We can only support GRE_CSUM if we can track the location of
+	 * the GRE header.  In the case of FOU/GUE we cannot because the
+	 * outer UDP header displaces the GRE header leaving us in a state
+	 * of limbo.
+	 */
+	if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou)
+		goto out;
+
+	type = greh->protocol;
+
+	ptype = gro_find_receive_by_type(type);
+	if (!ptype)
+		goto out;
+
+	grehlen = GRE_HEADER_SECTION;
+
+	if (greh->flags & GRE_KEY)
+		grehlen += GRE_HEADER_SECTION;
+
+	if (greh->flags & GRE_CSUM)
+		grehlen += GRE_HEADER_SECTION;
+
+	hlen = off + grehlen;
+	if (skb_gro_header_hard(skb, hlen)) {
+		greh = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!greh))
+			goto out;
+	}
+
+	/* Don't bother verifying checksum if we're going to flush anyway. */
+	if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
+		if (skb_gro_checksum_simple_validate(skb))
+			goto out;
+
+		skb_gro_checksum_try_convert(skb, IPPROTO_GRE,
+					     null_compute_pseudo);
+	}
+
+	list_for_each_entry(p, head, list) {
+		const struct gre_base_hdr *greh2;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		/* The following checks are needed to ensure only pkts
+		 * from the same tunnel are considered for aggregation.
+		 * The criteria for "the same tunnel" includes:
+		 * 1) same version (we only support version 0 here)
+		 * 2) same protocol (we only support ETH_P_IP for now)
+		 * 3) same set of flags
+		 * 4) same key if the key field is present.
+		 */
+		greh2 = (struct gre_base_hdr *)(p->data + off);
+
+		if (greh2->flags != greh->flags ||
+		    greh2->protocol != greh->protocol) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+		if (greh->flags & GRE_KEY) {
+			/* compare keys */
+			if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
+				NAPI_GRO_CB(p)->same_flow = 0;
+				continue;
+			}
+		}
+	}
+
+	skb_gro_pull(skb, grehlen);
+
+	/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+	skb_gro_postpull_rcsum(skb, greh, grehlen);
+
+	pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
+	flush = 0;
+
+out:
+	skb_gro_flush_final(skb, pp, flush);
+
+	return pp;
+}
+
+static int gre_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
+	struct packet_offload *ptype;
+	unsigned int grehlen = sizeof(*greh);
+	int err = -ENOENT;
+	__be16 type;
+
+	skb->encapsulation = 1;
+	skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
+
+	type = greh->protocol;
+	if (greh->flags & GRE_KEY)
+		grehlen += GRE_HEADER_SECTION;
+
+	if (greh->flags & GRE_CSUM)
+		grehlen += GRE_HEADER_SECTION;
+
+	ptype = gro_find_complete_by_type(type);
+	if (ptype)
+		err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
+
+	skb_set_inner_mac_header(skb, nhoff + grehlen);
+
+	return err;
+}
+
+static const struct net_offload gre_offload = {
+	.callbacks = {
+		.gso_segment = gre_gso_segment,
+		.gro_receive = gre_gro_receive,
+		.gro_complete = gre_gro_complete,
+	},
+};
+
+static int __init gre_offload_init(void)
+{
+	int err;
+
+	err = inet_add_offload(&gre_offload, IPPROTO_GRE);
+#if IS_ENABLED(CONFIG_IPV6)
+	if (err)
+		return err;
+
+	err = inet6_add_offload(&gre_offload, IPPROTO_GRE);
+	if (err)
+		inet_del_offload(&gre_offload, IPPROTO_GRE);
+#endif
+
+	return err;
+}
+device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 0000000000..b8607763d1
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1510 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	NET3:	Implementation of the ICMP protocol layer.
+ *
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *
+ *	Some of the function names and the icmp unreach table for this
+ *	module were derived from [icmp.c 1.0.11 06/02/93] by
+ *	Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ *	Other than that this module is a complete rewrite.
+ *
+ *	Fixes:
+ *	Clemens Fruhwirth	:	introduce global icmp rate limiting
+ *					with icmp type masking ability instead
+ *					of broken per type icmp timeouts.
+ *		Mike Shaver	:	RFC1122 checks.
+ *		Alan Cox	:	Multicast ping reply as self.
+ *		Alan Cox	:	Fix atomicity lockup in ip_build_xmit
+ *					call.
+ *		Alan Cox	:	Added 216,128 byte paths to the MTU
+ *					code.
+ *		Martin Mares	:	RFC1812 checks.
+ *		Martin Mares	:	Can be configured to follow redirects
+ *					if acting as a router _without_ a
+ *					routing protocol (RFC 1812).
+ *		Martin Mares	:	Echo requests may be configured to
+ *					be ignored (RFC 1812).
+ *		Martin Mares	:	Limitation of ICMP error message
+ *					transmit rate (RFC 1812).
+ *		Martin Mares	:	TOS and Precedence set correctly
+ *					(RFC 1812).
+ *		Martin Mares	:	Now copying as much data from the
+ *					original packet as we can without
+ *					exceeding 576 bytes (RFC 1812).
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		Keith Owens	:	RFC1191 correction for 4.2BSD based
+ *					path MTU bug.
+ *		Thomas Quinot	:	ICMP Dest Unreach codes up to 15 are
+ *					valid (RFC 1812).
+ *		Andi Kleen	:	Check all packet lengths properly
+ *					and moved all kfree_skb() up to
+ *					icmp_rcv.
+ *		Andi Kleen	:	Move the rate limit bookkeeping
+ *					into the dest entry and use a token
+ *					bucket filter (thanks to ANK). Make
+ *					the rates sysctl configurable.
+ *		Yu Tianli	:	Fixed two ugly bugs in icmp_send
+ *					- IP option length was accounted wrongly
+ *					- ICMP header length was not accounted
+ *					  at all.
+ *              Tristan Greaves :       Added sysctl option to ignore bogus
+ *              			broadcast responses from broken routers.
+ *
+ * To Fix:
+ *
+ *	- Should use skb_pull() instead of all the manual checking.
+ *	  This would also greatly simply some upper layer error handlers. --AK
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/slab.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/ping.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+#include <net/ip_fib.h>
+#include <net/l3mdev.h>
+
+/*
+ *	Build xmit assembly blocks
+ */
+
+struct icmp_bxm {
+	struct sk_buff *skb;
+	int offset;
+	int data_len;
+
+	struct {
+		struct icmphdr icmph;
+		__be32	       times[3];
+	} data;
+	int head_len;
+	struct ip_options_data replyopts;
+};
+
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
+
+const struct icmp_err icmp_err_convert[] = {
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNREACH */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_UNREACH */
+		.fatal = 0,
+	},
+	{
+		.errno = ENOPROTOOPT	/* ICMP_PROT_UNREACH */,
+		.fatal = 1,
+	},
+	{
+		.errno = ECONNREFUSED,	/* ICMP_PORT_UNREACH */
+		.fatal = 1,
+	},
+	{
+		.errno = EMSGSIZE,	/* ICMP_FRAG_NEEDED */
+		.fatal = 0,
+	},
+	{
+		.errno = EOPNOTSUPP,	/* ICMP_SR_FAILED */
+		.fatal = 0,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNKNOWN */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTDOWN,	/* ICMP_HOST_UNKNOWN */
+		.fatal = 1,
+	},
+	{
+		.errno = ENONET,	/* ICMP_HOST_ISOLATED */
+		.fatal = 1,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_ANO	*/
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_ANO */
+		.fatal = 1,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNR_TOS */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_UNR_TOS */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PKT_FILTERED */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PREC_VIOLATION */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PREC_CUTOFF */
+		.fatal = 1,
+	},
+};
+EXPORT_SYMBOL(icmp_err_convert);
+
+/*
+ *	ICMP control array. This specifies what to do with each ICMP.
+ */
+
+struct icmp_control {
+	enum skb_drop_reason (*handler)(struct sk_buff *skb);
+	short   error;		/* This ICMP is classed as an error message */
+};
+
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);
+
+/* Called with BH disabled */
+static inline struct sock *icmp_xmit_lock(struct net *net)
+{
+	struct sock *sk;
+
+	sk = this_cpu_read(ipv4_icmp_sk);
+
+	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
+		/* This can happen if the output path signals a
+		 * dst_link_failure() for an outgoing ICMP packet.
+		 */
+		return NULL;
+	}
+	sock_net_set(sk, net);
+	return sk;
+}
+
+static inline void icmp_xmit_unlock(struct sock *sk)
+{
+	sock_net_set(sk, &init_net);
+	spin_unlock(&sk->sk_lock.slock);
+}
+
+int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
+int sysctl_icmp_msgs_burst __read_mostly = 50;
+
+static struct {
+	spinlock_t	lock;
+	u32		credit;
+	u32		stamp;
+} icmp_global = {
+	.lock		= __SPIN_LOCK_UNLOCKED(icmp_global.lock),
+};
+
+/**
+ * icmp_global_allow - Are we allowed to send one more ICMP message ?
+ *
+ * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
+ * Returns false if we reached the limit and can not send another packet.
+ * Note: called with BH disabled
+ */
+bool icmp_global_allow(void)
+{
+	u32 credit, delta, incr = 0, now = (u32)jiffies;
+	bool rc = false;
+
+	/* Check if token bucket is empty and cannot be refilled
+	 * without taking the spinlock. The READ_ONCE() are paired
+	 * with the following WRITE_ONCE() in this same function.
+	 */
+	if (!READ_ONCE(icmp_global.credit)) {
+		delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
+		if (delta < HZ / 50)
+			return false;
+	}
+
+	spin_lock(&icmp_global.lock);
+	delta = min_t(u32, now - icmp_global.stamp, HZ);
+	if (delta >= HZ / 50) {
+		incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ;
+		if (incr)
+			WRITE_ONCE(icmp_global.stamp, now);
+	}
+	credit = min_t(u32, icmp_global.credit + incr,
+		       READ_ONCE(sysctl_icmp_msgs_burst));
+	if (credit) {
+		/* We want to use a credit of one in average, but need to randomize
+		 * it for security reasons.
+		 */
+		credit = max_t(int, credit - get_random_u32_below(3), 0);
+		rc = true;
+	}
+	WRITE_ONCE(icmp_global.credit, credit);
+	spin_unlock(&icmp_global.lock);
+	return rc;
+}
+EXPORT_SYMBOL(icmp_global_allow);
+
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+	if (type > NR_ICMP_TYPES)
+		return true;
+
+	/* Don't limit PMTU discovery. */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		return true;
+
+	/* Limit if icmp type is enabled in ratemask. */
+	if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask)))
+		return true;
+
+	return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+	if (icmpv4_mask_allow(net, type, code))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
+	return false;
+}
+
+/*
+ *	Send an ICMP frame.
+ */
+
+static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+			       struct flowi4 *fl4, int type, int code)
+{
+	struct dst_entry *dst = &rt->dst;
+	struct inet_peer *peer;
+	bool rc = true;
+	int vif;
+
+	if (icmpv4_mask_allow(net, type, code))
+		goto out;
+
+	/* No rate limit on loopback */
+	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+		goto out;
+
+	vif = l3mdev_master_ifindex(dst->dev);
+	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+	rc = inet_peer_xrlim_allow(peer,
+				   READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
+	if (peer)
+		inet_putpeer(peer);
+out:
+	if (!rc)
+		__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
+	return rc;
+}
+
+/*
+ *	Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+void icmp_out_count(struct net *net, unsigned char type)
+{
+	ICMPMSGOUT_INC_STATS(net, type);
+	ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
+}
+
+/*
+ *	Checksum each fragment, and on the first include the headers and final
+ *	checksum.
+ */
+static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
+			  struct sk_buff *skb)
+{
+	struct icmp_bxm *icmp_param = from;
+	__wsum csum;
+
+	csum = skb_copy_and_csum_bits(icmp_param->skb,
+				      icmp_param->offset + offset,
+				      to, len);
+
+	skb->csum = csum_block_add(skb->csum, csum, odd);
+	if (icmp_pointers[icmp_param->data.icmph.type].error)
+		nf_ct_attach(skb, icmp_param->skb);
+	return 0;
+}
+
+static void icmp_push_reply(struct sock *sk,
+			    struct icmp_bxm *icmp_param,
+			    struct flowi4 *fl4,
+			    struct ipcm_cookie *ipc, struct rtable **rt)
+{
+	struct sk_buff *skb;
+
+	if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
+			   icmp_param->data_len+icmp_param->head_len,
+			   icmp_param->head_len,
+			   ipc, rt, MSG_DONTWAIT) < 0) {
+		__ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS);
+		ip_flush_pending_frames(sk);
+	} else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+		struct icmphdr *icmph = icmp_hdr(skb);
+		__wsum csum;
+		struct sk_buff *skb1;
+
+		csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+						 (char *)icmph,
+						 icmp_param->head_len);
+		skb_queue_walk(&sk->sk_write_queue, skb1) {
+			csum = csum_add(csum, skb1->csum);
+		}
+		icmph->checksum = csum_fold(csum);
+		skb->ip_summed = CHECKSUM_NONE;
+		ip_push_pending_frames(sk, fl4);
+	}
+}
+
+/*
+ *	Driving logic for building and sending ICMP messages.
+ */
+
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+	struct ipcm_cookie ipc;
+	struct rtable *rt = skb_rtable(skb);
+	struct net *net = dev_net(rt->dst.dev);
+	struct flowi4 fl4;
+	struct sock *sk;
+	struct inet_sock *inet;
+	__be32 daddr, saddr;
+	u32 mark = IP4_REPLY_MARK(net, skb->mark);
+	int type = icmp_param->data.icmph.type;
+	int code = icmp_param->data.icmph.code;
+
+	if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
+		return;
+
+	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	local_bh_disable();
+
+	/* global icmp_msgs_per_sec */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_bh_enable;
+
+	sk = icmp_xmit_lock(net);
+	if (!sk)
+		goto out_bh_enable;
+	inet = inet_sk(sk);
+
+	icmp_param->data.icmph.checksum = 0;
+
+	ipcm_init(&ipc);
+	inet->tos = ip_hdr(skb)->tos;
+	ipc.sockc.mark = mark;
+	daddr = ipc.addr = ip_hdr(skb)->saddr;
+	saddr = fib_compute_spec_dst(skb);
+
+	if (icmp_param->replyopts.opt.opt.optlen) {
+		ipc.opt = &icmp_param->replyopts.opt;
+		if (ipc.opt->opt.srr)
+			daddr = icmp_param->replyopts.opt.opt.faddr;
+	}
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = daddr;
+	fl4.saddr = saddr;
+	fl4.flowi4_mark = mark;
+	fl4.flowi4_uid = sock_net_uid(net, NULL);
+	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+	fl4.flowi4_proto = IPPROTO_ICMP;
+	fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
+	security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		goto out_unlock;
+	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+		icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
+	ip_rt_put(rt);
+out_unlock:
+	icmp_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
+}
+
+/*
+ * The device used for looking up which routing table to use for sending an ICMP
+ * error is preferably the source whenever it is set, which should ensure the
+ * icmp error can be sent to the source host, else lookup using the routing
+ * table of the destination device, else use the main routing table (index 0).
+ */
+static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
+{
+	struct net_device *route_lookup_dev = NULL;
+
+	if (skb->dev)
+		route_lookup_dev = skb->dev;
+	else if (skb_dst(skb))
+		route_lookup_dev = skb_dst(skb)->dev;
+	return route_lookup_dev;
+}
+
+static struct rtable *icmp_route_lookup(struct net *net,
+					struct flowi4 *fl4,
+					struct sk_buff *skb_in,
+					const struct iphdr *iph,
+					__be32 saddr, u8 tos, u32 mark,
+					int type, int code,
+					struct icmp_bxm *param)
+{
+	struct net_device *route_lookup_dev;
+	struct rtable *rt, *rt2;
+	struct flowi4 fl4_dec;
+	int err;
+
+	memset(fl4, 0, sizeof(*fl4));
+	fl4->daddr = (param->replyopts.opt.opt.srr ?
+		      param->replyopts.opt.opt.faddr : iph->saddr);
+	fl4->saddr = saddr;
+	fl4->flowi4_mark = mark;
+	fl4->flowi4_uid = sock_net_uid(net, NULL);
+	fl4->flowi4_tos = RT_TOS(tos);
+	fl4->flowi4_proto = IPPROTO_ICMP;
+	fl4->fl4_icmp_type = type;
+	fl4->fl4_icmp_code = code;
+	route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
+	fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
+
+	security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4));
+	rt = ip_route_output_key_hash(net, fl4, skb_in);
+	if (IS_ERR(rt))
+		return rt;
+
+	/* No need to clone since we're just using its address. */
+	rt2 = rt;
+
+	rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+					   flowi4_to_flowi(fl4), NULL, 0);
+	if (!IS_ERR(rt)) {
+		if (rt != rt2)
+			return rt;
+	} else if (PTR_ERR(rt) == -EPERM) {
+		rt = NULL;
+	} else
+		return rt;
+
+	err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
+	if (err)
+		goto relookup_failed;
+
+	if (inet_addr_type_dev_table(net, route_lookup_dev,
+				     fl4_dec.saddr) == RTN_LOCAL) {
+		rt2 = __ip_route_output_key(net, &fl4_dec);
+		if (IS_ERR(rt2))
+			err = PTR_ERR(rt2);
+	} else {
+		struct flowi4 fl4_2 = {};
+		unsigned long orefdst;
+
+		fl4_2.daddr = fl4_dec.saddr;
+		rt2 = ip_route_output_key(net, &fl4_2);
+		if (IS_ERR(rt2)) {
+			err = PTR_ERR(rt2);
+			goto relookup_failed;
+		}
+		/* Ugh! */
+		orefdst = skb_in->_skb_refdst; /* save old refdst */
+		skb_dst_set(skb_in, NULL);
+		err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
+				     RT_TOS(tos), rt2->dst.dev);
+
+		dst_release(&rt2->dst);
+		rt2 = skb_rtable(skb_in);
+		skb_in->_skb_refdst = orefdst; /* restore old refdst */
+	}
+
+	if (err)
+		goto relookup_failed;
+
+	rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
+					    flowi4_to_flowi(&fl4_dec), NULL,
+					    XFRM_LOOKUP_ICMP);
+	if (!IS_ERR(rt2)) {
+		dst_release(&rt->dst);
+		memcpy(fl4, &fl4_dec, sizeof(*fl4));
+		rt = rt2;
+	} else if (PTR_ERR(rt2) == -EPERM) {
+		if (rt)
+			dst_release(&rt->dst);
+		return rt2;
+	} else {
+		err = PTR_ERR(rt2);
+		goto relookup_failed;
+	}
+	return rt;
+
+relookup_failed:
+	if (rt)
+		return rt;
+	return ERR_PTR(err);
+}
+
+/*
+ *	Send an ICMP message in response to a situation
+ *
+ *	RFC 1122: 3.2.2	MUST send at least the IP header and 8 bytes of header.
+ *		  MAY send more (we do).
+ *			MUST NOT change this header information.
+ *			MUST NOT reply to a multicast/broadcast IP address.
+ *			MUST NOT reply to a multicast/broadcast MAC address.
+ *			MUST reply to only the first fragment.
+ */
+
+void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
+		 const struct ip_options *opt)
+{
+	struct iphdr *iph;
+	int room;
+	struct icmp_bxm icmp_param;
+	struct rtable *rt = skb_rtable(skb_in);
+	struct ipcm_cookie ipc;
+	struct flowi4 fl4;
+	__be32 saddr;
+	u8  tos;
+	u32 mark;
+	struct net *net;
+	struct sock *sk;
+
+	if (!rt)
+		goto out;
+
+	if (rt->dst.dev)
+		net = dev_net(rt->dst.dev);
+	else if (skb_in->dev)
+		net = dev_net(skb_in->dev);
+	else
+		goto out;
+
+	/*
+	 *	Find the original header. It is expected to be valid, of course.
+	 *	Check this, icmp_send is called from the most obscure devices
+	 *	sometimes.
+	 */
+	iph = ip_hdr(skb_in);
+
+	if ((u8 *)iph < skb_in->head ||
+	    (skb_network_header(skb_in) + sizeof(*iph)) >
+	    skb_tail_pointer(skb_in))
+		goto out;
+
+	/*
+	 *	No replies to physical multicast/broadcast
+	 */
+	if (skb_in->pkt_type != PACKET_HOST)
+		goto out;
+
+	/*
+	 *	Now check at the protocol level
+	 */
+	if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		goto out;
+
+	/*
+	 *	Only reply to fragment 0. We byte re-order the constant
+	 *	mask for efficiency.
+	 */
+	if (iph->frag_off & htons(IP_OFFSET))
+		goto out;
+
+	/*
+	 *	If we send an ICMP error to an ICMP error a mess would result..
+	 */
+	if (icmp_pointers[type].error) {
+		/*
+		 *	We are an error, check if we are replying to an
+		 *	ICMP error
+		 */
+		if (iph->protocol == IPPROTO_ICMP) {
+			u8 _inner_type, *itp;
+
+			itp = skb_header_pointer(skb_in,
+						 skb_network_header(skb_in) +
+						 (iph->ihl << 2) +
+						 offsetof(struct icmphdr,
+							  type) -
+						 skb_in->data,
+						 sizeof(_inner_type),
+						 &_inner_type);
+			if (!itp)
+				goto out;
+
+			/*
+			 *	Assume any unknown ICMP type is an error. This
+			 *	isn't specified by the RFC, but think about it..
+			 */
+			if (*itp > NR_ICMP_TYPES ||
+			    icmp_pointers[*itp].error)
+				goto out;
+		}
+	}
+
+	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	local_bh_disable();
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
+	 * incoming dev is loopback.  If outgoing dev change to not be
+	 * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
+	 */
+	if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
+	      !icmpv4_global_allow(net, type, code))
+		goto out_bh_enable;
+
+	sk = icmp_xmit_lock(net);
+	if (!sk)
+		goto out_bh_enable;
+
+	/*
+	 *	Construct source address and options.
+	 */
+
+	saddr = iph->daddr;
+	if (!(rt->rt_flags & RTCF_LOCAL)) {
+		struct net_device *dev = NULL;
+
+		rcu_read_lock();
+		if (rt_is_input_route(rt) &&
+		    READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
+			dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
+
+		if (dev)
+			saddr = inet_select_addr(dev, iph->saddr,
+						 RT_SCOPE_LINK);
+		else
+			saddr = 0;
+		rcu_read_unlock();
+	}
+
+	tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) |
+					   IPTOS_PREC_INTERNETCONTROL) :
+					   iph->tos;
+	mark = IP4_REPLY_MARK(net, skb_in->mark);
+
+	if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
+		goto out_unlock;
+
+
+	/*
+	 *	Prepare data for ICMP header.
+	 */
+
+	icmp_param.data.icmph.type	 = type;
+	icmp_param.data.icmph.code	 = code;
+	icmp_param.data.icmph.un.gateway = info;
+	icmp_param.data.icmph.checksum	 = 0;
+	icmp_param.skb	  = skb_in;
+	icmp_param.offset = skb_network_offset(skb_in);
+	inet_sk(sk)->tos = tos;
+	ipcm_init(&ipc);
+	ipc.addr = iph->saddr;
+	ipc.opt = &icmp_param.replyopts.opt;
+	ipc.sockc.mark = mark;
+
+	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
+			       type, code, &icmp_param);
+	if (IS_ERR(rt))
+		goto out_unlock;
+
+	/* peer icmp_ratelimit */
+	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+		goto ende;
+
+	/* RFC says return as much as we can without exceeding 576 bytes. */
+
+	room = dst_mtu(&rt->dst);
+	if (room > 576)
+		room = 576;
+	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
+	room -= sizeof(struct icmphdr);
+	/* Guard against tiny mtu. We need to include at least one
+	 * IP network header for this message to make any sense.
+	 */
+	if (room <= (int)sizeof(struct iphdr))
+		goto ende;
+
+	icmp_param.data_len = skb_in->len - icmp_param.offset;
+	if (icmp_param.data_len > room)
+		icmp_param.data_len = room;
+	icmp_param.head_len = sizeof(struct icmphdr);
+
+	/* if we don't have a source address at this point, fall back to the
+	 * dummy address instead of sending out a packet with a source address
+	 * of 0.0.0.0
+	 */
+	if (!fl4.saddr)
+		fl4.saddr = htonl(INADDR_DUMMY);
+
+	icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+ende:
+	ip_rt_put(rt);
+out_unlock:
+	icmp_xmit_unlock(sk);
+out_bh_enable:
+	local_bh_enable();
+out:;
+}
+EXPORT_SYMBOL(__icmp_send);
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+	struct sk_buff *cloned_skb = NULL;
+	struct ip_options opts = { 0 };
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	__be32 orig_ip;
+
+	ct = nf_ct_get(skb_in, &ctinfo);
+	if (!ct || !(ct->status & IPS_SRC_NAT)) {
+		__icmp_send(skb_in, type, code, info, &opts);
+		return;
+	}
+
+	if (skb_shared(skb_in))
+		skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+	if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+	    (skb_network_header(skb_in) + sizeof(struct iphdr)) >
+	    skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+	    skb_network_offset(skb_in) + sizeof(struct iphdr))))
+		goto out;
+
+	orig_ip = ip_hdr(skb_in)->saddr;
+	ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
+	__icmp_send(skb_in, type, code, info, &opts);
+	ip_hdr(skb_in)->saddr = orig_ip;
+out:
+	consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmp_ndo_send);
+#endif
+
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const struct net_protocol *ipprot;
+	int protocol = iph->protocol;
+
+	/* Checkin full IP header plus 8 bytes of protocol to
+	 * avoid additional coding at protocol handlers.
+	 */
+	if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
+		__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
+		return;
+	}
+
+	raw_icmp_error(skb, protocol, info);
+
+	ipprot = rcu_dereference(inet_protos[protocol]);
+	if (ipprot && ipprot->err_handler)
+		ipprot->err_handler(skb, info);
+}
+
+static bool icmp_tag_validation(int proto)
+{
+	bool ok;
+
+	rcu_read_lock();
+	ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
+	rcu_read_unlock();
+	return ok;
+}
+
+/*
+ *	Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and
+ *	ICMP_PARAMETERPROB.
+ */
+
+static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
+{
+	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
+	const struct iphdr *iph;
+	struct icmphdr *icmph;
+	struct net *net;
+	u32 info = 0;
+
+	net = dev_net(skb_dst(skb)->dev);
+
+	/*
+	 *	Incomplete header ?
+	 * 	Only checks for the IP header, there should be an
+	 *	additional check for longer headers in upper levels.
+	 */
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out_err;
+
+	icmph = icmp_hdr(skb);
+	iph   = (const struct iphdr *)skb->data;
+
+	if (iph->ihl < 5)  { /* Mangled header, drop. */
+		reason = SKB_DROP_REASON_IP_INHDR;
+		goto out_err;
+	}
+
+	switch (icmph->type) {
+	case ICMP_DEST_UNREACH:
+		switch (icmph->code & 15) {
+		case ICMP_NET_UNREACH:
+		case ICMP_HOST_UNREACH:
+		case ICMP_PROT_UNREACH:
+		case ICMP_PORT_UNREACH:
+			break;
+		case ICMP_FRAG_NEEDED:
+			/* for documentation of the ip_no_pmtu_disc
+			 * values please see
+			 * Documentation/networking/ip-sysctl.rst
+			 */
+			switch (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) {
+			default:
+				net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
+						    &iph->daddr);
+				break;
+			case 2:
+				goto out;
+			case 3:
+				if (!icmp_tag_validation(iph->protocol))
+					goto out;
+				fallthrough;
+			case 0:
+				info = ntohs(icmph->un.frag.mtu);
+			}
+			break;
+		case ICMP_SR_FAILED:
+			net_dbg_ratelimited("%pI4: Source Route Failed\n",
+					    &iph->daddr);
+			break;
+		default:
+			break;
+		}
+		if (icmph->code > NR_ICMP_UNREACH)
+			goto out;
+		break;
+	case ICMP_PARAMETERPROB:
+		info = ntohl(icmph->un.gateway) >> 24;
+		break;
+	case ICMP_TIME_EXCEEDED:
+		__ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS);
+		if (icmph->code == ICMP_EXC_FRAGTIME)
+			goto out;
+		break;
+	}
+
+	/*
+	 *	Throw it at our lower layers
+	 *
+	 *	RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
+	 *		  header.
+	 *	RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
+	 *		  transport layer.
+	 *	RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
+	 *		  transport layer.
+	 */
+
+	/*
+	 *	Check the other end isn't violating RFC 1122. Some routers send
+	 *	bogus responses to broadcast frames. If you see this message
+	 *	first check your netmask matches at both ends, if it does then
+	 *	get the other vendor to fix their kit.
+	 */
+
+	if (!READ_ONCE(net->ipv4.sysctl_icmp_ignore_bogus_error_responses) &&
+	    inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
+		net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+				     &ip_hdr(skb)->saddr,
+				     icmph->type, icmph->code,
+				     &iph->daddr, skb->dev->name);
+		goto out;
+	}
+
+	icmp_socket_deliver(skb, info);
+
+out:
+	return reason;
+out_err:
+	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+	return reason ?: SKB_DROP_REASON_NOT_SPECIFIED;
+}
+
+
+/*
+ *	Handle ICMP_REDIRECT.
+ */
+
+static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
+{
+	if (skb->len < sizeof(struct iphdr)) {
+		__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
+		return SKB_DROP_REASON_PKT_TOO_SMALL;
+	}
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
+		/* there aught to be a stat */
+		return SKB_DROP_REASON_NOMEM;
+	}
+
+	icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
+	return SKB_NOT_DROPPED_YET;
+}
+
+/*
+ *	Handle ICMP_ECHO ("ping") and ICMP_EXT_ECHO ("PROBE") requests.
+ *
+ *	RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
+ *		  requests.
+ *	RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
+ *		  included in the reply.
+ *	RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
+ *		  echo requests, MUST have default=NOT.
+ *	RFC 8335: 8 MUST have a config option to enable/disable ICMP
+ *		  Extended Echo Functionality, MUST be disabled by default
+ *	See also WRT handling of options once they are done and working.
+ */
+
+static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
+{
+	struct icmp_bxm icmp_param;
+	struct net *net;
+
+	net = dev_net(skb_dst(skb)->dev);
+	/* should there be an ICMP stat for ignored echos? */
+	if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
+		return SKB_NOT_DROPPED_YET;
+
+	icmp_param.data.icmph	   = *icmp_hdr(skb);
+	icmp_param.skb		   = skb;
+	icmp_param.offset	   = 0;
+	icmp_param.data_len	   = skb->len;
+	icmp_param.head_len	   = sizeof(struct icmphdr);
+
+	if (icmp_param.data.icmph.type == ICMP_ECHO)
+		icmp_param.data.icmph.type = ICMP_ECHOREPLY;
+	else if (!icmp_build_probe(skb, &icmp_param.data.icmph))
+		return SKB_NOT_DROPPED_YET;
+
+	icmp_reply(&icmp_param, skb);
+	return SKB_NOT_DROPPED_YET;
+}
+
+/*	Helper for icmp_echo and icmpv6_echo_reply.
+ *	Searches for net_device that matches PROBE interface identifier
+ *		and builds PROBE reply message in icmphdr.
+ *
+ *	Returns false if PROBE responses are disabled via sysctl
+ */
+
+bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
+{
+	struct icmp_ext_hdr *ext_hdr, _ext_hdr;
+	struct icmp_ext_echo_iio *iio, _iio;
+	struct net *net = dev_net(skb->dev);
+	struct net_device *dev;
+	char buff[IFNAMSIZ];
+	u16 ident_len;
+	u8 status;
+
+	if (!READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
+		return false;
+
+	/* We currently only support probing interfaces on the proxy node
+	 * Check to ensure L-bit is set
+	 */
+	if (!(ntohs(icmphdr->un.echo.sequence) & 1))
+		return false;
+	/* Clear status bits in reply message */
+	icmphdr->un.echo.sequence &= htons(0xFF00);
+	if (icmphdr->type == ICMP_EXT_ECHO)
+		icmphdr->type = ICMP_EXT_ECHOREPLY;
+	else
+		icmphdr->type = ICMPV6_EXT_ECHO_REPLY;
+	ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr);
+	/* Size of iio is class_type dependent.
+	 * Only check header here and assign length based on ctype in the switch statement
+	 */
+	iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio);
+	if (!ext_hdr || !iio)
+		goto send_mal_query;
+	if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) ||
+	    ntohs(iio->extobj_hdr.length) > sizeof(_iio))
+		goto send_mal_query;
+	ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr);
+	iio = skb_header_pointer(skb, sizeof(_ext_hdr),
+				 sizeof(iio->extobj_hdr) + ident_len, &_iio);
+	if (!iio)
+		goto send_mal_query;
+
+	status = 0;
+	dev = NULL;
+	switch (iio->extobj_hdr.class_type) {
+	case ICMP_EXT_ECHO_CTYPE_NAME:
+		if (ident_len >= IFNAMSIZ)
+			goto send_mal_query;
+		memset(buff, 0, sizeof(buff));
+		memcpy(buff, &iio->ident.name, ident_len);
+		dev = dev_get_by_name(net, buff);
+		break;
+	case ICMP_EXT_ECHO_CTYPE_INDEX:
+		if (ident_len != sizeof(iio->ident.ifindex))
+			goto send_mal_query;
+		dev = dev_get_by_index(net, ntohl(iio->ident.ifindex));
+		break;
+	case ICMP_EXT_ECHO_CTYPE_ADDR:
+		if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) ||
+		    ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
+				 iio->ident.addr.ctype3_hdr.addrlen)
+			goto send_mal_query;
+		switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) {
+		case ICMP_AFI_IP:
+			if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr))
+				goto send_mal_query;
+			dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr);
+			break;
+#if IS_ENABLED(CONFIG_IPV6)
+		case ICMP_AFI_IP6:
+			if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr))
+				goto send_mal_query;
+			dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev);
+			dev_hold(dev);
+			break;
+#endif
+		default:
+			goto send_mal_query;
+		}
+		break;
+	default:
+		goto send_mal_query;
+	}
+	if (!dev) {
+		icmphdr->code = ICMP_EXT_CODE_NO_IF;
+		return true;
+	}
+	/* Fill bits in reply message */
+	if (dev->flags & IFF_UP)
+		status |= ICMP_EXT_ECHOREPLY_ACTIVE;
+	if (__in_dev_get_rcu(dev) && __in_dev_get_rcu(dev)->ifa_list)
+		status |= ICMP_EXT_ECHOREPLY_IPV4;
+	if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list))
+		status |= ICMP_EXT_ECHOREPLY_IPV6;
+	dev_put(dev);
+	icmphdr->un.echo.sequence |= htons(status);
+	return true;
+send_mal_query:
+	icmphdr->code = ICMP_EXT_CODE_MAL_QUERY;
+	return true;
+}
+EXPORT_SYMBOL_GPL(icmp_build_probe);
+
+/*
+ *	Handle ICMP Timestamp requests.
+ *	RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ *		  SHOULD be in the kernel for minimum random latency.
+ *		  MUST be accurate to a few minutes.
+ *		  MUST be updated at least at 15Hz.
+ */
+static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
+{
+	struct icmp_bxm icmp_param;
+	/*
+	 *	Too short.
+	 */
+	if (skb->len < 4)
+		goto out_err;
+
+	/*
+	 *	Fill in the current time as ms since midnight UT:
+	 */
+	icmp_param.data.times[1] = inet_current_timestamp();
+	icmp_param.data.times[2] = icmp_param.data.times[1];
+
+	BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4));
+
+	icmp_param.data.icmph	   = *icmp_hdr(skb);
+	icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
+	icmp_param.data.icmph.code = 0;
+	icmp_param.skb		   = skb;
+	icmp_param.offset	   = 0;
+	icmp_param.data_len	   = 0;
+	icmp_param.head_len	   = sizeof(struct icmphdr) + 12;
+	icmp_reply(&icmp_param, skb);
+	return SKB_NOT_DROPPED_YET;
+
+out_err:
+	__ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+	return SKB_DROP_REASON_PKT_TOO_SMALL;
+}
+
+static enum skb_drop_reason icmp_discard(struct sk_buff *skb)
+{
+	/* pretend it was a success */
+	return SKB_NOT_DROPPED_YET;
+}
+
+/*
+ *	Deal with incoming ICMP packets.
+ */
+int icmp_rcv(struct sk_buff *skb)
+{
+	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	struct rtable *rt = skb_rtable(skb);
+	struct net *net = dev_net(rt->dst.dev);
+	struct icmphdr *icmph;
+
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		struct sec_path *sp = skb_sec_path(skb);
+		int nh;
+
+		if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+				 XFRM_STATE_ICMP)) {
+			reason = SKB_DROP_REASON_XFRM_POLICY;
+			goto drop;
+		}
+
+		if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
+			goto drop;
+
+		nh = skb_network_offset(skb);
+		skb_set_network_header(skb, sizeof(*icmph));
+
+		if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN,
+						skb)) {
+			reason = SKB_DROP_REASON_XFRM_POLICY;
+			goto drop;
+		}
+
+		skb_set_network_header(skb, nh);
+	}
+
+	__ICMP_INC_STATS(net, ICMP_MIB_INMSGS);
+
+	if (skb_checksum_simple_validate(skb))
+		goto csum_error;
+
+	if (!pskb_pull(skb, sizeof(*icmph)))
+		goto error;
+
+	icmph = icmp_hdr(skb);
+
+	ICMPMSGIN_INC_STATS(net, icmph->type);
+
+	/* Check for ICMP Extended Echo (PROBE) messages */
+	if (icmph->type == ICMP_EXT_ECHO) {
+		/* We can't use icmp_pointers[].handler() because it is an array of
+		 * size NR_ICMP_TYPES + 1 (19 elements) and PROBE has code 42.
+		 */
+		reason = icmp_echo(skb);
+		goto reason_check;
+	}
+
+	if (icmph->type == ICMP_EXT_ECHOREPLY) {
+		reason = ping_rcv(skb);
+		goto reason_check;
+	}
+
+	/*
+	 *	18 is the highest 'known' ICMP type. Anything else is a mystery
+	 *
+	 *	RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
+	 *		  discarded.
+	 */
+	if (icmph->type > NR_ICMP_TYPES) {
+		reason = SKB_DROP_REASON_UNHANDLED_PROTO;
+		goto error;
+	}
+
+	/*
+	 *	Parse the ICMP message
+	 */
+
+	if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+		/*
+		 *	RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+		 *	  silently ignored (we let user decide with a sysctl).
+		 *	RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+		 *	  discarded if to broadcast/multicast.
+		 */
+		if ((icmph->type == ICMP_ECHO ||
+		     icmph->type == ICMP_TIMESTAMP) &&
+		    READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_broadcasts)) {
+			reason = SKB_DROP_REASON_INVALID_PROTO;
+			goto error;
+		}
+		if (icmph->type != ICMP_ECHO &&
+		    icmph->type != ICMP_TIMESTAMP &&
+		    icmph->type != ICMP_ADDRESS &&
+		    icmph->type != ICMP_ADDRESSREPLY) {
+			reason = SKB_DROP_REASON_INVALID_PROTO;
+			goto error;
+		}
+	}
+
+	reason = icmp_pointers[icmph->type].handler(skb);
+reason_check:
+	if (!reason)  {
+		consume_skb(skb);
+		return NET_RX_SUCCESS;
+	}
+
+drop:
+	kfree_skb_reason(skb, reason);
+	return NET_RX_DROP;
+csum_error:
+	reason = SKB_DROP_REASON_ICMP_CSUM;
+	__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
+error:
+	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+	goto drop;
+}
+
+static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off)
+{
+	struct icmp_extobj_hdr *objh, _objh;
+	struct icmp_ext_hdr *exth, _exth;
+	u16 olen;
+
+	exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth);
+	if (!exth)
+		return false;
+	if (exth->version != 2)
+		return true;
+
+	if (exth->checksum &&
+	    csum_fold(skb_checksum(skb, off, skb->len - off, 0)))
+		return false;
+
+	off += sizeof(_exth);
+	while (off < skb->len) {
+		objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh);
+		if (!objh)
+			return false;
+
+		olen = ntohs(objh->length);
+		if (olen < sizeof(_objh))
+			return false;
+
+		off += olen;
+		if (off > skb->len)
+			return false;
+	}
+
+	return true;
+}
+
+void ip_icmp_error_rfc4884(const struct sk_buff *skb,
+			   struct sock_ee_data_rfc4884 *out,
+			   int thlen, int off)
+{
+	int hlen;
+
+	/* original datagram headers: end of icmph to payload (skb->data) */
+	hlen = -skb_transport_offset(skb) - thlen;
+
+	/* per rfc 4884: minimal datagram length of 128 bytes */
+	if (off < 128 || off < hlen)
+		return;
+
+	/* kernel has stripped headers: return payload offset in bytes */
+	off -= hlen;
+	if (off + sizeof(struct icmp_ext_hdr) > skb->len)
+		return;
+
+	out->len = off;
+
+	if (!ip_icmp_error_rfc4884_validate(skb, off))
+		out->flags |= SO_EE_RFC4884_FLAG_INVALID;
+}
+EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884);
+
+int icmp_err(struct sk_buff *skb, u32 info)
+{
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	int offset = iph->ihl<<2;
+	struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+	int type = icmp_hdr(skb)->type;
+	int code = icmp_hdr(skb)->code;
+	struct net *net = dev_net(skb->dev);
+
+	/*
+	 * Use ping_err to handle all icmp errors except those
+	 * triggered by ICMP_ECHOREPLY which sent from kernel.
+	 */
+	if (icmph->type != ICMP_ECHOREPLY) {
+		ping_err(skb, offset, info);
+		return 0;
+	}
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
+	else if (type == ICMP_REDIRECT)
+		ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+	return 0;
+}
+
+/*
+ *	This table is the definition of how we handle ICMP.
+ */
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+	[ICMP_ECHOREPLY] = {
+		.handler = ping_rcv,
+	},
+	[1] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[2] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_DEST_UNREACH] = {
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_SOURCE_QUENCH] = {
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_REDIRECT] = {
+		.handler = icmp_redirect,
+		.error = 1,
+	},
+	[6] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[7] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_ECHO] = {
+		.handler = icmp_echo,
+	},
+	[9] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[10] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_TIME_EXCEEDED] = {
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_PARAMETERPROB] = {
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_TIMESTAMP] = {
+		.handler = icmp_timestamp,
+	},
+	[ICMP_TIMESTAMPREPLY] = {
+		.handler = icmp_discard,
+	},
+	[ICMP_INFO_REQUEST] = {
+		.handler = icmp_discard,
+	},
+	[ICMP_INFO_REPLY] = {
+		.handler = icmp_discard,
+	},
+	[ICMP_ADDRESS] = {
+		.handler = icmp_discard,
+	},
+	[ICMP_ADDRESSREPLY] = {
+		.handler = icmp_discard,
+	},
+};
+
+static int __net_init icmp_sk_init(struct net *net)
+{
+	/* Control parameters for ECHO replies. */
+	net->ipv4.sysctl_icmp_echo_ignore_all = 0;
+	net->ipv4.sysctl_icmp_echo_enable_probe = 0;
+	net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
+
+	/* Control parameter - ignore bogus broadcast responses? */
+	net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
+
+	/*
+	 * 	Configurable global rate limit.
+	 *
+	 *	ratelimit defines tokens/packet consumed for dst->rate_token
+	 *	bucket ratemask defines which icmp types are ratelimited by
+	 *	setting	it's bit position.
+	 *
+	 *	default:
+	 *	dest unreachable (3), source quench (4),
+	 *	time exceeded (11), parameter problem (12)
+	 */
+
+	net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
+	net->ipv4.sysctl_icmp_ratemask = 0x1818;
+	net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+
+	return 0;
+}
+
+static struct pernet_operations __net_initdata icmp_sk_ops = {
+       .init = icmp_sk_init,
+};
+
+int __init icmp_init(void)
+{
+	int err, i;
+
+	for_each_possible_cpu(i) {
+		struct sock *sk;
+
+		err = inet_ctl_sock_create(&sk, PF_INET,
+					   SOCK_RAW, IPPROTO_ICMP, &init_net);
+		if (err < 0)
+			return err;
+
+		per_cpu(ipv4_icmp_sk, i) = sk;
+
+		/* Enough space for 2 64K ICMP packets, including
+		 * sk_buff/skb_shared_info struct overhead.
+		 */
+		sk->sk_sndbuf =	2 * SKB_TRUESIZE(64 * 1024);
+
+		/*
+		 * Speedup sock_wfree()
+		 */
+		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+	}
+	return register_pernet_subsys(&icmp_sk_ops);
+}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 0000000000..d515881d02
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,3110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	Linux NET3:	Internet Group Management Protocol  [IGMP]
+ *
+ *	This code implements the IGMP protocol as defined in RFC1112. There has
+ *	been a further revision of this protocol since which is now supported.
+ *
+ *	If you have trouble with this module be careful what gcc you have used,
+ *	the older version didn't come out right using gcc 2.5.8, the newer one
+ *	seems to fall out with gcc 2.6.2.
+ *
+ *	Authors:
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ *	Fixes:
+ *
+ *		Alan Cox	:	Added lots of __inline__ to optimise
+ *					the memory usage of all the tiny little
+ *					functions.
+ *		Alan Cox	:	Dumped the header building experiment.
+ *		Alan Cox	:	Minor tweaks ready for multicast routing
+ *					and extended IGMP protocol.
+ *		Alan Cox	:	Removed a load of inline directives. Gcc 2.5.8
+ *					writes utterly bogus code otherwise (sigh)
+ *					fixed IGMP loopback to behave in the manner
+ *					desired by mrouted, fixed the fact it has been
+ *					broken since 1.3.6 and cleaned up a few minor
+ *					points.
+ *
+ *		Chih-Jen Chang	:	Tried to revise IGMP to Version 2
+ *		Tsu-Sheng Tsao		E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ *					The enhancements are mainly based on Steve Deering's
+ * 					ipmulti-3.5 source code.
+ *		Chih-Jen Chang	:	Added the igmp_get_mrouter_info and
+ *		Tsu-Sheng Tsao		igmp_set_mrouter_info to keep track of
+ *					the mrouted version on that device.
+ *		Chih-Jen Chang	:	Added the max_resp_time parameter to
+ *		Tsu-Sheng Tsao		igmp_heard_query(). Using this parameter
+ *					to identify the multicast router version
+ *					and do what the IGMP version 2 specified.
+ *		Chih-Jen Chang	:	Added a timer to revert to IGMP V2 router
+ *		Tsu-Sheng Tsao		if the specified time expired.
+ *		Alan Cox	:	Stop IGMP from 0.0.0.0 being accepted.
+ *		Alan Cox	:	Use GFP_ATOMIC in the right places.
+ *		Christian Daudt :	igmp timer wasn't set for local group
+ *					memberships but was being deleted,
+ *					which caused a "del_timer() called
+ *					from %p with timer not initialized\n"
+ *					message (960131).
+ *		Christian Daudt :	removed del_timer from
+ *					igmp_timer_expire function (960205).
+ *             Christian Daudt :       igmp_heard_report now only calls
+ *                                     igmp_timer_expire if tm->running is
+ *                                     true (960216).
+ *		Malcolm Beattie :	ttl comparison wrong in igmp_rcv made
+ *					igmp_heard_query never trigger. Expiry
+ *					miscalculation fixed in igmp_heard_query
+ *					and random() made to return unsigned to
+ *					prevent negative expiry times.
+ *		Alexey Kuznetsov:	Wrong group leaving behaviour, backport
+ *					fix from pending 2.1.x patches.
+ *		Alan Cox:		Forget to enable FDDI support earlier.
+ *		Alexey Kuznetsov:	Fixed leaving groups on device down.
+ *		Alexey Kuznetsov:	Accordance to igmp-v2-06 draft.
+ *		David L Stevens:	IGMPv3 support, with help from
+ *					Vinay Kulkarni
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/times.h>
+#include <linux/pkt_sched.h>
+#include <linux/byteorder/generic.h>
+
+#include <net/net_namespace.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/inet_common.h>
+#include <linux/netfilter_ipv4.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#endif
+
+#ifdef CONFIG_IP_MULTICAST
+/* Parameter names and values are taken from igmp-v2-06 draft */
+
+#define IGMP_QUERY_INTERVAL			(125*HZ)
+#define IGMP_QUERY_RESPONSE_INTERVAL		(10*HZ)
+
+#define IGMP_INITIAL_REPORT_DELAY		(1)
+
+/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+
+#define IGMP_V1_SEEN(in_dev) \
+	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
+	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
+	 ((in_dev)->mr_v1_seen && \
+	  time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) \
+	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
+	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
+	 ((in_dev)->mr_v2_seen && \
+	  time_before(jiffies, (in_dev)->mr_v2_seen)))
+
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+	int interval_ms, interval_jiffies;
+
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+		interval_ms = IN_DEV_CONF_GET(
+			in_dev,
+			IGMPV2_UNSOLICITED_REPORT_INTERVAL);
+	else /* v3 */
+		interval_ms = IN_DEV_CONF_GET(
+			in_dev,
+			IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+
+	interval_jiffies = msecs_to_jiffies(interval_ms);
+
+	/* _timer functions can't handle a delay of 0 jiffies so ensure
+	 *  we always return a positive value.
+	 */
+	if (interval_jiffies <= 0)
+		interval_jiffies = 1;
+	return interval_jiffies;
+}
+
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
+			      gfp_t gfp);
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_clear_delrec(struct in_device *in_dev);
+static int sf_setstate(struct ip_mc_list *pmc);
+static void sf_markstate(struct ip_mc_list *pmc);
+#endif
+static void ip_mc_clear_src(struct ip_mc_list *pmc);
+static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+			 int sfcount, __be32 *psfsrc, int delta);
+
+static void ip_ma_put(struct ip_mc_list *im)
+{
+	if (refcount_dec_and_test(&im->refcnt)) {
+		in_dev_put(im->interface);
+		kfree_rcu(im, rcu);
+	}
+}
+
+#define for_each_pmc_rcu(in_dev, pmc)				\
+	for (pmc = rcu_dereference(in_dev->mc_list);		\
+	     pmc != NULL;					\
+	     pmc = rcu_dereference(pmc->next_rcu))
+
+#define for_each_pmc_rtnl(in_dev, pmc)				\
+	for (pmc = rtnl_dereference(in_dev->mc_list);		\
+	     pmc != NULL;					\
+	     pmc = rtnl_dereference(pmc->next_rcu))
+
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+	struct ip_sf_list *next;
+
+	while (psf) {
+		next = psf->sf_next;
+		kfree(psf);
+		psf = next;
+	}
+}
+
+#ifdef CONFIG_IP_MULTICAST
+
+/*
+ *	Timer management
+ */
+
+static void igmp_stop_timer(struct ip_mc_list *im)
+{
+	spin_lock_bh(&im->lock);
+	if (del_timer(&im->timer))
+		refcount_dec(&im->refcnt);
+	im->tm_running = 0;
+	im->reporter = 0;
+	im->unsolicit_count = 0;
+	spin_unlock_bh(&im->lock);
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+	int tv = get_random_u32_below(max_delay);
+
+	im->tm_running = 1;
+	if (refcount_inc_not_zero(&im->refcnt)) {
+		if (mod_timer(&im->timer, jiffies + tv + 2))
+			ip_ma_put(im);
+	}
+}
+
+static void igmp_gq_start_timer(struct in_device *in_dev)
+{
+	int tv = get_random_u32_below(in_dev->mr_maxdelay);
+	unsigned long exp = jiffies + tv + 2;
+
+	if (in_dev->mr_gq_running &&
+	    time_after_eq(exp, (in_dev->mr_gq_timer).expires))
+		return;
+
+	in_dev->mr_gq_running = 1;
+	if (!mod_timer(&in_dev->mr_gq_timer, exp))
+		in_dev_hold(in_dev);
+}
+
+static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
+{
+	int tv = get_random_u32_below(delay);
+
+	if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
+		in_dev_hold(in_dev);
+}
+
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+	spin_lock_bh(&im->lock);
+	im->unsolicit_count = 0;
+	if (del_timer(&im->timer)) {
+		if ((long)(im->timer.expires-jiffies) < max_delay) {
+			add_timer(&im->timer);
+			im->tm_running = 1;
+			spin_unlock_bh(&im->lock);
+			return;
+		}
+		refcount_dec(&im->refcnt);
+	}
+	igmp_start_timer(im, max_delay);
+	spin_unlock_bh(&im->lock);
+}
+
+
+/*
+ *	Send an IGMP report.
+ */
+
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+
+static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+	int gdeleted, int sdeleted)
+{
+	switch (type) {
+	case IGMPV3_MODE_IS_INCLUDE:
+	case IGMPV3_MODE_IS_EXCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		if (!(pmc->gsquery && !psf->sf_gsresp)) {
+			if (pmc->sfmode == MCAST_INCLUDE)
+				return 1;
+			/* don't include if this source is excluded
+			 * in all filters
+			 */
+			if (psf->sf_count[MCAST_INCLUDE])
+				return type == IGMPV3_MODE_IS_INCLUDE;
+			return pmc->sfcount[MCAST_EXCLUDE] ==
+				psf->sf_count[MCAST_EXCLUDE];
+		}
+		return 0;
+	case IGMPV3_CHANGE_TO_INCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		return psf->sf_count[MCAST_INCLUDE] != 0;
+	case IGMPV3_CHANGE_TO_EXCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
+		    psf->sf_count[MCAST_INCLUDE])
+			return 0;
+		return pmc->sfcount[MCAST_EXCLUDE] ==
+			psf->sf_count[MCAST_EXCLUDE];
+	case IGMPV3_ALLOW_NEW_SOURCES:
+		if (gdeleted || !psf->sf_crcount)
+			return 0;
+		return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
+	case IGMPV3_BLOCK_OLD_SOURCES:
+		if (pmc->sfmode == MCAST_INCLUDE)
+			return gdeleted || (psf->sf_crcount && sdeleted);
+		return psf->sf_crcount && !gdeleted && !sdeleted;
+	}
+	return 0;
+}
+
+static int
+igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
+{
+	struct ip_sf_list *psf;
+	int scount = 0;
+
+	for (psf = pmc->sources; psf; psf = psf->sf_next) {
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+			continue;
+		scount++;
+	}
+	return scount;
+}
+
+/* source address selection per RFC 3376 section 4.2.13 */
+static __be32 igmpv3_get_srcaddr(struct net_device *dev,
+				 const struct flowi4 *fl4)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	const struct in_ifaddr *ifa;
+
+	if (!in_dev)
+		return htonl(INADDR_ANY);
+
+	in_dev_for_each_ifa_rcu(ifa, in_dev) {
+		if (fl4->saddr == ifa->ifa_local)
+			return fl4->saddr;
+	}
+
+	return htonl(INADDR_ANY);
+}
+
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
+{
+	struct sk_buff *skb;
+	struct rtable *rt;
+	struct iphdr *pip;
+	struct igmpv3_report *pig;
+	struct net *net = dev_net(dev);
+	struct flowi4 fl4;
+	int hlen = LL_RESERVED_SPACE(dev);
+	int tlen = dev->needed_tailroom;
+	unsigned int size;
+
+	size = min(mtu, IP_MAX_MTU);
+	while (1) {
+		skb = alloc_skb(size + hlen + tlen,
+				GFP_ATOMIC | __GFP_NOWARN);
+		if (skb)
+			break;
+		size >>= 1;
+		if (size < 256)
+			return NULL;
+	}
+	skb->priority = TC_PRIO_CONTROL;
+
+	rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
+				   0, 0,
+				   IPPROTO_IGMP, 0, dev->ifindex);
+	if (IS_ERR(rt)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	skb_dst_set(skb, &rt->dst);
+	skb->dev = dev;
+
+	skb_reserve(skb, hlen);
+	skb_tailroom_reserve(skb, mtu, tlen);
+
+	skb_reset_network_header(skb);
+	pip = ip_hdr(skb);
+	skb_put(skb, sizeof(struct iphdr) + 4);
+
+	pip->version  = 4;
+	pip->ihl      = (sizeof(struct iphdr)+4)>>2;
+	pip->tos      = 0xc0;
+	pip->frag_off = htons(IP_DF);
+	pip->ttl      = 1;
+	pip->daddr    = fl4.daddr;
+
+	rcu_read_lock();
+	pip->saddr    = igmpv3_get_srcaddr(dev, &fl4);
+	rcu_read_unlock();
+
+	pip->protocol = IPPROTO_IGMP;
+	pip->tot_len  = 0;	/* filled in later */
+	ip_select_ident(net, skb, NULL);
+	((u8 *)&pip[1])[0] = IPOPT_RA;
+	((u8 *)&pip[1])[1] = 4;
+	((u8 *)&pip[1])[2] = 0;
+	((u8 *)&pip[1])[3] = 0;
+
+	skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
+	skb_put(skb, sizeof(*pig));
+	pig = igmpv3_report_hdr(skb);
+	pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+	pig->resv1 = 0;
+	pig->csum = 0;
+	pig->resv2 = 0;
+	pig->ngrec = 0;
+	return skb;
+}
+
+static int igmpv3_sendpack(struct sk_buff *skb)
+{
+	struct igmphdr *pig = igmp_hdr(skb);
+	const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
+
+	pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
+
+	return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+}
+
+static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+{
+	return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+	int type, struct igmpv3_grec **ppgr, unsigned int mtu)
+{
+	struct net_device *dev = pmc->interface->dev;
+	struct igmpv3_report *pih;
+	struct igmpv3_grec *pgr;
+
+	if (!skb) {
+		skb = igmpv3_newpack(dev, mtu);
+		if (!skb)
+			return NULL;
+	}
+	pgr = skb_put(skb, sizeof(struct igmpv3_grec));
+	pgr->grec_type = type;
+	pgr->grec_auxwords = 0;
+	pgr->grec_nsrcs = 0;
+	pgr->grec_mca = pmc->multiaddr;
+	pih = igmpv3_report_hdr(skb);
+	pih->ngrec = htons(ntohs(pih->ngrec)+1);
+	*ppgr = pgr;
+	return skb;
+}
+
+#define AVAILABLE(skb)	((skb) ? skb_availroom(skb) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
+	int type, int gdeleted, int sdeleted)
+{
+	struct net_device *dev = pmc->interface->dev;
+	struct net *net = dev_net(dev);
+	struct igmpv3_report *pih;
+	struct igmpv3_grec *pgr = NULL;
+	struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+	int scount, stotal, first, isquery, truncate;
+	unsigned int mtu;
+
+	if (pmc->multiaddr == IGMP_ALL_HOSTS)
+		return skb;
+	if (ipv4_is_local_multicast(pmc->multiaddr) &&
+	    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+		return skb;
+
+	mtu = READ_ONCE(dev->mtu);
+	if (mtu < IPV4_MIN_MTU)
+		return skb;
+
+	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+		  type == IGMPV3_MODE_IS_EXCLUDE;
+	truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+		    type == IGMPV3_CHANGE_TO_EXCLUDE;
+
+	stotal = scount = 0;
+
+	psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
+
+	if (!*psf_list)
+		goto empty_source;
+
+	pih = skb ? igmpv3_report_hdr(skb) : NULL;
+
+	/* EX and TO_EX get a fresh packet, if needed */
+	if (truncate) {
+		if (pih && pih->ngrec &&
+		    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+			if (skb)
+				igmpv3_sendpack(skb);
+			skb = igmpv3_newpack(dev, mtu);
+		}
+	}
+	first = 1;
+	psf_prev = NULL;
+	for (psf = *psf_list; psf; psf = psf_next) {
+		__be32 *psrc;
+
+		psf_next = psf->sf_next;
+
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+			psf_prev = psf;
+			continue;
+		}
+
+		/* Based on RFC3376 5.1. Should not send source-list change
+		 * records when there is a filter mode change.
+		 */
+		if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) ||
+		     (!gdeleted && pmc->crcount)) &&
+		    (type == IGMPV3_ALLOW_NEW_SOURCES ||
+		     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount)
+			goto decrease_sf_crcount;
+
+		/* clear marks on query responses */
+		if (isquery)
+			psf->sf_gsresp = 0;
+
+		if (AVAILABLE(skb) < sizeof(__be32) +
+		    first*sizeof(struct igmpv3_grec)) {
+			if (truncate && !first)
+				break;	 /* truncate these */
+			if (pgr)
+				pgr->grec_nsrcs = htons(scount);
+			if (skb)
+				igmpv3_sendpack(skb);
+			skb = igmpv3_newpack(dev, mtu);
+			first = 1;
+			scount = 0;
+		}
+		if (first) {
+			skb = add_grhead(skb, pmc, type, &pgr, mtu);
+			first = 0;
+		}
+		if (!skb)
+			return NULL;
+		psrc = skb_put(skb, sizeof(__be32));
+		*psrc = psf->sf_inaddr;
+		scount++; stotal++;
+		if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
+		     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+decrease_sf_crcount:
+			psf->sf_crcount--;
+			if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+				if (psf_prev)
+					psf_prev->sf_next = psf->sf_next;
+				else
+					*psf_list = psf->sf_next;
+				kfree(psf);
+				continue;
+			}
+		}
+		psf_prev = psf;
+	}
+
+empty_source:
+	if (!stotal) {
+		if (type == IGMPV3_ALLOW_NEW_SOURCES ||
+		    type == IGMPV3_BLOCK_OLD_SOURCES)
+			return skb;
+		if (pmc->crcount || isquery) {
+			/* make sure we have room for group header */
+			if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
+				igmpv3_sendpack(skb);
+				skb = NULL; /* add_grhead will get a new one */
+			}
+			skb = add_grhead(skb, pmc, type, &pgr, mtu);
+		}
+	}
+	if (pgr)
+		pgr->grec_nsrcs = htons(scount);
+
+	if (isquery)
+		pmc->gsquery = 0;	/* clear query state on report */
+	return skb;
+}
+
+static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
+{
+	struct sk_buff *skb = NULL;
+	struct net *net = dev_net(in_dev->dev);
+	int type;
+
+	if (!pmc) {
+		rcu_read_lock();
+		for_each_pmc_rcu(in_dev, pmc) {
+			if (pmc->multiaddr == IGMP_ALL_HOSTS)
+				continue;
+			if (ipv4_is_local_multicast(pmc->multiaddr) &&
+			    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+				continue;
+			spin_lock_bh(&pmc->lock);
+			if (pmc->sfcount[MCAST_EXCLUDE])
+				type = IGMPV3_MODE_IS_EXCLUDE;
+			else
+				type = IGMPV3_MODE_IS_INCLUDE;
+			skb = add_grec(skb, pmc, type, 0, 0);
+			spin_unlock_bh(&pmc->lock);
+		}
+		rcu_read_unlock();
+	} else {
+		spin_lock_bh(&pmc->lock);
+		if (pmc->sfcount[MCAST_EXCLUDE])
+			type = IGMPV3_MODE_IS_EXCLUDE;
+		else
+			type = IGMPV3_MODE_IS_INCLUDE;
+		skb = add_grec(skb, pmc, type, 0, 0);
+		spin_unlock_bh(&pmc->lock);
+	}
+	if (!skb)
+		return 0;
+	return igmpv3_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
+{
+	struct ip_sf_list *psf_prev, *psf_next, *psf;
+
+	psf_prev = NULL;
+	for (psf = *ppsf; psf; psf = psf_next) {
+		psf_next = psf->sf_next;
+		if (psf->sf_crcount == 0) {
+			if (psf_prev)
+				psf_prev->sf_next = psf->sf_next;
+			else
+				*ppsf = psf->sf_next;
+			kfree(psf);
+		} else
+			psf_prev = psf;
+	}
+}
+
+static void kfree_pmc(struct ip_mc_list *pmc)
+{
+	ip_sf_list_clear_all(pmc->sources);
+	ip_sf_list_clear_all(pmc->tomb);
+	kfree(pmc);
+}
+
+static void igmpv3_send_cr(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
+	struct sk_buff *skb = NULL;
+	int type, dtype;
+
+	rcu_read_lock();
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+
+	/* deleted MCA's */
+	pmc_prev = NULL;
+	for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
+		pmc_next = pmc->next;
+		if (pmc->sfmode == MCAST_INCLUDE) {
+			type = IGMPV3_BLOCK_OLD_SOURCES;
+			dtype = IGMPV3_BLOCK_OLD_SOURCES;
+			skb = add_grec(skb, pmc, type, 1, 0);
+			skb = add_grec(skb, pmc, dtype, 1, 1);
+		}
+		if (pmc->crcount) {
+			if (pmc->sfmode == MCAST_EXCLUDE) {
+				type = IGMPV3_CHANGE_TO_INCLUDE;
+				skb = add_grec(skb, pmc, type, 1, 0);
+			}
+			pmc->crcount--;
+			if (pmc->crcount == 0) {
+				igmpv3_clear_zeros(&pmc->tomb);
+				igmpv3_clear_zeros(&pmc->sources);
+			}
+		}
+		if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
+			if (pmc_prev)
+				pmc_prev->next = pmc_next;
+			else
+				in_dev->mc_tomb = pmc_next;
+			in_dev_put(pmc->interface);
+			kfree_pmc(pmc);
+		} else
+			pmc_prev = pmc;
+	}
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+	/* change recs */
+	for_each_pmc_rcu(in_dev, pmc) {
+		spin_lock_bh(&pmc->lock);
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			type = IGMPV3_BLOCK_OLD_SOURCES;
+			dtype = IGMPV3_ALLOW_NEW_SOURCES;
+		} else {
+			type = IGMPV3_ALLOW_NEW_SOURCES;
+			dtype = IGMPV3_BLOCK_OLD_SOURCES;
+		}
+		skb = add_grec(skb, pmc, type, 0, 0);
+		skb = add_grec(skb, pmc, dtype, 0, 1);	/* deleted sources */
+
+		/* filter mode changes */
+		if (pmc->crcount) {
+			if (pmc->sfmode == MCAST_EXCLUDE)
+				type = IGMPV3_CHANGE_TO_EXCLUDE;
+			else
+				type = IGMPV3_CHANGE_TO_INCLUDE;
+			skb = add_grec(skb, pmc, type, 0, 0);
+			pmc->crcount--;
+		}
+		spin_unlock_bh(&pmc->lock);
+	}
+	rcu_read_unlock();
+
+	if (!skb)
+		return;
+	(void) igmpv3_sendpack(skb);
+}
+
+static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
+	int type)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+	struct igmphdr *ih;
+	struct rtable *rt;
+	struct net_device *dev = in_dev->dev;
+	struct net *net = dev_net(dev);
+	__be32	group = pmc ? pmc->multiaddr : 0;
+	struct flowi4 fl4;
+	__be32	dst;
+	int hlen, tlen;
+
+	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
+		return igmpv3_send_report(in_dev, pmc);
+
+	if (ipv4_is_local_multicast(group) &&
+	    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+		return 0;
+
+	if (type == IGMP_HOST_LEAVE_MESSAGE)
+		dst = IGMP_ALL_ROUTER;
+	else
+		dst = group;
+
+	rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
+				   0, 0,
+				   IPPROTO_IGMP, 0, dev->ifindex);
+	if (IS_ERR(rt))
+		return -1;
+
+	hlen = LL_RESERVED_SPACE(dev);
+	tlen = dev->needed_tailroom;
+	skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
+	if (!skb) {
+		ip_rt_put(rt);
+		return -1;
+	}
+	skb->priority = TC_PRIO_CONTROL;
+
+	skb_dst_set(skb, &rt->dst);
+
+	skb_reserve(skb, hlen);
+
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	skb_put(skb, sizeof(struct iphdr) + 4);
+
+	iph->version  = 4;
+	iph->ihl      = (sizeof(struct iphdr)+4)>>2;
+	iph->tos      = 0xc0;
+	iph->frag_off = htons(IP_DF);
+	iph->ttl      = 1;
+	iph->daddr    = dst;
+	iph->saddr    = fl4.saddr;
+	iph->protocol = IPPROTO_IGMP;
+	ip_select_ident(net, skb, NULL);
+	((u8 *)&iph[1])[0] = IPOPT_RA;
+	((u8 *)&iph[1])[1] = 4;
+	((u8 *)&iph[1])[2] = 0;
+	((u8 *)&iph[1])[3] = 0;
+
+	ih = skb_put(skb, sizeof(struct igmphdr));
+	ih->type = type;
+	ih->code = 0;
+	ih->csum = 0;
+	ih->group = group;
+	ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+	return ip_local_out(net, skb->sk, skb);
+}
+
+static void igmp_gq_timer_expire(struct timer_list *t)
+{
+	struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer);
+
+	in_dev->mr_gq_running = 0;
+	igmpv3_send_report(in_dev, NULL);
+	in_dev_put(in_dev);
+}
+
+static void igmp_ifc_timer_expire(struct timer_list *t)
+{
+	struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
+	u32 mr_ifc_count;
+
+	igmpv3_send_cr(in_dev);
+restart:
+	mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count);
+
+	if (mr_ifc_count) {
+		if (cmpxchg(&in_dev->mr_ifc_count,
+			    mr_ifc_count,
+			    mr_ifc_count - 1) != mr_ifc_count)
+			goto restart;
+		igmp_ifc_start_timer(in_dev,
+				     unsolicited_report_interval(in_dev));
+	}
+	in_dev_put(in_dev);
+}
+
+static void igmp_ifc_event(struct in_device *in_dev)
+{
+	struct net *net = dev_net(in_dev->dev);
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+		return;
+	WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv));
+	igmp_ifc_start_timer(in_dev, 1);
+}
+
+
+static void igmp_timer_expire(struct timer_list *t)
+{
+	struct ip_mc_list *im = from_timer(im, t, timer);
+	struct in_device *in_dev = im->interface;
+
+	spin_lock(&im->lock);
+	im->tm_running = 0;
+
+	if (im->unsolicit_count && --im->unsolicit_count)
+		igmp_start_timer(im, unsolicited_report_interval(in_dev));
+
+	im->reporter = 1;
+	spin_unlock(&im->lock);
+
+	if (IGMP_V1_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+	else if (IGMP_V2_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+	else
+		igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
+
+	ip_ma_put(im);
+}
+
+/* mark EXCLUDE-mode sources */
+static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
+{
+	struct ip_sf_list *psf;
+	int i, scount;
+
+	scount = 0;
+	for (psf = pmc->sources; psf; psf = psf->sf_next) {
+		if (scount == nsrcs)
+			break;
+		for (i = 0; i < nsrcs; i++) {
+			/* skip inactive filters */
+			if (psf->sf_count[MCAST_INCLUDE] ||
+			    pmc->sfcount[MCAST_EXCLUDE] !=
+			    psf->sf_count[MCAST_EXCLUDE])
+				break;
+			if (srcs[i] == psf->sf_inaddr) {
+				scount++;
+				break;
+			}
+		}
+	}
+	pmc->gsquery = 0;
+	if (scount == nsrcs)	/* all sources excluded */
+		return 0;
+	return 1;
+}
+
+static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
+{
+	struct ip_sf_list *psf;
+	int i, scount;
+
+	if (pmc->sfmode == MCAST_EXCLUDE)
+		return igmp_xmarksources(pmc, nsrcs, srcs);
+
+	/* mark INCLUDE-mode sources */
+	scount = 0;
+	for (psf = pmc->sources; psf; psf = psf->sf_next) {
+		if (scount == nsrcs)
+			break;
+		for (i = 0; i < nsrcs; i++)
+			if (srcs[i] == psf->sf_inaddr) {
+				psf->sf_gsresp = 1;
+				scount++;
+				break;
+			}
+	}
+	if (!scount) {
+		pmc->gsquery = 0;
+		return 0;
+	}
+	pmc->gsquery = 1;
+	return 1;
+}
+
+/* return true if packet was dropped */
+static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
+{
+	struct ip_mc_list *im;
+	struct net *net = dev_net(in_dev->dev);
+
+	/* Timers are only set for non-local groups */
+
+	if (group == IGMP_ALL_HOSTS)
+		return false;
+	if (ipv4_is_local_multicast(group) &&
+	    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+		return false;
+
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, im) {
+		if (im->multiaddr == group) {
+			igmp_stop_timer(im);
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return false;
+}
+
+/* return true if packet was dropped */
+static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+	int len)
+{
+	struct igmphdr 		*ih = igmp_hdr(skb);
+	struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
+	struct ip_mc_list	*im;
+	__be32			group = ih->group;
+	int			max_delay;
+	int			mark = 0;
+	struct net		*net = dev_net(in_dev->dev);
+
+
+	if (len == 8) {
+		if (ih->code == 0) {
+			/* Alas, old v1 router presents here. */
+
+			max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
+			in_dev->mr_v1_seen = jiffies +
+				(in_dev->mr_qrv * in_dev->mr_qi) +
+				in_dev->mr_qri;
+			group = 0;
+		} else {
+			/* v2 router present */
+			max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
+			in_dev->mr_v2_seen = jiffies +
+				(in_dev->mr_qrv * in_dev->mr_qi) +
+				in_dev->mr_qri;
+		}
+		/* cancel the interface change timer */
+		WRITE_ONCE(in_dev->mr_ifc_count, 0);
+		if (del_timer(&in_dev->mr_ifc_timer))
+			__in_dev_put(in_dev);
+		/* clear deleted report items */
+		igmpv3_clear_delrec(in_dev);
+	} else if (len < 12) {
+		return true;	/* ignore bogus packet; freed by caller */
+	} else if (IGMP_V1_SEEN(in_dev)) {
+		/* This is a v3 query with v1 queriers present */
+		max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
+		group = 0;
+	} else if (IGMP_V2_SEEN(in_dev)) {
+		/* this is a v3 query with v2 queriers present;
+		 * Interpretation of the max_delay code is problematic here.
+		 * A real v2 host would use ih_code directly, while v3 has a
+		 * different encoding. We use the v3 encoding as more likely
+		 * to be intended in a v3 query.
+		 */
+		max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+		if (!max_delay)
+			max_delay = 1;	/* can't mod w/ 0 */
+	} else { /* v3 */
+		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
+			return true;
+
+		ih3 = igmpv3_query_hdr(skb);
+		if (ih3->nsrcs) {
+			if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+					   + ntohs(ih3->nsrcs)*sizeof(__be32)))
+				return true;
+			ih3 = igmpv3_query_hdr(skb);
+		}
+
+		max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+		if (!max_delay)
+			max_delay = 1;	/* can't mod w/ 0 */
+		in_dev->mr_maxdelay = max_delay;
+
+		/* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
+		 * received value was zero, use the default or statically
+		 * configured value.
+		 */
+		in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+		in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
+
+		/* RFC3376, 8.3. Query Response Interval:
+		 * The number of seconds represented by the [Query Response
+		 * Interval] must be less than the [Query Interval].
+		 */
+		if (in_dev->mr_qri >= in_dev->mr_qi)
+			in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;
+
+		if (!group) { /* general query */
+			if (ih3->nsrcs)
+				return true;	/* no sources allowed */
+			igmp_gq_start_timer(in_dev);
+			return false;
+		}
+		/* mark sources to include, if group & source-specific */
+		mark = ih3->nsrcs != 0;
+	}
+
+	/*
+	 * - Start the timers in all of our membership records
+	 *   that the query applies to for the interface on
+	 *   which the query arrived excl. those that belong
+	 *   to a "local" group (224.0.0.X)
+	 * - For timers already running check if they need to
+	 *   be reset.
+	 * - Use the igmp->igmp_code field as the maximum
+	 *   delay possible
+	 */
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, im) {
+		int changed;
+
+		if (group && group != im->multiaddr)
+			continue;
+		if (im->multiaddr == IGMP_ALL_HOSTS)
+			continue;
+		if (ipv4_is_local_multicast(im->multiaddr) &&
+		    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+			continue;
+		spin_lock_bh(&im->lock);
+		if (im->tm_running)
+			im->gsquery = im->gsquery && mark;
+		else
+			im->gsquery = mark;
+		changed = !im->gsquery ||
+			igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+		spin_unlock_bh(&im->lock);
+		if (changed)
+			igmp_mod_timer(im, max_delay);
+	}
+	rcu_read_unlock();
+	return false;
+}
+
+/* called in rcu_read_lock() section */
+int igmp_rcv(struct sk_buff *skb)
+{
+	/* This basically follows the spec line by line -- see RFC1112 */
+	struct igmphdr *ih;
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	int len = skb->len;
+	bool dropped = true;
+
+	if (netif_is_l3_master(dev)) {
+		dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
+		if (!dev)
+			goto drop;
+	}
+
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev)
+		goto drop;
+
+	if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
+		goto drop;
+
+	if (skb_checksum_simple_validate(skb))
+		goto drop;
+
+	ih = igmp_hdr(skb);
+	switch (ih->type) {
+	case IGMP_HOST_MEMBERSHIP_QUERY:
+		dropped = igmp_heard_query(in_dev, skb, len);
+		break;
+	case IGMP_HOST_MEMBERSHIP_REPORT:
+	case IGMPV2_HOST_MEMBERSHIP_REPORT:
+		/* Is it our report looped back? */
+		if (rt_is_output_route(skb_rtable(skb)))
+			break;
+		/* don't rely on MC router hearing unicast reports */
+		if (skb->pkt_type == PACKET_MULTICAST ||
+		    skb->pkt_type == PACKET_BROADCAST)
+			dropped = igmp_heard_report(in_dev, ih->group);
+		break;
+	case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+		return pim_rcv_v1(skb);
+#endif
+	case IGMPV3_HOST_MEMBERSHIP_REPORT:
+	case IGMP_DVMRP:
+	case IGMP_TRACE:
+	case IGMP_HOST_LEAVE_MESSAGE:
+	case IGMP_MTRACE:
+	case IGMP_MTRACE_RESP:
+		break;
+	default:
+		break;
+	}
+
+drop:
+	if (dropped)
+		kfree_skb(skb);
+	else
+		consume_skb(skb);
+	return 0;
+}
+
+#endif
+
+
+/*
+ *	Add a filter to a device
+ */
+
+static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
+{
+	char buf[MAX_ADDR_LEN];
+	struct net_device *dev = in_dev->dev;
+
+	/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+	   We will get multicast token leakage, when IFF_MULTICAST
+	   is changed. This check should be done in ndo_set_rx_mode
+	   routine. Something sort of:
+	   if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+	   --ANK
+	   */
+	if (arp_mc_map(addr, buf, dev, 0) == 0)
+		dev_mc_add(dev, buf);
+}
+
+/*
+ *	Remove a filter from a device
+ */
+
+static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
+{
+	char buf[MAX_ADDR_LEN];
+	struct net_device *dev = in_dev->dev;
+
+	if (arp_mc_map(addr, buf, dev, 0) == 0)
+		dev_mc_del(dev, buf);
+}
+
+#ifdef CONFIG_IP_MULTICAST
+/*
+ * deleted ip_mc_list manipulation
+ */
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
+			      gfp_t gfp)
+{
+	struct ip_mc_list *pmc;
+	struct net *net = dev_net(in_dev->dev);
+
+	/* this is an "ip_mc_list" for convenience; only the fields below
+	 * are actually used. In particular, the refcnt and users are not
+	 * used for management of the delete list. Using the same structure
+	 * for deleted items allows change reports to use common code with
+	 * non-deleted or query-response MCA's.
+	 */
+	pmc = kzalloc(sizeof(*pmc), gfp);
+	if (!pmc)
+		return;
+	spin_lock_init(&pmc->lock);
+	spin_lock_bh(&im->lock);
+	pmc->interface = im->interface;
+	in_dev_hold(in_dev);
+	pmc->multiaddr = im->multiaddr;
+	pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+	pmc->sfmode = im->sfmode;
+	if (pmc->sfmode == MCAST_INCLUDE) {
+		struct ip_sf_list *psf;
+
+		pmc->tomb = im->tomb;
+		pmc->sources = im->sources;
+		im->tomb = im->sources = NULL;
+		for (psf = pmc->sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = pmc->crcount;
+	}
+	spin_unlock_bh(&im->lock);
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc->next = in_dev->mc_tomb;
+	in_dev->mc_tomb = pmc;
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+}
+
+/*
+ * restore ip_mc_list deleted records
+ */
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+{
+	struct ip_mc_list *pmc, *pmc_prev;
+	struct ip_sf_list *psf;
+	struct net *net = dev_net(in_dev->dev);
+	__be32 multiaddr = im->multiaddr;
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc_prev = NULL;
+	for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
+		if (pmc->multiaddr == multiaddr)
+			break;
+		pmc_prev = pmc;
+	}
+	if (pmc) {
+		if (pmc_prev)
+			pmc_prev->next = pmc->next;
+		else
+			in_dev->mc_tomb = pmc->next;
+	}
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+	spin_lock_bh(&im->lock);
+	if (pmc) {
+		im->interface = pmc->interface;
+		if (im->sfmode == MCAST_INCLUDE) {
+			swap(im->tomb, pmc->tomb);
+			swap(im->sources, pmc->sources);
+			for (psf = im->sources; psf; psf = psf->sf_next)
+				psf->sf_crcount = in_dev->mr_qrv ?:
+					READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+		} else {
+			im->crcount = in_dev->mr_qrv ?:
+				READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+		}
+		in_dev_put(pmc->interface);
+		kfree_pmc(pmc);
+	}
+	spin_unlock_bh(&im->lock);
+}
+
+/*
+ * flush ip_mc_list deleted records
+ */
+static void igmpv3_clear_delrec(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc, *nextpmc;
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc = in_dev->mc_tomb;
+	in_dev->mc_tomb = NULL;
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+	for (; pmc; pmc = nextpmc) {
+		nextpmc = pmc->next;
+		ip_mc_clear_src(pmc);
+		in_dev_put(pmc->interface);
+		kfree_pmc(pmc);
+	}
+	/* clear dead sources, too */
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, pmc) {
+		struct ip_sf_list *psf;
+
+		spin_lock_bh(&pmc->lock);
+		psf = pmc->tomb;
+		pmc->tomb = NULL;
+		spin_unlock_bh(&pmc->lock);
+		ip_sf_list_clear_all(psf);
+	}
+	rcu_read_unlock();
+}
+#endif
+
+static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp)
+{
+	struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+	struct net *net = dev_net(in_dev->dev);
+	int reporter;
+#endif
+
+	if (im->loaded) {
+		im->loaded = 0;
+		ip_mc_filter_del(in_dev, im->multiaddr);
+	}
+
+#ifdef CONFIG_IP_MULTICAST
+	if (im->multiaddr == IGMP_ALL_HOSTS)
+		return;
+	if (ipv4_is_local_multicast(im->multiaddr) &&
+	    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+		return;
+
+	reporter = im->reporter;
+	igmp_stop_timer(im);
+
+	if (!in_dev->dead) {
+		if (IGMP_V1_SEEN(in_dev))
+			return;
+		if (IGMP_V2_SEEN(in_dev)) {
+			if (reporter)
+				igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
+			return;
+		}
+		/* IGMPv3 */
+		igmpv3_add_delrec(in_dev, im, gfp);
+
+		igmp_ifc_event(in_dev);
+	}
+#endif
+}
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+	__igmp_group_dropped(im, GFP_KERNEL);
+}
+
+static void igmp_group_added(struct ip_mc_list *im)
+{
+	struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+	struct net *net = dev_net(in_dev->dev);
+#endif
+
+	if (im->loaded == 0) {
+		im->loaded = 1;
+		ip_mc_filter_add(in_dev, im->multiaddr);
+	}
+
+#ifdef CONFIG_IP_MULTICAST
+	if (im->multiaddr == IGMP_ALL_HOSTS)
+		return;
+	if (ipv4_is_local_multicast(im->multiaddr) &&
+	    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+		return;
+
+	if (in_dev->dead)
+		return;
+
+	im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+		spin_lock_bh(&im->lock);
+		igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
+		spin_unlock_bh(&im->lock);
+		return;
+	}
+	/* else, v3 */
+
+	/* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
+	 * not send filter-mode change record as the mode should be from
+	 * IN() to IN(A).
+	 */
+	if (im->sfmode == MCAST_EXCLUDE)
+		im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+
+	igmp_ifc_event(in_dev);
+#endif
+}
+
+
+/*
+ *	Multicast list managers
+ */
+
+static u32 ip_mc_hash(const struct ip_mc_list *im)
+{
+	return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
+}
+
+static void ip_mc_hash_add(struct in_device *in_dev,
+			   struct ip_mc_list *im)
+{
+	struct ip_mc_list __rcu **mc_hash;
+	u32 hash;
+
+	mc_hash = rtnl_dereference(in_dev->mc_hash);
+	if (mc_hash) {
+		hash = ip_mc_hash(im);
+		im->next_hash = mc_hash[hash];
+		rcu_assign_pointer(mc_hash[hash], im);
+		return;
+	}
+
+	/* do not use a hash table for small number of items */
+	if (in_dev->mc_count < 4)
+		return;
+
+	mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
+			  GFP_KERNEL);
+	if (!mc_hash)
+		return;
+
+	for_each_pmc_rtnl(in_dev, im) {
+		hash = ip_mc_hash(im);
+		im->next_hash = mc_hash[hash];
+		RCU_INIT_POINTER(mc_hash[hash], im);
+	}
+
+	rcu_assign_pointer(in_dev->mc_hash, mc_hash);
+}
+
+static void ip_mc_hash_remove(struct in_device *in_dev,
+			      struct ip_mc_list *im)
+{
+	struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
+	struct ip_mc_list *aux;
+
+	if (!mc_hash)
+		return;
+	mc_hash += ip_mc_hash(im);
+	while ((aux = rtnl_dereference(*mc_hash)) != im)
+		mc_hash = &aux->next_hash;
+	*mc_hash = im->next_hash;
+}
+
+
+/*
+ *	A socket has joined a multicast group on device dev.
+ */
+static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
+				unsigned int mode, gfp_t gfp)
+{
+	struct ip_mc_list *im;
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, im) {
+		if (im->multiaddr == addr) {
+			im->users++;
+			ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
+			goto out;
+		}
+	}
+
+	im = kzalloc(sizeof(*im), gfp);
+	if (!im)
+		goto out;
+
+	im->users = 1;
+	im->interface = in_dev;
+	in_dev_hold(in_dev);
+	im->multiaddr = addr;
+	/* initial mode is (EX, empty) */
+	im->sfmode = mode;
+	im->sfcount[mode] = 1;
+	refcount_set(&im->refcnt, 1);
+	spin_lock_init(&im->lock);
+#ifdef CONFIG_IP_MULTICAST
+	timer_setup(&im->timer, igmp_timer_expire, 0);
+#endif
+
+	im->next_rcu = in_dev->mc_list;
+	in_dev->mc_count++;
+	rcu_assign_pointer(in_dev->mc_list, im);
+
+	ip_mc_hash_add(in_dev, im);
+
+#ifdef CONFIG_IP_MULTICAST
+	igmpv3_del_delrec(in_dev, im);
+#endif
+	igmp_group_added(im);
+	if (!in_dev->dead)
+		ip_rt_multicast_event(in_dev);
+out:
+	return;
+}
+
+void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
+{
+	____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp);
+}
+EXPORT_SYMBOL(__ip_mc_inc_group);
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+	__ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ip_mc_inc_group);
+
+static int ip_mc_check_iphdr(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	unsigned int len;
+	unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
+		return -EINVAL;
+
+	offset += ip_hdrlen(skb) - sizeof(*iph);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		return -EINVAL;
+
+	len = skb_network_offset(skb) + ntohs(iph->tot_len);
+	if (skb->len < len || len < offset)
+		return -EINVAL;
+
+	skb_set_transport_header(skb, offset);
+
+	return 0;
+}
+
+static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct igmpv3_report);
+
+	return ip_mc_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ip_mc_check_igmp_query(struct sk_buff *skb)
+{
+	unsigned int transport_len = ip_transport_len(skb);
+	unsigned int len;
+
+	/* IGMPv{1,2}? */
+	if (transport_len != sizeof(struct igmphdr)) {
+		/* or IGMPv3? */
+		if (transport_len < sizeof(struct igmpv3_query))
+			return -EINVAL;
+
+		len = skb_transport_offset(skb) + sizeof(struct igmpv3_query);
+		if (!ip_mc_may_pull(skb, len))
+			return -EINVAL;
+	}
+
+	/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+	 * all-systems destination addresses (224.0.0.1) for general queries
+	 */
+	if (!igmp_hdr(skb)->group &&
+	    ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ip_mc_check_igmp_msg(struct sk_buff *skb)
+{
+	switch (igmp_hdr(skb)->type) {
+	case IGMP_HOST_LEAVE_MESSAGE:
+	case IGMP_HOST_MEMBERSHIP_REPORT:
+	case IGMPV2_HOST_MEMBERSHIP_REPORT:
+		return 0;
+	case IGMPV3_HOST_MEMBERSHIP_REPORT:
+		return ip_mc_check_igmp_reportv3(skb);
+	case IGMP_HOST_MEMBERSHIP_QUERY:
+		return ip_mc_check_igmp_query(skb);
+	default:
+		return -ENOMSG;
+	}
+}
+
+static __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+{
+	return skb_checksum_simple_validate(skb);
+}
+
+static int ip_mc_check_igmp_csum(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
+	unsigned int transport_len = ip_transport_len(skb);
+	struct sk_buff *skb_chk;
+
+	if (!ip_mc_may_pull(skb, len))
+		return -EINVAL;
+
+	skb_chk = skb_checksum_trimmed(skb, transport_len,
+				       ip_mc_validate_checksum);
+	if (!skb_chk)
+		return -EINVAL;
+
+	if (skb_chk != skb)
+		kfree_skb(skb_chk);
+
+	return 0;
+}
+
+/**
+ * ip_mc_check_igmp - checks whether this is a sane IGMP packet
+ * @skb: the skb to validate
+ *
+ * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
+ * skb transport header accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ *  standard
+ * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Caller needs to set the skb network header and free any returned skb if it
+ * differs from the provided skb.
+ */
+int ip_mc_check_igmp(struct sk_buff *skb)
+{
+	int ret = ip_mc_check_iphdr(skb);
+
+	if (ret < 0)
+		return ret;
+
+	if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
+		return -ENOMSG;
+
+	ret = ip_mc_check_igmp_csum(skb);
+	if (ret < 0)
+		return ret;
+
+	return ip_mc_check_igmp_msg(skb);
+}
+EXPORT_SYMBOL(ip_mc_check_igmp);
+
+/*
+ *	Resend IGMP JOIN report; used by netdev notifier.
+ */
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
+{
+#ifdef CONFIG_IP_MULTICAST
+	struct ip_mc_list *im;
+	int type;
+	struct net *net = dev_net(in_dev->dev);
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, im) {
+		if (im->multiaddr == IGMP_ALL_HOSTS)
+			continue;
+		if (ipv4_is_local_multicast(im->multiaddr) &&
+		    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
+			continue;
+
+		/* a failover is happening and switches
+		 * must be notified immediately
+		 */
+		if (IGMP_V1_SEEN(in_dev))
+			type = IGMP_HOST_MEMBERSHIP_REPORT;
+		else if (IGMP_V2_SEEN(in_dev))
+			type = IGMPV2_HOST_MEMBERSHIP_REPORT;
+		else
+			type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+		igmp_send_report(in_dev, im, type);
+	}
+#endif
+}
+
+/*
+ *	A socket has left a multicast group on device dev
+ */
+
+void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
+{
+	struct ip_mc_list *i;
+	struct ip_mc_list __rcu **ip;
+
+	ASSERT_RTNL();
+
+	for (ip = &in_dev->mc_list;
+	     (i = rtnl_dereference(*ip)) != NULL;
+	     ip = &i->next_rcu) {
+		if (i->multiaddr == addr) {
+			if (--i->users == 0) {
+				ip_mc_hash_remove(in_dev, i);
+				*ip = i->next_rcu;
+				in_dev->mc_count--;
+				__igmp_group_dropped(i, gfp);
+				ip_mc_clear_src(i);
+
+				if (!in_dev->dead)
+					ip_rt_multicast_event(in_dev);
+
+				ip_ma_put(i);
+				return;
+			}
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL(__ip_mc_dec_group);
+
+/* Device changing type */
+
+void ip_mc_unmap(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc;
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, pmc)
+		igmp_group_dropped(pmc);
+}
+
+void ip_mc_remap(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc;
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+		igmpv3_del_delrec(in_dev, pmc);
+#endif
+		igmp_group_added(pmc);
+	}
+}
+
+/* Device going down */
+
+void ip_mc_down(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc;
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, pmc)
+		igmp_group_dropped(pmc);
+
+#ifdef CONFIG_IP_MULTICAST
+	WRITE_ONCE(in_dev->mr_ifc_count, 0);
+	if (del_timer(&in_dev->mr_ifc_timer))
+		__in_dev_put(in_dev);
+	in_dev->mr_gq_running = 0;
+	if (del_timer(&in_dev->mr_gq_timer))
+		__in_dev_put(in_dev);
+#endif
+
+	ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static void ip_mc_reset(struct in_device *in_dev)
+{
+	struct net *net = dev_net(in_dev->dev);
+
+	in_dev->mr_qi = IGMP_QUERY_INTERVAL;
+	in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL;
+	in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+}
+#else
+static void ip_mc_reset(struct in_device *in_dev)
+{
+}
+#endif
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
+	ASSERT_RTNL();
+
+#ifdef CONFIG_IP_MULTICAST
+	timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
+	timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
+#endif
+	ip_mc_reset(in_dev);
+
+	spin_lock_init(&in_dev->mc_tomb_lock);
+}
+
+/* Device going up */
+
+void ip_mc_up(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc;
+
+	ASSERT_RTNL();
+
+	ip_mc_reset(in_dev);
+	ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+
+	for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+		igmpv3_del_delrec(in_dev, pmc);
+#endif
+		igmp_group_added(pmc);
+	}
+}
+
+/*
+ *	Device is about to be destroyed: clean up.
+ */
+
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+	struct ip_mc_list *i;
+
+	ASSERT_RTNL();
+
+	/* Deactivate timers */
+	ip_mc_down(in_dev);
+#ifdef CONFIG_IP_MULTICAST
+	igmpv3_clear_delrec(in_dev);
+#endif
+
+	while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
+		in_dev->mc_list = i->next_rcu;
+		in_dev->mc_count--;
+		ip_mc_clear_src(i);
+		ip_ma_put(i);
+	}
+}
+
+/* RTNL is locked */
+static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
+{
+	struct net_device *dev = NULL;
+	struct in_device *idev = NULL;
+
+	if (imr->imr_ifindex) {
+		idev = inetdev_by_index(net, imr->imr_ifindex);
+		return idev;
+	}
+	if (imr->imr_address.s_addr) {
+		dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
+		if (!dev)
+			return NULL;
+	}
+
+	if (!dev) {
+		struct rtable *rt = ip_route_output(net,
+						    imr->imr_multiaddr.s_addr,
+						    0, 0, 0);
+		if (!IS_ERR(rt)) {
+			dev = rt->dst.dev;
+			ip_rt_put(rt);
+		}
+	}
+	if (dev) {
+		imr->imr_ifindex = dev->ifindex;
+		idev = __in_dev_get_rtnl(dev);
+	}
+	return idev;
+}
+
+/*
+ *	Join a socket to a group
+ */
+
+static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
+	__be32 *psfsrc)
+{
+	struct ip_sf_list *psf, *psf_prev;
+	int rv = 0;
+
+	psf_prev = NULL;
+	for (psf = pmc->sources; psf; psf = psf->sf_next) {
+		if (psf->sf_inaddr == *psfsrc)
+			break;
+		psf_prev = psf;
+	}
+	if (!psf || psf->sf_count[sfmode] == 0) {
+		/* source filter not found, or count wrong =>  bug */
+		return -ESRCH;
+	}
+	psf->sf_count[sfmode]--;
+	if (psf->sf_count[sfmode] == 0) {
+		ip_rt_multicast_event(pmc->interface);
+	}
+	if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+		struct in_device *in_dev = pmc->interface;
+		struct net *net = dev_net(in_dev->dev);
+#endif
+
+		/* no more filters for this source */
+		if (psf_prev)
+			psf_prev->sf_next = psf->sf_next;
+		else
+			pmc->sources = psf->sf_next;
+#ifdef CONFIG_IP_MULTICAST
+		if (psf->sf_oldin &&
+		    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
+			psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+			psf->sf_next = pmc->tomb;
+			pmc->tomb = psf;
+			rv = 1;
+		} else
+#endif
+			kfree(psf);
+	}
+	return rv;
+}
+
+#ifndef CONFIG_IP_MULTICAST
+#define igmp_ifc_event(x)	do { } while (0)
+#endif
+
+static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+			 int sfcount, __be32 *psfsrc, int delta)
+{
+	struct ip_mc_list *pmc;
+	int	changerec = 0;
+	int	i, err;
+
+	if (!in_dev)
+		return -ENODEV;
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, pmc) {
+		if (*pmca == pmc->multiaddr)
+			break;
+	}
+	if (!pmc) {
+		/* MCA not found?? bug */
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+	spin_lock_bh(&pmc->lock);
+	rcu_read_unlock();
+#ifdef CONFIG_IP_MULTICAST
+	sf_markstate(pmc);
+#endif
+	if (!delta) {
+		err = -EINVAL;
+		if (!pmc->sfcount[sfmode])
+			goto out_unlock;
+		pmc->sfcount[sfmode]--;
+	}
+	err = 0;
+	for (i = 0; i < sfcount; i++) {
+		int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+		changerec |= rv > 0;
+		if (!err && rv < 0)
+			err = rv;
+	}
+	if (pmc->sfmode == MCAST_EXCLUDE &&
+	    pmc->sfcount[MCAST_EXCLUDE] == 0 &&
+	    pmc->sfcount[MCAST_INCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+		struct ip_sf_list *psf;
+		struct net *net = dev_net(in_dev->dev);
+#endif
+
+		/* filter mode change */
+		pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+		pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+		WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
+		for (psf = pmc->sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = 0;
+		igmp_ifc_event(pmc->interface);
+	} else if (sf_setstate(pmc) || changerec) {
+		igmp_ifc_event(pmc->interface);
+#endif
+	}
+out_unlock:
+	spin_unlock_bh(&pmc->lock);
+	return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
+	__be32 *psfsrc)
+{
+	struct ip_sf_list *psf, *psf_prev;
+
+	psf_prev = NULL;
+	for (psf = pmc->sources; psf; psf = psf->sf_next) {
+		if (psf->sf_inaddr == *psfsrc)
+			break;
+		psf_prev = psf;
+	}
+	if (!psf) {
+		psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+		if (!psf)
+			return -ENOBUFS;
+		psf->sf_inaddr = *psfsrc;
+		if (psf_prev) {
+			psf_prev->sf_next = psf;
+		} else
+			pmc->sources = psf;
+	}
+	psf->sf_count[sfmode]++;
+	if (psf->sf_count[sfmode] == 1) {
+		ip_rt_multicast_event(pmc->interface);
+	}
+	return 0;
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static void sf_markstate(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *psf;
+	int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+
+	for (psf = pmc->sources; psf; psf = psf->sf_next)
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			psf->sf_oldin = mca_xcount ==
+				psf->sf_count[MCAST_EXCLUDE] &&
+				!psf->sf_count[MCAST_INCLUDE];
+		} else
+			psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *psf, *dpsf;
+	int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+	int qrv = pmc->interface->mr_qrv;
+	int new_in, rv;
+
+	rv = 0;
+	for (psf = pmc->sources; psf; psf = psf->sf_next) {
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+				!psf->sf_count[MCAST_INCLUDE];
+		} else
+			new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+		if (new_in) {
+			if (!psf->sf_oldin) {
+				struct ip_sf_list *prev = NULL;
+
+				for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
+					if (dpsf->sf_inaddr == psf->sf_inaddr)
+						break;
+					prev = dpsf;
+				}
+				if (dpsf) {
+					if (prev)
+						prev->sf_next = dpsf->sf_next;
+					else
+						pmc->tomb = dpsf->sf_next;
+					kfree(dpsf);
+				}
+				psf->sf_crcount = qrv;
+				rv++;
+			}
+		} else if (psf->sf_oldin) {
+
+			psf->sf_crcount = 0;
+			/*
+			 * add or update "delete" records if an active filter
+			 * is now inactive
+			 */
+			for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
+				if (dpsf->sf_inaddr == psf->sf_inaddr)
+					break;
+			if (!dpsf) {
+				dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+				if (!dpsf)
+					continue;
+				*dpsf = *psf;
+				/* pmc->lock held by callers */
+				dpsf->sf_next = pmc->tomb;
+				pmc->tomb = dpsf;
+			}
+			dpsf->sf_crcount = qrv;
+			rv++;
+		}
+	}
+	return rv;
+}
+#endif
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+			 int sfcount, __be32 *psfsrc, int delta)
+{
+	struct ip_mc_list *pmc;
+	int	isexclude;
+	int	i, err;
+
+	if (!in_dev)
+		return -ENODEV;
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, pmc) {
+		if (*pmca == pmc->multiaddr)
+			break;
+	}
+	if (!pmc) {
+		/* MCA not found?? bug */
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+	spin_lock_bh(&pmc->lock);
+	rcu_read_unlock();
+
+#ifdef CONFIG_IP_MULTICAST
+	sf_markstate(pmc);
+#endif
+	isexclude = pmc->sfmode == MCAST_EXCLUDE;
+	if (!delta)
+		pmc->sfcount[sfmode]++;
+	err = 0;
+	for (i = 0; i < sfcount; i++) {
+		err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
+		if (err)
+			break;
+	}
+	if (err) {
+		int j;
+
+		if (!delta)
+			pmc->sfcount[sfmode]--;
+		for (j = 0; j < i; j++)
+			(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
+	} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
+#ifdef CONFIG_IP_MULTICAST
+		struct ip_sf_list *psf;
+		struct net *net = dev_net(pmc->interface->dev);
+		in_dev = pmc->interface;
+#endif
+
+		/* filter mode change */
+		if (pmc->sfcount[MCAST_EXCLUDE])
+			pmc->sfmode = MCAST_EXCLUDE;
+		else if (pmc->sfcount[MCAST_INCLUDE])
+			pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+		/* else no filters; keep old mode for reports */
+
+		pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+		WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
+		for (psf = pmc->sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = 0;
+		igmp_ifc_event(in_dev);
+	} else if (sf_setstate(pmc)) {
+		igmp_ifc_event(in_dev);
+#endif
+	}
+	spin_unlock_bh(&pmc->lock);
+	return err;
+}
+
+static void ip_mc_clear_src(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *tomb, *sources;
+
+	spin_lock_bh(&pmc->lock);
+	tomb = pmc->tomb;
+	pmc->tomb = NULL;
+	sources = pmc->sources;
+	pmc->sources = NULL;
+	pmc->sfmode = MCAST_EXCLUDE;
+	pmc->sfcount[MCAST_INCLUDE] = 0;
+	pmc->sfcount[MCAST_EXCLUDE] = 1;
+	spin_unlock_bh(&pmc->lock);
+
+	ip_sf_list_clear_all(tomb);
+	ip_sf_list_clear_all(sources);
+}
+
+/* Join a multicast group
+ */
+static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
+			      unsigned int mode)
+{
+	__be32 addr = imr->imr_multiaddr.s_addr;
+	struct ip_mc_socklist *iml, *i;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	int ifindex;
+	int count = 0;
+	int err;
+
+	ASSERT_RTNL();
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	in_dev = ip_mc_find_dev(net, imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+
+	err = -EADDRINUSE;
+	ifindex = imr->imr_ifindex;
+	for_each_pmc_rtnl(inet, i) {
+		if (i->multi.imr_multiaddr.s_addr == addr &&
+		    i->multi.imr_ifindex == ifindex)
+			goto done;
+		count++;
+	}
+	err = -ENOBUFS;
+	if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships))
+		goto done;
+	iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+	if (!iml)
+		goto done;
+
+	memcpy(&iml->multi, imr, sizeof(*imr));
+	iml->next_rcu = inet->mc_list;
+	iml->sflist = NULL;
+	iml->sfmode = mode;
+	rcu_assign_pointer(inet->mc_list, iml);
+	____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
+	err = 0;
+done:
+	return err;
+}
+
+/* Join ASM (Any-Source Multicast) group
+ */
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+	return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
+}
+EXPORT_SYMBOL(ip_mc_join_group);
+
+/* Join SSM (Source-Specific Multicast) group
+ */
+int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
+			 unsigned int mode)
+{
+	return __ip_mc_join_group(sk, imr, mode);
+}
+
+static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
+			   struct in_device *in_dev)
+{
+	struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
+	int err;
+
+	if (!psf) {
+		/* any-source empty exclude case */
+		return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+			iml->sfmode, 0, NULL, 0);
+	}
+	err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+			iml->sfmode, psf->sl_count, psf->sl_addr, 0);
+	RCU_INIT_POINTER(iml->sflist, NULL);
+	/* decrease mem now to avoid the memleak warning */
+	atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc);
+	kfree_rcu(psf, rcu);
+	return err;
+}
+
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *iml;
+	struct ip_mc_socklist __rcu **imlp;
+	struct in_device *in_dev;
+	struct net *net = sock_net(sk);
+	__be32 group = imr->imr_multiaddr.s_addr;
+	u32 ifindex;
+	int ret = -EADDRNOTAVAIL;
+
+	ASSERT_RTNL();
+
+	in_dev = ip_mc_find_dev(net, imr);
+	if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
+		ret = -ENODEV;
+		goto out;
+	}
+	ifindex = imr->imr_ifindex;
+	for (imlp = &inet->mc_list;
+	     (iml = rtnl_dereference(*imlp)) != NULL;
+	     imlp = &iml->next_rcu) {
+		if (iml->multi.imr_multiaddr.s_addr != group)
+			continue;
+		if (ifindex) {
+			if (iml->multi.imr_ifindex != ifindex)
+				continue;
+		} else if (imr->imr_address.s_addr && imr->imr_address.s_addr !=
+				iml->multi.imr_address.s_addr)
+			continue;
+
+		(void) ip_mc_leave_src(sk, iml, in_dev);
+
+		*imlp = iml->next_rcu;
+
+		if (in_dev)
+			ip_mc_dec_group(in_dev, group);
+
+		/* decrease mem now to avoid the memleak warning */
+		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+		kfree_rcu(iml, rcu);
+		return 0;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ip_mc_leave_group);
+
+int ip_mc_source(int add, int omode, struct sock *sk, struct
+	ip_mreq_source *mreqs, int ifindex)
+{
+	int err;
+	struct ip_mreqn imr;
+	__be32 addr = mreqs->imr_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev = NULL;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+	struct net *net = sock_net(sk);
+	int leavegroup = 0;
+	int i, j, rv;
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	ASSERT_RTNL();
+
+	imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
+	imr.imr_address.s_addr = mreqs->imr_interface;
+	imr.imr_ifindex = ifindex;
+	in_dev = ip_mc_find_dev(net, &imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+	err = -EADDRNOTAVAIL;
+
+	for_each_pmc_rtnl(inet, pmc) {
+		if ((pmc->multi.imr_multiaddr.s_addr ==
+		     imr.imr_multiaddr.s_addr) &&
+		    (pmc->multi.imr_ifindex == imr.imr_ifindex))
+			break;
+	}
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
+		goto done;
+	}
+	/* if a source filter was set, must be the same mode as before */
+	if (pmc->sflist) {
+		if (pmc->sfmode != omode) {
+			err = -EINVAL;
+			goto done;
+		}
+	} else if (pmc->sfmode != omode) {
+		/* allow mode switches for empty-set filters */
+		ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
+		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
+			NULL, 0);
+		pmc->sfmode = omode;
+	}
+
+	psl = rtnl_dereference(pmc->sflist);
+	if (!add) {
+		if (!psl)
+			goto done;	/* err = -EADDRNOTAVAIL */
+		rv = !0;
+		for (i = 0; i < psl->sl_count; i++) {
+			rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+				sizeof(__be32));
+			if (rv == 0)
+				break;
+		}
+		if (rv)		/* source not found */
+			goto done;	/* err = -EADDRNOTAVAIL */
+
+		/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+		if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+			leavegroup = 1;
+			goto done;
+		}
+
+		/* update the interface filter */
+		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+			&mreqs->imr_sourceaddr, 1);
+
+		for (j = i+1; j < psl->sl_count; j++)
+			psl->sl_addr[j-1] = psl->sl_addr[j];
+		psl->sl_count--;
+		err = 0;
+		goto done;
+	}
+	/* else, add a new source to the filter */
+
+	if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
+		err = -ENOBUFS;
+		goto done;
+	}
+	if (!psl || psl->sl_count == psl->sl_max) {
+		struct ip_sf_socklist *newpsl;
+		int count = IP_SFBLOCK;
+
+		if (psl)
+			count += psl->sl_max;
+		newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+				      GFP_KERNEL);
+		if (!newpsl) {
+			err = -ENOBUFS;
+			goto done;
+		}
+		newpsl->sl_max = count;
+		newpsl->sl_count = count - IP_SFBLOCK;
+		if (psl) {
+			for (i = 0; i < psl->sl_count; i++)
+				newpsl->sl_addr[i] = psl->sl_addr[i];
+			/* decrease mem now to avoid the memleak warning */
+			atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+				   &sk->sk_omem_alloc);
+		}
+		rcu_assign_pointer(pmc->sflist, newpsl);
+		if (psl)
+			kfree_rcu(psl, rcu);
+		psl = newpsl;
+	}
+	rv = 1;	/* > 0 for insert logic below if sl_count is 0 */
+	for (i = 0; i < psl->sl_count; i++) {
+		rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+			sizeof(__be32));
+		if (rv == 0)
+			break;
+	}
+	if (rv == 0)		/* address already there is an error */
+		goto done;
+	for (j = psl->sl_count-1; j >= i; j--)
+		psl->sl_addr[j+1] = psl->sl_addr[j];
+	psl->sl_addr[i] = mreqs->imr_sourceaddr;
+	psl->sl_count++;
+	err = 0;
+	/* update the interface list */
+	ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+		&mreqs->imr_sourceaddr, 1);
+done:
+	if (leavegroup)
+		err = ip_mc_leave_group(sk, &imr);
+	return err;
+}
+
+int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
+{
+	int err = 0;
+	struct ip_mreqn	imr;
+	__be32 addr = msf->imsf_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *newpsl, *psl;
+	struct net *net = sock_net(sk);
+	int leavegroup = 0;
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+	if (msf->imsf_fmode != MCAST_INCLUDE &&
+	    msf->imsf_fmode != MCAST_EXCLUDE)
+		return -EINVAL;
+
+	ASSERT_RTNL();
+
+	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+	imr.imr_address.s_addr = msf->imsf_interface;
+	imr.imr_ifindex = ifindex;
+	in_dev = ip_mc_find_dev(net, &imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+
+	/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+	if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+		leavegroup = 1;
+		goto done;
+	}
+
+	for_each_pmc_rtnl(inet, pmc) {
+		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+		    pmc->multi.imr_ifindex == imr.imr_ifindex)
+			break;
+	}
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
+		goto done;
+	}
+	if (msf->imsf_numsrc) {
+		newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+						      msf->imsf_numsrc),
+				      GFP_KERNEL);
+		if (!newpsl) {
+			err = -ENOBUFS;
+			goto done;
+		}
+		newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
+		memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
+		       flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
+		err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+			msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
+		if (err) {
+			sock_kfree_s(sk, newpsl,
+				     struct_size(newpsl, sl_addr,
+						 newpsl->sl_max));
+			goto done;
+		}
+	} else {
+		newpsl = NULL;
+		(void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+				     msf->imsf_fmode, 0, NULL, 0);
+	}
+	psl = rtnl_dereference(pmc->sflist);
+	if (psl) {
+		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+			psl->sl_count, psl->sl_addr, 0);
+		/* decrease mem now to avoid the memleak warning */
+		atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+			   &sk->sk_omem_alloc);
+	} else {
+		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+			0, NULL, 0);
+	}
+	rcu_assign_pointer(pmc->sflist, newpsl);
+	if (psl)
+		kfree_rcu(psl, rcu);
+	pmc->sfmode = msf->imsf_fmode;
+	err = 0;
+done:
+	if (leavegroup)
+		err = ip_mc_leave_group(sk, &imr);
+	return err;
+}
+int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
+		 sockptr_t optval, sockptr_t optlen)
+{
+	int err, len, count, copycount, msf_size;
+	struct ip_mreqn	imr;
+	__be32 addr = msf->imsf_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+	struct net *net = sock_net(sk);
+
+	ASSERT_RTNL();
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+	imr.imr_address.s_addr = msf->imsf_interface;
+	imr.imr_ifindex = 0;
+	in_dev = ip_mc_find_dev(net, &imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+	err = -EADDRNOTAVAIL;
+
+	for_each_pmc_rtnl(inet, pmc) {
+		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+		    pmc->multi.imr_ifindex == imr.imr_ifindex)
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		goto done;
+	msf->imsf_fmode = pmc->sfmode;
+	psl = rtnl_dereference(pmc->sflist);
+	if (!psl) {
+		count = 0;
+	} else {
+		count = psl->sl_count;
+	}
+	copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
+	len = flex_array_size(psl, sl_addr, copycount);
+	msf->imsf_numsrc = count;
+	msf_size = IP_MSFILTER_SIZE(copycount);
+	if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) ||
+	    copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) {
+		return -EFAULT;
+	}
+	if (len &&
+	    copy_to_sockptr_offset(optval,
+				   offsetof(struct ip_msfilter, imsf_slist_flex),
+				   psl->sl_addr, len))
+		return -EFAULT;
+	return 0;
+done:
+	return err;
+}
+
+int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
+		 sockptr_t optval, size_t ss_offset)
+{
+	int i, count, copycount;
+	struct sockaddr_in *psin;
+	__be32 addr;
+	struct ip_mc_socklist *pmc;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+
+	ASSERT_RTNL();
+
+	psin = (struct sockaddr_in *)&gsf->gf_group;
+	if (psin->sin_family != AF_INET)
+		return -EINVAL;
+	addr = psin->sin_addr.s_addr;
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	for_each_pmc_rtnl(inet, pmc) {
+		if (pmc->multi.imr_multiaddr.s_addr == addr &&
+		    pmc->multi.imr_ifindex == gsf->gf_interface)
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		return -EADDRNOTAVAIL;
+	gsf->gf_fmode = pmc->sfmode;
+	psl = rtnl_dereference(pmc->sflist);
+	count = psl ? psl->sl_count : 0;
+	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+	gsf->gf_numsrc = count;
+	for (i = 0; i < copycount; i++) {
+		struct sockaddr_storage ss;
+
+		psin = (struct sockaddr_in *)&ss;
+		memset(&ss, 0, sizeof(ss));
+		psin->sin_family = AF_INET;
+		psin->sin_addr.s_addr = psl->sl_addr[i];
+		if (copy_to_sockptr_offset(optval, ss_offset,
+					   &ss, sizeof(ss)))
+			return -EFAULT;
+		ss_offset += sizeof(ss);
+	}
+	return 0;
+}
+
+/*
+ * check if a multicast source filter allows delivery for a given <src,dst,intf>
+ */
+int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+		   int dif, int sdif)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *pmc;
+	struct ip_sf_socklist *psl;
+	int i;
+	int ret;
+
+	ret = 1;
+	if (!ipv4_is_multicast(loc_addr))
+		goto out;
+
+	rcu_read_lock();
+	for_each_pmc_rcu(inet, pmc) {
+		if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
+		    (pmc->multi.imr_ifindex == dif ||
+		     (sdif && pmc->multi.imr_ifindex == sdif)))
+			break;
+	}
+	ret = inet_test_bit(MC_ALL, sk);
+	if (!pmc)
+		goto unlock;
+	psl = rcu_dereference(pmc->sflist);
+	ret = (pmc->sfmode == MCAST_EXCLUDE);
+	if (!psl)
+		goto unlock;
+
+	for (i = 0; i < psl->sl_count; i++) {
+		if (psl->sl_addr[i] == rmt_addr)
+			break;
+	}
+	ret = 0;
+	if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+		goto unlock;
+	if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+		goto unlock;
+	ret = 1;
+unlock:
+	rcu_read_unlock();
+out:
+	return ret;
+}
+
+/*
+ *	A socket is closing.
+ */
+
+void ip_mc_drop_socket(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *iml;
+	struct net *net = sock_net(sk);
+
+	if (!inet->mc_list)
+		return;
+
+	rtnl_lock();
+	while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
+		struct in_device *in_dev;
+
+		inet->mc_list = iml->next_rcu;
+		in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
+		(void) ip_mc_leave_src(sk, iml, in_dev);
+		if (in_dev)
+			ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+		/* decrease mem now to avoid the memleak warning */
+		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+		kfree_rcu(iml, rcu);
+	}
+	rtnl_unlock();
+}
+
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
+{
+	struct ip_mc_list *im;
+	struct ip_mc_list __rcu **mc_hash;
+	struct ip_sf_list *psf;
+	int rv = 0;
+
+	mc_hash = rcu_dereference(in_dev->mc_hash);
+	if (mc_hash) {
+		u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
+
+		for (im = rcu_dereference(mc_hash[hash]);
+		     im != NULL;
+		     im = rcu_dereference(im->next_hash)) {
+			if (im->multiaddr == mc_addr)
+				break;
+		}
+	} else {
+		for_each_pmc_rcu(in_dev, im) {
+			if (im->multiaddr == mc_addr)
+				break;
+		}
+	}
+	if (im && proto == IPPROTO_IGMP) {
+		rv = 1;
+	} else if (im) {
+		if (src_addr) {
+			spin_lock_bh(&im->lock);
+			for (psf = im->sources; psf; psf = psf->sf_next) {
+				if (psf->sf_inaddr == src_addr)
+					break;
+			}
+			if (psf)
+				rv = psf->sf_count[MCAST_INCLUDE] ||
+					psf->sf_count[MCAST_EXCLUDE] !=
+					im->sfcount[MCAST_EXCLUDE];
+			else
+				rv = im->sfcount[MCAST_EXCLUDE] != 0;
+			spin_unlock_bh(&im->lock);
+		} else
+			rv = 1; /* unspecified source; tentatively allow */
+	}
+	return rv;
+}
+
+#if defined(CONFIG_PROC_FS)
+struct igmp_mc_iter_state {
+	struct seq_net_private p;
+	struct net_device *dev;
+	struct in_device *in_dev;
+};
+
+#define	igmp_mc_seq_private(seq)	((struct igmp_mc_iter_state *)(seq)->private)
+
+static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct ip_mc_list *im = NULL;
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+	state->in_dev = NULL;
+	for_each_netdev_rcu(net, state->dev) {
+		struct in_device *in_dev;
+
+		in_dev = __in_dev_get_rcu(state->dev);
+		if (!in_dev)
+			continue;
+		im = rcu_dereference(in_dev->mc_list);
+		if (im) {
+			state->in_dev = in_dev;
+			break;
+		}
+	}
+	return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
+{
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+	im = rcu_dereference(im->next_rcu);
+	while (!im) {
+		state->dev = next_net_device_rcu(state->dev);
+		if (!state->dev) {
+			state->in_dev = NULL;
+			break;
+		}
+		state->in_dev = __in_dev_get_rcu(state->dev);
+		if (!state->in_dev)
+			continue;
+		im = rcu_dereference(state->in_dev->mc_list);
+	}
+	return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ip_mc_list *im = igmp_mc_get_first(seq);
+	if (im)
+		while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
+			--pos;
+	return pos ? NULL : im;
+}
+
+static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+	return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_mc_list *im;
+	if (v == SEQ_START_TOKEN)
+		im = igmp_mc_get_first(seq);
+	else
+		im = igmp_mc_get_next(seq, v);
+	++*pos;
+	return im;
+}
+
+static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu)
+{
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+	state->in_dev = NULL;
+	state->dev = NULL;
+	rcu_read_unlock();
+}
+
+static int igmp_mc_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+			 "Idx\tDevice    : Count Querier\tGroup    Users Timer\tReporter\n");
+	else {
+		struct ip_mc_list *im = v;
+		struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+		char   *querier;
+		long delta;
+
+#ifdef CONFIG_IP_MULTICAST
+		querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
+			  IGMP_V2_SEEN(state->in_dev) ? "V2" :
+			  "V3";
+#else
+		querier = "NONE";
+#endif
+
+		if (rcu_access_pointer(state->in_dev->mc_list) == im) {
+			seq_printf(seq, "%d\t%-10s: %5d %7s\n",
+				   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
+		}
+
+		delta = im->timer.expires - jiffies;
+		seq_printf(seq,
+			   "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
+			   im->multiaddr, im->users,
+			   im->tm_running,
+			   im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
+			   im->reporter);
+	}
+	return 0;
+}
+
+static const struct seq_operations igmp_mc_seq_ops = {
+	.start	=	igmp_mc_seq_start,
+	.next	=	igmp_mc_seq_next,
+	.stop	=	igmp_mc_seq_stop,
+	.show	=	igmp_mc_seq_show,
+};
+
+struct igmp_mcf_iter_state {
+	struct seq_net_private p;
+	struct net_device *dev;
+	struct in_device *idev;
+	struct ip_mc_list *im;
+};
+
+#define igmp_mcf_seq_private(seq)	((struct igmp_mcf_iter_state *)(seq)->private)
+
+static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct ip_sf_list *psf = NULL;
+	struct ip_mc_list *im = NULL;
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	state->idev = NULL;
+	state->im = NULL;
+	for_each_netdev_rcu(net, state->dev) {
+		struct in_device *idev;
+		idev = __in_dev_get_rcu(state->dev);
+		if (unlikely(!idev))
+			continue;
+		im = rcu_dereference(idev->mc_list);
+		if (likely(im)) {
+			spin_lock_bh(&im->lock);
+			psf = im->sources;
+			if (likely(psf)) {
+				state->im = im;
+				state->idev = idev;
+				break;
+			}
+			spin_unlock_bh(&im->lock);
+		}
+	}
+	return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
+{
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	psf = psf->sf_next;
+	while (!psf) {
+		spin_unlock_bh(&state->im->lock);
+		state->im = state->im->next;
+		while (!state->im) {
+			state->dev = next_net_device_rcu(state->dev);
+			if (!state->dev) {
+				state->idev = NULL;
+				goto out;
+			}
+			state->idev = __in_dev_get_rcu(state->dev);
+			if (!state->idev)
+				continue;
+			state->im = rcu_dereference(state->idev->mc_list);
+		}
+		if (!state->im)
+			break;
+		spin_lock_bh(&state->im->lock);
+		psf = state->im->sources;
+	}
+out:
+	return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ip_sf_list *psf = igmp_mcf_get_first(seq);
+	if (psf)
+		while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
+			--pos;
+	return pos ? NULL : psf;
+}
+
+static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+	return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_sf_list *psf;
+	if (v == SEQ_START_TOKEN)
+		psf = igmp_mcf_get_first(seq);
+	else
+		psf = igmp_mcf_get_next(seq, v);
+	++*pos;
+	return psf;
+}
+
+static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu)
+{
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+	if (likely(state->im)) {
+		spin_unlock_bh(&state->im->lock);
+		state->im = NULL;
+	}
+	state->idev = NULL;
+	state->dev = NULL;
+	rcu_read_unlock();
+}
+
+static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
+{
+	struct ip_sf_list *psf = v;
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Idx Device        MCA        SRC    INC    EXC\n");
+	} else {
+		seq_printf(seq,
+			   "%3d %6.6s 0x%08x "
+			   "0x%08x %6lu %6lu\n",
+			   state->dev->ifindex, state->dev->name,
+			   ntohl(state->im->multiaddr),
+			   ntohl(psf->sf_inaddr),
+			   psf->sf_count[MCAST_INCLUDE],
+			   psf->sf_count[MCAST_EXCLUDE]);
+	}
+	return 0;
+}
+
+static const struct seq_operations igmp_mcf_seq_ops = {
+	.start	=	igmp_mcf_seq_start,
+	.next	=	igmp_mcf_seq_next,
+	.stop	=	igmp_mcf_seq_stop,
+	.show	=	igmp_mcf_seq_show,
+};
+
+static int __net_init igmp_net_init(struct net *net)
+{
+	struct proc_dir_entry *pde;
+	int err;
+
+	pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops,
+			sizeof(struct igmp_mc_iter_state));
+	if (!pde)
+		goto out_igmp;
+	pde = proc_create_net("mcfilter", 0444, net->proc_net,
+			&igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state));
+	if (!pde)
+		goto out_mcfilter;
+	err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
+				   SOCK_DGRAM, 0, net);
+	if (err < 0) {
+		pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
+		       err);
+		goto out_sock;
+	}
+
+	return 0;
+
+out_sock:
+	remove_proc_entry("mcfilter", net->proc_net);
+out_mcfilter:
+	remove_proc_entry("igmp", net->proc_net);
+out_igmp:
+	return -ENOMEM;
+}
+
+static void __net_exit igmp_net_exit(struct net *net)
+{
+	remove_proc_entry("mcfilter", net->proc_net);
+	remove_proc_entry("igmp", net->proc_net);
+	inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
+}
+
+static struct pernet_operations igmp_net_ops = {
+	.init = igmp_net_init,
+	.exit = igmp_net_exit,
+};
+#endif
+
+static int igmp_netdev_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct in_device *in_dev;
+
+	switch (event) {
+	case NETDEV_RESEND_IGMP:
+		in_dev = __in_dev_get_rtnl(dev);
+		if (in_dev)
+			ip_mc_rejoin_groups(in_dev);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block igmp_notifier = {
+	.notifier_call = igmp_netdev_event,
+};
+
+int __init igmp_mc_init(void)
+{
+#if defined(CONFIG_PROC_FS)
+	int err;
+
+	err = register_pernet_subsys(&igmp_net_ops);
+	if (err)
+		return err;
+	err = register_netdevice_notifier(&igmp_notifier);
+	if (err)
+		goto reg_notif_fail;
+	return 0;
+
+reg_notif_fail:
+	unregister_pernet_subsys(&igmp_net_ops);
+	return err;
+#else
+	return register_netdevice_notifier(&igmp_notifier);
+#endif
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 0000000000..762817d6c8
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,1501 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Support for INET connection oriented protocols.
+ *
+ * Authors:	See the TCP sources
+ */
+
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+#include <net/tcp.h>
+#include <net/sock_reuseport.h>
+#include <net/addrconf.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* match_sk*_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses
+ *				if IPv6 only, and any IPv4 addresses
+ *				if not IPv6 only
+ * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
+ *				IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ *				and 0.0.0.0 equals to 0.0.0.0 only
+ */
+static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+				 const struct in6_addr *sk2_rcv_saddr6,
+				 __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+				 bool sk1_ipv6only, bool sk2_ipv6only,
+				 bool match_sk1_wildcard,
+				 bool match_sk2_wildcard)
+{
+	int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
+	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+	/* if both are mapped, treat as IPv4 */
+	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+		if (!sk2_ipv6only) {
+			if (sk1_rcv_saddr == sk2_rcv_saddr)
+				return true;
+			return (match_sk1_wildcard && !sk1_rcv_saddr) ||
+				(match_sk2_wildcard && !sk2_rcv_saddr);
+		}
+		return false;
+	}
+
+	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+		return true;
+
+	if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard &&
+	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+		return true;
+
+	if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard &&
+	    !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
+		return true;
+
+	if (sk2_rcv_saddr6 &&
+	    ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
+		return true;
+
+	return false;
+}
+#endif
+
+/* match_sk*_wildcard == true:  0.0.0.0 equals to any IPv4 addresses
+ * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
+ *				0.0.0.0 only equals to 0.0.0.0
+ */
+static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+				 bool sk2_ipv6only, bool match_sk1_wildcard,
+				 bool match_sk2_wildcard)
+{
+	if (!sk2_ipv6only) {
+		if (sk1_rcv_saddr == sk2_rcv_saddr)
+			return true;
+		return (match_sk1_wildcard && !sk1_rcv_saddr) ||
+			(match_sk2_wildcard && !sk2_rcv_saddr);
+	}
+	return false;
+}
+
+bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			  bool match_wildcard)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
+					    inet6_rcv_saddr(sk2),
+					    sk->sk_rcv_saddr,
+					    sk2->sk_rcv_saddr,
+					    ipv6_only_sock(sk),
+					    ipv6_only_sock(sk2),
+					    match_wildcard,
+					    match_wildcard);
+#endif
+	return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
+				    ipv6_only_sock(sk2), match_wildcard,
+				    match_wildcard);
+}
+EXPORT_SYMBOL(inet_rcv_saddr_equal);
+
+bool inet_rcv_saddr_any(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#endif
+	return !sk->sk_rcv_saddr;
+}
+
+void inet_get_local_port_range(const struct net *net, int *low, int *high)
+{
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
+
+		*low = net->ipv4.ip_local_ports.range[0];
+		*high = net->ipv4.ip_local_ports.range[1];
+	} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
+}
+EXPORT_SYMBOL(inet_get_local_port_range);
+
+void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct net *net = sock_net(sk);
+	int lo, hi, sk_lo, sk_hi;
+
+	inet_get_local_port_range(net, &lo, &hi);
+
+	sk_lo = inet->local_port_range.lo;
+	sk_hi = inet->local_port_range.hi;
+
+	if (unlikely(lo <= sk_lo && sk_lo <= hi))
+		lo = sk_lo;
+	if (unlikely(lo <= sk_hi && sk_hi <= hi))
+		hi = sk_hi;
+
+	*low = lo;
+	*high = hi;
+}
+EXPORT_SYMBOL(inet_sk_get_local_port_range);
+
+static bool inet_use_bhash2_on_bind(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6) {
+		int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+
+		return addr_type != IPV6_ADDR_ANY &&
+			addr_type != IPV6_ADDR_MAPPED;
+	}
+#endif
+	return sk->sk_rcv_saddr != htonl(INADDR_ANY);
+}
+
+static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
+			       kuid_t sk_uid, bool relax,
+			       bool reuseport_cb_ok, bool reuseport_ok)
+{
+	int bound_dev_if2;
+
+	if (sk == sk2)
+		return false;
+
+	bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
+
+	if (!sk->sk_bound_dev_if || !bound_dev_if2 ||
+	    sk->sk_bound_dev_if == bound_dev_if2) {
+		if (sk->sk_reuse && sk2->sk_reuse &&
+		    sk2->sk_state != TCP_LISTEN) {
+			if (!relax || (!reuseport_ok && sk->sk_reuseport &&
+				       sk2->sk_reuseport && reuseport_cb_ok &&
+				       (sk2->sk_state == TCP_TIME_WAIT ||
+					uid_eq(sk_uid, sock_i_uid(sk2)))))
+				return true;
+		} else if (!reuseport_ok || !sk->sk_reuseport ||
+			   !sk2->sk_reuseport || !reuseport_cb_ok ||
+			   (sk2->sk_state != TCP_TIME_WAIT &&
+			    !uid_eq(sk_uid, sock_i_uid(sk2)))) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
+				   kuid_t sk_uid, bool relax,
+				   bool reuseport_cb_ok, bool reuseport_ok)
+{
+	if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
+		return false;
+
+	return inet_bind_conflict(sk, sk2, sk_uid, relax,
+				  reuseport_cb_ok, reuseport_ok);
+}
+
+static bool inet_bhash2_conflict(const struct sock *sk,
+				 const struct inet_bind2_bucket *tb2,
+				 kuid_t sk_uid,
+				 bool relax, bool reuseport_cb_ok,
+				 bool reuseport_ok)
+{
+	struct inet_timewait_sock *tw2;
+	struct sock *sk2;
+
+	sk_for_each_bound_bhash2(sk2, &tb2->owners) {
+		if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
+					   reuseport_cb_ok, reuseport_ok))
+			return true;
+	}
+
+	twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) {
+		sk2 = (struct sock *)tw2;
+
+		if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
+					   reuseport_cb_ok, reuseport_ok))
+			return true;
+	}
+
+	return false;
+}
+
+/* This should be called only when the tb and tb2 hashbuckets' locks are held */
+static int inet_csk_bind_conflict(const struct sock *sk,
+				  const struct inet_bind_bucket *tb,
+				  const struct inet_bind2_bucket *tb2, /* may be null */
+				  bool relax, bool reuseport_ok)
+{
+	bool reuseport_cb_ok;
+	struct sock_reuseport *reuseport_cb;
+	kuid_t uid = sock_i_uid((struct sock *)sk);
+
+	rcu_read_lock();
+	reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
+	/* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
+	reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
+	rcu_read_unlock();
+
+	/*
+	 * Unlike other sk lookup places we do not check
+	 * for sk_net here, since _all_ the socks listed
+	 * in tb->owners and tb2->owners list belong
+	 * to the same net - the one this bucket belongs to.
+	 */
+
+	if (!inet_use_bhash2_on_bind(sk)) {
+		struct sock *sk2;
+
+		sk_for_each_bound(sk2, &tb->owners)
+			if (inet_bind_conflict(sk, sk2, uid, relax,
+					       reuseport_cb_ok, reuseport_ok) &&
+			    inet_rcv_saddr_equal(sk, sk2, true))
+				return true;
+
+		return false;
+	}
+
+	/* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if
+	 * ipv4) should have been checked already. We need to do these two
+	 * checks separately because their spinlocks have to be acquired/released
+	 * independently of each other, to prevent possible deadlocks
+	 */
+	return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,
+					   reuseport_ok);
+}
+
+/* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or
+ * INADDR_ANY (if ipv4) socket.
+ *
+ * Caller must hold bhash hashbucket lock with local bh disabled, to protect
+ * against concurrent binds on the port for addr any
+ */
+static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev,
+					  bool relax, bool reuseport_ok)
+{
+	kuid_t uid = sock_i_uid((struct sock *)sk);
+	const struct net *net = sock_net(sk);
+	struct sock_reuseport *reuseport_cb;
+	struct inet_bind_hashbucket *head2;
+	struct inet_bind2_bucket *tb2;
+	bool reuseport_cb_ok;
+
+	rcu_read_lock();
+	reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
+	/* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
+	reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
+	rcu_read_unlock();
+
+	head2 = inet_bhash2_addr_any_hashbucket(sk, net, port);
+
+	spin_lock(&head2->lock);
+
+	inet_bind_bucket_for_each(tb2, &head2->chain)
+		if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk))
+			break;
+
+	if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,
+					reuseport_ok)) {
+		spin_unlock(&head2->lock);
+		return true;
+	}
+
+	spin_unlock(&head2->lock);
+	return false;
+}
+
+/*
+ * Find an open port number for the socket.  Returns with the
+ * inet_bind_hashbucket locks held if successful.
+ */
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
+			struct inet_bind2_bucket **tb2_ret,
+			struct inet_bind_hashbucket **head2_ret, int *port_ret)
+{
+	struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+	int i, low, high, attempt_half, port, l3mdev;
+	struct inet_bind_hashbucket *head, *head2;
+	struct net *net = sock_net(sk);
+	struct inet_bind2_bucket *tb2;
+	struct inet_bind_bucket *tb;
+	u32 remaining, offset;
+	bool relax = false;
+
+	l3mdev = inet_sk_bound_l3mdev(sk);
+ports_exhausted:
+	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+other_half_scan:
+	inet_sk_get_local_port_range(sk, &low, &high);
+	high++; /* [32768, 60999] -> [32768, 61000[ */
+	if (high - low < 4)
+		attempt_half = 0;
+	if (attempt_half) {
+		int half = low + (((high - low) >> 2) << 1);
+
+		if (attempt_half == 1)
+			high = half;
+		else
+			low = half;
+	}
+	remaining = high - low;
+	if (likely(remaining > 1))
+		remaining &= ~1U;
+
+	offset = get_random_u32_below(remaining);
+	/* __inet_hash_connect() favors ports having @low parity
+	 * We do the opposite to not pollute connect() users.
+	 */
+	offset |= 1U;
+
+other_parity_scan:
+	port = low + offset;
+	for (i = 0; i < remaining; i += 2, port += 2) {
+		if (unlikely(port >= high))
+			port -= remaining;
+		if (inet_is_local_reserved_port(net, port))
+			continue;
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
+		if (inet_use_bhash2_on_bind(sk)) {
+			if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false))
+				goto next_port;
+		}
+
+		head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+		spin_lock(&head2->lock);
+		tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+		inet_bind_bucket_for_each(tb, &head->chain)
+			if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
+				if (!inet_csk_bind_conflict(sk, tb, tb2,
+							    relax, false))
+					goto success;
+				spin_unlock(&head2->lock);
+				goto next_port;
+			}
+		tb = NULL;
+		goto success;
+next_port:
+		spin_unlock_bh(&head->lock);
+		cond_resched();
+	}
+
+	offset--;
+	if (!(offset & 1))
+		goto other_parity_scan;
+
+	if (attempt_half == 1) {
+		/* OK we now try the upper half of the range */
+		attempt_half = 2;
+		goto other_half_scan;
+	}
+
+	if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) {
+		/* We still have a chance to connect to different destinations */
+		relax = true;
+		goto ports_exhausted;
+	}
+	return NULL;
+success:
+	*port_ret = port;
+	*tb_ret = tb;
+	*tb2_ret = tb2;
+	*head2_ret = head2;
+	return head;
+}
+
+static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
+				     struct sock *sk)
+{
+	kuid_t uid = sock_i_uid(sk);
+
+	if (tb->fastreuseport <= 0)
+		return 0;
+	if (!sk->sk_reuseport)
+		return 0;
+	if (rcu_access_pointer(sk->sk_reuseport_cb))
+		return 0;
+	if (!uid_eq(tb->fastuid, uid))
+		return 0;
+	/* We only need to check the rcv_saddr if this tb was once marked
+	 * without fastreuseport and then was reset, as we can only know that
+	 * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
+	 * owners list.
+	 */
+	if (tb->fastreuseport == FASTREUSEPORT_ANY)
+		return 1;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (tb->fast_sk_family == AF_INET6)
+		return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
+					    inet6_rcv_saddr(sk),
+					    tb->fast_rcv_saddr,
+					    sk->sk_rcv_saddr,
+					    tb->fast_ipv6_only,
+					    ipv6_only_sock(sk), true, false);
+#endif
+	return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
+				    ipv6_only_sock(sk), true, false);
+}
+
+void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
+			       struct sock *sk)
+{
+	kuid_t uid = sock_i_uid(sk);
+	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+
+	if (hlist_empty(&tb->owners)) {
+		tb->fastreuse = reuse;
+		if (sk->sk_reuseport) {
+			tb->fastreuseport = FASTREUSEPORT_ANY;
+			tb->fastuid = uid;
+			tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+			tb->fast_ipv6_only = ipv6_only_sock(sk);
+			tb->fast_sk_family = sk->sk_family;
+#if IS_ENABLED(CONFIG_IPV6)
+			tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+		} else {
+			tb->fastreuseport = 0;
+		}
+	} else {
+		if (!reuse)
+			tb->fastreuse = 0;
+		if (sk->sk_reuseport) {
+			/* We didn't match or we don't have fastreuseport set on
+			 * the tb, but we have sk_reuseport set on this socket
+			 * and we know that there are no bind conflicts with
+			 * this socket in this tb, so reset our tb's reuseport
+			 * settings so that any subsequent sockets that match
+			 * our current socket will be put on the fast path.
+			 *
+			 * If we reset we need to set FASTREUSEPORT_STRICT so we
+			 * do extra checking for all subsequent sk_reuseport
+			 * socks.
+			 */
+			if (!sk_reuseport_match(tb, sk)) {
+				tb->fastreuseport = FASTREUSEPORT_STRICT;
+				tb->fastuid = uid;
+				tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+				tb->fast_ipv6_only = ipv6_only_sock(sk);
+				tb->fast_sk_family = sk->sk_family;
+#if IS_ENABLED(CONFIG_IPV6)
+				tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+			}
+		} else {
+			tb->fastreuseport = 0;
+		}
+	}
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+	struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+	bool found_port = false, check_bind_conflict = true;
+	bool bhash_created = false, bhash2_created = false;
+	int ret = -EADDRINUSE, port = snum, l3mdev;
+	struct inet_bind_hashbucket *head, *head2;
+	struct inet_bind2_bucket *tb2 = NULL;
+	struct inet_bind_bucket *tb = NULL;
+	bool head2_lock_acquired = false;
+	struct net *net = sock_net(sk);
+
+	l3mdev = inet_sk_bound_l3mdev(sk);
+
+	if (!port) {
+		head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port);
+		if (!head)
+			return ret;
+
+		head2_lock_acquired = true;
+
+		if (tb && tb2)
+			goto success;
+		found_port = true;
+	} else {
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
+		inet_bind_bucket_for_each(tb, &head->chain)
+			if (inet_bind_bucket_match(tb, net, port, l3mdev))
+				break;
+	}
+
+	if (!tb) {
+		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net,
+					     head, port, l3mdev);
+		if (!tb)
+			goto fail_unlock;
+		bhash_created = true;
+	}
+
+	if (!found_port) {
+		if (!hlist_empty(&tb->owners)) {
+			if (sk->sk_reuse == SK_FORCE_REUSE ||
+			    (tb->fastreuse > 0 && reuse) ||
+			    sk_reuseport_match(tb, sk))
+				check_bind_conflict = false;
+		}
+
+		if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) {
+			if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true))
+				goto fail_unlock;
+		}
+
+		head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+		spin_lock(&head2->lock);
+		head2_lock_acquired = true;
+		tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+	}
+
+	if (!tb2) {
+		tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
+					       net, head2, port, l3mdev, sk);
+		if (!tb2)
+			goto fail_unlock;
+		bhash2_created = true;
+	}
+
+	if (!found_port && check_bind_conflict) {
+		if (inet_csk_bind_conflict(sk, tb, tb2, true, true))
+			goto fail_unlock;
+	}
+
+success:
+	inet_csk_update_fastreuse(tb, sk);
+
+	if (!inet_csk(sk)->icsk_bind_hash)
+		inet_bind_hash(sk, tb, tb2, port);
+	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
+	WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2);
+	ret = 0;
+
+fail_unlock:
+	if (ret) {
+		if (bhash_created)
+			inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
+		if (bhash2_created)
+			inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep,
+						  tb2);
+	}
+	if (head2_lock_acquired)
+		spin_unlock(&head2->lock);
+	spin_unlock_bh(&head->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	DEFINE_WAIT(wait);
+	int err;
+
+	/*
+	 * True wake-one mechanism for incoming connections: only
+	 * one process gets woken up, not the 'whole herd'.
+	 * Since we do not 'race & poll' for established sockets
+	 * anymore, the common case will execute the loop only once.
+	 *
+	 * Subtle issue: "add_wait_queue_exclusive()" will be added
+	 * after any current non-exclusive waiters, and we know that
+	 * it will always _stay_ after any new non-exclusive waiters
+	 * because all non-exclusive waiters are added at the
+	 * beginning of the wait-queue. As such, it's ok to "drop"
+	 * our exclusiveness temporarily when we get woken up without
+	 * having to remove and re-insert us on the wait queue.
+	 */
+	for (;;) {
+		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+					  TASK_INTERRUPTIBLE);
+		release_sock(sk);
+		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+			timeo = schedule_timeout(timeo);
+		sched_annotate_sleep();
+		lock_sock(sk);
+		err = 0;
+		if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+			break;
+		err = -EINVAL;
+		if (sk->sk_state != TCP_LISTEN)
+			break;
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+	struct request_sock *req;
+	struct sock *newsk;
+	int error;
+
+	lock_sock(sk);
+
+	/* We need to make sure that this socket is listening,
+	 * and that it has something pending.
+	 */
+	error = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out_err;
+
+	/* Find already established connection */
+	if (reqsk_queue_empty(queue)) {
+		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+		/* If this is a non blocking socket don't sleep */
+		error = -EAGAIN;
+		if (!timeo)
+			goto out_err;
+
+		error = inet_csk_wait_for_connect(sk, timeo);
+		if (error)
+			goto out_err;
+	}
+	req = reqsk_queue_remove(queue, sk);
+	newsk = req->sk;
+
+	if (sk->sk_protocol == IPPROTO_TCP &&
+	    tcp_rsk(req)->tfo_listener) {
+		spin_lock_bh(&queue->fastopenq.lock);
+		if (tcp_rsk(req)->tfo_listener) {
+			/* We are still waiting for the final ACK from 3WHS
+			 * so can't free req now. Instead, we set req->sk to
+			 * NULL to signify that the child socket is taken
+			 * so reqsk_fastopen_remove() will free the req
+			 * when 3WHS finishes (or is aborted).
+			 */
+			req->sk = NULL;
+			req = NULL;
+		}
+		spin_unlock_bh(&queue->fastopenq.lock);
+	}
+
+out:
+	release_sock(sk);
+	if (newsk && mem_cgroup_sockets_enabled) {
+		int amt = 0;
+
+		/* atomically get the memory usage, set and charge the
+		 * newsk->sk_memcg.
+		 */
+		lock_sock(newsk);
+
+		mem_cgroup_sk_alloc(newsk);
+		if (newsk->sk_memcg) {
+			/* The socket has not been accepted yet, no need
+			 * to look at newsk->sk_wmem_queued.
+			 */
+			amt = sk_mem_pages(newsk->sk_forward_alloc +
+					   atomic_read(&newsk->sk_rmem_alloc));
+		}
+
+		if (amt)
+			mem_cgroup_charge_skmem(newsk->sk_memcg, amt,
+						GFP_KERNEL | __GFP_NOFAIL);
+
+		release_sock(newsk);
+	}
+	if (req)
+		reqsk_put(req);
+
+	if (newsk)
+		inet_init_csk_locks(newsk);
+
+	return newsk;
+out_err:
+	newsk = NULL;
+	req = NULL;
+	*err = error;
+	goto out;
+}
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+			       void (*retransmit_handler)(struct timer_list *t),
+			       void (*delack_handler)(struct timer_list *t),
+			       void (*keepalive_handler)(struct timer_list *t))
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
+	timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
+	timer_setup(&sk->sk_timer, keepalive_handler, 0);
+	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+
+	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+	sk_stop_timer(sk, &icsk->icsk_delack_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry *inet_csk_route_req(const struct sock *sk,
+				     struct flowi4 *fl4,
+				     const struct request_sock *req)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct net *net = read_pnet(&ireq->ireq_net);
+	struct ip_options_rcu *opt;
+	struct rtable *rt;
+
+	rcu_read_lock();
+	opt = rcu_dereference(ireq->ireq_opt);
+
+	flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+			   ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
+			   sk->sk_protocol, inet_sk_flowi_flags(sk),
+			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+			   ireq->ir_loc_addr, ireq->ir_rmt_port,
+			   htons(ireq->ir_num), sk->sk_uid);
+	security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
+	rt = ip_route_output_flow(net, fl4, sk);
+	if (IS_ERR(rt))
+		goto no_route;
+	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+		goto route_err;
+	rcu_read_unlock();
+	return &rt->dst;
+
+route_err:
+	ip_rt_put(rt);
+no_route:
+	rcu_read_unlock();
+	__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
+					    struct sock *newsk,
+					    const struct request_sock *req)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct net *net = read_pnet(&ireq->ireq_net);
+	struct inet_sock *newinet = inet_sk(newsk);
+	struct ip_options_rcu *opt;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+
+	opt = rcu_dereference(ireq->ireq_opt);
+	fl4 = &newinet->cork.fl.u.ip4;
+
+	flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+			   ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
+			   sk->sk_protocol, inet_sk_flowi_flags(sk),
+			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+			   ireq->ir_loc_addr, ireq->ir_rmt_port,
+			   htons(ireq->ir_num), sk->sk_uid);
+	security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
+	rt = ip_route_output_flow(net, fl4, sk);
+	if (IS_ERR(rt))
+		goto no_route;
+	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+		goto route_err;
+	return &rt->dst;
+
+route_err:
+	ip_rt_put(rt);
+no_route:
+	__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
+
+/* Decide when to expire the request and when to resend SYN-ACK */
+static void syn_ack_recalc(struct request_sock *req,
+			   const int max_syn_ack_retries,
+			   const u8 rskq_defer_accept,
+			   int *expire, int *resend)
+{
+	if (!rskq_defer_accept) {
+		*expire = req->num_timeout >= max_syn_ack_retries;
+		*resend = 1;
+		return;
+	}
+	*expire = req->num_timeout >= max_syn_ack_retries &&
+		  (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept);
+	/* Do not resend while waiting for data after ACK,
+	 * start to resend on end of deferring period to give
+	 * last chance for data or ACK to create established socket.
+	 */
+	*resend = !inet_rsk(req)->acked ||
+		  req->num_timeout >= rskq_defer_accept - 1;
+}
+
+int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
+{
+	int err = req->rsk_ops->rtx_syn_ack(parent, req);
+
+	if (!err)
+		req->num_retrans++;
+	return err;
+}
+EXPORT_SYMBOL(inet_rtx_syn_ack);
+
+static struct request_sock *inet_reqsk_clone(struct request_sock *req,
+					     struct sock *sk)
+{
+	struct sock *req_sk, *nreq_sk;
+	struct request_sock *nreq;
+
+	nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+	if (!nreq) {
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+
+		/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+		sock_put(sk);
+		return NULL;
+	}
+
+	req_sk = req_to_sk(req);
+	nreq_sk = req_to_sk(nreq);
+
+	memcpy(nreq_sk, req_sk,
+	       offsetof(struct sock, sk_dontcopy_begin));
+	memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+	       req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+	sk_node_init(&nreq_sk->sk_node);
+	nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+	nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+	nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+	nreq->rsk_listener = sk;
+
+	/* We need not acquire fastopenq->lock
+	 * because the child socket is locked in inet_csk_listen_stop().
+	 */
+	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
+		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+	return nreq;
+}
+
+static void reqsk_queue_migrated(struct request_sock_queue *queue,
+				 const struct request_sock *req)
+{
+	if (req->num_timeout == 0)
+		atomic_inc(&queue->young);
+	atomic_inc(&queue->qlen);
+}
+
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+	req->saved_syn = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+	inet_rsk(req)->ipv6_opt = NULL;
+	inet_rsk(req)->pktopts = NULL;
+#else
+	inet_rsk(req)->ireq_opt = NULL;
+#endif
+}
+
+/* return true if req was found in the ehash table */
+static bool reqsk_queue_unlink(struct request_sock *req)
+{
+	struct sock *sk = req_to_sk(req);
+	bool found = false;
+
+	if (sk_hashed(sk)) {
+		struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+		spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
+
+		spin_lock(lock);
+		found = __sk_nulls_del_node_init_rcu(sk);
+		spin_unlock(lock);
+	}
+	if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
+		reqsk_put(req);
+	return found;
+}
+
+bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+	bool unlinked = reqsk_queue_unlink(req);
+
+	if (unlinked) {
+		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+		reqsk_put(req);
+	}
+	return unlinked;
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+
+void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
+{
+	inet_csk_reqsk_queue_drop(sk, req);
+	reqsk_put(req);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+
+static void reqsk_timer_handler(struct timer_list *t)
+{
+	struct request_sock *req = from_timer(req, t, rsk_timer);
+	struct request_sock *nreq = NULL, *oreq = req;
+	struct sock *sk_listener = req->rsk_listener;
+	struct inet_connection_sock *icsk;
+	struct request_sock_queue *queue;
+	struct net *net;
+	int max_syn_ack_retries, qlen, expire = 0, resend = 0;
+
+	if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
+		struct sock *nsk;
+
+		nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL);
+		if (!nsk)
+			goto drop;
+
+		nreq = inet_reqsk_clone(req, nsk);
+		if (!nreq)
+			goto drop;
+
+		/* The new timer for the cloned req can decrease the 2
+		 * by calling inet_csk_reqsk_queue_drop_and_put(), so
+		 * hold another count to prevent use-after-free and
+		 * call reqsk_put() just before return.
+		 */
+		refcount_set(&nreq->rsk_refcnt, 2 + 1);
+		timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+		reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req);
+
+		req = nreq;
+		sk_listener = nsk;
+	}
+
+	icsk = inet_csk(sk_listener);
+	net = sock_net(sk_listener);
+	max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
+		READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
+	/* Normally all the openreqs are young and become mature
+	 * (i.e. converted to established socket) for first timeout.
+	 * If synack was not acknowledged for 1 second, it means
+	 * one of the following things: synack was lost, ack was lost,
+	 * rtt is high or nobody planned to ack (i.e. synflood).
+	 * When server is a bit loaded, queue is populated with old
+	 * open requests, reducing effective size of queue.
+	 * When server is well loaded, queue size reduces to zero
+	 * after several minutes of work. It is not synflood,
+	 * it is normal operation. The solution is pruning
+	 * too old entries overriding normal timeout, when
+	 * situation becomes dangerous.
+	 *
+	 * Essentially, we reserve half of room for young
+	 * embrions; and abort old ones without pity, if old
+	 * ones are about to clog our table.
+	 */
+	queue = &icsk->icsk_accept_queue;
+	qlen = reqsk_queue_len(queue);
+	if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
+		int young = reqsk_queue_len_young(queue) << 1;
+
+		while (max_syn_ack_retries > 2) {
+			if (qlen < young)
+				break;
+			max_syn_ack_retries--;
+			young <<= 1;
+		}
+	}
+	syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept),
+		       &expire, &resend);
+	req->rsk_ops->syn_ack_timeout(req);
+	if (!expire &&
+	    (!resend ||
+	     !inet_rtx_syn_ack(sk_listener, req) ||
+	     inet_rsk(req)->acked)) {
+		if (req->num_timeout++ == 0)
+			atomic_dec(&queue->young);
+		mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX));
+
+		if (!nreq)
+			return;
+
+		if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
+			/* delete timer */
+			inet_csk_reqsk_queue_drop(sk_listener, nreq);
+			goto no_ownership;
+		}
+
+		__NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS);
+		reqsk_migrate_reset(oreq);
+		reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq);
+		reqsk_put(oreq);
+
+		reqsk_put(nreq);
+		return;
+	}
+
+	/* Even if we can clone the req, we may need not retransmit any more
+	 * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another
+	 * CPU may win the "own_req" race so that inet_ehash_insert() fails.
+	 */
+	if (nreq) {
+		__NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE);
+no_ownership:
+		reqsk_migrate_reset(nreq);
+		reqsk_queue_removed(queue, nreq);
+		__reqsk_free(nreq);
+	}
+
+drop:
+	inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq);
+}
+
+static void reqsk_queue_hash_req(struct request_sock *req,
+				 unsigned long timeout)
+{
+	timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+	mod_timer(&req->rsk_timer, jiffies + timeout);
+
+	inet_ehash_insert(req_to_sk(req), NULL, NULL);
+	/* before letting lookups find us, make sure all req fields
+	 * are committed to memory and refcnt initialized.
+	 */
+	smp_wmb();
+	refcount_set(&req->rsk_refcnt, 2 + 1);
+}
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+				   unsigned long timeout)
+{
+	reqsk_queue_hash_req(req, timeout);
+	inet_csk_reqsk_queue_added(sk);
+}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
+			   const gfp_t priority)
+{
+	struct inet_connection_sock *icsk = inet_csk(newsk);
+
+	if (!icsk->icsk_ulp_ops)
+		return;
+
+	icsk->icsk_ulp_ops->clone(req, newsk, priority);
+}
+
+/**
+ *	inet_csk_clone_lock - clone an inet socket, and lock its clone
+ *	@sk: the socket to clone
+ *	@req: request_sock
+ *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_clone_lock(const struct sock *sk,
+				 const struct request_sock *req,
+				 const gfp_t priority)
+{
+	struct sock *newsk = sk_clone_lock(sk, priority);
+
+	if (newsk) {
+		struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+		inet_sk_set_state(newsk, TCP_SYN_RECV);
+		newicsk->icsk_bind_hash = NULL;
+		newicsk->icsk_bind2_hash = NULL;
+
+		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
+		inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
+		inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
+
+		/* listeners have SOCK_RCU_FREE, not the children */
+		sock_reset_flag(newsk, SOCK_RCU_FREE);
+
+		inet_sk(newsk)->mc_list = NULL;
+
+		newsk->sk_mark = inet_rsk(req)->ir_mark;
+		atomic64_set(&newsk->sk_cookie,
+			     atomic64_read(&inet_rsk(req)->ir_cookie));
+
+		newicsk->icsk_retransmits = 0;
+		newicsk->icsk_backoff	  = 0;
+		newicsk->icsk_probes_out  = 0;
+		newicsk->icsk_probes_tstamp = 0;
+
+		/* Deinitialize accept_queue to trap illegal accesses. */
+		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+
+		inet_clone_ulp(req, newsk, priority);
+
+		security_inet_csk_clone(newsk, req);
+	}
+	return newsk;
+}
+EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+	WARN_ON(sk->sk_state != TCP_CLOSE);
+	WARN_ON(!sock_flag(sk, SOCK_DEAD));
+
+	/* It cannot be in hash table! */
+	WARN_ON(!sk_unhashed(sk));
+
+	/* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
+	WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
+
+	sk->sk_prot->destroy(sk);
+
+	sk_stream_kill_queues(sk);
+
+	xfrm_sk_free_policy(sk);
+
+	this_cpu_dec(*sk->sk_prot->orphan_count);
+
+	sock_put(sk);
+}
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+/* This function allows to force a closure of a socket after the call to
+ * tcp/dccp_create_openreq_child().
+ */
+void inet_csk_prepare_forced_close(struct sock *sk)
+	__releases(&sk->sk_lock.slock)
+{
+	/* sk_clone_lock locked the socket and set refcnt to 2 */
+	bh_unlock_sock(sk);
+	sock_put(sk);
+	inet_csk_prepare_for_destroy_sock(sk);
+	inet_sk(sk)->inet_num = 0;
+}
+EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+
+static int inet_ulp_can_listen(const struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone)
+		return -EINVAL;
+
+	return 0;
+}
+
+int inet_csk_listen_start(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	int err;
+
+	err = inet_ulp_can_listen(sk);
+	if (unlikely(err))
+		return err;
+
+	reqsk_queue_alloc(&icsk->icsk_accept_queue);
+
+	sk->sk_ack_backlog = 0;
+	inet_csk_delack_init(sk);
+
+	/* There is race window here: we announce ourselves listening,
+	 * but this transition is still not validated by get_port().
+	 * It is OK, because this socket enters to hash table only
+	 * after validation is complete.
+	 */
+	inet_sk_state_store(sk, TCP_LISTEN);
+	err = sk->sk_prot->get_port(sk, inet->inet_num);
+	if (!err) {
+		inet->inet_sport = htons(inet->inet_num);
+
+		sk_dst_reset(sk);
+		err = sk->sk_prot->hash(sk);
+
+		if (likely(!err))
+			return 0;
+	}
+
+	inet_sk_set_state(sk, TCP_CLOSE);
+	return err;
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+static void inet_child_forget(struct sock *sk, struct request_sock *req,
+			      struct sock *child)
+{
+	sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+	sock_orphan(child);
+
+	this_cpu_inc(*sk->sk_prot->orphan_count);
+
+	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+		BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
+		BUG_ON(sk != req->rsk_listener);
+
+		/* Paranoid, to prevent race condition if
+		 * an inbound pkt destined for child is
+		 * blocked by sock lock in tcp_v4_rcv().
+		 * Also to satisfy an assertion in
+		 * tcp_v4_destroy_sock().
+		 */
+		RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
+	}
+	inet_csk_destroy_sock(child);
+}
+
+struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
+				      struct request_sock *req,
+				      struct sock *child)
+{
+	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+
+	spin_lock(&queue->rskq_lock);
+	if (unlikely(sk->sk_state != TCP_LISTEN)) {
+		inet_child_forget(sk, req, child);
+		child = NULL;
+	} else {
+		req->sk = child;
+		req->dl_next = NULL;
+		if (queue->rskq_accept_head == NULL)
+			WRITE_ONCE(queue->rskq_accept_head, req);
+		else
+			queue->rskq_accept_tail->dl_next = req;
+		queue->rskq_accept_tail = req;
+		sk_acceptq_added(sk);
+	}
+	spin_unlock(&queue->rskq_lock);
+	return child;
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
+
+struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
+					 struct request_sock *req, bool own_req)
+{
+	if (own_req) {
+		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+
+		if (sk != req->rsk_listener) {
+			/* another listening sk has been selected,
+			 * migrate the req to it.
+			 */
+			struct request_sock *nreq;
+
+			/* hold a refcnt for the nreq->rsk_listener
+			 * which is assigned in inet_reqsk_clone()
+			 */
+			sock_hold(sk);
+			nreq = inet_reqsk_clone(req, sk);
+			if (!nreq) {
+				inet_child_forget(sk, req, child);
+				goto child_put;
+			}
+
+			refcount_set(&nreq->rsk_refcnt, 1);
+			if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+				__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS);
+				reqsk_migrate_reset(req);
+				reqsk_put(req);
+				return child;
+			}
+
+			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+			reqsk_migrate_reset(nreq);
+			__reqsk_free(nreq);
+		} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
+			return child;
+		}
+	}
+	/* Too bad, another child took ownership of the request, undo. */
+child_put:
+	bh_unlock_sock(child);
+	sock_put(child);
+	return NULL;
+}
+EXPORT_SYMBOL(inet_csk_complete_hashdance);
+
+/*
+ *	This routine closes sockets which have been at least partially
+ *	opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+	struct request_sock *next, *req;
+
+	/* Following specs, it would be better either to send FIN
+	 * (and enter FIN-WAIT-1, it is normal close)
+	 * or to send active reset (abort).
+	 * Certainly, it is pretty dangerous while synflood, but it is
+	 * bad justification for our negligence 8)
+	 * To be honest, we are not able to make either
+	 * of the variants now.			--ANK
+	 */
+	while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
+		struct sock *child = req->sk, *nsk;
+		struct request_sock *nreq;
+
+		local_bh_disable();
+		bh_lock_sock(child);
+		WARN_ON(sock_owned_by_user(child));
+		sock_hold(child);
+
+		nsk = reuseport_migrate_sock(sk, child, NULL);
+		if (nsk) {
+			nreq = inet_reqsk_clone(req, nsk);
+			if (nreq) {
+				refcount_set(&nreq->rsk_refcnt, 1);
+
+				if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+					__NET_INC_STATS(sock_net(nsk),
+							LINUX_MIB_TCPMIGRATEREQSUCCESS);
+					reqsk_migrate_reset(req);
+				} else {
+					__NET_INC_STATS(sock_net(nsk),
+							LINUX_MIB_TCPMIGRATEREQFAILURE);
+					reqsk_migrate_reset(nreq);
+					__reqsk_free(nreq);
+				}
+
+				/* inet_csk_reqsk_queue_add() has already
+				 * called inet_child_forget() on failure case.
+				 */
+				goto skip_child_forget;
+			}
+		}
+
+		inet_child_forget(sk, req, child);
+skip_child_forget:
+		reqsk_put(req);
+		bh_unlock_sock(child);
+		local_bh_enable();
+		sock_put(child);
+
+		cond_resched();
+	}
+	if (queue->fastopenq.rskq_rst_head) {
+		/* Free all the reqs queued in rskq_rst_head. */
+		spin_lock_bh(&queue->fastopenq.lock);
+		req = queue->fastopenq.rskq_rst_head;
+		queue->fastopenq.rskq_rst_head = NULL;
+		spin_unlock_bh(&queue->fastopenq.lock);
+		while (req != NULL) {
+			next = req->dl_next;
+			reqsk_put(req);
+			req = next;
+		}
+	}
+	WARN_ON_ONCE(sk->sk_ack_backlog);
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
+
+void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	const struct inet_sock *inet = inet_sk(sk);
+
+	sin->sin_family		= AF_INET;
+	sin->sin_addr.s_addr	= inet->inet_daddr;
+	sin->sin_port		= inet->inet_dport;
+}
+EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
+
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ip_options_rcu *inet_opt;
+	__be32 daddr = inet->inet_daddr;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	fl4 = &fl->u.ip4;
+	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+				   inet->inet_saddr, inet->inet_dport,
+				   inet->inet_sport, sk->sk_protocol,
+				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+	if (IS_ERR(rt))
+		rt = NULL;
+	if (rt)
+		sk_setup_caps(sk, &rt->dst);
+	rcu_read_unlock();
+
+	return &rt->dst;
+}
+
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+	struct dst_entry *dst = __sk_dst_check(sk, 0);
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (!dst) {
+		dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+		if (!dst)
+			goto out;
+	}
+	dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
+
+	dst = __sk_dst_check(sk, 0);
+	if (!dst)
+		dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+	return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 0000000000..e13a844334
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,1485 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * inet_diag.c	Module for monitoring INET transport protocols sockets.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/bpf_sk_storage.h>
+#include <net/netlink.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+	const __be32 *saddr;
+	const __be32 *daddr;
+	u16 sport;
+	u16 dport;
+	u16 family;
+	u16 userlocks;
+	u32 ifindex;
+	u32 mark;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	u64 cgroup_id;
+#endif
+};
+
+static DEFINE_MUTEX(inet_diag_table_mutex);
+
+static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
+{
+	if (proto < 0 || proto >= IPPROTO_MAX) {
+		mutex_lock(&inet_diag_table_mutex);
+		return ERR_PTR(-ENOENT);
+	}
+
+	if (!inet_diag_table[proto])
+		sock_load_diag_module(AF_INET, proto);
+
+	mutex_lock(&inet_diag_table_mutex);
+	if (!inet_diag_table[proto])
+		return ERR_PTR(-ENOENT);
+
+	return inet_diag_table[proto];
+}
+
+static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
+{
+	mutex_unlock(&inet_diag_table_mutex);
+}
+
+void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+{
+	r->idiag_family = sk->sk_family;
+
+	r->id.idiag_sport = htons(sk->sk_num);
+	r->id.idiag_dport = sk->sk_dport;
+	r->id.idiag_if = sk->sk_bound_dev_if;
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6) {
+		*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+		*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+	} else
+#endif
+	{
+	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+	r->id.idiag_src[0] = sk->sk_rcv_saddr;
+	r->id.idiag_dst[0] = sk->sk_daddr;
+	}
+}
+EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
+
+static size_t inet_sk_attr_size(struct sock *sk,
+				const struct inet_diag_req_v2 *req,
+				bool net_admin)
+{
+	const struct inet_diag_handler *handler;
+	size_t aux = 0;
+
+	handler = inet_diag_table[req->sdiag_protocol];
+	if (handler && handler->idiag_get_aux_size)
+		aux = handler->idiag_get_aux_size(sk, net_admin);
+
+	return	  nla_total_size(sizeof(struct tcp_info))
+		+ nla_total_size(sizeof(struct inet_diag_msg))
+		+ inet_diag_msg_attrs_size()
+		+ nla_total_size(sizeof(struct inet_diag_meminfo))
+		+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+		+ nla_total_size(TCP_CA_NAME_MAX)
+		+ nla_total_size(sizeof(struct tcpvegas_info))
+		+ aux
+		+ 64;
+}
+
+int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+			     struct inet_diag_msg *r, int ext,
+			     struct user_namespace *user_ns,
+			     bool net_admin)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	struct inet_diag_sockopt inet_sockopt;
+
+	if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+		goto errout;
+
+	/* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+	 * hence this needs to be included regardless of socket family.
+	 */
+	if (ext & (1 << (INET_DIAG_TOS - 1)))
+		if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+			goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (r->idiag_family == AF_INET6) {
+		if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+			if (nla_put_u8(skb, INET_DIAG_TCLASS,
+				       inet6_sk(sk)->tclass) < 0)
+				goto errout;
+
+		if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+		    nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+			goto errout;
+	}
+#endif
+
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark)))
+		goto errout;
+
+	if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) ||
+	    ext & (1 << (INET_DIAG_TCLASS - 1))) {
+		u32 classid = 0;
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+		classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+#endif
+		/* Fallback to socket priority if class id isn't set.
+		 * Classful qdiscs use it as direct reference to class.
+		 * For cgroup2 classid is always zero.
+		 */
+		if (!classid)
+			classid = sk->sk_priority;
+
+		if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+			goto errout;
+	}
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID,
+			      cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)),
+			      INET_DIAG_PAD))
+		goto errout;
+#endif
+
+	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->idiag_inode = sock_i_ino(sk);
+
+	memset(&inet_sockopt, 0, sizeof(inet_sockopt));
+	inet_sockopt.recverr	= inet_test_bit(RECVERR, sk);
+	inet_sockopt.is_icsk	= inet_test_bit(IS_ICSK, sk);
+	inet_sockopt.freebind	= inet_test_bit(FREEBIND, sk);
+	inet_sockopt.hdrincl	= inet_test_bit(HDRINCL, sk);
+	inet_sockopt.mc_loop	= inet_test_bit(MC_LOOP, sk);
+	inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk);
+	inet_sockopt.mc_all	= inet_test_bit(MC_ALL, sk);
+	inet_sockopt.nodefrag	= inet_test_bit(NODEFRAG, sk);
+	inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
+	inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk);
+	inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk);
+	if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt),
+		    &inet_sockopt))
+		goto errout;
+
+	return 0;
+errout:
+	return 1;
+}
+EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
+
+static int inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen,
+				 struct nlattr **req_nlas)
+{
+	struct nlattr *nla;
+	int remaining;
+
+	nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) {
+		int type = nla_type(nla);
+
+		if (type == INET_DIAG_REQ_PROTOCOL && nla_len(nla) != sizeof(u32))
+			return -EINVAL;
+
+		if (type < __INET_DIAG_REQ_MAX)
+			req_nlas[type] = nla;
+	}
+	return 0;
+}
+
+static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req,
+				  const struct inet_diag_dump_data *data)
+{
+	if (data->req_nlas[INET_DIAG_REQ_PROTOCOL])
+		return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]);
+	return req->sdiag_protocol;
+}
+
+#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
+int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
+		      struct sk_buff *skb, struct netlink_callback *cb,
+		      const struct inet_diag_req_v2 *req,
+		      u16 nlmsg_flags, bool net_admin)
+{
+	const struct tcp_congestion_ops *ca_ops;
+	const struct inet_diag_handler *handler;
+	struct inet_diag_dump_data *cb_data;
+	int ext = req->idiag_ext;
+	struct inet_diag_msg *r;
+	struct nlmsghdr  *nlh;
+	struct nlattr *attr;
+	void *info = NULL;
+
+	cb_data = cb->data;
+	handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)];
+	BUG_ON(!handler);
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	BUG_ON(!sk_fullsock(sk));
+
+	inet_diag_msg_common_fill(r, sk);
+	r->idiag_state = sk->sk_state;
+	r->idiag_timer = 0;
+	r->idiag_retrans = 0;
+	r->idiag_expires = 0;
+
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext,
+				     sk_user_ns(NETLINK_CB(cb->skb).sk),
+				     net_admin))
+		goto errout;
+
+	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+		struct inet_diag_meminfo minfo = {
+			.idiag_rmem = sk_rmem_alloc_get(sk),
+			.idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
+			.idiag_fmem = sk_forward_alloc_get(sk),
+			.idiag_tmem = sk_wmem_alloc_get(sk),
+		};
+
+		if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+			goto errout;
+	}
+
+	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
+		if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
+			goto errout;
+
+	/*
+	 * RAW sockets might have user-defined protocols assigned,
+	 * so report the one supplied on socket creation.
+	 */
+	if (sk->sk_type == SOCK_RAW) {
+		if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))
+			goto errout;
+	}
+
+	if (!icsk) {
+		handler->idiag_get_info(sk, r, NULL);
+		goto out;
+	}
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+		r->idiag_timer = 1;
+		r->idiag_retrans = icsk->icsk_retransmits;
+		r->idiag_expires =
+			jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies);
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+		r->idiag_timer = 4;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires =
+			jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies);
+	} else if (timer_pending(&sk->sk_timer)) {
+		r->idiag_timer = 2;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires =
+			jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
+	}
+
+	if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
+		attr = nla_reserve_64bit(skb, INET_DIAG_INFO,
+					 handler->idiag_info_size,
+					 INET_DIAG_PAD);
+		if (!attr)
+			goto errout;
+
+		info = nla_data(attr);
+	}
+
+	if (ext & (1 << (INET_DIAG_CONG - 1))) {
+		int err = 0;
+
+		rcu_read_lock();
+		ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+		if (ca_ops)
+			err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
+		rcu_read_unlock();
+		if (err < 0)
+			goto errout;
+	}
+
+	handler->idiag_get_info(sk, r, info);
+
+	if (ext & (1 << (INET_DIAG_INFO - 1)) && handler->idiag_get_aux)
+		if (handler->idiag_get_aux(sk, net_admin, skb) < 0)
+			goto errout;
+
+	if (sk->sk_state < TCP_TIME_WAIT) {
+		union tcp_cc_info info;
+		size_t sz = 0;
+		int attr;
+
+		rcu_read_lock();
+		ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+		if (ca_ops && ca_ops->get_info)
+			sz = ca_ops->get_info(sk, ext, &attr, &info);
+		rcu_read_unlock();
+		if (sz && nla_put(skb, attr, sz, &info) < 0)
+			goto errout;
+	}
+
+	/* Keep it at the end for potential retry with a larger skb,
+	 * or else do best-effort fitting, which is only done for the
+	 * first_nlmsg.
+	 */
+	if (cb_data->bpf_stg_diag) {
+		bool first_nlmsg = ((unsigned char *)nlh == skb->data);
+		unsigned int prev_min_dump_alloc;
+		unsigned int total_nla_size = 0;
+		unsigned int msg_len;
+		int err;
+
+		msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
+		err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb,
+					      INET_DIAG_SK_BPF_STORAGES,
+					      &total_nla_size);
+
+		if (!err)
+			goto out;
+
+		total_nla_size += msg_len;
+		prev_min_dump_alloc = cb->min_dump_alloc;
+		if (total_nla_size > prev_min_dump_alloc)
+			cb->min_dump_alloc = min_t(u32, total_nla_size,
+						   MAX_DUMP_ALLOC_SIZE);
+
+		if (!first_nlmsg)
+			goto errout;
+
+		if (cb->min_dump_alloc > prev_min_dump_alloc)
+			/* Retry with pskb_expand_head() with
+			 * __GFP_DIRECT_RECLAIM
+			 */
+			goto errout;
+
+		WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc);
+
+		/* Send what we have for this sk
+		 * and move on to the next sk in the following
+		 * dump()
+		 */
+	}
+
+out:
+	nlmsg_end(skb, nlh);
+	return 0;
+
+errout:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
+
+static int inet_twsk_diag_fill(struct sock *sk,
+			       struct sk_buff *skb,
+			       struct netlink_callback *cb,
+			       u16 nlmsg_flags, bool net_admin)
+{
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+	struct inet_diag_msg *r;
+	struct nlmsghdr *nlh;
+	long tmo;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+			sizeof(*r), nlmsg_flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+
+	inet_diag_msg_common_fill(r, sk);
+	r->idiag_retrans      = 0;
+
+	r->idiag_state	      = tw->tw_substate;
+	r->idiag_timer	      = 3;
+	tmo = tw->tw_timer.expires - jiffies;
+	r->idiag_expires      = jiffies_delta_to_msecs(tmo);
+	r->idiag_rqueue	      = 0;
+	r->idiag_wqueue	      = 0;
+	r->idiag_uid	      = 0;
+	r->idiag_inode	      = 0;
+
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+				     tw->tw_mark)) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
+
+	nlmsg_end(skb, nlh);
+	return 0;
+}
+
+static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+			      struct netlink_callback *cb,
+			      u16 nlmsg_flags, bool net_admin)
+{
+	struct request_sock *reqsk = inet_reqsk(sk);
+	struct inet_diag_msg *r;
+	struct nlmsghdr *nlh;
+	long tmo;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	inet_diag_msg_common_fill(r, sk);
+	r->idiag_state = TCP_SYN_RECV;
+	r->idiag_timer = 1;
+	r->idiag_retrans = reqsk->num_retrans;
+
+	BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+		     offsetof(struct sock, sk_cookie));
+
+	tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
+	r->idiag_expires = jiffies_delta_to_msecs(tmo);
+	r->idiag_rqueue	= 0;
+	r->idiag_wqueue	= 0;
+	r->idiag_uid	= 0;
+	r->idiag_inode	= 0;
+
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+				     inet_rsk(reqsk)->ir_mark)) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
+
+	nlmsg_end(skb, nlh);
+	return 0;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+			struct netlink_callback *cb,
+			const struct inet_diag_req_v2 *r,
+			u16 nlmsg_flags, bool net_admin)
+{
+	if (sk->sk_state == TCP_TIME_WAIT)
+		return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+	if (sk->sk_state == TCP_NEW_SYN_RECV)
+		return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+	return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+				 net_admin);
+}
+
+struct sock *inet_diag_find_one_icsk(struct net *net,
+				     struct inet_hashinfo *hashinfo,
+				     const struct inet_diag_req_v2 *req)
+{
+	struct sock *sk;
+
+	rcu_read_lock();
+	if (req->sdiag_family == AF_INET)
+		sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
+				 req->id.idiag_dport, req->id.idiag_src[0],
+				 req->id.idiag_sport, req->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (req->sdiag_family == AF_INET6) {
+		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+			sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
+					 req->id.idiag_dport, req->id.idiag_src[3],
+					 req->id.idiag_sport, req->id.idiag_if);
+		else
+			sk = inet6_lookup(net, hashinfo, NULL, 0,
+					  (struct in6_addr *)req->id.idiag_dst,
+					  req->id.idiag_dport,
+					  (struct in6_addr *)req->id.idiag_src,
+					  req->id.idiag_sport,
+					  req->id.idiag_if);
+	}
+#endif
+	else {
+		rcu_read_unlock();
+		return ERR_PTR(-EINVAL);
+	}
+	rcu_read_unlock();
+	if (!sk)
+		return ERR_PTR(-ENOENT);
+
+	if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+		sock_gen_put(sk);
+		return ERR_PTR(-ENOENT);
+	}
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
+
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
+			    struct netlink_callback *cb,
+			    const struct inet_diag_req_v2 *req)
+{
+	struct sk_buff *in_skb = cb->skb;
+	bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
+	struct net *net = sock_net(in_skb->sk);
+	struct sk_buff *rep;
+	struct sock *sk;
+	int err;
+
+	sk = inet_diag_find_one_icsk(net, hashinfo, req);
+	if (IS_ERR(sk))
+		return PTR_ERR(sk);
+
+	rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL);
+	if (!rep) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		nlmsg_free(rep);
+		goto out;
+	}
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+	if (sk)
+		sock_gen_put(sk);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
+
+static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
+			       const struct nlmsghdr *nlh,
+			       int hdrlen,
+			       const struct inet_diag_req_v2 *req)
+{
+	const struct inet_diag_handler *handler;
+	struct inet_diag_dump_data dump_data;
+	int err, protocol;
+
+	memset(&dump_data, 0, sizeof(dump_data));
+	err = inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas);
+	if (err)
+		return err;
+
+	protocol = inet_diag_get_protocol(req, &dump_data);
+
+	handler = inet_diag_lock_handler(protocol);
+	if (IS_ERR(handler)) {
+		err = PTR_ERR(handler);
+	} else if (cmd == SOCK_DIAG_BY_FAMILY) {
+		struct netlink_callback cb = {
+			.nlh = nlh,
+			.skb = in_skb,
+			.data = &dump_data,
+		};
+		err = handler->dump_one(&cb, req);
+	} else if (cmd == SOCK_DESTROY && handler->destroy) {
+		err = handler->destroy(in_skb, req);
+	} else {
+		err = -EOPNOTSUPP;
+	}
+	inet_diag_unlock_handler(handler);
+
+	return err;
+}
+
+static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
+{
+	int words = bits >> 5;
+
+	bits &= 0x1f;
+
+	if (words) {
+		if (memcmp(a1, a2, words << 2))
+			return 0;
+	}
+	if (bits) {
+		__be32 w1, w2;
+		__be32 mask;
+
+		w1 = a1[words];
+		w2 = a2[words];
+
+		mask = htonl((0xffffffff) << (32 - bits));
+
+		if ((w1 ^ w2) & mask)
+			return 0;
+	}
+
+	return 1;
+}
+
+static int inet_diag_bc_run(const struct nlattr *_bc,
+			    const struct inet_diag_entry *entry)
+{
+	const void *bc = nla_data(_bc);
+	int len = nla_len(_bc);
+
+	while (len > 0) {
+		int yes = 1;
+		const struct inet_diag_bc_op *op = bc;
+
+		switch (op->code) {
+		case INET_DIAG_BC_NOP:
+			break;
+		case INET_DIAG_BC_JMP:
+			yes = 0;
+			break;
+		case INET_DIAG_BC_S_EQ:
+			yes = entry->sport == op[1].no;
+			break;
+		case INET_DIAG_BC_S_GE:
+			yes = entry->sport >= op[1].no;
+			break;
+		case INET_DIAG_BC_S_LE:
+			yes = entry->sport <= op[1].no;
+			break;
+		case INET_DIAG_BC_D_EQ:
+			yes = entry->dport == op[1].no;
+			break;
+		case INET_DIAG_BC_D_GE:
+			yes = entry->dport >= op[1].no;
+			break;
+		case INET_DIAG_BC_D_LE:
+			yes = entry->dport <= op[1].no;
+			break;
+		case INET_DIAG_BC_AUTO:
+			yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+			break;
+		case INET_DIAG_BC_S_COND:
+		case INET_DIAG_BC_D_COND: {
+			const struct inet_diag_hostcond *cond;
+			const __be32 *addr;
+
+			cond = (const struct inet_diag_hostcond *)(op + 1);
+			if (cond->port != -1 &&
+			    cond->port != (op->code == INET_DIAG_BC_S_COND ?
+					     entry->sport : entry->dport)) {
+				yes = 0;
+				break;
+			}
+
+			if (op->code == INET_DIAG_BC_S_COND)
+				addr = entry->saddr;
+			else
+				addr = entry->daddr;
+
+			if (cond->family != AF_UNSPEC &&
+			    cond->family != entry->family) {
+				if (entry->family == AF_INET6 &&
+				    cond->family == AF_INET) {
+					if (addr[0] == 0 && addr[1] == 0 &&
+					    addr[2] == htonl(0xffff) &&
+					    bitstring_match(addr + 3,
+							    cond->addr,
+							    cond->prefix_len))
+						break;
+				}
+				yes = 0;
+				break;
+			}
+
+			if (cond->prefix_len == 0)
+				break;
+			if (bitstring_match(addr, cond->addr,
+					    cond->prefix_len))
+				break;
+			yes = 0;
+			break;
+		}
+		case INET_DIAG_BC_DEV_COND: {
+			u32 ifindex;
+
+			ifindex = *((const u32 *)(op + 1));
+			if (ifindex != entry->ifindex)
+				yes = 0;
+			break;
+		}
+		case INET_DIAG_BC_MARK_COND: {
+			struct inet_diag_markcond *cond;
+
+			cond = (struct inet_diag_markcond *)(op + 1);
+			if ((entry->mark & cond->mask) != cond->mark)
+				yes = 0;
+			break;
+		}
+#ifdef CONFIG_SOCK_CGROUP_DATA
+		case INET_DIAG_BC_CGROUP_COND: {
+			u64 cgroup_id;
+
+			cgroup_id = get_unaligned((const u64 *)(op + 1));
+			if (cgroup_id != entry->cgroup_id)
+				yes = 0;
+			break;
+		}
+#endif
+		}
+
+		if (yes) {
+			len -= op->yes;
+			bc += op->yes;
+		} else {
+			len -= op->no;
+			bc += op->no;
+		}
+	}
+	return len == 0;
+}
+
+/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
+ */
+static void entry_fill_addrs(struct inet_diag_entry *entry,
+			     const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6) {
+		entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+		entry->daddr = sk->sk_v6_daddr.s6_addr32;
+	} else
+#endif
+	{
+		entry->saddr = &sk->sk_rcv_saddr;
+		entry->daddr = &sk->sk_daddr;
+	}
+}
+
+int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_diag_entry entry;
+
+	if (!bc)
+		return 1;
+
+	entry.family = sk->sk_family;
+	entry_fill_addrs(&entry, sk);
+	entry.sport = inet->inet_num;
+	entry.dport = ntohs(inet->inet_dport);
+	entry.ifindex = sk->sk_bound_dev_if;
+	entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
+	if (sk_fullsock(sk))
+		entry.mark = READ_ONCE(sk->sk_mark);
+	else if (sk->sk_state == TCP_NEW_SYN_RECV)
+		entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+	else if (sk->sk_state == TCP_TIME_WAIT)
+		entry.mark = inet_twsk(sk)->tw_mark;
+	else
+		entry.mark = 0;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	entry.cgroup_id = sk_fullsock(sk) ?
+		cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
+#endif
+
+	return inet_diag_bc_run(bc, &entry);
+}
+EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+	while (len >= 0) {
+		const struct inet_diag_bc_op *op = bc;
+
+		if (cc > len)
+			return 0;
+		if (cc == len)
+			return 1;
+		if (op->yes < 4 || op->yes & 3)
+			return 0;
+		len -= op->yes;
+		bc  += op->yes;
+	}
+	return 0;
+}
+
+/* data is u32 ifindex */
+static bool valid_devcond(const struct inet_diag_bc_op *op, int len,
+			  int *min_len)
+{
+	/* Check ifindex space. */
+	*min_len += sizeof(u32);
+	if (len < *min_len)
+		return false;
+
+	return true;
+}
+/* Validate an inet_diag_hostcond. */
+static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
+			   int *min_len)
+{
+	struct inet_diag_hostcond *cond;
+	int addr_len;
+
+	/* Check hostcond space. */
+	*min_len += sizeof(struct inet_diag_hostcond);
+	if (len < *min_len)
+		return false;
+	cond = (struct inet_diag_hostcond *)(op + 1);
+
+	/* Check address family and address length. */
+	switch (cond->family) {
+	case AF_UNSPEC:
+		addr_len = 0;
+		break;
+	case AF_INET:
+		addr_len = sizeof(struct in_addr);
+		break;
+	case AF_INET6:
+		addr_len = sizeof(struct in6_addr);
+		break;
+	default:
+		return false;
+	}
+	*min_len += addr_len;
+	if (len < *min_len)
+		return false;
+
+	/* Check prefix length (in bits) vs address length (in bytes). */
+	if (cond->prefix_len > 8 * addr_len)
+		return false;
+
+	return true;
+}
+
+/* Validate a port comparison operator. */
+static bool valid_port_comparison(const struct inet_diag_bc_op *op,
+				  int len, int *min_len)
+{
+	/* Port comparisons put the port in a follow-on inet_diag_bc_op. */
+	*min_len += sizeof(struct inet_diag_bc_op);
+	if (len < *min_len)
+		return false;
+	return true;
+}
+
+static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
+			   int *min_len)
+{
+	*min_len += sizeof(struct inet_diag_markcond);
+	return len >= *min_len;
+}
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len,
+			     int *min_len)
+{
+	*min_len += sizeof(u64);
+	return len >= *min_len;
+}
+#endif
+
+static int inet_diag_bc_audit(const struct nlattr *attr,
+			      const struct sk_buff *skb)
+{
+	bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
+	const void *bytecode, *bc;
+	int bytecode_len, len;
+
+	if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
+		return -EINVAL;
+
+	bytecode = bc = nla_data(attr);
+	len = bytecode_len = nla_len(attr);
+
+	while (len > 0) {
+		int min_len = sizeof(struct inet_diag_bc_op);
+		const struct inet_diag_bc_op *op = bc;
+
+		switch (op->code) {
+		case INET_DIAG_BC_S_COND:
+		case INET_DIAG_BC_D_COND:
+			if (!valid_hostcond(bc, len, &min_len))
+				return -EINVAL;
+			break;
+		case INET_DIAG_BC_DEV_COND:
+			if (!valid_devcond(bc, len, &min_len))
+				return -EINVAL;
+			break;
+		case INET_DIAG_BC_S_EQ:
+		case INET_DIAG_BC_S_GE:
+		case INET_DIAG_BC_S_LE:
+		case INET_DIAG_BC_D_EQ:
+		case INET_DIAG_BC_D_GE:
+		case INET_DIAG_BC_D_LE:
+			if (!valid_port_comparison(bc, len, &min_len))
+				return -EINVAL;
+			break;
+		case INET_DIAG_BC_MARK_COND:
+			if (!net_admin)
+				return -EPERM;
+			if (!valid_markcond(bc, len, &min_len))
+				return -EINVAL;
+			break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+		case INET_DIAG_BC_CGROUP_COND:
+			if (!valid_cgroupcond(bc, len, &min_len))
+				return -EINVAL;
+			break;
+#endif
+		case INET_DIAG_BC_AUTO:
+		case INET_DIAG_BC_JMP:
+		case INET_DIAG_BC_NOP:
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (op->code != INET_DIAG_BC_NOP) {
+			if (op->no < min_len || op->no > len + 4 || op->no & 3)
+				return -EINVAL;
+			if (op->no < len &&
+			    !valid_cc(bytecode, bytecode_len, len - op->no))
+				return -EINVAL;
+		}
+
+		if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
+			return -EINVAL;
+		bc  += op->yes;
+		len -= op->yes;
+	}
+	return len == 0 ? 0 : -EINVAL;
+}
+
+static void twsk_build_assert(void)
+{
+	BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+		     offsetof(struct sock, sk_family));
+
+	BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+		     offsetof(struct inet_sock, inet_num));
+
+	BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+		     offsetof(struct inet_sock, inet_dport));
+
+	BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+		     offsetof(struct inet_sock, inet_rcv_saddr));
+
+	BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+		     offsetof(struct inet_sock, inet_daddr));
+
+#if IS_ENABLED(CONFIG_IPV6)
+	BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+		     offsetof(struct sock, sk_v6_rcv_saddr));
+
+	BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+		     offsetof(struct sock, sk_v6_daddr));
+#endif
+}
+
+void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
+			 struct netlink_callback *cb,
+			 const struct inet_diag_req_v2 *r)
+{
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+	struct inet_diag_dump_data *cb_data = cb->data;
+	struct net *net = sock_net(skb->sk);
+	u32 idiag_states = r->idiag_states;
+	int i, num, s_i, s_num;
+	struct nlattr *bc;
+	struct sock *sk;
+
+	bc = cb_data->inet_diag_nla_bc;
+	if (idiag_states & TCPF_SYN_RECV)
+		idiag_states |= TCPF_NEW_SYN_RECV;
+	s_i = cb->args[1];
+	s_num = num = cb->args[2];
+
+	if (cb->args[0] == 0) {
+		if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
+			goto skip_listen_ht;
+
+		for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
+			struct inet_listen_hashbucket *ilb;
+			struct hlist_nulls_node *node;
+
+			num = 0;
+			ilb = &hashinfo->lhash2[i];
+
+			spin_lock(&ilb->lock);
+			sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (!net_eq(sock_net(sk), net))
+					continue;
+
+				if (num < s_num) {
+					num++;
+					continue;
+				}
+
+				if (r->sdiag_family != AF_UNSPEC &&
+				    sk->sk_family != r->sdiag_family)
+					goto next_listen;
+
+				if (r->id.idiag_sport != inet->inet_sport &&
+				    r->id.idiag_sport)
+					goto next_listen;
+
+				if (!inet_diag_bc_sk(bc, sk))
+					goto next_listen;
+
+				if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+						      cb, r, NLM_F_MULTI,
+						      net_admin) < 0) {
+					spin_unlock(&ilb->lock);
+					goto done;
+				}
+
+next_listen:
+				++num;
+			}
+			spin_unlock(&ilb->lock);
+
+			s_num = 0;
+		}
+skip_listen_ht:
+		cb->args[0] = 1;
+		s_i = num = s_num = 0;
+	}
+
+	if (!(idiag_states & ~TCPF_LISTEN))
+		goto out;
+
+#define SKARR_SZ 16
+	for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+		spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+		struct hlist_nulls_node *node;
+		struct sock *sk_arr[SKARR_SZ];
+		int num_arr[SKARR_SZ];
+		int idx, accum, res;
+
+		if (hlist_nulls_empty(&head->chain))
+			continue;
+
+		if (i > s_i)
+			s_num = 0;
+
+next_chunk:
+		num = 0;
+		accum = 0;
+		spin_lock_bh(lock);
+		sk_nulls_for_each(sk, node, &head->chain) {
+			int state;
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+			if (num < s_num)
+				goto next_normal;
+			state = (sk->sk_state == TCP_TIME_WAIT) ?
+				inet_twsk(sk)->tw_substate : sk->sk_state;
+			if (!(idiag_states & (1 << state)))
+				goto next_normal;
+			if (r->sdiag_family != AF_UNSPEC &&
+			    sk->sk_family != r->sdiag_family)
+				goto next_normal;
+			if (r->id.idiag_sport != htons(sk->sk_num) &&
+			    r->id.idiag_sport)
+				goto next_normal;
+			if (r->id.idiag_dport != sk->sk_dport &&
+			    r->id.idiag_dport)
+				goto next_normal;
+			twsk_build_assert();
+
+			if (!inet_diag_bc_sk(bc, sk))
+				goto next_normal;
+
+			if (!refcount_inc_not_zero(&sk->sk_refcnt))
+				goto next_normal;
+
+			num_arr[accum] = num;
+			sk_arr[accum] = sk;
+			if (++accum == SKARR_SZ)
+				break;
+next_normal:
+			++num;
+		}
+		spin_unlock_bh(lock);
+		res = 0;
+		for (idx = 0; idx < accum; idx++) {
+			if (res >= 0) {
+				res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+						   NLM_F_MULTI, net_admin);
+				if (res < 0)
+					num = num_arr[idx];
+			}
+			sock_gen_put(sk_arr[idx]);
+		}
+		if (res < 0)
+			break;
+		cond_resched();
+		if (accum == SKARR_SZ) {
+			s_num = num + 1;
+			goto next_chunk;
+		}
+	}
+
+done:
+	cb->args[1] = i;
+	cb->args[2] = num;
+out:
+	;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
+
+static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			    const struct inet_diag_req_v2 *r)
+{
+	struct inet_diag_dump_data *cb_data = cb->data;
+	const struct inet_diag_handler *handler;
+	u32 prev_min_dump_alloc;
+	int protocol, err = 0;
+
+	protocol = inet_diag_get_protocol(r, cb_data);
+
+again:
+	prev_min_dump_alloc = cb->min_dump_alloc;
+	handler = inet_diag_lock_handler(protocol);
+	if (!IS_ERR(handler))
+		handler->dump(skb, cb, r);
+	else
+		err = PTR_ERR(handler);
+	inet_diag_unlock_handler(handler);
+
+	/* The skb is not large enough to fit one sk info and
+	 * inet_sk_diag_fill() has requested for a larger skb.
+	 */
+	if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) {
+		err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL);
+		if (!err)
+			goto again;
+	}
+
+	return err ? : skb->len;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh));
+}
+
+static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
+{
+	const struct nlmsghdr *nlh = cb->nlh;
+	struct inet_diag_dump_data *cb_data;
+	struct sk_buff *skb = cb->skb;
+	struct nlattr *nla;
+	int err;
+
+	cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL);
+	if (!cb_data)
+		return -ENOMEM;
+
+	err = inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas);
+	if (err) {
+		kfree(cb_data);
+		return err;
+	}
+	nla = cb_data->inet_diag_nla_bc;
+	if (nla) {
+		err = inet_diag_bc_audit(nla, skb);
+		if (err) {
+			kfree(cb_data);
+			return err;
+		}
+	}
+
+	nla = cb_data->inet_diag_nla_bpf_stgs;
+	if (nla) {
+		struct bpf_sk_storage_diag *bpf_stg_diag;
+
+		bpf_stg_diag = bpf_sk_storage_diag_alloc(nla);
+		if (IS_ERR(bpf_stg_diag)) {
+			kfree(cb_data);
+			return PTR_ERR(bpf_stg_diag);
+		}
+		cb_data->bpf_stg_diag = bpf_stg_diag;
+	}
+
+	cb->data = cb_data;
+	return 0;
+}
+
+static int inet_diag_dump_start(struct netlink_callback *cb)
+{
+	return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2));
+}
+
+static int inet_diag_dump_start_compat(struct netlink_callback *cb)
+{
+	return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req));
+}
+
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+	struct inet_diag_dump_data *cb_data = cb->data;
+
+	bpf_sk_storage_diag_free(cb_data->bpf_stg_diag);
+	kfree(cb->data);
+
+	return 0;
+}
+
+static int inet_diag_type2proto(int type)
+{
+	switch (type) {
+	case TCPDIAG_GETSOCK:
+		return IPPROTO_TCP;
+	case DCCPDIAG_GETSOCK:
+		return IPPROTO_DCCP;
+	default:
+		return 0;
+	}
+}
+
+static int inet_diag_dump_compat(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	struct inet_diag_req *rc = nlmsg_data(cb->nlh);
+	struct inet_diag_req_v2 req;
+
+	req.sdiag_family = AF_UNSPEC; /* compatibility */
+	req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
+	req.idiag_ext = rc->idiag_ext;
+	req.idiag_states = rc->idiag_states;
+	req.id = rc->id;
+
+	return __inet_diag_dump(skb, cb, &req);
+}
+
+static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
+				      const struct nlmsghdr *nlh)
+{
+	struct inet_diag_req *rc = nlmsg_data(nlh);
+	struct inet_diag_req_v2 req;
+
+	req.sdiag_family = rc->idiag_family;
+	req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
+	req.idiag_ext = rc->idiag_ext;
+	req.idiag_states = rc->idiag_states;
+	req.id = rc->id;
+
+	return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh,
+				   sizeof(struct inet_diag_req), &req);
+}
+
+static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	int hdrlen = sizeof(struct inet_diag_req);
+	struct net *net = sock_net(skb->sk);
+
+	if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
+	    nlmsg_len(nlh) < hdrlen)
+		return -EINVAL;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.start = inet_diag_dump_start_compat,
+			.done = inet_diag_dump_done,
+			.dump = inet_diag_dump_compat,
+		};
+		return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
+	}
+
+	return inet_diag_get_exact_compat(skb, nlh);
+}
+
+static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
+{
+	int hdrlen = sizeof(struct inet_diag_req_v2);
+	struct net *net = sock_net(skb->sk);
+
+	if (nlmsg_len(h) < hdrlen)
+		return -EINVAL;
+
+	if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+	    h->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.start = inet_diag_dump_start,
+			.done = inet_diag_dump_done,
+			.dump = inet_diag_dump,
+		};
+		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+	}
+
+	return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen,
+				   nlmsg_data(h));
+}
+
+static
+int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
+{
+	const struct inet_diag_handler *handler;
+	struct nlmsghdr *nlh;
+	struct nlattr *attr;
+	struct inet_diag_msg *r;
+	void *info = NULL;
+	int err = 0;
+
+	nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0);
+	if (!nlh)
+		return -ENOMEM;
+
+	r = nlmsg_data(nlh);
+	memset(r, 0, sizeof(*r));
+	inet_diag_msg_common_fill(r, sk);
+	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM)
+		r->id.idiag_sport = inet_sk(sk)->inet_sport;
+	r->idiag_state = sk->sk_state;
+
+	if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) {
+		nlmsg_cancel(skb, nlh);
+		return err;
+	}
+
+	handler = inet_diag_lock_handler(sk->sk_protocol);
+	if (IS_ERR(handler)) {
+		inet_diag_unlock_handler(handler);
+		nlmsg_cancel(skb, nlh);
+		return PTR_ERR(handler);
+	}
+
+	attr = handler->idiag_info_size
+		? nla_reserve_64bit(skb, INET_DIAG_INFO,
+				    handler->idiag_info_size,
+				    INET_DIAG_PAD)
+		: NULL;
+	if (attr)
+		info = nla_data(attr);
+
+	handler->idiag_get_info(sk, r, info);
+	inet_diag_unlock_handler(handler);
+
+	nlmsg_end(skb, nlh);
+	return 0;
+}
+
+static const struct sock_diag_handler inet_diag_handler = {
+	.family = AF_INET,
+	.dump = inet_diag_handler_cmd,
+	.get_info = inet_diag_handler_get_info,
+	.destroy = inet_diag_handler_cmd,
+};
+
+static const struct sock_diag_handler inet6_diag_handler = {
+	.family = AF_INET6,
+	.dump = inet_diag_handler_cmd,
+	.get_info = inet_diag_handler_get_info,
+	.destroy = inet_diag_handler_cmd,
+};
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+	const __u16 type = h->idiag_type;
+	int err = -EINVAL;
+
+	if (type >= IPPROTO_MAX)
+		goto out;
+
+	mutex_lock(&inet_diag_table_mutex);
+	err = -EEXIST;
+	if (!inet_diag_table[type]) {
+		inet_diag_table[type] = h;
+		err = 0;
+	}
+	mutex_unlock(&inet_diag_table_mutex);
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+	const __u16 type = h->idiag_type;
+
+	if (type >= IPPROTO_MAX)
+		return;
+
+	mutex_lock(&inet_diag_table_mutex);
+	inet_diag_table[type] = NULL;
+	mutex_unlock(&inet_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+	const int inet_diag_table_size = (IPPROTO_MAX *
+					  sizeof(struct inet_diag_handler *));
+	int err = -ENOMEM;
+
+	inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL);
+	if (!inet_diag_table)
+		goto out;
+
+	err = sock_diag_register(&inet_diag_handler);
+	if (err)
+		goto out_free_nl;
+
+	err = sock_diag_register(&inet6_diag_handler);
+	if (err)
+		goto out_free_inet;
+
+	sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
+out:
+	return err;
+
+out_free_inet:
+	sock_diag_unregister(&inet_diag_handler);
+out_free_nl:
+	kfree(inet_diag_table);
+	goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+	sock_diag_unregister(&inet6_diag_handler);
+	sock_diag_unregister(&inet_diag_handler);
+	sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
+	kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
new file mode 100644
index 0000000000..7072fc0783
--- /dev/null
+++ b/net/ipv4/inet_fragment.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * inet fragments management
+ *
+ * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
+ *				Started as consolidation of ipv4/ip_fragment.c,
+ *				ipv6/reassembly. and ipv6 nf conntrack reassembly
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/rhashtable.h>
+
+#include <net/sock.h>
+#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+/* Use skb->cb to track consecutive/adjacent fragments coming at
+ * the end of the queue. Nodes in the rb-tree queue will
+ * contain "runs" of one or more adjacent fragments.
+ *
+ * Invariants:
+ * - next_frag is NULL at the tail of a "run";
+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
+ */
+struct ipfrag_skb_cb {
+	union {
+		struct inet_skb_parm	h4;
+		struct inet6_skb_parm	h6;
+	};
+	struct sk_buff		*next_frag;
+	int			frag_run_len;
+};
+
+#define FRAG_CB(skb)		((struct ipfrag_skb_cb *)((skb)->cb))
+
+static void fragcb_clear(struct sk_buff *skb)
+{
+	RB_CLEAR_NODE(&skb->rbnode);
+	FRAG_CB(skb)->next_frag = NULL;
+	FRAG_CB(skb)->frag_run_len = skb->len;
+}
+
+/* Append skb to the last "run". */
+static void fragrun_append_to_last(struct inet_frag_queue *q,
+				   struct sk_buff *skb)
+{
+	fragcb_clear(skb);
+
+	FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
+	FRAG_CB(q->fragments_tail)->next_frag = skb;
+	q->fragments_tail = skb;
+}
+
+/* Create a new "run" with the skb. */
+static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
+	fragcb_clear(skb);
+
+	if (q->last_run_head)
+		rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
+			     &q->last_run_head->rbnode.rb_right);
+	else
+		rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
+	rb_insert_color(&skb->rbnode, &q->rb_fragments);
+
+	q->fragments_tail = skb;
+	q->last_run_head = skb;
+}
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+const u8 ip_frag_ecn_table[16] = {
+	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,
+
+	/* invalid combinations : drop frame */
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+EXPORT_SYMBOL(ip_frag_ecn_table);
+
+int inet_frags_init(struct inet_frags *f)
+{
+	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
+					    NULL);
+	if (!f->frags_cachep)
+		return -ENOMEM;
+
+	refcount_set(&f->refcnt, 1);
+	init_completion(&f->completion);
+	return 0;
+}
+EXPORT_SYMBOL(inet_frags_init);
+
+void inet_frags_fini(struct inet_frags *f)
+{
+	if (refcount_dec_and_test(&f->refcnt))
+		complete(&f->completion);
+
+	wait_for_completion(&f->completion);
+
+	kmem_cache_destroy(f->frags_cachep);
+	f->frags_cachep = NULL;
+}
+EXPORT_SYMBOL(inet_frags_fini);
+
+/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
+static void inet_frags_free_cb(void *ptr, void *arg)
+{
+	struct inet_frag_queue *fq = ptr;
+	int count;
+
+	count = del_timer_sync(&fq->timer) ? 1 : 0;
+
+	spin_lock_bh(&fq->lock);
+	fq->flags |= INET_FRAG_DROP;
+	if (!(fq->flags & INET_FRAG_COMPLETE)) {
+		fq->flags |= INET_FRAG_COMPLETE;
+		count++;
+	} else if (fq->flags & INET_FRAG_HASH_DEAD) {
+		count++;
+	}
+	spin_unlock_bh(&fq->lock);
+
+	if (refcount_sub_and_test(count, &fq->refcnt))
+		inet_frag_destroy(fq);
+}
+
+static LLIST_HEAD(fqdir_free_list);
+
+static void fqdir_free_fn(struct work_struct *work)
+{
+	struct llist_node *kill_list;
+	struct fqdir *fqdir, *tmp;
+	struct inet_frags *f;
+
+	/* Atomically snapshot the list of fqdirs to free */
+	kill_list = llist_del_all(&fqdir_free_list);
+
+	/* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
+	 * have completed, since they need to dereference fqdir.
+	 * Would it not be nice to have kfree_rcu_barrier() ? :)
+	 */
+	rcu_barrier();
+
+	llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) {
+		f = fqdir->f;
+		if (refcount_dec_and_test(&f->refcnt))
+			complete(&f->completion);
+
+		kfree(fqdir);
+	}
+}
+
+static DECLARE_WORK(fqdir_free_work, fqdir_free_fn);
+
+static void fqdir_work_fn(struct work_struct *work)
+{
+	struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
+
+	rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+
+	if (llist_add(&fqdir->free_list, &fqdir_free_list))
+		queue_work(system_wq, &fqdir_free_work);
+}
+
+int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
+{
+	struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
+	int res;
+
+	if (!fqdir)
+		return -ENOMEM;
+	fqdir->f = f;
+	fqdir->net = net;
+	res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
+	if (res < 0) {
+		kfree(fqdir);
+		return res;
+	}
+	refcount_inc(&f->refcnt);
+	*fqdirp = fqdir;
+	return 0;
+}
+EXPORT_SYMBOL(fqdir_init);
+
+static struct workqueue_struct *inet_frag_wq;
+
+static int __init inet_frag_wq_init(void)
+{
+	inet_frag_wq = create_workqueue("inet_frag_wq");
+	if (!inet_frag_wq)
+		panic("Could not create inet frag workq");
+	return 0;
+}
+
+pure_initcall(inet_frag_wq_init);
+
+void fqdir_exit(struct fqdir *fqdir)
+{
+	INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
+	queue_work(inet_frag_wq, &fqdir->destroy_work);
+}
+EXPORT_SYMBOL(fqdir_exit);
+
+void inet_frag_kill(struct inet_frag_queue *fq)
+{
+	if (del_timer(&fq->timer))
+		refcount_dec(&fq->refcnt);
+
+	if (!(fq->flags & INET_FRAG_COMPLETE)) {
+		struct fqdir *fqdir = fq->fqdir;
+
+		fq->flags |= INET_FRAG_COMPLETE;
+		rcu_read_lock();
+		/* The RCU read lock provides a memory barrier
+		 * guaranteeing that if fqdir->dead is false then
+		 * the hash table destruction will not start until
+		 * after we unlock.  Paired with fqdir_pre_exit().
+		 */
+		if (!READ_ONCE(fqdir->dead)) {
+			rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
+					       fqdir->f->rhash_params);
+			refcount_dec(&fq->refcnt);
+		} else {
+			fq->flags |= INET_FRAG_HASH_DEAD;
+		}
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL(inet_frag_kill);
+
+static void inet_frag_destroy_rcu(struct rcu_head *head)
+{
+	struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
+						 rcu);
+	struct inet_frags *f = q->fqdir->f;
+
+	if (f->destructor)
+		f->destructor(q);
+	kmem_cache_free(f->frags_cachep, q);
+}
+
+unsigned int inet_frag_rbtree_purge(struct rb_root *root,
+				    enum skb_drop_reason reason)
+{
+	struct rb_node *p = rb_first(root);
+	unsigned int sum = 0;
+
+	while (p) {
+		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, root);
+		while (skb) {
+			struct sk_buff *next = FRAG_CB(skb)->next_frag;
+
+			sum += skb->truesize;
+			kfree_skb_reason(skb, reason);
+			skb = next;
+		}
+	}
+	return sum;
+}
+EXPORT_SYMBOL(inet_frag_rbtree_purge);
+
+void inet_frag_destroy(struct inet_frag_queue *q)
+{
+	unsigned int sum, sum_truesize = 0;
+	enum skb_drop_reason reason;
+	struct inet_frags *f;
+	struct fqdir *fqdir;
+
+	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
+	reason = (q->flags & INET_FRAG_DROP) ?
+			SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
+			SKB_CONSUMED;
+	WARN_ON(del_timer(&q->timer) != 0);
+
+	/* Release all fragment data. */
+	fqdir = q->fqdir;
+	f = fqdir->f;
+	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
+	sum = sum_truesize + f->qsize;
+
+	call_rcu(&q->rcu, inet_frag_destroy_rcu);
+
+	sub_frag_mem_limit(fqdir, sum);
+}
+EXPORT_SYMBOL(inet_frag_destroy);
+
+static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
+					       struct inet_frags *f,
+					       void *arg)
+{
+	struct inet_frag_queue *q;
+
+	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
+	if (!q)
+		return NULL;
+
+	q->fqdir = fqdir;
+	f->constructor(q, arg);
+	add_frag_mem_limit(fqdir, f->qsize);
+
+	timer_setup(&q->timer, f->frag_expire, 0);
+	spin_lock_init(&q->lock);
+	refcount_set(&q->refcnt, 3);
+
+	return q;
+}
+
+static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
+						void *arg,
+						struct inet_frag_queue **prev)
+{
+	struct inet_frags *f = fqdir->f;
+	struct inet_frag_queue *q;
+
+	q = inet_frag_alloc(fqdir, f, arg);
+	if (!q) {
+		*prev = ERR_PTR(-ENOMEM);
+		return NULL;
+	}
+	mod_timer(&q->timer, jiffies + fqdir->timeout);
+
+	*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
+						 &q->node, f->rhash_params);
+	if (*prev) {
+		q->flags |= INET_FRAG_COMPLETE;
+		inet_frag_kill(q);
+		inet_frag_destroy(q);
+		return NULL;
+	}
+	return q;
+}
+
+/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
+struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
+{
+	/* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
+	long high_thresh = READ_ONCE(fqdir->high_thresh);
+	struct inet_frag_queue *fq = NULL, *prev;
+
+	if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
+		return NULL;
+
+	rcu_read_lock();
+
+	prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
+	if (!prev)
+		fq = inet_frag_create(fqdir, key, &prev);
+	if (!IS_ERR_OR_NULL(prev)) {
+		fq = prev;
+		if (!refcount_inc_not_zero(&fq->refcnt))
+			fq = NULL;
+	}
+	rcu_read_unlock();
+	return fq;
+}
+EXPORT_SYMBOL(inet_frag_find);
+
+int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
+			   int offset, int end)
+{
+	struct sk_buff *last = q->fragments_tail;
+
+	/* RFC5722, Section 4, amended by Errata ID : 3089
+	 *                          When reassembling an IPv6 datagram, if
+	 *   one or more its constituent fragments is determined to be an
+	 *   overlapping fragment, the entire datagram (and any constituent
+	 *   fragments) MUST be silently discarded.
+	 *
+	 * Duplicates, however, should be ignored (i.e. skb dropped, but the
+	 * queue/fragments kept for later reassembly).
+	 */
+	if (!last)
+		fragrun_create(q, skb);  /* First fragment. */
+	else if (last->ip_defrag_offset + last->len < end) {
+		/* This is the common case: skb goes to the end. */
+		/* Detect and discard overlaps. */
+		if (offset < last->ip_defrag_offset + last->len)
+			return IPFRAG_OVERLAP;
+		if (offset == last->ip_defrag_offset + last->len)
+			fragrun_append_to_last(q, skb);
+		else
+			fragrun_create(q, skb);
+	} else {
+		/* Binary search. Note that skb can become the first fragment,
+		 * but not the last (covered above).
+		 */
+		struct rb_node **rbn, *parent;
+
+		rbn = &q->rb_fragments.rb_node;
+		do {
+			struct sk_buff *curr;
+			int curr_run_end;
+
+			parent = *rbn;
+			curr = rb_to_skb(parent);
+			curr_run_end = curr->ip_defrag_offset +
+					FRAG_CB(curr)->frag_run_len;
+			if (end <= curr->ip_defrag_offset)
+				rbn = &parent->rb_left;
+			else if (offset >= curr_run_end)
+				rbn = &parent->rb_right;
+			else if (offset >= curr->ip_defrag_offset &&
+				 end <= curr_run_end)
+				return IPFRAG_DUP;
+			else
+				return IPFRAG_OVERLAP;
+		} while (*rbn);
+		/* Here we have parent properly set, and rbn pointing to
+		 * one of its NULL left/right children. Insert skb.
+		 */
+		fragcb_clear(skb);
+		rb_link_node(&skb->rbnode, parent, rbn);
+		rb_insert_color(&skb->rbnode, &q->rb_fragments);
+	}
+
+	skb->ip_defrag_offset = offset;
+
+	return IPFRAG_OK;
+}
+EXPORT_SYMBOL(inet_frag_queue_insert);
+
+void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
+			      struct sk_buff *parent)
+{
+	struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
+	struct sk_buff **nextp;
+	int delta;
+
+	if (head != skb) {
+		fp = skb_clone(skb, GFP_ATOMIC);
+		if (!fp)
+			return NULL;
+		FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+		if (RB_EMPTY_NODE(&skb->rbnode))
+			FRAG_CB(parent)->next_frag = fp;
+		else
+			rb_replace_node(&skb->rbnode, &fp->rbnode,
+					&q->rb_fragments);
+		if (q->fragments_tail == skb)
+			q->fragments_tail = fp;
+		skb_morph(skb, head);
+		FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
+		rb_replace_node(&head->rbnode, &skb->rbnode,
+				&q->rb_fragments);
+		consume_skb(head);
+		head = skb;
+	}
+	WARN_ON(head->ip_defrag_offset != 0);
+
+	delta = -head->truesize;
+
+	/* Head of list must not be cloned. */
+	if (skb_unclone(head, GFP_ATOMIC))
+		return NULL;
+
+	delta += head->truesize;
+	if (delta)
+		add_frag_mem_limit(q->fqdir, delta);
+
+	/* If the first fragment is fragmented itself, we split
+	 * it to two chunks: the first with data and paged part
+	 * and the second, holding only fragments.
+	 */
+	if (skb_has_frag_list(head)) {
+		struct sk_buff *clone;
+		int i, plen = 0;
+
+		clone = alloc_skb(0, GFP_ATOMIC);
+		if (!clone)
+			return NULL;
+		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+		skb_frag_list_init(head);
+		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+		clone->data_len = head->data_len - plen;
+		clone->len = clone->data_len;
+		head->truesize += clone->truesize;
+		clone->csum = 0;
+		clone->ip_summed = head->ip_summed;
+		add_frag_mem_limit(q->fqdir, clone->truesize);
+		skb_shinfo(head)->frag_list = clone;
+		nextp = &clone->next;
+	} else {
+		nextp = &skb_shinfo(head)->frag_list;
+	}
+
+	return nextp;
+}
+EXPORT_SYMBOL(inet_frag_reasm_prepare);
+
+void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
+			    void *reasm_data, bool try_coalesce)
+{
+	struct sk_buff **nextp = reasm_data;
+	struct rb_node *rbn;
+	struct sk_buff *fp;
+	int sum_truesize;
+
+	skb_push(head, head->data - skb_network_header(head));
+
+	/* Traverse the tree in order, to build frag_list. */
+	fp = FRAG_CB(head)->next_frag;
+	rbn = rb_next(&head->rbnode);
+	rb_erase(&head->rbnode, &q->rb_fragments);
+
+	sum_truesize = head->truesize;
+	while (rbn || fp) {
+		/* fp points to the next sk_buff in the current run;
+		 * rbn points to the next run.
+		 */
+		/* Go through the current run. */
+		while (fp) {
+			struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
+			bool stolen;
+			int delta;
+
+			sum_truesize += fp->truesize;
+			if (head->ip_summed != fp->ip_summed)
+				head->ip_summed = CHECKSUM_NONE;
+			else if (head->ip_summed == CHECKSUM_COMPLETE)
+				head->csum = csum_add(head->csum, fp->csum);
+
+			if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
+							     &delta)) {
+				kfree_skb_partial(fp, stolen);
+			} else {
+				fp->prev = NULL;
+				memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+				fp->sk = NULL;
+
+				head->data_len += fp->len;
+				head->len += fp->len;
+				head->truesize += fp->truesize;
+
+				*nextp = fp;
+				nextp = &fp->next;
+			}
+
+			fp = next_frag;
+		}
+		/* Move to the next run. */
+		if (rbn) {
+			struct rb_node *rbnext = rb_next(rbn);
+
+			fp = rb_to_skb(rbn);
+			rb_erase(rbn, &q->rb_fragments);
+			rbn = rbnext;
+		}
+	}
+	sub_frag_mem_limit(q->fqdir, sum_truesize);
+
+	*nextp = NULL;
+	skb_mark_not_on_list(head);
+	head->prev = NULL;
+	head->tstamp = q->stamp;
+	head->mono_delivery_time = q->mono_delivery_time;
+}
+EXPORT_SYMBOL(inet_frag_reasm_finish);
+
+struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
+{
+	struct sk_buff *head, *skb;
+
+	head = skb_rb_first(&q->rb_fragments);
+	if (!head)
+		return NULL;
+	skb = FRAG_CB(head)->next_frag;
+	if (skb)
+		rb_replace_node(&head->rbnode, &skb->rbnode,
+				&q->rb_fragments);
+	else
+		rb_erase(&head->rbnode, &q->rb_fragments);
+	memset(&head->rbnode, 0, sizeof(head->rbnode));
+	barrier();
+
+	if (head == q->fragments_tail)
+		q->fragments_tail = NULL;
+
+	sub_frag_mem_limit(q->fqdir, head->truesize);
+
+	return head;
+}
+EXPORT_SYMBOL(inet_frag_pull_head);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 0000000000..a532f749e4
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,1277 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic INET transport hashtables
+ *
+ * Authors:	Lotsa people, from code originally in tcp
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/vmalloc.h>
+#include <linux/memblock.h>
+
+#include <net/addrconf.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/inet6_hashtables.h>
+#endif
+#include <net/secure_seq.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/sock_reuseport.h>
+
+u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+		 const __u16 lport, const __be32 faddr,
+		 const __be16 fport)
+{
+	static u32 inet_ehash_secret __read_mostly;
+
+	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
+
+	return __inet_ehashfn(laddr, lport, faddr, fport,
+			      inet_ehash_secret + net_hash_mix(net));
+}
+EXPORT_SYMBOL_GPL(inet_ehashfn);
+
+/* This function handles inet_sock, but also timewait and request sockets
+ * for IPv4/IPv6.
+ */
+static u32 sk_ehashfn(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6 &&
+	    !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+		return inet6_ehashfn(sock_net(sk),
+				     &sk->sk_v6_rcv_saddr, sk->sk_num,
+				     &sk->sk_v6_daddr, sk->sk_dport);
+#endif
+	return inet_ehashfn(sock_net(sk),
+			    sk->sk_rcv_saddr, sk->sk_num,
+			    sk->sk_daddr, sk->sk_dport);
+}
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+						 struct net *net,
+						 struct inet_bind_hashbucket *head,
+						 const unsigned short snum,
+						 int l3mdev)
+{
+	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
+
+	if (tb) {
+		write_pnet(&tb->ib_net, net);
+		tb->l3mdev    = l3mdev;
+		tb->port      = snum;
+		tb->fastreuse = 0;
+		tb->fastreuseport = 0;
+		INIT_HLIST_HEAD(&tb->owners);
+		hlist_add_head(&tb->node, &head->chain);
+	}
+	return tb;
+}
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+{
+	if (hlist_empty(&tb->owners)) {
+		__hlist_del(&tb->node);
+		kmem_cache_free(cachep, tb);
+	}
+}
+
+bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
+			    unsigned short port, int l3mdev)
+{
+	return net_eq(ib_net(tb), net) && tb->port == port &&
+		tb->l3mdev == l3mdev;
+}
+
+static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb,
+				   struct net *net,
+				   struct inet_bind_hashbucket *head,
+				   unsigned short port, int l3mdev,
+				   const struct sock *sk)
+{
+	write_pnet(&tb->ib_net, net);
+	tb->l3mdev    = l3mdev;
+	tb->port      = port;
+#if IS_ENABLED(CONFIG_IPV6)
+	tb->family    = sk->sk_family;
+	if (sk->sk_family == AF_INET6)
+		tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+	else
+#endif
+		tb->rcv_saddr = sk->sk_rcv_saddr;
+	INIT_HLIST_HEAD(&tb->owners);
+	INIT_HLIST_HEAD(&tb->deathrow);
+	hlist_add_head(&tb->node, &head->chain);
+}
+
+struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
+						   struct net *net,
+						   struct inet_bind_hashbucket *head,
+						   unsigned short port,
+						   int l3mdev,
+						   const struct sock *sk)
+{
+	struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
+
+	if (tb)
+		inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk);
+
+	return tb;
+}
+
+/* Caller must hold hashbucket lock for this tb with local BH disabled */
+void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
+{
+	if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) {
+		__hlist_del(&tb->node);
+		kmem_cache_free(cachep, tb);
+	}
+}
+
+static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
+					 const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family != tb2->family) {
+		if (sk->sk_family == AF_INET)
+			return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) &&
+				tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr;
+
+		return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) &&
+			sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr;
+	}
+
+	if (sk->sk_family == AF_INET6)
+		return ipv6_addr_equal(&tb2->v6_rcv_saddr,
+				       &sk->sk_v6_rcv_saddr);
+#endif
+	return tb2->rcv_saddr == sk->sk_rcv_saddr;
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+		    struct inet_bind2_bucket *tb2, unsigned short port)
+{
+	inet_sk(sk)->inet_num = port;
+	sk_add_bind_node(sk, &tb->owners);
+	inet_csk(sk)->icsk_bind_hash = tb;
+	sk_add_bind2_node(sk, &tb2->owners);
+	inet_csk(sk)->icsk_bind2_hash = tb2;
+}
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct sock *sk)
+{
+	struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+	struct inet_bind_hashbucket *head, *head2;
+	struct net *net = sock_net(sk);
+	struct inet_bind_bucket *tb;
+	int bhash;
+
+	bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
+	head = &hashinfo->bhash[bhash];
+	head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);
+
+	spin_lock(&head->lock);
+	tb = inet_csk(sk)->icsk_bind_hash;
+	__sk_del_bind_node(sk);
+	inet_csk(sk)->icsk_bind_hash = NULL;
+	inet_sk(sk)->inet_num = 0;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+
+	spin_lock(&head2->lock);
+	if (inet_csk(sk)->icsk_bind2_hash) {
+		struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;
+
+		__sk_del_bind2_node(sk);
+		inet_csk(sk)->icsk_bind2_hash = NULL;
+		inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+	}
+	spin_unlock(&head2->lock);
+
+	spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct sock *sk)
+{
+	local_bh_disable();
+	__inet_put_port(sk);
+	local_bh_enable();
+}
+EXPORT_SYMBOL(inet_put_port);
+
+int __inet_inherit_port(const struct sock *sk, struct sock *child)
+{
+	struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk);
+	unsigned short port = inet_sk(child)->inet_num;
+	struct inet_bind_hashbucket *head, *head2;
+	bool created_inet_bind_bucket = false;
+	struct net *net = sock_net(sk);
+	bool update_fastreuse = false;
+	struct inet_bind2_bucket *tb2;
+	struct inet_bind_bucket *tb;
+	int bhash, l3mdev;
+
+	bhash = inet_bhashfn(net, port, table->bhash_size);
+	head = &table->bhash[bhash];
+	head2 = inet_bhashfn_portaddr(table, child, net, port);
+
+	spin_lock(&head->lock);
+	spin_lock(&head2->lock);
+	tb = inet_csk(sk)->icsk_bind_hash;
+	tb2 = inet_csk(sk)->icsk_bind2_hash;
+	if (unlikely(!tb || !tb2)) {
+		spin_unlock(&head2->lock);
+		spin_unlock(&head->lock);
+		return -ENOENT;
+	}
+	if (tb->port != port) {
+		l3mdev = inet_sk_bound_l3mdev(sk);
+
+		/* NOTE: using tproxy and redirecting skbs to a proxy
+		 * on a different listener port breaks the assumption
+		 * that the listener socket's icsk_bind_hash is the same
+		 * as that of the child socket. We have to look up or
+		 * create a new bind bucket for the child here. */
+		inet_bind_bucket_for_each(tb, &head->chain) {
+			if (inet_bind_bucket_match(tb, net, port, l3mdev))
+				break;
+		}
+		if (!tb) {
+			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+						     net, head, port, l3mdev);
+			if (!tb) {
+				spin_unlock(&head2->lock);
+				spin_unlock(&head->lock);
+				return -ENOMEM;
+			}
+			created_inet_bind_bucket = true;
+		}
+		update_fastreuse = true;
+
+		goto bhash2_find;
+	} else if (!inet_bind2_bucket_addr_match(tb2, child)) {
+		l3mdev = inet_sk_bound_l3mdev(sk);
+
+bhash2_find:
+		tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
+		if (!tb2) {
+			tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
+						       net, head2, port,
+						       l3mdev, child);
+			if (!tb2)
+				goto error;
+		}
+	}
+	if (update_fastreuse)
+		inet_csk_update_fastreuse(tb, child);
+	inet_bind_hash(child, tb, tb2, port);
+	spin_unlock(&head2->lock);
+	spin_unlock(&head->lock);
+
+	return 0;
+
+error:
+	if (created_inet_bind_bucket)
+		inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
+	spin_unlock(&head2->lock);
+	spin_unlock(&head->lock);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(__inet_inherit_port);
+
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+	u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		hash = ipv6_portaddr_hash(sock_net(sk),
+					  &sk->sk_v6_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	else
+#endif
+		hash = ipv4_portaddr_hash(sock_net(sk),
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	return inet_lhash2_bucket(h, hash);
+}
+
+static inline int compute_score(struct sock *sk, struct net *net,
+				const unsigned short hnum, const __be32 daddr,
+				const int dif, const int sdif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
+			!ipv6_only_sock(sk)) {
+		if (sk->sk_rcv_saddr != daddr)
+			return -1;
+
+		if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+			return -1;
+		score =  sk->sk_bound_dev_if ? 2 : 1;
+
+		if (sk->sk_family == PF_INET)
+			score++;
+		if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+			score++;
+	}
+	return score;
+}
+
+/**
+ * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
+ * @net: network namespace.
+ * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
+ * @skb: context for a potential SK_REUSEPORT program.
+ * @doff: header offset.
+ * @saddr: source address.
+ * @sport: source port.
+ * @daddr: destination address.
+ * @hnum: destination port in host byte order.
+ * @ehashfn: hash function used to generate the fallback hash.
+ *
+ * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
+ *         the selected sock or an error.
+ */
+struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
+				   struct sk_buff *skb, int doff,
+				   __be32 saddr, __be16 sport,
+				   __be32 daddr, unsigned short hnum,
+				   inet_ehashfn_t *ehashfn)
+{
+	struct sock *reuse_sk = NULL;
+	u32 phash;
+
+	if (sk->sk_reuseport) {
+		phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
+					net, daddr, hnum, saddr, sport);
+		reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
+	}
+	return reuse_sk;
+}
+EXPORT_SYMBOL_GPL(inet_lookup_reuseport);
+
+/*
+ * Here are some nice properties to exploit here. The BSD API
+ * does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+
+/* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(struct net *net,
+				struct inet_listen_hashbucket *ilb2,
+				struct sk_buff *skb, int doff,
+				const __be32 saddr, __be16 sport,
+				const __be32 daddr, const unsigned short hnum,
+				const int dif, const int sdif)
+{
+	struct sock *sk, *result = NULL;
+	struct hlist_nulls_node *node;
+	int score, hiscore = 0;
+
+	sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
+		score = compute_score(sk, net, hnum, daddr, dif, sdif);
+		if (score > hiscore) {
+			result = inet_lookup_reuseport(net, sk, skb, doff,
+						       saddr, sport, daddr, hnum, inet_ehashfn);
+			if (result)
+				return result;
+
+			result = sk;
+			hiscore = score;
+		}
+	}
+
+	return result;
+}
+
+struct sock *inet_lookup_run_sk_lookup(struct net *net,
+				       int protocol,
+				       struct sk_buff *skb, int doff,
+				       __be32 saddr, __be16 sport,
+				       __be32 daddr, u16 hnum, const int dif,
+				       inet_ehashfn_t *ehashfn)
+{
+	struct sock *sk, *reuse_sk;
+	bool no_reuseport;
+
+	no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
+					    daddr, hnum, dif, &sk);
+	if (no_reuseport || IS_ERR_OR_NULL(sk))
+		return sk;
+
+	reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
+					 ehashfn);
+	if (reuse_sk)
+		sk = reuse_sk;
+	return sk;
+}
+
+struct sock *__inet_lookup_listener(struct net *net,
+				    struct inet_hashinfo *hashinfo,
+				    struct sk_buff *skb, int doff,
+				    const __be32 saddr, __be16 sport,
+				    const __be32 daddr, const unsigned short hnum,
+				    const int dif, const int sdif)
+{
+	struct inet_listen_hashbucket *ilb2;
+	struct sock *result = NULL;
+	unsigned int hash2;
+
+	/* Lookup redirect from BPF */
+	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+	    hashinfo == net->ipv4.tcp_death_row.hashinfo) {
+		result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
+						   saddr, sport, daddr, hnum, dif,
+						   inet_ehashfn);
+		if (result)
+			goto done;
+	}
+
+	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+
+	result = inet_lhash2_lookup(net, ilb2, skb, doff,
+				    saddr, sport, daddr, hnum,
+				    dif, sdif);
+	if (result)
+		goto done;
+
+	/* Lookup lhash2 with INADDR_ANY */
+	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+
+	result = inet_lhash2_lookup(net, ilb2, skb, doff,
+				    saddr, sport, htonl(INADDR_ANY), hnum,
+				    dif, sdif);
+done:
+	if (IS_ERR(result))
+		return NULL;
+	return result;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+/* All sockets share common refcount, but have different destructors */
+void sock_gen_put(struct sock *sk)
+{
+	if (!refcount_dec_and_test(&sk->sk_refcnt))
+		return;
+
+	if (sk->sk_state == TCP_TIME_WAIT)
+		inet_twsk_free(inet_twsk(sk));
+	else if (sk->sk_state == TCP_NEW_SYN_RECV)
+		reqsk_free(inet_reqsk(sk));
+	else
+		sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sock_gen_put);
+
+void sock_edemux(struct sk_buff *skb)
+{
+	sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
+
+struct sock *__inet_lookup_established(struct net *net,
+				  struct inet_hashinfo *hashinfo,
+				  const __be32 saddr, const __be16 sport,
+				  const __be32 daddr, const u16 hnum,
+				  const int dif, const int sdif)
+{
+	INET_ADDR_COOKIE(acookie, saddr, daddr);
+	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+	struct sock *sk;
+	const struct hlist_nulls_node *node;
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+	unsigned int slot = hash & hashinfo->ehash_mask;
+	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+
+begin:
+	sk_nulls_for_each_rcu(sk, node, &head->chain) {
+		if (sk->sk_hash != hash)
+			continue;
+		if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
+			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+				goto out;
+			if (unlikely(!inet_match(net, sk, acookie,
+						 ports, dif, sdif))) {
+				sock_gen_put(sk);
+				goto begin;
+			}
+			goto found;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begin;
+out:
+	sk = NULL;
+found:
+	return sk;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_established);
+
+/* called with local bh disabled */
+static int __inet_check_established(struct inet_timewait_death_row *death_row,
+				    struct sock *sk, __u16 lport,
+				    struct inet_timewait_sock **twp)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	struct inet_sock *inet = inet_sk(sk);
+	__be32 daddr = inet->inet_rcv_saddr;
+	__be32 saddr = inet->inet_daddr;
+	int dif = sk->sk_bound_dev_if;
+	struct net *net = sock_net(sk);
+	int sdif = l3mdev_master_ifindex_by_index(net, dif);
+	INET_ADDR_COOKIE(acookie, saddr, daddr);
+	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
+	unsigned int hash = inet_ehashfn(net, daddr, lport,
+					 saddr, inet->inet_dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
+	struct sock *sk2;
+	const struct hlist_nulls_node *node;
+	struct inet_timewait_sock *tw = NULL;
+
+	spin_lock(lock);
+
+	sk_nulls_for_each(sk2, node, &head->chain) {
+		if (sk2->sk_hash != hash)
+			continue;
+
+		if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
+			if (sk2->sk_state == TCP_TIME_WAIT) {
+				tw = inet_twsk(sk2);
+				if (twsk_unique(sk, sk2, twp))
+					break;
+			}
+			goto not_unique;
+		}
+	}
+
+	/* Must record num and sport now. Otherwise we will see
+	 * in hash table socket with a funny identity.
+	 */
+	inet->inet_num = lport;
+	inet->inet_sport = htons(lport);
+	sk->sk_hash = hash;
+	WARN_ON(!sk_unhashed(sk));
+	__sk_nulls_add_node_rcu(sk, &head->chain);
+	if (tw) {
+		sk_nulls_del_node_init_rcu((struct sock *)tw);
+		__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
+	}
+	spin_unlock(lock);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+	if (twp) {
+		*twp = tw;
+	} else if (tw) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule_put(tw);
+	}
+	return 0;
+
+not_unique:
+	spin_unlock(lock);
+	return -EADDRNOTAVAIL;
+}
+
+static u64 inet_sk_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+
+	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
+					  inet->inet_daddr,
+					  inet->inet_dport);
+}
+
+/* Searches for an exsiting socket in the ehash bucket list.
+ * Returns true if found, false otherwise.
+ */
+static bool inet_ehash_lookup_by_sk(struct sock *sk,
+				    struct hlist_nulls_head *list)
+{
+	const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
+	const int sdif = sk->sk_bound_dev_if;
+	const int dif = sk->sk_bound_dev_if;
+	const struct hlist_nulls_node *node;
+	struct net *net = sock_net(sk);
+	struct sock *esk;
+
+	INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
+
+	sk_nulls_for_each_rcu(esk, node, list) {
+		if (esk->sk_hash != sk->sk_hash)
+			continue;
+		if (sk->sk_family == AF_INET) {
+			if (unlikely(inet_match(net, esk, acookie,
+						ports, dif, sdif))) {
+				return true;
+			}
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (sk->sk_family == AF_INET6) {
+			if (unlikely(inet6_match(net, esk,
+						 &sk->sk_v6_daddr,
+						 &sk->sk_v6_rcv_saddr,
+						 ports, dif, sdif))) {
+				return true;
+			}
+		}
+#endif
+	}
+	return false;
+}
+
+/* Insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT)
+ * If an existing socket already exists, socket sk is not inserted,
+ * and sets found_dup_sk parameter to true.
+ */
+bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
+{
+	struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+	struct inet_ehash_bucket *head;
+	struct hlist_nulls_head *list;
+	spinlock_t *lock;
+	bool ret = true;
+
+	WARN_ON_ONCE(!sk_unhashed(sk));
+
+	sk->sk_hash = sk_ehashfn(sk);
+	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
+	list = &head->chain;
+	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+	spin_lock(lock);
+	if (osk) {
+		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+		ret = sk_nulls_del_node_init_rcu(osk);
+	} else if (found_dup_sk) {
+		*found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
+		if (*found_dup_sk)
+			ret = false;
+	}
+
+	if (ret)
+		__sk_nulls_add_node_rcu(sk, list);
+
+	spin_unlock(lock);
+
+	return ret;
+}
+
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
+{
+	bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
+
+	if (ok) {
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	} else {
+		this_cpu_inc(*sk->sk_prot->orphan_count);
+		inet_sk_set_state(sk, TCP_CLOSE);
+		sock_set_flag(sk, SOCK_DEAD);
+		inet_csk_destroy_sock(sk);
+	}
+	return ok;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
+
+static int inet_reuseport_add_sock(struct sock *sk,
+				   struct inet_listen_hashbucket *ilb)
+{
+	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
+	const struct hlist_nulls_node *node;
+	struct sock *sk2;
+	kuid_t uid = sock_i_uid(sk);
+
+	sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
+		if (sk2 != sk &&
+		    sk2->sk_family == sk->sk_family &&
+		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+		    inet_csk(sk2)->icsk_bind_hash == tb &&
+		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+		    inet_rcv_saddr_equal(sk, sk2, false))
+			return reuseport_add_sock(sk, sk2,
+						  inet_rcv_saddr_any(sk));
+	}
+
+	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
+}
+
+int __inet_hash(struct sock *sk, struct sock *osk)
+{
+	struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+	struct inet_listen_hashbucket *ilb2;
+	int err = 0;
+
+	if (sk->sk_state != TCP_LISTEN) {
+		local_bh_disable();
+		inet_ehash_nolisten(sk, osk, NULL);
+		local_bh_enable();
+		return 0;
+	}
+	WARN_ON(!sk_unhashed(sk));
+	ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
+
+	spin_lock(&ilb2->lock);
+	if (sk->sk_reuseport) {
+		err = inet_reuseport_add_sock(sk, ilb2);
+		if (err)
+			goto unlock;
+	}
+	sock_set_flag(sk, SOCK_RCU_FREE);
+	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+		sk->sk_family == AF_INET6)
+		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
+	else
+		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+unlock:
+	spin_unlock(&ilb2->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(__inet_hash);
+
+int inet_hash(struct sock *sk)
+{
+	int err = 0;
+
+	if (sk->sk_state != TCP_CLOSE)
+		err = __inet_hash(sk, NULL);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(inet_hash);
+
+void inet_unhash(struct sock *sk)
+{
+	struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+
+	if (sk_unhashed(sk))
+		return;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		struct inet_listen_hashbucket *ilb2;
+
+		ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
+		/* Don't disable bottom halves while acquiring the lock to
+		 * avoid circular locking dependency on PREEMPT_RT.
+		 */
+		spin_lock(&ilb2->lock);
+		if (sk_unhashed(sk)) {
+			spin_unlock(&ilb2->lock);
+			return;
+		}
+
+		if (rcu_access_pointer(sk->sk_reuseport_cb))
+			reuseport_stop_listen_sock(sk);
+
+		__sk_nulls_del_node_init_rcu(sk);
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+		spin_unlock(&ilb2->lock);
+	} else {
+		spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+		spin_lock_bh(lock);
+		if (sk_unhashed(sk)) {
+			spin_unlock_bh(lock);
+			return;
+		}
+		__sk_nulls_del_node_init_rcu(sk);
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+		spin_unlock_bh(lock);
+	}
+}
+EXPORT_SYMBOL_GPL(inet_unhash);
+
+static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
+				    const struct net *net, unsigned short port,
+				    int l3mdev, const struct sock *sk)
+{
+	if (!net_eq(ib2_net(tb), net) || tb->port != port ||
+	    tb->l3mdev != l3mdev)
+		return false;
+
+	return inet_bind2_bucket_addr_match(tb, sk);
+}
+
+bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
+				      unsigned short port, int l3mdev, const struct sock *sk)
+{
+	if (!net_eq(ib2_net(tb), net) || tb->port != port ||
+	    tb->l3mdev != l3mdev)
+		return false;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family != tb->family) {
+		if (sk->sk_family == AF_INET)
+			return ipv6_addr_any(&tb->v6_rcv_saddr) ||
+				ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr);
+
+		return false;
+	}
+
+	if (sk->sk_family == AF_INET6)
+		return ipv6_addr_any(&tb->v6_rcv_saddr);
+#endif
+	return tb->rcv_saddr == 0;
+}
+
+/* The socket's bhash2 hashbucket spinlock must be held when this is called */
+struct inet_bind2_bucket *
+inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
+		       unsigned short port, int l3mdev, const struct sock *sk)
+{
+	struct inet_bind2_bucket *bhash2 = NULL;
+
+	inet_bind_bucket_for_each(bhash2, &head->chain)
+		if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
+			break;
+
+	return bhash2;
+}
+
+struct inet_bind_hashbucket *
+inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
+{
+	struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+	u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		hash = ipv6_portaddr_hash(net, &in6addr_any, port);
+	else
+#endif
+		hash = ipv4_portaddr_hash(net, 0, port);
+
+	return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
+}
+
+static void inet_update_saddr(struct sock *sk, void *saddr, int family)
+{
+	if (family == AF_INET) {
+		inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
+		sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else {
+		sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
+	}
+#endif
+}
+
+static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
+{
+	struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+	struct inet_bind_hashbucket *head, *head2;
+	struct inet_bind2_bucket *tb2, *new_tb2;
+	int l3mdev = inet_sk_bound_l3mdev(sk);
+	int port = inet_sk(sk)->inet_num;
+	struct net *net = sock_net(sk);
+	int bhash;
+
+	if (!inet_csk(sk)->icsk_bind2_hash) {
+		/* Not bind()ed before. */
+		if (reset)
+			inet_reset_saddr(sk);
+		else
+			inet_update_saddr(sk, saddr, family);
+
+		return 0;
+	}
+
+	/* Allocate a bind2 bucket ahead of time to avoid permanently putting
+	 * the bhash2 table in an inconsistent state if a new tb2 bucket
+	 * allocation fails.
+	 */
+	new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
+	if (!new_tb2) {
+		if (reset) {
+			/* The (INADDR_ANY, port) bucket might have already
+			 * been freed, then we cannot fixup icsk_bind2_hash,
+			 * so we give up and unlink sk from bhash/bhash2 not
+			 * to leave inconsistency in bhash2.
+			 */
+			inet_put_port(sk);
+			inet_reset_saddr(sk);
+		}
+
+		return -ENOMEM;
+	}
+
+	bhash = inet_bhashfn(net, port, hinfo->bhash_size);
+	head = &hinfo->bhash[bhash];
+	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+
+	/* If we change saddr locklessly, another thread
+	 * iterating over bhash might see corrupted address.
+	 */
+	spin_lock_bh(&head->lock);
+
+	spin_lock(&head2->lock);
+	__sk_del_bind2_node(sk);
+	inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
+	spin_unlock(&head2->lock);
+
+	if (reset)
+		inet_reset_saddr(sk);
+	else
+		inet_update_saddr(sk, saddr, family);
+
+	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+
+	spin_lock(&head2->lock);
+	tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+	if (!tb2) {
+		tb2 = new_tb2;
+		inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk);
+	}
+	sk_add_bind2_node(sk, &tb2->owners);
+	inet_csk(sk)->icsk_bind2_hash = tb2;
+	spin_unlock(&head2->lock);
+
+	spin_unlock_bh(&head->lock);
+
+	if (tb2 != new_tb2)
+		kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
+
+	return 0;
+}
+
+int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
+{
+	return __inet_bhash2_update_saddr(sk, saddr, family, false);
+}
+EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);
+
+void inet_bhash2_reset_saddr(struct sock *sk)
+{
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+		__inet_bhash2_update_saddr(sk, NULL, 0, true);
+}
+EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr);
+
+/* RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm
+ * Note that we use 32bit integers (vs RFC 'short integers')
+ * because 2^16 is not a multiple of num_ephemeral and this
+ * property might be used by clever attacker.
+ *
+ * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
+ * attacks were since demonstrated, thus we use 65536 by default instead
+ * to really give more isolation and privacy, at the expense of 256kB
+ * of kernel memory.
+ */
+#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
+static u32 *table_perturb;
+
+int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+		struct sock *sk, u64 port_offset,
+		int (*check_established)(struct inet_timewait_death_row *,
+			struct sock *, __u16, struct inet_timewait_sock **))
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	struct inet_bind_hashbucket *head, *head2;
+	struct inet_timewait_sock *tw = NULL;
+	int port = inet_sk(sk)->inet_num;
+	struct net *net = sock_net(sk);
+	struct inet_bind2_bucket *tb2;
+	struct inet_bind_bucket *tb;
+	bool tb_created = false;
+	u32 remaining, offset;
+	int ret, i, low, high;
+	int l3mdev;
+	u32 index;
+
+	if (port) {
+		local_bh_disable();
+		ret = check_established(death_row, sk, port, NULL);
+		local_bh_enable();
+		return ret;
+	}
+
+	l3mdev = inet_sk_bound_l3mdev(sk);
+
+	inet_sk_get_local_port_range(sk, &low, &high);
+	high++; /* [32768, 60999] -> [32768, 61000[ */
+	remaining = high - low;
+	if (likely(remaining > 1))
+		remaining &= ~1U;
+
+	get_random_sleepable_once(table_perturb,
+				  INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
+	index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
+
+	offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
+	offset %= remaining;
+
+	/* In first pass we try ports of @low parity.
+	 * inet_csk_get_port() does the opposite choice.
+	 */
+	offset &= ~1U;
+other_parity_scan:
+	port = low + offset;
+	for (i = 0; i < remaining; i += 2, port += 2) {
+		if (unlikely(port >= high))
+			port -= remaining;
+		if (inet_is_local_reserved_port(net, port))
+			continue;
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
+
+		/* Does not bother with rcv_saddr checks, because
+		 * the established check is already unique enough.
+		 */
+		inet_bind_bucket_for_each(tb, &head->chain) {
+			if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
+				if (tb->fastreuse >= 0 ||
+				    tb->fastreuseport >= 0)
+					goto next_port;
+				WARN_ON(hlist_empty(&tb->owners));
+				if (!check_established(death_row, sk,
+						       port, &tw))
+					goto ok;
+				goto next_port;
+			}
+		}
+
+		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+					     net, head, port, l3mdev);
+		if (!tb) {
+			spin_unlock_bh(&head->lock);
+			return -ENOMEM;
+		}
+		tb_created = true;
+		tb->fastreuse = -1;
+		tb->fastreuseport = -1;
+		goto ok;
+next_port:
+		spin_unlock_bh(&head->lock);
+		cond_resched();
+	}
+
+	offset++;
+	if ((offset & 1) && remaining > 1)
+		goto other_parity_scan;
+
+	return -EADDRNOTAVAIL;
+
+ok:
+	/* Find the corresponding tb2 bucket since we need to
+	 * add the socket to the bhash2 table as well
+	 */
+	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+	spin_lock(&head2->lock);
+
+	tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+	if (!tb2) {
+		tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
+					       head2, port, l3mdev, sk);
+		if (!tb2)
+			goto error;
+	}
+
+	/* Here we want to add a little bit of randomness to the next source
+	 * port that will be chosen. We use a max() with a random here so that
+	 * on low contention the randomness is maximal and on high contention
+	 * it may be inexistent.
+	 */
+	i = max_t(int, i, get_random_u32_below(8) * 2);
+	WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
+
+	/* Head lock still held and bh's disabled */
+	inet_bind_hash(sk, tb, tb2, port);
+
+	if (sk_unhashed(sk)) {
+		inet_sk(sk)->inet_sport = htons(port);
+		inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
+	}
+	if (tw)
+		inet_twsk_bind_unhash(tw, hinfo);
+
+	spin_unlock(&head2->lock);
+	spin_unlock(&head->lock);
+
+	if (tw)
+		inet_twsk_deschedule_put(tw);
+	local_bh_enable();
+	return 0;
+
+error:
+	spin_unlock(&head2->lock);
+	if (tb_created)
+		inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
+	spin_unlock_bh(&head->lock);
+	return -ENOMEM;
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+		      struct sock *sk)
+{
+	u64 port_offset = 0;
+
+	if (!inet_sk(sk)->inet_num)
+		port_offset = inet_sk_port_offset(sk);
+	return __inet_hash_connect(death_row, sk, port_offset,
+				   __inet_check_established);
+}
+EXPORT_SYMBOL_GPL(inet_hash_connect);
+
+static void init_hashinfo_lhash2(struct inet_hashinfo *h)
+{
+	int i;
+
+	for (i = 0; i <= h->lhash2_mask; i++) {
+		spin_lock_init(&h->lhash2[i].lock);
+		INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
+				      i + LISTENING_NULLS_BASE);
+	}
+}
+
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+				unsigned long numentries, int scale,
+				unsigned long low_limit,
+				unsigned long high_limit)
+{
+	h->lhash2 = alloc_large_system_hash(name,
+					    sizeof(*h->lhash2),
+					    numentries,
+					    scale,
+					    0,
+					    NULL,
+					    &h->lhash2_mask,
+					    low_limit,
+					    high_limit);
+	init_hashinfo_lhash2(h);
+
+	/* this one is used for source ports of outgoing connections */
+	table_perturb = alloc_large_system_hash("Table-perturb",
+						sizeof(*table_perturb),
+						INET_TABLE_PERTURB_SIZE,
+						0, 0, NULL, NULL,
+						INET_TABLE_PERTURB_SIZE,
+						INET_TABLE_PERTURB_SIZE);
+}
+
+int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
+{
+	h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
+	if (!h->lhash2)
+		return -ENOMEM;
+
+	h->lhash2_mask = INET_LHTABLE_SIZE - 1;
+	/* INET_LHTABLE_SIZE must be a power of 2 */
+	BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
+
+	init_hashinfo_lhash2(h);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);
+
+int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+	unsigned int locksz = sizeof(spinlock_t);
+	unsigned int i, nblocks = 1;
+
+	if (locksz != 0) {
+		/* allocate 2 cache lines or at least one spinlock per cpu */
+		nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
+		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
+
+		/* no more locks than number of hash buckets */
+		nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+		hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
+		if (!hashinfo->ehash_locks)
+			return -ENOMEM;
+
+		for (i = 0; i < nblocks; i++)
+			spin_lock_init(&hashinfo->ehash_locks[i]);
+	}
+	hashinfo->ehash_locks_mask = nblocks - 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
+
+struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
+						 unsigned int ehash_entries)
+{
+	struct inet_hashinfo *new_hashinfo;
+	int i;
+
+	new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
+	if (!new_hashinfo)
+		goto err;
+
+	new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
+					   GFP_KERNEL_ACCOUNT);
+	if (!new_hashinfo->ehash)
+		goto free_hashinfo;
+
+	new_hashinfo->ehash_mask = ehash_entries - 1;
+
+	if (inet_ehash_locks_alloc(new_hashinfo))
+		goto free_ehash;
+
+	for (i = 0; i < ehash_entries; i++)
+		INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
+
+	new_hashinfo->pernet = true;
+
+	return new_hashinfo;
+
+free_ehash:
+	vfree(new_hashinfo->ehash);
+free_hashinfo:
+	kfree(new_hashinfo);
+err:
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc);
+
+void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
+{
+	if (!hashinfo->pernet)
+		return;
+
+	inet_ehash_locks_free(hashinfo);
+	vfree(hashinfo->ehash);
+	kfree(hashinfo);
+}
+EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 0000000000..dd37a5bf68
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic TIME_WAIT sockets functions
+ *
+ *		From code orinally in TCP
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+
+/**
+ *	inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ *	@tw: timewait socket
+ *	@hashinfo: hashinfo pointer
+ *
+ *	unhash a timewait socket from bind hash, if hashed.
+ *	bind hash lock must be held by caller.
+ *	Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+			  struct inet_hashinfo *hashinfo)
+{
+	struct inet_bind2_bucket *tb2 = tw->tw_tb2;
+	struct inet_bind_bucket *tb = tw->tw_tb;
+
+	if (!tb)
+		return;
+
+	__hlist_del(&tw->tw_bind_node);
+	tw->tw_tb = NULL;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+
+	__hlist_del(&tw->tw_bind2_node);
+	tw->tw_tb2 = NULL;
+	inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+
+	__sock_put((struct sock *)tw);
+}
+
+/* Must be called with locally disabled BHs. */
+static void inet_twsk_kill(struct inet_timewait_sock *tw)
+{
+	struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+	struct inet_bind_hashbucket *bhead, *bhead2;
+
+	spin_lock(lock);
+	sk_nulls_del_node_init_rcu((struct sock *)tw);
+	spin_unlock(lock);
+
+	/* Disassociate with bind bucket. */
+	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
+			hashinfo->bhash_size)];
+	bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
+				       twsk_net(tw), tw->tw_num);
+
+	spin_lock(&bhead->lock);
+	spin_lock(&bhead2->lock);
+	inet_twsk_bind_unhash(tw, hashinfo);
+	spin_unlock(&bhead2->lock);
+	spin_unlock(&bhead->lock);
+
+	refcount_dec(&tw->tw_dr->tw_refcount);
+	inet_twsk_put(tw);
+}
+
+void inet_twsk_free(struct inet_timewait_sock *tw)
+{
+	struct module *owner = tw->tw_prot->owner;
+	twsk_destructor((struct sock *)tw);
+	kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+	module_put(owner);
+}
+
+void inet_twsk_put(struct inet_timewait_sock *tw)
+{
+	if (refcount_dec_and_test(&tw->tw_refcnt))
+		inet_twsk_free(tw);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_put);
+
+static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+				   struct hlist_nulls_head *list)
+{
+	hlist_nulls_add_head_rcu(&tw->tw_node, list);
+}
+
+static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
+				    struct hlist_head *list)
+{
+	hlist_add_head(&tw->tw_bind_node, list);
+}
+
+static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
+				     struct hlist_head *list)
+{
+	hlist_add_head(&tw->tw_bind2_node, list);
+}
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+			   struct inet_hashinfo *hashinfo)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+	spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+	struct inet_bind_hashbucket *bhead, *bhead2;
+
+	/* Step 1: Put TW into bind hash. Original socket stays there too.
+	   Note, that any socket with inet->num != 0 MUST be bound in
+	   binding cache, even if it is closed.
+	 */
+	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
+			hashinfo->bhash_size)];
+	bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
+
+	spin_lock(&bhead->lock);
+	spin_lock(&bhead2->lock);
+
+	tw->tw_tb = icsk->icsk_bind_hash;
+	WARN_ON(!icsk->icsk_bind_hash);
+	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+
+	tw->tw_tb2 = icsk->icsk_bind2_hash;
+	WARN_ON(!icsk->icsk_bind2_hash);
+	inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow);
+
+	spin_unlock(&bhead2->lock);
+	spin_unlock(&bhead->lock);
+
+	spin_lock(lock);
+
+	inet_twsk_add_node_rcu(tw, &ehead->chain);
+
+	/* Step 3: Remove SK from hash chain */
+	if (__sk_nulls_del_node_init_rcu(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+	spin_unlock(lock);
+
+	/* tw_refcnt is set to 3 because we have :
+	 * - one reference for bhash chain.
+	 * - one reference for ehash chain.
+	 * - one reference for timer.
+	 * We can use atomic_set() because prior spin_lock()/spin_unlock()
+	 * committed into memory all tw fields.
+	 * Also note that after this point, we lost our implicit reference
+	 * so we are not allowed to use tw anymore.
+	 */
+	refcount_set(&tw->tw_refcnt, 3);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
+
+static void tw_timer_handler(struct timer_list *t)
+{
+	struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
+
+	inet_twsk_kill(tw);
+}
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+					   struct inet_timewait_death_row *dr,
+					   const int state)
+{
+	struct inet_timewait_sock *tw;
+
+	if (refcount_read(&dr->tw_refcount) - 1 >=
+	    READ_ONCE(dr->sysctl_max_tw_buckets))
+		return NULL;
+
+	tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+			      GFP_ATOMIC);
+	if (tw) {
+		const struct inet_sock *inet = inet_sk(sk);
+
+		tw->tw_dr	    = dr;
+		/* Give us an identity. */
+		tw->tw_daddr	    = inet->inet_daddr;
+		tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
+		tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+		tw->tw_tos	    = inet->tos;
+		tw->tw_num	    = inet->inet_num;
+		tw->tw_state	    = TCP_TIME_WAIT;
+		tw->tw_substate	    = state;
+		tw->tw_sport	    = inet->inet_sport;
+		tw->tw_dport	    = inet->inet_dport;
+		tw->tw_family	    = sk->sk_family;
+		tw->tw_reuse	    = sk->sk_reuse;
+		tw->tw_reuseport    = sk->sk_reuseport;
+		tw->tw_hash	    = sk->sk_hash;
+		tw->tw_ipv6only	    = 0;
+		tw->tw_transparent  = inet_test_bit(TRANSPARENT, sk);
+		tw->tw_prot	    = sk->sk_prot_creator;
+		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
+		twsk_net_set(tw, sock_net(sk));
+		timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
+		/*
+		 * Because we use RCU lookups, we should not set tw_refcnt
+		 * to a non null value before everything is setup for this
+		 * timewait socket.
+		 */
+		refcount_set(&tw->tw_refcnt, 0);
+
+		__module_get(tw->tw_prot->owner);
+	}
+
+	return tw;
+}
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* These are always called from BH context.  See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets.
+ * Warning : consume reference.
+ * Caller should not access tw anymore.
+ */
+void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
+{
+	if (del_timer_sync(&tw->tw_timer))
+		inet_twsk_kill(tw);
+	inet_twsk_put(tw);
+}
+EXPORT_SYMBOL(inet_twsk_deschedule_put);
+
+void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
+{
+	/* timeout := RTO * 3.5
+	 *
+	 * 3.5 = 1+2+0.5 to wait for two retransmits.
+	 *
+	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
+	 * FINs (or previous seqments) are lost (probability of such event
+	 * is p^(N+1), where p is probability to lose single packet and
+	 * time to detect the loss is about RTO*(2^N - 1) with exponential
+	 * backoff). Normal timewait length is calculated so, that we
+	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+	 * [ BTW Linux. following BSD, violates this requirement waiting
+	 *   only for 60sec, we should wait at least for 240 secs.
+	 *   Well, 240 consumes too much of resources 8)
+	 * ]
+	 * This interval is not reduced to catch old duplicate and
+	 * responces to our wandering segments living for two MSLs.
+	 * However, if we use PAWS to detect
+	 * old duplicates, we can reduce the interval to bounds required
+	 * by RTO, rather than MSL. So, if peer understands PAWS, we
+	 * kill tw bucket after 3.5*RTO (it is important that this number
+	 * is greater than TS tick!) and detect old duplicates with help
+	 * of PAWS.
+	 */
+
+	if (!rearm) {
+		bool kill = timeo <= 4*HZ;
+
+		__NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
+						     LINUX_MIB_TIMEWAITED);
+		BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
+		refcount_inc(&tw->tw_dr->tw_refcount);
+	} else {
+		mod_timer_pending(&tw->tw_timer, jiffies + timeo);
+	}
+}
+EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
+
+void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
+{
+	struct inet_timewait_sock *tw;
+	struct sock *sk;
+	struct hlist_nulls_node *node;
+	unsigned int slot;
+
+	for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+		struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+		cond_resched();
+		rcu_read_lock();
+restart:
+		sk_nulls_for_each_rcu(sk, node, &head->chain) {
+			if (sk->sk_state != TCP_TIME_WAIT) {
+				/* A kernel listener socket might not hold refcnt for net,
+				 * so reqsk_timer_handler() could be fired after net is
+				 * freed.  Userspace listener and reqsk never exist here.
+				 */
+				if (unlikely(sk->sk_state == TCP_NEW_SYN_RECV &&
+					     hashinfo->pernet)) {
+					struct request_sock *req = inet_reqsk(sk);
+
+					inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
+				}
+
+				continue;
+			}
+
+			tw = inet_twsk(sk);
+			if ((tw->tw_family != family) ||
+				refcount_read(&twsk_net(tw)->ns.count))
+				continue;
+
+			if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt)))
+				continue;
+
+			if (unlikely((tw->tw_family != family) ||
+				     refcount_read(&twsk_net(tw)->ns.count))) {
+				inet_twsk_put(tw);
+				goto restart;
+			}
+
+			rcu_read_unlock();
+			local_bh_disable();
+			inet_twsk_deschedule_put(tw);
+			local_bh_enable();
+			goto restart_rcu;
+		}
+		/* If the nulls value we got at the end of this lookup is
+		 * not the expected one, we must restart lookup.
+		 * We probably met an item that was moved to another chain.
+		 */
+		if (get_nulls_value(node) != slot)
+			goto restart;
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 0000000000..e9fed83e9b
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,308 @@
+/*
+ *		INETPEER - A storage for permanent information about peers
+ *
+ *  This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ *  Authors:	Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <net/ip.h>
+#include <net/inetpeer.h>
+#include <net/secure_seq.h>
+
+/*
+ *  Theory of operations.
+ *  We keep one entry for each peer IP address.  The nodes contains long-living
+ *  information about the peer which doesn't depend on routes.
+ *
+ *  Nodes are removed only when reference counter goes to 0.
+ *  When it's happened the node may be removed when a sufficient amount of
+ *  time has been passed since its last use.  The less-recently-used entry can
+ *  also be removed if the pool is overloaded i.e. if the total amount of
+ *  entries is greater-or-equal than the threshold.
+ *
+ *  Node pool is organised as an RB tree.
+ *  Such an implementation has been chosen not just for fun.  It's a way to
+ *  prevent easy and efficient DoS attacks by creating hash collisions.  A huge
+ *  amount of long living nodes in a single hash slot would significantly delay
+ *  lookups performed with disabled BHs.
+ *
+ *  Serialisation issues.
+ *  1.  Nodes may appear in the tree only with the pool lock held.
+ *  2.  Nodes may disappear from the tree only with the pool lock held
+ *      AND reference count being 0.
+ *  3.  Global variable peer_total is modified under the pool lock.
+ *  4.  struct inet_peer fields modification:
+ *		rb_node: pool lock
+ *		refcnt: atomically against modifications on other CPU;
+ *		   usually under some other lock to prevent node disappearing
+ *		daddr: unchangeable
+ */
+
+static struct kmem_cache *peer_cachep __ro_after_init;
+
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+	bp->rb_root = RB_ROOT;
+	seqlock_init(&bp->lock);
+	bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
+
+#define PEER_MAX_GC 32
+
+/* Exported for sysctl_net_ipv4.  */
+int inet_peer_threshold __read_mostly;	/* start to throw entries more
+					 * aggressively at this stage */
+int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */
+int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */
+
+/* Called from ip_output.c:ip_init  */
+void __init inet_initpeers(void)
+{
+	u64 nr_entries;
+
+	 /* 1% of physical memory */
+	nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT,
+			      100 * L1_CACHE_ALIGN(sizeof(struct inet_peer)));
+
+	inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128);
+
+	peer_cachep = kmem_cache_create("inet_peer_cache",
+			sizeof(struct inet_peer),
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+			NULL);
+}
+
+/* Called with rcu_read_lock() or base->lock held */
+static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
+				struct inet_peer_base *base,
+				unsigned int seq,
+				struct inet_peer *gc_stack[],
+				unsigned int *gc_cnt,
+				struct rb_node **parent_p,
+				struct rb_node ***pp_p)
+{
+	struct rb_node **pp, *parent, *next;
+	struct inet_peer *p;
+
+	pp = &base->rb_root.rb_node;
+	parent = NULL;
+	while (1) {
+		int cmp;
+
+		next = rcu_dereference_raw(*pp);
+		if (!next)
+			break;
+		parent = next;
+		p = rb_entry(parent, struct inet_peer, rb_node);
+		cmp = inetpeer_addr_cmp(daddr, &p->daddr);
+		if (cmp == 0) {
+			if (!refcount_inc_not_zero(&p->refcnt))
+				break;
+			return p;
+		}
+		if (gc_stack) {
+			if (*gc_cnt < PEER_MAX_GC)
+				gc_stack[(*gc_cnt)++] = p;
+		} else if (unlikely(read_seqretry(&base->lock, seq))) {
+			break;
+		}
+		if (cmp == -1)
+			pp = &next->rb_left;
+		else
+			pp = &next->rb_right;
+	}
+	*parent_p = parent;
+	*pp_p = pp;
+	return NULL;
+}
+
+static void inetpeer_free_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
+}
+
+/* perform garbage collect on all items stacked during a lookup */
+static void inet_peer_gc(struct inet_peer_base *base,
+			 struct inet_peer *gc_stack[],
+			 unsigned int gc_cnt)
+{
+	int peer_threshold, peer_maxttl, peer_minttl;
+	struct inet_peer *p;
+	__u32 delta, ttl;
+	int i;
+
+	peer_threshold = READ_ONCE(inet_peer_threshold);
+	peer_maxttl = READ_ONCE(inet_peer_maxttl);
+	peer_minttl = READ_ONCE(inet_peer_minttl);
+
+	if (base->total >= peer_threshold)
+		ttl = 0; /* be aggressive */
+	else
+		ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ *
+			base->total / peer_threshold * HZ;
+	for (i = 0; i < gc_cnt; i++) {
+		p = gc_stack[i];
+
+		/* The READ_ONCE() pairs with the WRITE_ONCE()
+		 * in inet_putpeer()
+		 */
+		delta = (__u32)jiffies - READ_ONCE(p->dtime);
+
+		if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
+			gc_stack[i] = NULL;
+	}
+	for (i = 0; i < gc_cnt; i++) {
+		p = gc_stack[i];
+		if (p) {
+			rb_erase(&p->rb_node, &base->rb_root);
+			base->total--;
+			call_rcu(&p->rcu, inetpeer_free_rcu);
+		}
+	}
+}
+
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+			       const struct inetpeer_addr *daddr,
+			       int create)
+{
+	struct inet_peer *p, *gc_stack[PEER_MAX_GC];
+	struct rb_node **pp, *parent;
+	unsigned int gc_cnt, seq;
+	int invalidated;
+
+	/* Attempt a lockless lookup first.
+	 * Because of a concurrent writer, we might not find an existing entry.
+	 */
+	rcu_read_lock();
+	seq = read_seqbegin(&base->lock);
+	p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
+	invalidated = read_seqretry(&base->lock, seq);
+	rcu_read_unlock();
+
+	if (p)
+		return p;
+
+	/* If no writer did a change during our lookup, we can return early. */
+	if (!create && !invalidated)
+		return NULL;
+
+	/* retry an exact lookup, taking the lock before.
+	 * At least, nodes should be hot in our cache.
+	 */
+	parent = NULL;
+	write_seqlock_bh(&base->lock);
+
+	gc_cnt = 0;
+	p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
+	if (!p && create) {
+		p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+		if (p) {
+			p->daddr = *daddr;
+			p->dtime = (__u32)jiffies;
+			refcount_set(&p->refcnt, 2);
+			atomic_set(&p->rid, 0);
+			p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+			p->rate_tokens = 0;
+			p->n_redirects = 0;
+			/* 60*HZ is arbitrary, but chosen enough high so that the first
+			 * calculation of tokens is at its maximum.
+			 */
+			p->rate_last = jiffies - 60*HZ;
+
+			rb_link_node(&p->rb_node, parent, pp);
+			rb_insert_color(&p->rb_node, &base->rb_root);
+			base->total++;
+		}
+	}
+	if (gc_cnt)
+		inet_peer_gc(base, gc_stack, gc_cnt);
+	write_sequnlock_bh(&base->lock);
+
+	return p;
+}
+EXPORT_SYMBOL_GPL(inet_getpeer);
+
+void inet_putpeer(struct inet_peer *p)
+{
+	/* The WRITE_ONCE() pairs with itself (we run lockless)
+	 * and the READ_ONCE() in inet_peer_gc()
+	 */
+	WRITE_ONCE(p->dtime, (__u32)jiffies);
+
+	if (refcount_dec_and_test(&p->refcnt))
+		call_rcu(&p->rcu, inetpeer_free_rcu);
+}
+EXPORT_SYMBOL_GPL(inet_putpeer);
+
+/*
+ *	Check transmit rate limitation for given message.
+ *	The rate information is held in the inet_peer entries now.
+ *	This function is generic and could be used for other purposes
+ *	too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ *	Note that the same inet_peer fields are modified by functions in
+ *	route.c too, but these work for packet destinations while xrlim_allow
+ *	works for icmp destinations. This means the rate limiting information
+ *	for one "ip object" is shared - and these ICMPs are twice limited:
+ *	by source and by destination.
+ *
+ *	RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ *			  SHOULD allow setting of rate limits
+ *
+ * 	Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+	unsigned long now, token;
+	bool rc = false;
+
+	if (!peer)
+		return true;
+
+	token = peer->rate_tokens;
+	now = jiffies;
+	token += now - peer->rate_last;
+	peer->rate_last = now;
+	if (token > XRLIM_BURST_FACTOR * timeout)
+		token = XRLIM_BURST_FACTOR * timeout;
+	if (token >= timeout) {
+		token -= timeout;
+		rc = true;
+	}
+	peer->rate_tokens = token;
+	return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
+
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
+{
+	struct rb_node *p = rb_first(&base->rb_root);
+
+	while (p) {
+		struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node);
+
+		p = rb_next(p);
+		rb_erase(&peer->rb_node, &base->rb_root);
+		inet_putpeer(peer);
+		cond_resched();
+	}
+
+	base->total = 0;
+}
+EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 0000000000..66fac1216d
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP forwarding functionality.
+ *
+ * Authors:	see ip.c
+ *
+ * Fixes:
+ *		Many		:	Split from ip.c , see ip_input.c for
+ *					history.
+ *		Dave Gregorich	:	NULL ip_rt_put fix for multicast
+ *					routing.
+ *		Jos Vos		:	Add call_out_firewall before sending,
+ *					use output device for accounting.
+ *		Jos Vos		:	Call forward firewall after routing
+ *					(always use output device).
+ *		Mike McLagan	:	Routing by source
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu)
+		return false;
+
+	if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0))
+		return false;
+
+	/* original fragment exceeds mtu and DF is set */
+	if (unlikely(IPCB(skb)->frag_max_size > mtu))
+		return true;
+
+	if (skb->ignore_df)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+		return false;
+
+	return true;
+}
+
+
+static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct ip_options *opt	= &(IPCB(skb)->opt);
+
+	__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+#ifdef CONFIG_NET_SWITCHDEV
+	if (skb->offload_l3_fwd_mark) {
+		consume_skb(skb);
+		return 0;
+	}
+#endif
+
+	if (unlikely(opt->optlen))
+		ip_forward_options(skb);
+
+	skb_clear_tstamp(skb);
+	return dst_output(net, sk, skb);
+}
+
+int ip_forward(struct sk_buff *skb)
+{
+	u32 mtu;
+	struct iphdr *iph;	/* Our header */
+	struct rtable *rt;	/* Route we use */
+	struct ip_options *opt	= &(IPCB(skb)->opt);
+	struct net *net;
+	SKB_DR(reason);
+
+	/* that should never happen */
+	if (skb->pkt_type != PACKET_HOST)
+		goto drop;
+
+	if (unlikely(skb->sk))
+		goto drop;
+
+	if (skb_warn_if_lro(skb))
+		goto drop;
+
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+		SKB_DR_SET(reason, XFRM_POLICY);
+		goto drop;
+	}
+
+	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+		return NET_RX_SUCCESS;
+
+	skb_forward_csum(skb);
+	net = dev_net(skb->dev);
+
+	/*
+	 *	According to the RFC, we must first decrease the TTL field. If
+	 *	that reaches zero, we must reply an ICMP control message telling
+	 *	that the packet's lifetime expired.
+	 */
+	if (ip_hdr(skb)->ttl <= 1)
+		goto too_many_hops;
+
+	if (!xfrm4_route_forward(skb)) {
+		SKB_DR_SET(reason, XFRM_POLICY);
+		goto drop;
+	}
+
+	rt = skb_rtable(skb);
+
+	if (opt->is_strictroute && rt->rt_uses_gateway)
+		goto sr_failed;
+
+	IPCB(skb)->flags |= IPSKB_FORWARDED;
+	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+	if (ip_exceeds_mtu(skb, mtu)) {
+		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(mtu));
+		SKB_DR_SET(reason, PKT_TOO_BIG);
+		goto drop;
+	}
+
+	/* We are about to mangle packet. Copy it! */
+	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
+		goto drop;
+	iph = ip_hdr(skb);
+
+	/* Decrease ttl after skb cow done */
+	ip_decrease_ttl(iph);
+
+	/*
+	 *	We now generate an ICMP HOST REDIRECT giving the route
+	 *	we calculated.
+	 */
+	if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
+	    !skb_sec_path(skb))
+		ip_rt_send_redirect(skb);
+
+	if (READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority))
+		skb->priority = rt_tos2priority(iph->tos);
+
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+		       net, NULL, skb, skb->dev, rt->dst.dev,
+		       ip_forward_finish);
+
+sr_failed:
+	/*
+	 *	Strict routing permits no gatewaying
+	 */
+	 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+	 goto drop;
+
+too_many_hops:
+	/* Tell the sender its packet died... */
+	__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+	icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+	SKB_DR_SET(reason, IP_INHDR);
+drop:
+	kfree_skb_reason(skb, reason);
+	return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 0000000000..a4941f53b5
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,761 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP fragmentation functionality.
+ *
+ * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ * Fixes:
+ *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
+ *		David S. Miller :	Begin massive cleanup...
+ *		Andi Kleen	:	Add sysctls.
+ *		xxxx		:	Overlapfrag bug.
+ *		Ultima          :       ip_expire() kernel panic.
+ *		Bill Hawes	:	Frag accounting and evictor fixes.
+ *		John McDonald	:	0 length frag bug.
+ *		Alexey Kuznetsov:	SMP races, threading, cleanup.
+ *		Patrick McHardy :	LRU queue of frag heads for evictor.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/jiffies.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <net/inet_frag.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/inet_ecn.h>
+#include <net/l3mdev.h>
+
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+static const char ip_frag_cache_name[] = "ip4-frags";
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+	struct inet_frag_queue q;
+
+	u8		ecn; /* RFC3168 support */
+	u16		max_df_size; /* largest frag with DF set seen */
+	int             iif;
+	unsigned int    rid;
+	struct inet_peer *peer;
+};
+
+static u8 ip4_frag_ecn(u8 tos)
+{
+	return 1 << (tos & INET_ECN_MASK);
+}
+
+static struct inet_frags ip4_frags;
+
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+			 struct sk_buff *prev_tail, struct net_device *dev);
+
+
+static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
+{
+	struct ipq *qp = container_of(q, struct ipq, q);
+	struct net *net = q->fqdir->net;
+
+	const struct frag_v4_compare_key *key = a;
+
+	q->key.v4 = *key;
+	qp->ecn = 0;
+	qp->peer = q->fqdir->max_dist ?
+		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
+		NULL;
+}
+
+static void ip4_frag_free(struct inet_frag_queue *q)
+{
+	struct ipq *qp;
+
+	qp = container_of(q, struct ipq, q);
+	if (qp->peer)
+		inet_putpeer(qp->peer);
+}
+
+
+/* Destruction primitives. */
+
+static void ipq_put(struct ipq *ipq)
+{
+	inet_frag_put(&ipq->q);
+}
+
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static void ipq_kill(struct ipq *ipq)
+{
+	inet_frag_kill(&ipq->q);
+}
+
+static bool frag_expire_skip_icmp(u32 user)
+{
+	return user == IP_DEFRAG_AF_PACKET ||
+	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
+					 __IP_DEFRAG_CONNTRACK_IN_END) ||
+	       ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
+					 __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
+}
+
+/*
+ * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
+ */
+static void ip_expire(struct timer_list *t)
+{
+	struct inet_frag_queue *frag = from_timer(frag, t, timer);
+	const struct iphdr *iph;
+	struct sk_buff *head = NULL;
+	struct net *net;
+	struct ipq *qp;
+	int err;
+
+	qp = container_of(frag, struct ipq, q);
+	net = qp->q.fqdir->net;
+
+	rcu_read_lock();
+
+	/* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
+	if (READ_ONCE(qp->q.fqdir->dead))
+		goto out_rcu_unlock;
+
+	spin_lock(&qp->q.lock);
+
+	if (qp->q.flags & INET_FRAG_COMPLETE)
+		goto out;
+
+	qp->q.flags |= INET_FRAG_DROP;
+	ipq_kill(qp);
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
+
+	if (!(qp->q.flags & INET_FRAG_FIRST_IN))
+		goto out;
+
+	/* sk_buff::dev and sk_buff::rbnode are unionized. So we
+	 * pull the head out of the tree in order to be able to
+	 * deal with head->dev.
+	 */
+	head = inet_frag_pull_head(&qp->q);
+	if (!head)
+		goto out;
+	head->dev = dev_get_by_index_rcu(net, qp->iif);
+	if (!head->dev)
+		goto out;
+
+
+	/* skb has no dst, perform route lookup again */
+	iph = ip_hdr(head);
+	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+					   iph->tos, head->dev);
+	if (err)
+		goto out;
+
+	/* Only an end host needs to send an ICMP
+	 * "Fragment Reassembly Timeout" message, per RFC792.
+	 */
+	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+	    (skb_rtable(head)->rt_type != RTN_LOCAL))
+		goto out;
+
+	spin_unlock(&qp->q.lock);
+	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+	goto out_rcu_unlock;
+
+out:
+	spin_unlock(&qp->q.lock);
+out_rcu_unlock:
+	rcu_read_unlock();
+	kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
+	ipq_put(qp);
+}
+
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static struct ipq *ip_find(struct net *net, struct iphdr *iph,
+			   u32 user, int vif)
+{
+	struct frag_v4_compare_key key = {
+		.saddr = iph->saddr,
+		.daddr = iph->daddr,
+		.user = user,
+		.vif = vif,
+		.id = iph->id,
+		.protocol = iph->protocol,
+	};
+	struct inet_frag_queue *q;
+
+	q = inet_frag_find(net->ipv4.fqdir, &key);
+	if (!q)
+		return NULL;
+
+	return container_of(q, struct ipq, q);
+}
+
+/* Is the fragment too far ahead to be part of ipq? */
+static int ip_frag_too_far(struct ipq *qp)
+{
+	struct inet_peer *peer = qp->peer;
+	unsigned int max = qp->q.fqdir->max_dist;
+	unsigned int start, end;
+
+	int rc;
+
+	if (!peer || !max)
+		return 0;
+
+	start = qp->rid;
+	end = atomic_inc_return(&peer->rid);
+	qp->rid = end;
+
+	rc = qp->q.fragments_tail && (end - start) > max;
+
+	if (rc)
+		__IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
+
+	return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+	unsigned int sum_truesize = 0;
+
+	if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
+		refcount_inc(&qp->q.refcnt);
+		return -ETIMEDOUT;
+	}
+
+	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
+					      SKB_DROP_REASON_FRAG_TOO_FAR);
+	sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
+
+	qp->q.flags = 0;
+	qp->q.len = 0;
+	qp->q.meat = 0;
+	qp->q.rb_fragments = RB_ROOT;
+	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
+	qp->iif = 0;
+	qp->ecn = 0;
+
+	return 0;
+}
+
+/* Add new segment to existing queue. */
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+{
+	struct net *net = qp->q.fqdir->net;
+	int ihl, end, flags, offset;
+	struct sk_buff *prev_tail;
+	struct net_device *dev;
+	unsigned int fragsize;
+	int err = -ENOENT;
+	SKB_DR(reason);
+	u8 ecn;
+
+	/* If reassembly is already done, @skb must be a duplicate frag. */
+	if (qp->q.flags & INET_FRAG_COMPLETE) {
+		SKB_DR_SET(reason, DUP_FRAG);
+		goto err;
+	}
+
+	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+	    unlikely(ip_frag_too_far(qp)) &&
+	    unlikely(err = ip_frag_reinit(qp))) {
+		ipq_kill(qp);
+		goto err;
+	}
+
+	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
+	offset = ntohs(ip_hdr(skb)->frag_off);
+	flags = offset & ~IP_OFFSET;
+	offset &= IP_OFFSET;
+	offset <<= 3;		/* offset is in 8-byte chunks */
+	ihl = ip_hdrlen(skb);
+
+	/* Determine the position of this fragment. */
+	end = offset + skb->len - skb_network_offset(skb) - ihl;
+	err = -EINVAL;
+
+	/* Is this the final fragment? */
+	if ((flags & IP_MF) == 0) {
+		/* If we already have some bits beyond end
+		 * or have different end, the segment is corrupted.
+		 */
+		if (end < qp->q.len ||
+		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
+			goto discard_qp;
+		qp->q.flags |= INET_FRAG_LAST_IN;
+		qp->q.len = end;
+	} else {
+		if (end&7) {
+			end &= ~7;
+			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+				skb->ip_summed = CHECKSUM_NONE;
+		}
+		if (end > qp->q.len) {
+			/* Some bits beyond end -> corruption. */
+			if (qp->q.flags & INET_FRAG_LAST_IN)
+				goto discard_qp;
+			qp->q.len = end;
+		}
+	}
+	if (end == offset)
+		goto discard_qp;
+
+	err = -ENOMEM;
+	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
+		goto discard_qp;
+
+	err = pskb_trim_rcsum(skb, end - offset);
+	if (err)
+		goto discard_qp;
+
+	/* Note : skb->rbnode and skb->dev share the same location. */
+	dev = skb->dev;
+	/* Makes sure compiler wont do silly aliasing games */
+	barrier();
+
+	prev_tail = qp->q.fragments_tail;
+	err = inet_frag_queue_insert(&qp->q, skb, offset, end);
+	if (err)
+		goto insert_error;
+
+	if (dev)
+		qp->iif = dev->ifindex;
+
+	qp->q.stamp = skb->tstamp;
+	qp->q.mono_delivery_time = skb->mono_delivery_time;
+	qp->q.meat += skb->len;
+	qp->ecn |= ecn;
+	add_frag_mem_limit(qp->q.fqdir, skb->truesize);
+	if (offset == 0)
+		qp->q.flags |= INET_FRAG_FIRST_IN;
+
+	fragsize = skb->len + ihl;
+
+	if (fragsize > qp->q.max_size)
+		qp->q.max_size = fragsize;
+
+	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+	    fragsize > qp->max_df_size)
+		qp->max_df_size = fragsize;
+
+	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	    qp->q.meat == qp->q.len) {
+		unsigned long orefdst = skb->_skb_refdst;
+
+		skb->_skb_refdst = 0UL;
+		err = ip_frag_reasm(qp, skb, prev_tail, dev);
+		skb->_skb_refdst = orefdst;
+		if (err)
+			inet_frag_kill(&qp->q);
+		return err;
+	}
+
+	skb_dst_drop(skb);
+	return -EINPROGRESS;
+
+insert_error:
+	if (err == IPFRAG_DUP) {
+		SKB_DR_SET(reason, DUP_FRAG);
+		err = -EINVAL;
+		goto err;
+	}
+	err = -EINVAL;
+	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
+discard_qp:
+	inet_frag_kill(&qp->q);
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
+err:
+	kfree_skb_reason(skb, reason);
+	return err;
+}
+
+static bool ip_frag_coalesce_ok(const struct ipq *qp)
+{
+	return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
+}
+
+/* Build a new IP datagram from all its fragments. */
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+			 struct sk_buff *prev_tail, struct net_device *dev)
+{
+	struct net *net = qp->q.fqdir->net;
+	struct iphdr *iph;
+	void *reasm_data;
+	int len, err;
+	u8 ecn;
+
+	ipq_kill(qp);
+
+	ecn = ip_frag_ecn_table[qp->ecn];
+	if (unlikely(ecn == 0xff)) {
+		err = -EINVAL;
+		goto out_fail;
+	}
+
+	/* Make the one we just received the head. */
+	reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
+	if (!reasm_data)
+		goto out_nomem;
+
+	len = ip_hdrlen(skb) + qp->q.len;
+	err = -E2BIG;
+	if (len > 65535)
+		goto out_oversize;
+
+	inet_frag_reasm_finish(&qp->q, skb, reasm_data,
+			       ip_frag_coalesce_ok(qp));
+
+	skb->dev = dev;
+	IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+
+	iph = ip_hdr(skb);
+	iph->tot_len = htons(len);
+	iph->tos |= ecn;
+
+	/* When we set IP_DF on a refragmented skb we must also force a
+	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
+	 * original sender only sent fragments of size f (where f < s).
+	 *
+	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
+	 * frag seen to avoid sending tiny DF-fragments in case skb was built
+	 * from one very small df-fragment and one large non-df frag.
+	 */
+	if (qp->max_df_size == qp->q.max_size) {
+		IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
+		iph->frag_off = htons(IP_DF);
+	} else {
+		iph->frag_off = 0;
+	}
+
+	ip_send_check(iph);
+
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
+	qp->q.rb_fragments = RB_ROOT;
+	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
+	return 0;
+
+out_nomem:
+	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
+	err = -ENOMEM;
+	goto out_fail;
+out_oversize:
+	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
+out_fail:
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
+	return err;
+}
+
+/* Process an incoming IP datagram fragment. */
+int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
+{
+	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
+	int vif = l3mdev_master_ifindex_rcu(dev);
+	struct ipq *qp;
+
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
+	skb_orphan(skb);
+
+	/* Lookup (or create) queue header */
+	qp = ip_find(net, ip_hdr(skb), user, vif);
+	if (qp) {
+		int ret;
+
+		spin_lock(&qp->q.lock);
+
+		ret = ip_frag_queue(qp, skb);
+
+		spin_unlock(&qp->q.lock);
+		ipq_put(qp);
+		return ret;
+	}
+
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
+	kfree_skb(skb);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(ip_defrag);
+
+struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
+{
+	struct iphdr iph;
+	int netoff;
+	u32 len;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return skb;
+
+	netoff = skb_network_offset(skb);
+
+	if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
+		return skb;
+
+	if (iph.ihl < 5 || iph.version != 4)
+		return skb;
+
+	len = ntohs(iph.tot_len);
+	if (skb->len < netoff + len || len < (iph.ihl * 4))
+		return skb;
+
+	if (ip_is_fragment(&iph)) {
+		skb = skb_share_check(skb, GFP_ATOMIC);
+		if (skb) {
+			if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) {
+				kfree_skb(skb);
+				return NULL;
+			}
+			if (pskb_trim_rcsum(skb, netoff + len)) {
+				kfree_skb(skb);
+				return NULL;
+			}
+			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+			if (ip_defrag(net, skb, user))
+				return NULL;
+			skb_clear_hash(skb);
+		}
+	}
+	return skb;
+}
+EXPORT_SYMBOL(ip_check_defrag);
+
+#ifdef CONFIG_SYSCTL
+static int dist_min;
+
+static struct ctl_table ip4_frags_ns_ctl_table[] = {
+	{
+		.procname	= "ipfrag_high_thresh",
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "ipfrag_low_thresh",
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "ipfrag_time",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ipfrag_max_dist",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &dist_min,
+	},
+	{ }
+};
+
+/* secret interval has been deprecated */
+static int ip4_frags_secret_interval_unused;
+static struct ctl_table ip4_frags_ctl_table[] = {
+	{
+		.procname	= "ipfrag_secret_interval",
+		.data		= &ip4_frags_secret_interval_unused,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+
+static int __net_init ip4_frags_ns_ctl_register(struct net *net)
+{
+	struct ctl_table *table;
+	struct ctl_table_header *hdr;
+
+	table = ip4_frags_ns_ctl_table;
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
+		if (!table)
+			goto err_alloc;
+
+	}
+	table[0].data	= &net->ipv4.fqdir->high_thresh;
+	table[0].extra1	= &net->ipv4.fqdir->low_thresh;
+	table[1].data	= &net->ipv4.fqdir->low_thresh;
+	table[1].extra2	= &net->ipv4.fqdir->high_thresh;
+	table[2].data	= &net->ipv4.fqdir->timeout;
+	table[3].data	= &net->ipv4.fqdir->max_dist;
+
+	hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+				     ARRAY_SIZE(ip4_frags_ns_ctl_table));
+	if (!hdr)
+		goto err_reg;
+
+	net->ipv4.frags_hdr = hdr;
+	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ipv4.frags_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.frags_hdr);
+	kfree(table);
+}
+
+static void __init ip4_frags_ctl_register(void)
+{
+	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
+}
+#else
+static int ip4_frags_ns_ctl_register(struct net *net)
+{
+	return 0;
+}
+
+static void ip4_frags_ns_ctl_unregister(struct net *net)
+{
+}
+
+static void __init ip4_frags_ctl_register(void)
+{
+}
+#endif
+
+static int __net_init ipv4_frags_init_net(struct net *net)
+{
+	int res;
+
+	res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
+	if (res < 0)
+		return res;
+	/* Fragment cache limits.
+	 *
+	 * The fragment memory accounting code, (tries to) account for
+	 * the real memory usage, by measuring both the size of frag
+	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
+	 * and the SKB's truesize.
+	 *
+	 * A 64K fragment consumes 129736 bytes (44*2944)+200
+	 * (1500 truesize == 2944, sizeof(struct ipq) == 200)
+	 *
+	 * We will commit 4MB at one time. Should we cross that limit
+	 * we will prune down to 3MB, making room for approx 8 big 64K
+	 * fragments 8x128k.
+	 */
+	net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
+	net->ipv4.fqdir->low_thresh  = 3 * 1024 * 1024;
+	/*
+	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
+	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
+	 * by TTL.
+	 */
+	net->ipv4.fqdir->timeout = IP_FRAG_TIME;
+
+	net->ipv4.fqdir->max_dist = 64;
+
+	res = ip4_frags_ns_ctl_register(net);
+	if (res < 0)
+		fqdir_exit(net->ipv4.fqdir);
+	return res;
+}
+
+static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
+{
+	fqdir_pre_exit(net->ipv4.fqdir);
+}
+
+static void __net_exit ipv4_frags_exit_net(struct net *net)
+{
+	ip4_frags_ns_ctl_unregister(net);
+	fqdir_exit(net->ipv4.fqdir);
+}
+
+static struct pernet_operations ip4_frags_ops = {
+	.init		= ipv4_frags_init_net,
+	.pre_exit	= ipv4_frags_pre_exit_net,
+	.exit		= ipv4_frags_exit_net,
+};
+
+
+static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key.v4,
+		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_v4_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static const struct rhashtable_params ip4_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.key_offset		= offsetof(struct inet_frag_queue, key),
+	.key_len		= sizeof(struct frag_v4_compare_key),
+	.hashfn			= ip4_key_hashfn,
+	.obj_hashfn		= ip4_obj_hashfn,
+	.obj_cmpfn		= ip4_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+
+void __init ipfrag_init(void)
+{
+	ip4_frags.constructor = ip4_frag_init;
+	ip4_frags.destructor = ip4_frag_free;
+	ip4_frags.qsize = sizeof(struct ipq);
+	ip4_frags.frag_expire = ip_expire;
+	ip4_frags.frags_cache_name = ip_frag_cache_name;
+	ip4_frags.rhash_params = ip4_rhash_params;
+	if (inet_frags_init(&ip4_frags))
+		panic("IP: failed to allocate ip4_frags cache\n");
+	ip4_frags_ctl_register();
+	register_pernet_subsys(&ip4_frags_ops);
+}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 0000000000..5169c3c72c
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,1802 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	Linux NET3:	GRE over IP protocol decoder.
+ *
+ *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/gre.h>
+#include <net/dst_metadata.h>
+#include <net/erspan.h>
+
+/*
+   Problems & solutions
+   --------------------
+
+   1. The most important issue is detecting local dead loops.
+   They would cause complete host lockup in transmit, which
+   would be "resolved" by stack overflow or, if queueing is enabled,
+   with infinite looping in net_bh.
+
+   We cannot track such dead loops during route installation,
+   it is infeasible task. The most general solutions would be
+   to keep skb->encapsulation counter (sort of local ttl),
+   and silently drop packet when it expires. It is a good
+   solution, but it supposes maintaining new variable in ALL
+   skb, even if no tunneling is used.
+
+   Current solution: xmit_recursion breaks dead loops. This is a percpu
+   counter, since when we enter the first ndo_xmit(), cpu migration is
+   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
+
+   2. Networking dead loops would not kill routers, but would really
+   kill network. IP hop limit plays role of "t->recursion" in this case,
+   if we copy it from packet being encapsulated to upper header.
+   It is very good solution, but it introduces two problems:
+
+   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+     do not work over tunnels.
+   - traceroute does not work. I planned to relay ICMP from tunnel,
+     so that this problem would be solved and traceroute output
+     would even more informative. This idea appeared to be wrong:
+     only Linux complies to rfc1812 now (yes, guys, Linux is the only
+     true router now :-)), all routers (at least, in neighbourhood of mine)
+     return only 8 bytes of payload. It is the end.
+
+   Hence, if we want that OSPF worked or traceroute said something reasonable,
+   we should search for another solution.
+
+   One of them is to parse packet trying to detect inner encapsulation
+   made by our node. It is difficult or even impossible, especially,
+   taking into account fragmentation. TO be short, ttl is not solution at all.
+
+   Current solution: The solution was UNEXPECTEDLY SIMPLE.
+   We force DF flag on tunnels with preconfigured hop limit,
+   that is ALL. :-) Well, it does not remove the problem completely,
+   but exponential growth of network traffic is changed to linear
+   (branches, that exceed pmtu are pruned) and tunnel mtu
+   rapidly degrades to value <68, where looping stops.
+   Yes, it is not good if there exists a router in the loop,
+   which does not force DF, even when encapsulating packets have DF set.
+   But it is not our problem! Nobody could accuse us, we made
+   all that we could make. Even if it is your gated who injected
+   fatal route to network, even if it were you who configured
+   fatal static route: you are innocent. :-)
+
+   Alexey Kuznetsov.
+ */
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly;
+static const struct header_ops ipgre_header_ops;
+
+static int ipgre_tunnel_init(struct net_device *dev);
+static void erspan_build_header(struct sk_buff *skb,
+				u32 id, u32 index,
+				bool truncate, bool is_ipv4);
+
+static unsigned int ipgre_net_id __read_mostly;
+static unsigned int gre_tap_net_id __read_mostly;
+static unsigned int erspan_net_id __read_mostly;
+
+static int ipgre_err(struct sk_buff *skb, u32 info,
+		     const struct tnl_ptk_info *tpi)
+{
+
+	/* All the routers (except for Linux) return only
+	   8 bytes of packet payload. It means, that precise relaying of
+	   ICMP in the real Internet is absolutely infeasible.
+
+	   Moreover, Cisco "wise men" put GRE key to the third word
+	   in GRE header. It makes impossible maintaining even soft
+	   state for keyed GRE tunnels with enabled checksum. Tell
+	   them "thank you".
+
+	   Well, I wonder, rfc1812 was written by Cisco employee,
+	   what the hell these idiots break standards established
+	   by themselves???
+	   */
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn;
+	const struct iphdr *iph;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	unsigned int data_len = 0;
+	struct ip_tunnel *t;
+
+	if (tpi->proto == htons(ETH_P_TEB))
+		itn = net_generic(net, gre_tap_net_id);
+	else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+		 tpi->proto == htons(ETH_P_ERSPAN2))
+		itn = net_generic(net, erspan_net_id);
+	else
+		itn = net_generic(net, ipgre_net_id);
+
+	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+			     iph->daddr, iph->saddr, tpi->key);
+
+	if (!t)
+		return -ENOENT;
+
+	switch (type) {
+	default:
+	case ICMP_PARAMETERPROB:
+		return 0;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return 0;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe they are just ether pollution. --ANK
+			 */
+			break;
+		}
+		break;
+
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return 0;
+		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
+		break;
+
+	case ICMP_REDIRECT:
+		break;
+	}
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (tpi->proto == htons(ETH_P_IPV6) &&
+	    !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
+					type, data_len))
+		return 0;
+#endif
+
+	if (t->parms.iph.daddr == 0 ||
+	    ipv4_is_multicast(t->parms.iph.daddr))
+		return 0;
+
+	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+		return 0;
+
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+		t->err_count++;
+	else
+		t->err_count = 1;
+	t->err_time = jiffies;
+
+	return 0;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+	/* All the routers (except for Linux) return only
+	 * 8 bytes of packet payload. It means, that precise relaying of
+	 * ICMP in the real Internet is absolutely infeasible.
+	 *
+	 * Moreover, Cisco "wise men" put GRE key to the third word
+	 * in GRE header. It makes impossible maintaining even soft
+	 * state for keyed
+	 * GRE tunnels with enabled checksum. Tell them "thank you".
+	 *
+	 * Well, I wonder, rfc1812 was written by Cisco employee,
+	 * what the hell these idiots break standards established
+	 * by themselves???
+	 */
+
+	const struct iphdr *iph = (struct iphdr *)skb->data;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct tnl_ptk_info tpi;
+
+	if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
+			     iph->ihl * 4) < 0)
+		return;
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+				 skb->dev->ifindex, IPPROTO_GRE);
+		return;
+	}
+	if (type == ICMP_REDIRECT) {
+		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
+			      IPPROTO_GRE);
+		return;
+	}
+
+	ipgre_err(skb, info, &tpi);
+}
+
+static bool is_erspan_type1(int gre_hdr_len)
+{
+	/* Both ERSPAN type I (version 0) and type II (version 1) use
+	 * protocol 0x88BE, but the type I has only 4-byte GRE header,
+	 * while type II has 8-byte.
+	 */
+	return gre_hdr_len == 4;
+}
+
+static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+		      int gre_hdr_len)
+{
+	struct net *net = dev_net(skb->dev);
+	struct metadata_dst *tun_dst = NULL;
+	struct erspan_base_hdr *ershdr;
+	struct ip_tunnel_net *itn;
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph;
+	struct erspan_md2 *md2;
+	int ver;
+	int len;
+
+	itn = net_generic(net, erspan_net_id);
+	iph = ip_hdr(skb);
+	if (is_erspan_type1(gre_hdr_len)) {
+		ver = 0;
+		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+					  tpi->flags | TUNNEL_NO_KEY,
+					  iph->saddr, iph->daddr, 0);
+	} else {
+		ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+		ver = ershdr->ver;
+		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+					  tpi->flags | TUNNEL_KEY,
+					  iph->saddr, iph->daddr, tpi->key);
+	}
+
+	if (tunnel) {
+		if (is_erspan_type1(gre_hdr_len))
+			len = gre_hdr_len;
+		else
+			len = gre_hdr_len + erspan_hdr_len(ver);
+
+		if (unlikely(!pskb_may_pull(skb, len)))
+			return PACKET_REJECT;
+
+		if (__iptunnel_pull_header(skb,
+					   len,
+					   htons(ETH_P_TEB),
+					   false, false) < 0)
+			goto drop;
+
+		if (tunnel->collect_md) {
+			struct erspan_metadata *pkt_md, *md;
+			struct ip_tunnel_info *info;
+			unsigned char *gh;
+			__be64 tun_id;
+			__be16 flags;
+
+			tpi->flags |= TUNNEL_KEY;
+			flags = tpi->flags;
+			tun_id = key32_to_tunnel_id(tpi->key);
+
+			tun_dst = ip_tun_rx_dst(skb, flags,
+						tun_id, sizeof(*md));
+			if (!tun_dst)
+				return PACKET_REJECT;
+
+			/* skb can be uncloned in __iptunnel_pull_header, so
+			 * old pkt_md is no longer valid and we need to reset
+			 * it
+			 */
+			gh = skb_network_header(skb) +
+			     skb_network_header_len(skb);
+			pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
+							    sizeof(*ershdr));
+			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
+			md->version = ver;
+			md2 = &md->u.md2;
+			memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
+						       ERSPAN_V2_MDSIZE);
+
+			info = &tun_dst->u.tun_info;
+			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+			info->options_len = sizeof(*md);
+		}
+
+		skb_reset_mac_header(skb);
+		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+		return PACKET_RCVD;
+	}
+	return PACKET_REJECT;
+
+drop:
+	kfree_skb(skb);
+	return PACKET_RCVD;
+}
+
+static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
+{
+	struct metadata_dst *tun_dst = NULL;
+	const struct iphdr *iph;
+	struct ip_tunnel *tunnel;
+
+	iph = ip_hdr(skb);
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+				  iph->saddr, iph->daddr, tpi->key);
+
+	if (tunnel) {
+		const struct iphdr *tnl_params;
+
+		if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
+					   raw_proto, false) < 0)
+			goto drop;
+
+		/* Special case for ipgre_header_parse(), which expects the
+		 * mac_header to point to the outer IP header.
+		 */
+		if (tunnel->dev->header_ops == &ipgre_header_ops)
+			skb_pop_mac_header(skb);
+		else
+			skb_reset_mac_header(skb);
+
+		tnl_params = &tunnel->parms.iph;
+		if (tunnel->collect_md || tnl_params->daddr == 0) {
+			__be16 flags;
+			__be64 tun_id;
+
+			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+			tun_id = key32_to_tunnel_id(tpi->key);
+			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
+			if (!tun_dst)
+				return PACKET_REJECT;
+		}
+
+		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+		return PACKET_RCVD;
+	}
+	return PACKET_NEXT;
+
+drop:
+	kfree_skb(skb);
+	return PACKET_RCVD;
+}
+
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+		     int hdr_len)
+{
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn;
+	int res;
+
+	if (tpi->proto == htons(ETH_P_TEB))
+		itn = net_generic(net, gre_tap_net_id);
+	else
+		itn = net_generic(net, ipgre_net_id);
+
+	res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
+	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
+		/* ipgre tunnels in collect metadata mode should receive
+		 * also ETH_P_TEB traffic.
+		 */
+		itn = net_generic(net, ipgre_net_id);
+		res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
+	}
+	return res;
+}
+
+static int gre_rcv(struct sk_buff *skb)
+{
+	struct tnl_ptk_info tpi;
+	bool csum_err = false;
+	int hdr_len;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+		/* Looped back packet, drop it! */
+		if (rt_is_output_route(skb_rtable(skb)))
+			goto drop;
+	}
+#endif
+
+	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
+	if (hdr_len < 0)
+		goto drop;
+
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+		     tpi.proto == htons(ETH_P_ERSPAN2))) {
+		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+			return 0;
+		goto out;
+	}
+
+	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+		return 0;
+
+out:
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
+		       const struct iphdr *tnl_params,
+		       __be16 proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	__be16 flags = tunnel->parms.o_flags;
+
+	/* Push GRE header. */
+	gre_build_header(skb, tunnel->tun_hlen,
+			 flags, proto, tunnel->parms.o_key,
+			 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
+
+	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
+}
+
+static int gre_handle_offloads(struct sk_buff *skb, bool csum)
+{
+	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+}
+
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+			__be16 proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct ip_tunnel_info *tun_info;
+	const struct ip_tunnel_key *key;
+	int tunnel_hlen;
+	__be16 flags;
+
+	tun_info = skb_tunnel_info(skb);
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
+		goto err_free_skb;
+
+	key = &tun_info->key;
+	tunnel_hlen = gre_calc_hlen(key->tun_flags);
+
+	if (skb_cow_head(skb, dev->needed_headroom))
+		goto err_free_skb;
+
+	/* Push Tunnel header. */
+	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
+		goto err_free_skb;
+
+	flags = tun_info->key.tun_flags &
+		(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
+	gre_build_header(skb, tunnel_hlen, flags, proto,
+			 tunnel_id_to_key32(tun_info->key.tun_id),
+			 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
+
+	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
+
+	return;
+
+err_free_skb:
+	kfree_skb(skb);
+	DEV_STATS_INC(dev, tx_dropped);
+}
+
+static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct ip_tunnel_info *tun_info;
+	const struct ip_tunnel_key *key;
+	struct erspan_metadata *md;
+	bool truncate = false;
+	__be16 proto;
+	int tunnel_hlen;
+	int version;
+	int nhoff;
+
+	tun_info = skb_tunnel_info(skb);
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
+		goto err_free_skb;
+
+	key = &tun_info->key;
+	if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+		goto err_free_skb;
+	if (tun_info->options_len < sizeof(*md))
+		goto err_free_skb;
+	md = ip_tunnel_info_opts(tun_info);
+
+	/* ERSPAN has fixed 8 byte GRE header */
+	version = md->version;
+	tunnel_hlen = 8 + erspan_hdr_len(version);
+
+	if (skb_cow_head(skb, dev->needed_headroom))
+		goto err_free_skb;
+
+	if (gre_handle_offloads(skb, false))
+		goto err_free_skb;
+
+	if (skb->len > dev->mtu + dev->hard_header_len) {
+		if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+			goto err_free_skb;
+		truncate = true;
+	}
+
+	nhoff = skb_network_offset(skb);
+	if (skb->protocol == htons(ETH_P_IP) &&
+	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+		truncate = true;
+
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		int thoff;
+
+		if (skb_transport_header_was_set(skb))
+			thoff = skb_transport_offset(skb);
+		else
+			thoff = nhoff + sizeof(struct ipv6hdr);
+		if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+			truncate = true;
+	}
+
+	if (version == 1) {
+		erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
+				    ntohl(md->u.index), truncate, true);
+		proto = htons(ETH_P_ERSPAN);
+	} else if (version == 2) {
+		erspan_build_header_v2(skb,
+				       ntohl(tunnel_id_to_key32(key->tun_id)),
+				       md->u.md2.dir,
+				       get_hwid(&md->u.md2),
+				       truncate, true);
+		proto = htons(ETH_P_ERSPAN2);
+	} else {
+		goto err_free_skb;
+	}
+
+	gre_build_header(skb, 8, TUNNEL_SEQ,
+			 proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
+
+	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
+
+	return;
+
+err_free_skb:
+	kfree_skb(skb);
+	DEV_STATS_INC(dev, tx_dropped);
+}
+
+static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	const struct ip_tunnel_key *key;
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	if (ip_tunnel_info_af(info) != AF_INET)
+		return -EINVAL;
+
+	key = &info->key;
+	ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
+			    tunnel_id_to_key32(key->tun_id),
+			    key->tos & ~INET_ECN_MASK, dev_net(dev), 0,
+			    skb->mark, skb_get_hash(skb), key->flow_flags);
+	rt = ip_route_output_key(dev_net(dev), &fl4);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	ip_rt_put(rt);
+	info->key.u.ipv4.src = fl4.saddr;
+	return 0;
+}
+
+static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
+			      struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	const struct iphdr *tnl_params;
+
+	if (!pskb_inet_may_pull(skb))
+		goto free_skb;
+
+	if (tunnel->collect_md) {
+		gre_fb_xmit(skb, dev, skb->protocol);
+		return NETDEV_TX_OK;
+	}
+
+	if (dev->header_ops) {
+		int pull_len = tunnel->hlen + sizeof(struct iphdr);
+
+		if (skb_cow_head(skb, 0))
+			goto free_skb;
+
+		tnl_params = (const struct iphdr *)skb->data;
+
+		if (!pskb_network_may_pull(skb, pull_len))
+			goto free_skb;
+
+		/* ip_tunnel_xmit() needs skb->data pointing to gre header. */
+		skb_pull(skb, pull_len);
+		skb_reset_mac_header(skb);
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL &&
+		    skb_checksum_start(skb) < skb->data)
+			goto free_skb;
+	} else {
+		if (skb_cow_head(skb, dev->needed_headroom))
+			goto free_skb;
+
+		tnl_params = &tunnel->parms.iph;
+	}
+
+	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+		goto free_skb;
+
+	__gre_xmit(skb, dev, tnl_params, skb->protocol);
+	return NETDEV_TX_OK;
+
+free_skb:
+	kfree_skb(skb);
+	DEV_STATS_INC(dev, tx_dropped);
+	return NETDEV_TX_OK;
+}
+
+static netdev_tx_t erspan_xmit(struct sk_buff *skb,
+			       struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	bool truncate = false;
+	__be16 proto;
+
+	if (!pskb_inet_may_pull(skb))
+		goto free_skb;
+
+	if (tunnel->collect_md) {
+		erspan_fb_xmit(skb, dev);
+		return NETDEV_TX_OK;
+	}
+
+	if (gre_handle_offloads(skb, false))
+		goto free_skb;
+
+	if (skb_cow_head(skb, dev->needed_headroom))
+		goto free_skb;
+
+	if (skb->len > dev->mtu + dev->hard_header_len) {
+		if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+			goto free_skb;
+		truncate = true;
+	}
+
+	/* Push ERSPAN header */
+	if (tunnel->erspan_ver == 0) {
+		proto = htons(ETH_P_ERSPAN);
+		tunnel->parms.o_flags &= ~TUNNEL_SEQ;
+	} else if (tunnel->erspan_ver == 1) {
+		erspan_build_header(skb, ntohl(tunnel->parms.o_key),
+				    tunnel->index,
+				    truncate, true);
+		proto = htons(ETH_P_ERSPAN);
+	} else if (tunnel->erspan_ver == 2) {
+		erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
+				       tunnel->dir, tunnel->hwid,
+				       truncate, true);
+		proto = htons(ETH_P_ERSPAN2);
+	} else {
+		goto free_skb;
+	}
+
+	tunnel->parms.o_flags &= ~TUNNEL_KEY;
+	__gre_xmit(skb, dev, &tunnel->parms.iph, proto);
+	return NETDEV_TX_OK;
+
+free_skb:
+	kfree_skb(skb);
+	DEV_STATS_INC(dev, tx_dropped);
+	return NETDEV_TX_OK;
+}
+
+static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
+				struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	if (!pskb_inet_may_pull(skb))
+		goto free_skb;
+
+	if (tunnel->collect_md) {
+		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
+		return NETDEV_TX_OK;
+	}
+
+	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+		goto free_skb;
+
+	if (skb_cow_head(skb, dev->needed_headroom))
+		goto free_skb;
+
+	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
+	return NETDEV_TX_OK;
+
+free_skb:
+	kfree_skb(skb);
+	DEV_STATS_INC(dev, tx_dropped);
+	return NETDEV_TX_OK;
+}
+
+static void ipgre_link_update(struct net_device *dev, bool set_mtu)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	__be16 flags;
+	int len;
+
+	len = tunnel->tun_hlen;
+	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+	len = tunnel->tun_hlen - len;
+	tunnel->hlen = tunnel->hlen + len;
+
+	if (dev->header_ops)
+		dev->hard_header_len += len;
+	else
+		dev->needed_headroom += len;
+
+	if (set_mtu)
+		dev->mtu = max_t(int, dev->mtu - len, 68);
+
+	flags = tunnel->parms.o_flags;
+
+	if (flags & TUNNEL_SEQ ||
+	    (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
+		dev->features &= ~NETIF_F_GSO_SOFTWARE;
+		dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+	} else {
+		dev->features |= NETIF_F_GSO_SOFTWARE;
+		dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+	}
+}
+
+static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
+			    int cmd)
+{
+	int err;
+
+	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+		if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
+		    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
+		    ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
+			return -EINVAL;
+	}
+
+	p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
+	p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
+
+	err = ip_tunnel_ctl(dev, p, cmd);
+	if (err)
+		return err;
+
+	if (cmd == SIOCCHGTUNNEL) {
+		struct ip_tunnel *t = netdev_priv(dev);
+
+		t->parms.i_flags = p->i_flags;
+		t->parms.o_flags = p->o_flags;
+
+		if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
+			ipgre_link_update(dev, true);
+	}
+
+	p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+	p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
+	return 0;
+}
+
+/* Nice toy. Unfortunately, useless in real life :-)
+   It allows to construct virtual multiprotocol broadcast "LAN"
+   over the Internet, provided multicast routing is tuned.
+
+
+   I have no idea was this bicycle invented before me,
+   so that I had to set ARPHRD_IPGRE to a random value.
+   I have an impression, that Cisco could make something similar,
+   but this feature is apparently missing in IOS<=11.2(8).
+
+   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+
+   ping -t 255 224.66.66.66
+
+   If nobody answers, mbone does not work.
+
+   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+   ip addr add 10.66.66.<somewhat>/24 dev Universe
+   ifconfig Universe up
+   ifconfig Universe add fe80::<Your_real_addr>/10
+   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+   ftp 10.66.66.66
+   ...
+   ftp fec0:6666:6666::193.233.7.65
+   ...
+ */
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
+			unsigned short type,
+			const void *daddr, const void *saddr, unsigned int len)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct iphdr *iph;
+	struct gre_base_hdr *greh;
+
+	iph = skb_push(skb, t->hlen + sizeof(*iph));
+	greh = (struct gre_base_hdr *)(iph+1);
+	greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
+	greh->protocol = htons(type);
+
+	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+
+	/* Set the source hardware address. */
+	if (saddr)
+		memcpy(&iph->saddr, saddr, 4);
+	if (daddr)
+		memcpy(&iph->daddr, daddr, 4);
+	if (iph->daddr)
+		return t->hlen + sizeof(*iph);
+
+	return -(t->hlen + sizeof(*iph));
+}
+
+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
+	memcpy(haddr, &iph->saddr, 4);
+	return 4;
+}
+
+static const struct header_ops ipgre_header_ops = {
+	.create	= ipgre_header,
+	.parse	= ipgre_header_parse,
+};
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+static int ipgre_open(struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	if (ipv4_is_multicast(t->parms.iph.daddr)) {
+		struct flowi4 fl4;
+		struct rtable *rt;
+
+		rt = ip_route_output_gre(t->net, &fl4,
+					 t->parms.iph.daddr,
+					 t->parms.iph.saddr,
+					 t->parms.o_key,
+					 RT_TOS(t->parms.iph.tos),
+					 t->parms.link);
+		if (IS_ERR(rt))
+			return -EADDRNOTAVAIL;
+		dev = rt->dst.dev;
+		ip_rt_put(rt);
+		if (!__in_dev_get_rtnl(dev))
+			return -EADDRNOTAVAIL;
+		t->mlink = dev->ifindex;
+		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+	}
+	return 0;
+}
+
+static int ipgre_close(struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
+		struct in_device *in_dev;
+		in_dev = inetdev_by_index(t->net, t->mlink);
+		if (in_dev)
+			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+	}
+	return 0;
+}
+#endif
+
+static const struct net_device_ops ipgre_netdev_ops = {
+	.ndo_init		= ipgre_tunnel_init,
+	.ndo_uninit		= ip_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+	.ndo_open		= ipgre_open,
+	.ndo_stop		= ipgre_close,
+#endif
+	.ndo_start_xmit		= ipgre_xmit,
+	.ndo_siocdevprivate	= ip_tunnel_siocdevprivate,
+	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_get_stats64	= dev_get_tstats64,
+	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_tunnel_ctl		= ipgre_tunnel_ctl,
+};
+
+#define GRE_FEATURES (NETIF_F_SG |		\
+		      NETIF_F_FRAGLIST |	\
+		      NETIF_F_HIGHDMA |		\
+		      NETIF_F_HW_CSUM)
+
+static void ipgre_tunnel_setup(struct net_device *dev)
+{
+	dev->netdev_ops		= &ipgre_netdev_ops;
+	dev->type		= ARPHRD_IPGRE;
+	ip_tunnel_setup(dev, ipgre_net_id);
+}
+
+static void __gre_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel;
+	__be16 flags;
+
+	tunnel = netdev_priv(dev);
+	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
+	tunnel->parms.iph.protocol = IPPROTO_GRE;
+
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+	dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
+
+	dev->features		|= GRE_FEATURES | NETIF_F_LLTX;
+	dev->hw_features	|= GRE_FEATURES;
+
+	flags = tunnel->parms.o_flags;
+
+	/* TCP offload with GRE SEQ is not supported, nor can we support 2
+	 * levels of outer headers requiring an update.
+	 */
+	if (flags & TUNNEL_SEQ)
+		return;
+	if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)
+		return;
+
+	dev->features |= NETIF_F_GSO_SOFTWARE;
+	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+}
+
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+
+	__gre_tunnel_init(dev);
+
+	__dev_addr_set(dev, &iph->saddr, 4);
+	memcpy(dev->broadcast, &iph->daddr, 4);
+
+	dev->flags		= IFF_NOARP;
+	netif_keep_dst(dev);
+	dev->addr_len		= 4;
+
+	if (iph->daddr && !tunnel->collect_md) {
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+		if (ipv4_is_multicast(iph->daddr)) {
+			if (!iph->saddr)
+				return -EINVAL;
+			dev->flags = IFF_BROADCAST;
+			dev->header_ops = &ipgre_header_ops;
+			dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+			dev->needed_headroom = 0;
+		}
+#endif
+	} else if (!tunnel->collect_md) {
+		dev->header_ops = &ipgre_header_ops;
+		dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+		dev->needed_headroom = 0;
+	}
+
+	return ip_tunnel_init(dev);
+}
+
+static const struct gre_protocol ipgre_protocol = {
+	.handler     = gre_rcv,
+	.err_handler = gre_err,
+};
+
+static int __net_init ipgre_init_net(struct net *net)
+{
+	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
+}
+
+static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
+{
+	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
+}
+
+static struct pernet_operations ipgre_net_ops = {
+	.init = ipgre_init_net,
+	.exit_batch = ipgre_exit_batch_net,
+	.id   = &ipgre_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
+static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+				 struct netlink_ext_ack *extack)
+{
+	__be16 flags;
+
+	if (!data)
+		return 0;
+
+	flags = 0;
+	if (data[IFLA_GRE_IFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (data[IFLA_GRE_OFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	if (flags & (GRE_VERSION|GRE_ROUTING))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_COLLECT_METADATA] &&
+	    data[IFLA_GRE_ENCAP_TYPE] &&
+	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
+			      struct netlink_ext_ack *extack)
+{
+	__be32 daddr;
+
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+			return -EINVAL;
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+			return -EADDRNOTAVAIL;
+	}
+
+	if (!data)
+		goto out;
+
+	if (data[IFLA_GRE_REMOTE]) {
+		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+		if (!daddr)
+			return -EINVAL;
+	}
+
+out:
+	return ipgre_tunnel_validate(tb, data, extack);
+}
+
+static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	__be16 flags = 0;
+	int ret;
+
+	if (!data)
+		return 0;
+
+	ret = ipgre_tap_validate(tb, data, extack);
+	if (ret)
+		return ret;
+
+	if (data[IFLA_GRE_ERSPAN_VER] &&
+	    nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
+		return 0;
+
+	/* ERSPAN type II/III should only have GRE sequence and key flag */
+	if (data[IFLA_GRE_OFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	if (data[IFLA_GRE_IFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (!data[IFLA_GRE_COLLECT_METADATA] &&
+	    flags != (GRE_SEQ | GRE_KEY))
+		return -EINVAL;
+
+	/* ERSPAN Session ID only has 10-bit. Since we reuse
+	 * 32-bit key field as ID, check it's range.
+	 */
+	if (data[IFLA_GRE_IKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_OKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipgre_netlink_parms(struct net_device *dev,
+				struct nlattr *data[],
+				struct nlattr *tb[],
+				struct ip_tunnel_parm *parms,
+				__u32 *fwmark)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	memset(parms, 0, sizeof(*parms));
+
+	parms->iph.protocol = IPPROTO_GRE;
+
+	if (!data)
+		return 0;
+
+	if (data[IFLA_GRE_LINK])
+		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
+
+	if (data[IFLA_GRE_IFLAGS])
+		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
+
+	if (data[IFLA_GRE_OFLAGS])
+		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
+
+	if (data[IFLA_GRE_IKEY])
+		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
+
+	if (data[IFLA_GRE_OKEY])
+		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
+
+	if (data[IFLA_GRE_LOCAL])
+		parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
+
+	if (data[IFLA_GRE_REMOTE])
+		parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
+
+	if (data[IFLA_GRE_TTL])
+		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
+
+	if (data[IFLA_GRE_TOS])
+		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
+
+	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
+		if (t->ignore_df)
+			return -EINVAL;
+		parms->iph.frag_off = htons(IP_DF);
+	}
+
+	if (data[IFLA_GRE_COLLECT_METADATA]) {
+		t->collect_md = true;
+		if (dev->type == ARPHRD_IPGRE)
+			dev->type = ARPHRD_NONE;
+	}
+
+	if (data[IFLA_GRE_IGNORE_DF]) {
+		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
+		  && (parms->iph.frag_off & htons(IP_DF)))
+			return -EINVAL;
+		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
+	}
+
+	if (data[IFLA_GRE_FWMARK])
+		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+
+	return 0;
+}
+
+static int erspan_netlink_parms(struct net_device *dev,
+				struct nlattr *data[],
+				struct nlattr *tb[],
+				struct ip_tunnel_parm *parms,
+				__u32 *fwmark)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	int err;
+
+	err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
+	if (err)
+		return err;
+	if (!data)
+		return 0;
+
+	if (data[IFLA_GRE_ERSPAN_VER]) {
+		t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+
+		if (t->erspan_ver > 2)
+			return -EINVAL;
+	}
+
+	if (t->erspan_ver == 1) {
+		if (data[IFLA_GRE_ERSPAN_INDEX]) {
+			t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+			if (t->index & ~INDEX_MASK)
+				return -EINVAL;
+		}
+	} else if (t->erspan_ver == 2) {
+		if (data[IFLA_GRE_ERSPAN_DIR]) {
+			t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+			if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
+				return -EINVAL;
+		}
+		if (data[IFLA_GRE_ERSPAN_HWID]) {
+			t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+			if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipgre_netlink_encap_parms(struct nlattr *data[],
+				      struct ip_tunnel_encap *ipencap)
+{
+	bool ret = false;
+
+	memset(ipencap, 0, sizeof(*ipencap));
+
+	if (!data)
+		return ret;
+
+	if (data[IFLA_GRE_ENCAP_TYPE]) {
+		ret = true;
+		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+	}
+
+	if (data[IFLA_GRE_ENCAP_FLAGS]) {
+		ret = true;
+		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+	}
+
+	if (data[IFLA_GRE_ENCAP_SPORT]) {
+		ret = true;
+		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+	}
+
+	if (data[IFLA_GRE_ENCAP_DPORT]) {
+		ret = true;
+		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+	}
+
+	return ret;
+}
+
+static int gre_tap_init(struct net_device *dev)
+{
+	__gre_tunnel_init(dev);
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	netif_keep_dst(dev);
+
+	return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops gre_tap_netdev_ops = {
+	.ndo_init		= gre_tap_init,
+	.ndo_uninit		= ip_tunnel_uninit,
+	.ndo_start_xmit		= gre_tap_xmit,
+	.ndo_set_mac_address 	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_get_stats64	= dev_get_tstats64,
+	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
+};
+
+static int erspan_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	if (tunnel->erspan_ver == 0)
+		tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
+	else
+		tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
+
+	tunnel->parms.iph.protocol = IPPROTO_GRE;
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+		       erspan_hdr_len(tunnel->erspan_ver);
+
+	dev->features		|= GRE_FEATURES;
+	dev->hw_features	|= GRE_FEATURES;
+	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
+	netif_keep_dst(dev);
+
+	return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops erspan_netdev_ops = {
+	.ndo_init		= erspan_tunnel_init,
+	.ndo_uninit		= ip_tunnel_uninit,
+	.ndo_start_xmit		= erspan_xmit,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_get_stats64	= dev_get_tstats64,
+	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
+};
+
+static void ipgre_tap_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->max_mtu = 0;
+	dev->netdev_ops	= &gre_tap_netdev_ops;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
+	ip_tunnel_setup(dev, gre_tap_net_id);
+}
+
+static int
+ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
+{
+	struct ip_tunnel_encap ipencap;
+
+	if (ipgre_netlink_encap_parms(data, &ipencap)) {
+		struct ip_tunnel *t = netdev_priv(dev);
+		int err = ip_tunnel_encap_setup(t, &ipencap);
+
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[],
+			 struct netlink_ext_ack *extack)
+{
+	struct ip_tunnel_parm p;
+	__u32 fwmark = 0;
+	int err;
+
+	err = ipgre_newlink_encap_setup(dev, data);
+	if (err)
+		return err;
+
+	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
+	if (err < 0)
+		return err;
+	return ip_tunnel_newlink(dev, tb, &p, fwmark);
+}
+
+static int erspan_newlink(struct net *src_net, struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[],
+			  struct netlink_ext_ack *extack)
+{
+	struct ip_tunnel_parm p;
+	__u32 fwmark = 0;
+	int err;
+
+	err = ipgre_newlink_encap_setup(dev, data);
+	if (err)
+		return err;
+
+	err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+	if (err)
+		return err;
+	return ip_tunnel_newlink(dev, tb, &p, fwmark);
+}
+
+static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
+			    struct nlattr *data[],
+			    struct netlink_ext_ack *extack)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	__u32 fwmark = t->fwmark;
+	struct ip_tunnel_parm p;
+	int err;
+
+	err = ipgre_newlink_encap_setup(dev, data);
+	if (err)
+		return err;
+
+	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
+	if (err < 0)
+		return err;
+
+	err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+	if (err < 0)
+		return err;
+
+	t->parms.i_flags = p.i_flags;
+	t->parms.o_flags = p.o_flags;
+
+	ipgre_link_update(dev, !tb[IFLA_MTU]);
+
+	return 0;
+}
+
+static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
+			     struct nlattr *data[],
+			     struct netlink_ext_ack *extack)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	__u32 fwmark = t->fwmark;
+	struct ip_tunnel_parm p;
+	int err;
+
+	err = ipgre_newlink_encap_setup(dev, data);
+	if (err)
+		return err;
+
+	err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+	if (err < 0)
+		return err;
+
+	err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+	if (err < 0)
+		return err;
+
+	t->parms.i_flags = p.i_flags;
+	t->parms.o_flags = p.o_flags;
+
+	return 0;
+}
+
+static size_t ipgre_get_size(const struct net_device *dev)
+{
+	return
+		/* IFLA_GRE_LINK */
+		nla_total_size(4) +
+		/* IFLA_GRE_IFLAGS */
+		nla_total_size(2) +
+		/* IFLA_GRE_OFLAGS */
+		nla_total_size(2) +
+		/* IFLA_GRE_IKEY */
+		nla_total_size(4) +
+		/* IFLA_GRE_OKEY */
+		nla_total_size(4) +
+		/* IFLA_GRE_LOCAL */
+		nla_total_size(4) +
+		/* IFLA_GRE_REMOTE */
+		nla_total_size(4) +
+		/* IFLA_GRE_TTL */
+		nla_total_size(1) +
+		/* IFLA_GRE_TOS */
+		nla_total_size(1) +
+		/* IFLA_GRE_PMTUDISC */
+		nla_total_size(1) +
+		/* IFLA_GRE_ENCAP_TYPE */
+		nla_total_size(2) +
+		/* IFLA_GRE_ENCAP_FLAGS */
+		nla_total_size(2) +
+		/* IFLA_GRE_ENCAP_SPORT */
+		nla_total_size(2) +
+		/* IFLA_GRE_ENCAP_DPORT */
+		nla_total_size(2) +
+		/* IFLA_GRE_COLLECT_METADATA */
+		nla_total_size(0) +
+		/* IFLA_GRE_IGNORE_DF */
+		nla_total_size(1) +
+		/* IFLA_GRE_FWMARK */
+		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_INDEX */
+		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_VER */
+		nla_total_size(1) +
+		/* IFLA_GRE_ERSPAN_DIR */
+		nla_total_size(1) +
+		/* IFLA_GRE_ERSPAN_HWID */
+		nla_total_size(2) +
+		0;
+}
+
+static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm *p = &t->parms;
+	__be16 o_flags = p->o_flags;
+
+	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
+			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
+	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
+			 gre_tnl_flags_to_gre_flags(o_flags)) ||
+	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+	    nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+		       !!(p->iph.frag_off & htons(IP_DF))) ||
+	    nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
+		goto nla_put_failure;
+
+	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+			t->encap.type) ||
+	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+			 t->encap.sport) ||
+	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+			 t->encap.dport) ||
+	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+			t->encap.flags))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
+		goto nla_put_failure;
+
+	if (t->collect_md) {
+		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
+			goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	if (t->erspan_ver <= 2) {
+		if (t->erspan_ver != 0 && !t->collect_md)
+			t->parms.o_flags |= TUNNEL_KEY;
+
+		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+			goto nla_put_failure;
+
+		if (t->erspan_ver == 1) {
+			if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+				goto nla_put_failure;
+		} else if (t->erspan_ver == 2) {
+			if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+				goto nla_put_failure;
+			if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+				goto nla_put_failure;
+		}
+	}
+
+	return ipgre_fill_info(skb, dev);
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static void erspan_setup(struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	ether_setup(dev);
+	dev->max_mtu = 0;
+	dev->netdev_ops = &erspan_netdev_ops;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	ip_tunnel_setup(dev, erspan_net_id);
+	t->erspan_ver = 1;
+}
+
+static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
+	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
+	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
+	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
+	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
+	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
+	[IFLA_GRE_LOCAL]	= { .len = sizeof_field(struct iphdr, saddr) },
+	[IFLA_GRE_REMOTE]	= { .len = sizeof_field(struct iphdr, daddr) },
+	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
+	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
+	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
+	[IFLA_GRE_ENCAP_TYPE]	= { .type = NLA_U16 },
+	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
+	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
+	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
+	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
+	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
+};
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
+	.kind		= "gre",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= ipgre_tunnel_setup,
+	.validate	= ipgre_tunnel_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.dellink	= ip_tunnel_dellink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
+};
+
+static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
+	.kind		= "gretap",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= ipgre_tap_setup,
+	.validate	= ipgre_tap_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.dellink	= ip_tunnel_dellink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
+};
+
+static struct rtnl_link_ops erspan_link_ops __read_mostly = {
+	.kind		= "erspan",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= erspan_setup,
+	.validate	= erspan_validate,
+	.newlink	= erspan_newlink,
+	.changelink	= erspan_changelink,
+	.dellink	= ip_tunnel_dellink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= erspan_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
+};
+
+struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
+					u8 name_assign_type)
+{
+	struct nlattr *tb[IFLA_MAX + 1];
+	struct net_device *dev;
+	LIST_HEAD(list_kill);
+	struct ip_tunnel *t;
+	int err;
+
+	memset(&tb, 0, sizeof(tb));
+
+	dev = rtnl_create_link(net, name, name_assign_type,
+			       &ipgre_tap_ops, tb, NULL);
+	if (IS_ERR(dev))
+		return dev;
+
+	/* Configure flow based GRE device. */
+	t = netdev_priv(dev);
+	t->collect_md = true;
+
+	err = ipgre_newlink(net, dev, tb, NULL, NULL);
+	if (err < 0) {
+		free_netdev(dev);
+		return ERR_PTR(err);
+	}
+
+	/* openvswitch users expect packet sizes to be unrestricted,
+	 * so set the largest MTU we can.
+	 */
+	err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
+	if (err)
+		goto out;
+
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
+	if (err < 0)
+		goto out;
+
+	return dev;
+out:
+	ip_tunnel_dellink(dev, &list_kill);
+	unregister_netdevice_many(&list_kill);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
+
+static int __net_init ipgre_tap_init_net(struct net *net)
+{
+	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
+}
+
+static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
+{
+	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
+}
+
+static struct pernet_operations ipgre_tap_net_ops = {
+	.init = ipgre_tap_init_net,
+	.exit_batch = ipgre_tap_exit_batch_net,
+	.id   = &gre_tap_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
+static int __net_init erspan_init_net(struct net *net)
+{
+	return ip_tunnel_init_net(net, erspan_net_id,
+				  &erspan_link_ops, "erspan0");
+}
+
+static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
+{
+	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
+}
+
+static struct pernet_operations erspan_net_ops = {
+	.init = erspan_init_net,
+	.exit_batch = erspan_exit_batch_net,
+	.id   = &erspan_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
+static int __init ipgre_init(void)
+{
+	int err;
+
+	pr_info("GRE over IPv4 tunneling driver\n");
+
+	err = register_pernet_device(&ipgre_net_ops);
+	if (err < 0)
+		return err;
+
+	err = register_pernet_device(&ipgre_tap_net_ops);
+	if (err < 0)
+		goto pnet_tap_failed;
+
+	err = register_pernet_device(&erspan_net_ops);
+	if (err < 0)
+		goto pnet_erspan_failed;
+
+	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
+	if (err < 0) {
+		pr_info("%s: can't add protocol\n", __func__);
+		goto add_proto_failed;
+	}
+
+	err = rtnl_link_register(&ipgre_link_ops);
+	if (err < 0)
+		goto rtnl_link_failed;
+
+	err = rtnl_link_register(&ipgre_tap_ops);
+	if (err < 0)
+		goto tap_ops_failed;
+
+	err = rtnl_link_register(&erspan_link_ops);
+	if (err < 0)
+		goto erspan_link_failed;
+
+	return 0;
+
+erspan_link_failed:
+	rtnl_link_unregister(&ipgre_tap_ops);
+tap_ops_failed:
+	rtnl_link_unregister(&ipgre_link_ops);
+rtnl_link_failed:
+	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+add_proto_failed:
+	unregister_pernet_device(&erspan_net_ops);
+pnet_erspan_failed:
+	unregister_pernet_device(&ipgre_tap_net_ops);
+pnet_tap_failed:
+	unregister_pernet_device(&ipgre_net_ops);
+	return err;
+}
+
+static void __exit ipgre_fini(void)
+{
+	rtnl_link_unregister(&ipgre_tap_ops);
+	rtnl_link_unregister(&ipgre_link_ops);
+	rtnl_link_unregister(&erspan_link_ops);
+	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+	unregister_pernet_device(&ipgre_tap_net_ops);
+	unregister_pernet_device(&ipgre_net_ops);
+	unregister_pernet_device(&erspan_net_ops);
+}
+
+module_init(ipgre_init);
+module_exit(ipgre_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("gre");
+MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_RTNL_LINK("erspan");
+MODULE_ALIAS_NETDEV("gre0");
+MODULE_ALIAS_NETDEV("gretap0");
+MODULE_ALIAS_NETDEV("erspan0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 0000000000..5e9c815665
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,675 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The Internet Protocol (IP) module.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Donald Becker, <becker@super.org>
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *		Richard Underwood
+ *		Stefan Becker, <stefanb@yello.ping.de>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *
+ * Fixes:
+ *		Alan Cox	:	Commented a couple of minor bits of surplus code
+ *		Alan Cox	:	Undefining IP_FORWARD doesn't include the code
+ *					(just stops a compiler warning).
+ *		Alan Cox	:	Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ *					are junked rather than corrupting things.
+ *		Alan Cox	:	Frames to bad broadcast subnets are dumped
+ *					We used to process them non broadcast and
+ *					boy could that cause havoc.
+ *		Alan Cox	:	ip_forward sets the free flag on the
+ *					new frame it queues. Still crap because
+ *					it copies the frame but at least it
+ *					doesn't eat memory too.
+ *		Alan Cox	:	Generic queue code and memory fixes.
+ *		Fred Van Kempen :	IP fragment support (borrowed from NET2E)
+ *		Gerhard Koerting:	Forward fragmented frames correctly.
+ *		Gerhard Koerting: 	Fixes to my fix of the above 8-).
+ *		Gerhard Koerting:	IP interface addressing fix.
+ *		Linus Torvalds	:	More robustness checks
+ *		Alan Cox	:	Even more checks: Still not as robust as it ought to be
+ *		Alan Cox	:	Save IP header pointer for later
+ *		Alan Cox	:	ip option setting
+ *		Alan Cox	:	Use ip_tos/ip_ttl settings
+ *		Alan Cox	:	Fragmentation bogosity removed
+ *					(Thanks to Mark.Bush@prg.ox.ac.uk)
+ *		Dmitry Gorodchanin :	Send of a raw packet crash fix.
+ *		Alan Cox	:	Silly ip bug when an overlength
+ *					fragment turns up. Now frees the
+ *					queue.
+ *		Linus Torvalds/ :	Memory leakage on fragmentation
+ *		Alan Cox	:	handling.
+ *		Gerhard Koerting:	Forwarding uses IP priority hints
+ *		Teemu Rantanen	:	Fragment problems.
+ *		Alan Cox	:	General cleanup, comments and reformat
+ *		Alan Cox	:	SNMP statistics
+ *		Alan Cox	:	BSD address rule semantics. Also see
+ *					UDP as there is a nasty checksum issue
+ *					if you do things the wrong way.
+ *		Alan Cox	:	Always defrag, moved IP_FORWARD to the config.in file
+ *		Alan Cox	: 	IP options adjust sk->priority.
+ *		Pedro Roque	:	Fix mtu/length error in ip_forward.
+ *		Alan Cox	:	Avoid ip_chk_addr when possible.
+ *	Richard Underwood	:	IP multicasting.
+ *		Alan Cox	:	Cleaned up multicast handlers.
+ *		Alan Cox	:	RAW sockets demultiplex in the BSD style.
+ *		Gunther Mayer	:	Fix the SNMP reporting typo
+ *		Alan Cox	:	Always in group 224.0.0.1
+ *	Pauline Middelink	:	Fast ip_checksum update when forwarding
+ *					Masquerading support.
+ *		Alan Cox	:	Multicast loopback error for 224.0.0.1
+ *		Alan Cox	:	IP_MULTICAST_LOOP option.
+ *		Alan Cox	:	Use notifiers.
+ *		Bjorn Ekwall	:	Removed ip_csum (from slhc.c too)
+ *		Bjorn Ekwall	:	Moved ip_fast_csum to ip.h (inline!)
+ *		Stefan Becker   :       Send out ICMP HOST REDIRECT
+ *	Arnt Gulbrandsen	:	ip_build_xmit
+ *		Alan Cox	:	Per socket routing cache
+ *		Alan Cox	:	Fixed routing cache, added header cache.
+ *		Alan Cox	:	Loopback didn't work right in original ip_build_xmit - fixed it.
+ *		Alan Cox	:	Only send ICMP_REDIRECT if src/dest are the same net.
+ *		Alan Cox	:	Incoming IP option handling.
+ *		Alan Cox	:	Set saddr on raw output frames as per BSD.
+ *		Alan Cox	:	Stopped broadcast source route explosions.
+ *		Alan Cox	:	Can disable source routing
+ *		Takeshi Sone    :	Masquerading didn't work.
+ *	Dave Bonn,Alan Cox	:	Faster IP forwarding whenever possible.
+ *		Alan Cox	:	Memory leaks, tramples, misc debugging.
+ *		Alan Cox	:	Fixed multicast (by popular demand 8))
+ *		Alan Cox	:	Fixed forwarding (by even more popular demand 8))
+ *		Alan Cox	:	Fixed SNMP statistics [I think]
+ *	Gerhard Koerting	:	IP fragmentation forwarding fix
+ *		Alan Cox	:	Device lock against page fault.
+ *		Alan Cox	:	IP_HDRINCL facility.
+ *	Werner Almesberger	:	Zero fragment bug
+ *		Alan Cox	:	RAW IP frame length bug
+ *		Alan Cox	:	Outgoing firewall on build_xmit
+ *		A.N.Kuznetsov	:	IP_OPTIONS support throughout the kernel
+ *		Alan Cox	:	Multicast routing hooks
+ *		Jos Vos		:	Do accounting *before* call_in_firewall
+ *	Willy Konynenberg	:	Transparent proxying support
+ *
+ * To Fix:
+ *		IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ *		and could be made very efficient with the addition of some virtual memory hacks to permit
+ *		the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ *		Output fragmentation wants updating along with the buffer management to use a single
+ *		interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ *		output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ *		fragmentation anyway.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/indirect_call_wrapper.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <net/inet_ecn.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/xfrm.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+#include <net/dst_metadata.h>
+
+/*
+ *	Process Router Attention IP option (RFC 2113)
+ */
+bool ip_call_ra_chain(struct sk_buff *skb)
+{
+	struct ip_ra_chain *ra;
+	u8 protocol = ip_hdr(skb)->protocol;
+	struct sock *last = NULL;
+	struct net_device *dev = skb->dev;
+	struct net *net = dev_net(dev);
+
+	for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
+		struct sock *sk = ra->sk;
+
+		/* If socket is bound to an interface, only report
+		 * the packet if it came  from that interface.
+		 */
+		if (sk && inet_sk(sk)->inet_num == protocol &&
+		    (!sk->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == dev->ifindex)) {
+			if (ip_is_fragment(ip_hdr(skb))) {
+				if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
+					return true;
+			}
+			if (last) {
+				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2)
+					raw_rcv(last, skb2);
+			}
+			last = sk;
+		}
+	}
+
+	if (last) {
+		raw_rcv(last, skb);
+		return true;
+	}
+	return false;
+}
+
+INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
+{
+	const struct net_protocol *ipprot;
+	int raw, ret;
+
+resubmit:
+	raw = raw_local_deliver(skb, protocol);
+
+	ipprot = rcu_dereference(inet_protos[protocol]);
+	if (ipprot) {
+		if (!ipprot->no_policy) {
+			if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+				kfree_skb_reason(skb,
+						 SKB_DROP_REASON_XFRM_POLICY);
+				return;
+			}
+			nf_reset_ct(skb);
+		}
+		ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
+				      skb);
+		if (ret < 0) {
+			protocol = -ret;
+			goto resubmit;
+		}
+		__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+	} else {
+		if (!raw) {
+			if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+				__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+				icmp_send(skb, ICMP_DEST_UNREACH,
+					  ICMP_PROT_UNREACH, 0);
+			}
+			kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
+		} else {
+			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+			consume_skb(skb);
+		}
+	}
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	skb_clear_delivery_time(skb);
+	__skb_pull(skb, skb_network_header_len(skb));
+
+	rcu_read_lock();
+	ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/*
+ * 	Deliver IP Packets to the higher protocol layers.
+ */
+int ip_local_deliver(struct sk_buff *skb)
+{
+	/*
+	 *	Reassemble IP fragments.
+	 */
+	struct net *net = dev_net(skb->dev);
+
+	if (ip_is_fragment(ip_hdr(skb))) {
+		if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
+			return 0;
+	}
+
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
+		       net, NULL, skb, skb->dev, NULL,
+		       ip_local_deliver_finish);
+}
+EXPORT_SYMBOL(ip_local_deliver);
+
+static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_options *opt;
+	const struct iphdr *iph;
+
+	/* It looks as overkill, because not all
+	   IP options require packet mangling.
+	   But it is the easiest for now, especially taking
+	   into account that combination of IP options
+	   and running sniffer is extremely rare condition.
+					      --ANK (980813)
+	*/
+	if (skb_cow(skb, skb_headroom(skb))) {
+		__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	iph = ip_hdr(skb);
+	opt = &(IPCB(skb)->opt);
+	opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+
+	if (ip_options_compile(dev_net(dev), opt, skb)) {
+		__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+		goto drop;
+	}
+
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+		if (in_dev) {
+			if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+				if (IN_DEV_LOG_MARTIANS(in_dev))
+					net_info_ratelimited("source route option %pI4 -> %pI4\n",
+							     &iph->saddr,
+							     &iph->daddr);
+				goto drop;
+			}
+		}
+
+		if (ip_options_rcv_srr(skb, dev))
+			goto drop;
+	}
+
+	return false;
+drop:
+	return true;
+}
+
+static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
+			    const struct sk_buff *hint)
+{
+	return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
+	       ip_hdr(hint)->tos == iph->tos;
+}
+
+int tcp_v4_early_demux(struct sk_buff *skb);
+int udp_v4_early_demux(struct sk_buff *skb);
+static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+			      struct sk_buff *skb, struct net_device *dev,
+			      const struct sk_buff *hint)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	int err, drop_reason;
+	struct rtable *rt;
+
+	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+
+	if (ip_can_use_hint(skb, iph, hint)) {
+		err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
+					dev, hint);
+		if (unlikely(err))
+			goto drop_error;
+	}
+
+	if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+	    !skb_dst(skb) &&
+	    !skb->sk &&
+	    !ip_is_fragment(iph)) {
+		switch (iph->protocol) {
+		case IPPROTO_TCP:
+			if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
+				tcp_v4_early_demux(skb);
+
+				/* must reload iph, skb->head might have changed */
+				iph = ip_hdr(skb);
+			}
+			break;
+		case IPPROTO_UDP:
+			if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
+				err = udp_v4_early_demux(skb);
+				if (unlikely(err))
+					goto drop_error;
+
+				/* must reload iph, skb->head might have changed */
+				iph = ip_hdr(skb);
+			}
+			break;
+		}
+	}
+
+	/*
+	 *	Initialise the virtual path cache for the packet. It describes
+	 *	how the packet travels inside Linux networking.
+	 */
+	if (!skb_valid_dst(skb)) {
+		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					   iph->tos, dev);
+		if (unlikely(err))
+			goto drop_error;
+	} else {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+		if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
+			IPCB(skb)->flags |= IPSKB_NOPOLICY;
+	}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (unlikely(skb_dst(skb)->tclassid)) {
+		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
+		u32 idx = skb_dst(skb)->tclassid;
+		st[idx&0xFF].o_packets++;
+		st[idx&0xFF].o_bytes += skb->len;
+		st[(idx>>16)&0xFF].i_packets++;
+		st[(idx>>16)&0xFF].i_bytes += skb->len;
+	}
+#endif
+
+	if (iph->ihl > 5 && ip_rcv_options(skb, dev))
+		goto drop;
+
+	rt = skb_rtable(skb);
+	if (rt->rt_type == RTN_MULTICAST) {
+		__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
+	} else if (rt->rt_type == RTN_BROADCAST) {
+		__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
+	} else if (skb->pkt_type == PACKET_BROADCAST ||
+		   skb->pkt_type == PACKET_MULTICAST) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+		/* RFC 1122 3.3.6:
+		 *
+		 *   When a host sends a datagram to a link-layer broadcast
+		 *   address, the IP destination address MUST be a legal IP
+		 *   broadcast or IP multicast address.
+		 *
+		 *   A host SHOULD silently discard a datagram that is received
+		 *   via a link-layer broadcast (see Section 2.4) but does not
+		 *   specify an IP multicast or broadcast destination address.
+		 *
+		 * This doesn't explicitly say L2 *broadcast*, but broadcast is
+		 * in a way a form of multicast and the most common use case for
+		 * this is 802.11 protecting against cross-station spoofing (the
+		 * so-called "hole-196" attack) so do it for both.
+		 */
+		if (in_dev &&
+		    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
+			drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
+			goto drop;
+		}
+	}
+
+	return NET_RX_SUCCESS;
+
+drop:
+	kfree_skb_reason(skb, drop_reason);
+	return NET_RX_DROP;
+
+drop_error:
+	if (err == -EXDEV) {
+		drop_reason = SKB_DROP_REASON_IP_RPFILTER;
+		__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+	}
+	goto drop;
+}
+
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	int ret;
+
+	/* if ingress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip_rcv(skb);
+	if (!skb)
+		return NET_RX_SUCCESS;
+
+	ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
+	if (ret != NET_RX_DROP)
+		ret = dst_input(skb);
+	return ret;
+}
+
+/*
+ * 	Main IP Receive routine.
+ */
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
+{
+	const struct iphdr *iph;
+	int drop_reason;
+	u32 len;
+
+	/* When the interface is in promisc. mode, drop all the crap
+	 * that it receives, do not try to analyse it.
+	 */
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
+		drop_reason = SKB_DROP_REASON_OTHERHOST;
+		goto drop;
+	}
+
+	__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb) {
+		__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
+		goto out;
+	}
+
+	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+
+	/*
+	 *	RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
+	 *
+	 *	Is the datagram acceptable?
+	 *
+	 *	1.	Length at least the size of an ip header
+	 *	2.	Version of 4
+	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+	 *	4.	Doesn't have a bogus length
+	 */
+
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error;
+
+	BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+	BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+	BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+	__IP_ADD_STATS(net,
+		       IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+		       max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto csum_error;
+
+	len = iph_totlen(skb, iph);
+	if (skb->len < len) {
+		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+		__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < (iph->ihl*4))
+		goto inhdr_error;
+
+	/* Our transport medium may have padded the buffer out. Now we know it
+	 * is IP we can trim to the true length of the frame.
+	 * Note this now means skb->len holds ntohs(iph->tot_len).
+	 */
+	if (pskb_trim_rcsum(skb, len)) {
+		__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	iph = ip_hdr(skb);
+	skb->transport_header = skb->network_header + iph->ihl*4;
+
+	/* Remove any debris in the socket control block */
+	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	IPCB(skb)->iif = skb->skb_iif;
+
+	/* Must drop socket now because of tproxy. */
+	if (!skb_sk_is_prefetched(skb))
+		skb_orphan(skb);
+
+	return skb;
+
+csum_error:
+	drop_reason = SKB_DROP_REASON_IP_CSUM;
+	__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
+inhdr_error:
+	if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
+		drop_reason = SKB_DROP_REASON_IP_INHDR;
+	__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+drop:
+	kfree_skb_reason(skb, drop_reason);
+out:
+	return NULL;
+}
+
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+	   struct net_device *orig_dev)
+{
+	struct net *net = dev_net(dev);
+
+	skb = ip_rcv_core(skb, net);
+	if (skb == NULL)
+		return NET_RX_DROP;
+
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+		       net, NULL, skb, dev, NULL,
+		       ip_rcv_finish);
+}
+
+static void ip_sublist_rcv_finish(struct list_head *head)
+{
+	struct sk_buff *skb, *next;
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		skb_list_del_init(skb);
+		dst_input(skb);
+	}
+}
+
+static struct sk_buff *ip_extract_route_hint(const struct net *net,
+					     struct sk_buff *skb, int rt_type)
+{
+	if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
+	    IPCB(skb)->flags & IPSKB_MULTIPATH)
+		return NULL;
+
+	return skb;
+}
+
+static void ip_list_rcv_finish(struct net *net, struct sock *sk,
+			       struct list_head *head)
+{
+	struct sk_buff *skb, *next, *hint = NULL;
+	struct dst_entry *curr_dst = NULL;
+	struct list_head sublist;
+
+	INIT_LIST_HEAD(&sublist);
+	list_for_each_entry_safe(skb, next, head, list) {
+		struct net_device *dev = skb->dev;
+		struct dst_entry *dst;
+
+		skb_list_del_init(skb);
+		/* if ingress device is enslaved to an L3 master device pass the
+		 * skb to its handler for processing
+		 */
+		skb = l3mdev_ip_rcv(skb);
+		if (!skb)
+			continue;
+		if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
+			continue;
+
+		dst = skb_dst(skb);
+		if (curr_dst != dst) {
+			hint = ip_extract_route_hint(net, skb,
+					       ((struct rtable *)dst)->rt_type);
+
+			/* dispatch old sublist */
+			if (!list_empty(&sublist))
+				ip_sublist_rcv_finish(&sublist);
+			/* start new sublist */
+			INIT_LIST_HEAD(&sublist);
+			curr_dst = dst;
+		}
+		list_add_tail(&skb->list, &sublist);
+	}
+	/* dispatch final sublist */
+	ip_sublist_rcv_finish(&sublist);
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+			   struct net *net)
+{
+	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+		     head, dev, NULL, ip_rcv_finish);
+	ip_list_rcv_finish(net, NULL, head);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		 struct net_device *orig_dev)
+{
+	struct net_device *curr_dev = NULL;
+	struct net *curr_net = NULL;
+	struct sk_buff *skb, *next;
+	struct list_head sublist;
+
+	INIT_LIST_HEAD(&sublist);
+	list_for_each_entry_safe(skb, next, head, list) {
+		struct net_device *dev = skb->dev;
+		struct net *net = dev_net(dev);
+
+		skb_list_del_init(skb);
+		skb = ip_rcv_core(skb, net);
+		if (skb == NULL)
+			continue;
+
+		if (curr_dev != dev || curr_net != net) {
+			/* dispatch old sublist */
+			if (!list_empty(&sublist))
+				ip_sublist_rcv(&sublist, curr_dev, curr_net);
+			/* start new sublist */
+			INIT_LIST_HEAD(&sublist);
+			curr_dev = dev;
+			curr_net = net;
+		}
+		list_add_tail(&skb->list, &sublist);
+	}
+	/* dispatch final sublist */
+	if (!list_empty(&sublist))
+		ip_sublist_rcv(&sublist, curr_dev, curr_net);
+}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 0000000000..a9e22a0988
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,641 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The options processing module for ip.c
+ *
+ * Authors:	A.N.Kuznetsov
+ *
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
+
+/*
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this  function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
+		      __be32 daddr, struct rtable *rt)
+{
+	unsigned char *iph = skb_network_header(skb);
+
+	memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+	memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen);
+	opt = &(IPCB(skb)->opt);
+
+	if (opt->srr)
+		memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
+
+	if (opt->rr_needaddr)
+		ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
+	if (opt->ts_needaddr)
+		ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
+	if (opt->ts_needtime) {
+		__be32 midtime;
+
+		midtime = inet_current_timestamp();
+		memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
+	}
+}
+
+/*
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+
+int __ip_options_echo(struct net *net, struct ip_options *dopt,
+		      struct sk_buff *skb, const struct ip_options *sopt)
+{
+	unsigned char *sptr, *dptr;
+	int soffset, doffset;
+	int	optlen;
+
+	memset(dopt, 0, sizeof(struct ip_options));
+
+	if (sopt->optlen == 0)
+		return 0;
+
+	sptr = skb_network_header(skb);
+	dptr = dopt->__data;
+
+	if (sopt->rr) {
+		optlen  = sptr[sopt->rr+1];
+		soffset = sptr[sopt->rr+2];
+		dopt->rr = dopt->optlen + sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->rr, optlen);
+		if (sopt->rr_needaddr && soffset <= optlen) {
+			if (soffset + 3 > optlen)
+				return -EINVAL;
+			dptr[2] = soffset + 4;
+			dopt->rr_needaddr = 1;
+		}
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
+	if (sopt->ts) {
+		optlen = sptr[sopt->ts+1];
+		soffset = sptr[sopt->ts+2];
+		dopt->ts = dopt->optlen + sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->ts, optlen);
+		if (soffset <= optlen) {
+			if (sopt->ts_needaddr) {
+				if (soffset + 3 > optlen)
+					return -EINVAL;
+				dopt->ts_needaddr = 1;
+				soffset += 4;
+			}
+			if (sopt->ts_needtime) {
+				if (soffset + 3 > optlen)
+					return -EINVAL;
+				if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+					dopt->ts_needtime = 1;
+					soffset += 4;
+				} else {
+					dopt->ts_needtime = 0;
+
+					if (soffset + 7 <= optlen) {
+						__be32 addr;
+
+						memcpy(&addr, dptr+soffset-1, 4);
+						if (inet_addr_type(net, addr) != RTN_UNICAST) {
+							dopt->ts_needtime = 1;
+							soffset += 8;
+						}
+					}
+				}
+			}
+			dptr[2] = soffset;
+		}
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
+	if (sopt->srr) {
+		unsigned char *start = sptr+sopt->srr;
+		__be32 faddr;
+
+		optlen  = start[1];
+		soffset = start[2];
+		doffset = 0;
+		if (soffset > optlen)
+			soffset = optlen + 1;
+		soffset -= 4;
+		if (soffset > 3) {
+			memcpy(&faddr, &start[soffset-1], 4);
+			for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
+				memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+			/*
+			 * RFC1812 requires to fix illegal source routes.
+			 */
+			if (memcmp(&ip_hdr(skb)->saddr,
+				   &start[soffset + 3], 4) == 0)
+				doffset -= 4;
+		}
+		if (doffset > 3) {
+			dopt->faddr = faddr;
+			dptr[0] = start[0];
+			dptr[1] = doffset+3;
+			dptr[2] = 4;
+			dptr += doffset+3;
+			dopt->srr = dopt->optlen + sizeof(struct iphdr);
+			dopt->optlen += doffset+3;
+			dopt->is_strictroute = sopt->is_strictroute;
+		}
+	}
+	if (sopt->cipso) {
+		optlen  = sptr[sopt->cipso+1];
+		dopt->cipso = dopt->optlen+sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->cipso, optlen);
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
+	while (dopt->optlen & 3) {
+		*dptr++ = IPOPT_END;
+		dopt->optlen++;
+	}
+	return 0;
+}
+
+/*
+ *	Options "fragmenting", just fill options not
+ *	allowed in fragments with NOOPs.
+ *	Simple and stupid 8), but the most efficient way.
+ */
+
+void ip_options_fragment(struct sk_buff *skb)
+{
+	unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
+	struct ip_options *opt = &(IPCB(skb)->opt);
+	int  l = opt->optlen;
+	int  optlen;
+
+	while (l > 0) {
+		switch (*optptr) {
+		case IPOPT_END:
+			return;
+		case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		optlen = optptr[1];
+		if (optlen < 2 || optlen > l)
+		  return;
+		if (!IPOPT_COPIED(*optptr))
+			memset(optptr, IPOPT_NOOP, optlen);
+		l -= optlen;
+		optptr += optlen;
+	}
+	opt->ts = 0;
+	opt->rr = 0;
+	opt->rr_needaddr = 0;
+	opt->ts_needaddr = 0;
+	opt->ts_needtime = 0;
+}
+
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+	if (*spec_dst == htonl(INADDR_ANY))
+		*spec_dst = fib_compute_spec_dst(skb);
+}
+
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+
+int __ip_options_compile(struct net *net,
+			 struct ip_options *opt, struct sk_buff *skb,
+			 __be32 *info)
+{
+	__be32 spec_dst = htonl(INADDR_ANY);
+	unsigned char *pp_ptr = NULL;
+	struct rtable *rt = NULL;
+	unsigned char *optptr;
+	unsigned char *iph;
+	int optlen, l;
+
+	if (skb) {
+		rt = skb_rtable(skb);
+		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+	} else
+		optptr = opt->__data;
+	iph = optptr - sizeof(struct iphdr);
+
+	for (l = opt->optlen; l > 0; ) {
+		switch (*optptr) {
+		case IPOPT_END:
+			for (optptr++, l--; l > 0; optptr++, l--) {
+				if (*optptr != IPOPT_END) {
+					*optptr = IPOPT_END;
+					opt->is_changed = 1;
+				}
+			}
+			goto eol;
+		case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		if (unlikely(l < 2)) {
+			pp_ptr = optptr;
+			goto error;
+		}
+		optlen = optptr[1];
+		if (optlen < 2 || optlen > l) {
+			pp_ptr = optptr;
+			goto error;
+		}
+		switch (*optptr) {
+		case IPOPT_SSRR:
+		case IPOPT_LSRR:
+			if (optlen < 3) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 4) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			/* NB: cf RFC-1812 5.2.4.1 */
+			if (opt->srr) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (!skb) {
+				if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+					pp_ptr = optptr + 1;
+					goto error;
+				}
+				memcpy(&opt->faddr, &optptr[3], 4);
+				if (optlen > 7)
+					memmove(&optptr[3], &optptr[7], optlen-7);
+			}
+			opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+			opt->srr = optptr - iph;
+			break;
+		case IPOPT_RR:
+			if (opt->rr) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (optlen < 3) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 4) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			if (optptr[2] <= optlen) {
+				if (optptr[2]+3 > optlen) {
+					pp_ptr = optptr + 2;
+					goto error;
+				}
+				if (rt) {
+					spec_dst_fill(&spec_dst, skb);
+					memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+					opt->is_changed = 1;
+				}
+				optptr[2] += 4;
+				opt->rr_needaddr = 1;
+			}
+			opt->rr = optptr - iph;
+			break;
+		case IPOPT_TIMESTAMP:
+			if (opt->ts) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (optlen < 4) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 5) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			if (optptr[2] <= optlen) {
+				unsigned char *timeptr = NULL;
+				if (optptr[2]+3 > optlen) {
+					pp_ptr = optptr + 2;
+					goto error;
+				}
+				switch (optptr[3]&0xF) {
+				case IPOPT_TS_TSONLY:
+					if (skb)
+						timeptr = &optptr[optptr[2]-1];
+					opt->ts_needtime = 1;
+					optptr[2] += 4;
+					break;
+				case IPOPT_TS_TSANDADDR:
+					if (optptr[2]+7 > optlen) {
+						pp_ptr = optptr + 2;
+						goto error;
+					}
+					if (rt)  {
+						spec_dst_fill(&spec_dst, skb);
+						memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+						timeptr = &optptr[optptr[2]+3];
+					}
+					opt->ts_needaddr = 1;
+					opt->ts_needtime = 1;
+					optptr[2] += 8;
+					break;
+				case IPOPT_TS_PRESPEC:
+					if (optptr[2]+7 > optlen) {
+						pp_ptr = optptr + 2;
+						goto error;
+					}
+					{
+						__be32 addr;
+						memcpy(&addr, &optptr[optptr[2]-1], 4);
+						if (inet_addr_type(net, addr) == RTN_UNICAST)
+							break;
+						if (skb)
+							timeptr = &optptr[optptr[2]+3];
+					}
+					opt->ts_needtime = 1;
+					optptr[2] += 8;
+					break;
+				default:
+					if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+						pp_ptr = optptr + 3;
+						goto error;
+					}
+					break;
+				}
+				if (timeptr) {
+					__be32 midtime;
+
+					midtime = inet_current_timestamp();
+					memcpy(timeptr, &midtime, 4);
+					opt->is_changed = 1;
+				}
+			} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+				unsigned int overflow = optptr[3]>>4;
+				if (overflow == 15) {
+					pp_ptr = optptr + 3;
+					goto error;
+				}
+				if (skb) {
+					optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+					opt->is_changed = 1;
+				}
+			}
+			opt->ts = optptr - iph;
+			break;
+		case IPOPT_RA:
+			if (optlen < 4) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] == 0 && optptr[3] == 0)
+				opt->router_alert = optptr - iph;
+			break;
+		case IPOPT_CIPSO:
+			if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			opt->cipso = optptr - iph;
+			if (cipso_v4_validate(skb, &optptr)) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			break;
+		case IPOPT_SEC:
+		case IPOPT_SID:
+		default:
+			if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			break;
+		}
+		l -= optlen;
+		optptr += optlen;
+	}
+
+eol:
+	if (!pp_ptr)
+		return 0;
+
+error:
+	if (info)
+		*info = htonl((pp_ptr-iph)<<24);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(__ip_options_compile);
+
+int ip_options_compile(struct net *net,
+		       struct ip_options *opt, struct sk_buff *skb)
+{
+	int ret;
+	__be32 info;
+
+	ret = __ip_options_compile(net, opt, skb, &info);
+	if (ret != 0 && skb)
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, info);
+	return ret;
+}
+EXPORT_SYMBOL(ip_options_compile);
+
+/*
+ *	Undo all the changes done by ip_options_compile().
+ */
+
+void ip_options_undo(struct ip_options *opt)
+{
+	if (opt->srr) {
+		unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr);
+
+		memmove(optptr + 7, optptr + 3, optptr[1] - 7);
+		memcpy(optptr + 3, &opt->faddr, 4);
+	}
+	if (opt->rr_needaddr) {
+		unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr);
+
+		optptr[2] -= 4;
+		memset(&optptr[optptr[2] - 1], 0, 4);
+	}
+	if (opt->ts) {
+		unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr);
+
+		if (opt->ts_needtime) {
+			optptr[2] -= 4;
+			memset(&optptr[optptr[2] - 1], 0, 4);
+			if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC)
+				optptr[2] -= 4;
+		}
+		if (opt->ts_needaddr) {
+			optptr[2] -= 4;
+			memset(&optptr[optptr[2] - 1], 0, 4);
+		}
+	}
+}
+
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
+		   sockptr_t data, int optlen)
+{
+	struct ip_options_rcu *opt;
+
+	opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
+		       GFP_KERNEL);
+	if (!opt)
+		return -ENOMEM;
+	if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) {
+		kfree(opt);
+		return -EFAULT;
+	}
+
+	while (optlen & 3)
+		opt->opt.__data[optlen++] = IPOPT_END;
+	opt->opt.optlen = optlen;
+	if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
+		kfree(opt);
+		return -EINVAL;
+	}
+	kfree(*optp);
+	*optp = opt;
+	return 0;
+}
+
+void ip_forward_options(struct sk_buff *skb)
+{
+	struct   ip_options *opt	= &(IPCB(skb)->opt);
+	unsigned char *optptr;
+	struct rtable *rt = skb_rtable(skb);
+	unsigned char *raw = skb_network_header(skb);
+
+	if (opt->rr_needaddr) {
+		optptr = (unsigned char *)raw + opt->rr;
+		ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
+		opt->is_changed = 1;
+	}
+	if (opt->srr_is_hit) {
+		int srrptr, srrspace;
+
+		optptr = raw + opt->srr;
+
+		for ( srrptr = optptr[2], srrspace = optptr[1];
+		     srrptr <= srrspace;
+		     srrptr += 4
+		     ) {
+			if (srrptr + 3 > srrspace)
+				break;
+			if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
+				break;
+		}
+		if (srrptr + 3 <= srrspace) {
+			opt->is_changed = 1;
+			ip_hdr(skb)->daddr = opt->nexthop;
+			ip_rt_get_source(&optptr[srrptr-1], skb, rt);
+			optptr[2] = srrptr+4;
+		} else {
+			net_crit_ratelimited("%s(): Argh! Destination lost!\n",
+					     __func__);
+		}
+		if (opt->ts_needaddr) {
+			optptr = raw + opt->ts;
+			ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
+			opt->is_changed = 1;
+		}
+	}
+	if (opt->is_changed) {
+		opt->is_changed = 0;
+		ip_send_check(ip_hdr(skb));
+	}
+}
+
+int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_options *opt = &(IPCB(skb)->opt);
+	int srrspace, srrptr;
+	__be32 nexthop;
+	struct iphdr *iph = ip_hdr(skb);
+	unsigned char *optptr = skb_network_header(skb) + opt->srr;
+	struct rtable *rt = skb_rtable(skb);
+	struct rtable *rt2;
+	unsigned long orefdst;
+	int err;
+
+	if (!rt)
+		return 0;
+
+	if (skb->pkt_type != PACKET_HOST)
+		return -EINVAL;
+	if (rt->rt_type == RTN_UNICAST) {
+		if (!opt->is_strictroute)
+			return 0;
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+		return -EINVAL;
+	}
+	if (rt->rt_type != RTN_LOCAL)
+		return -EINVAL;
+
+	for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+		if (srrptr + 3 > srrspace) {
+			icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+			return -EINVAL;
+		}
+		memcpy(&nexthop, &optptr[srrptr-1], 4);
+
+		orefdst = skb->_skb_refdst;
+		skb_dst_set(skb, NULL);
+		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, dev);
+		rt2 = skb_rtable(skb);
+		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+			skb_dst_drop(skb);
+			skb->_skb_refdst = orefdst;
+			return -EINVAL;
+		}
+		refdst_drop(orefdst);
+		if (rt2->rt_type != RTN_LOCAL)
+			break;
+		/* Superfast 8) loopback forward */
+		iph->daddr = nexthop;
+		opt->is_changed = 1;
+	}
+	if (srrptr <= srrspace) {
+		opt->srr_is_hit = 1;
+		opt->nexthop = nexthop;
+		opt->is_changed = 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 0000000000..4ab877cf6d
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1663 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The Internet Protocol (IP) output module.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Donald Becker, <becker@super.org>
+ *		Alan Cox, <Alan.Cox@linux.org>
+ *		Richard Underwood
+ *		Stefan Becker, <stefanb@yello.ping.de>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ *	See ip_input.c for original log
+ *
+ *	Fixes:
+ *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
+ *		Mike Kilburn	:	htons() missing in ip_build_xmit.
+ *		Bradford Johnson:	Fix faulty handling of some frames when
+ *					no route is found.
+ *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
+ *					(in case if packet not accepted by
+ *					output firewall rules)
+ *		Mike McLagan	:	Routing by source
+ *		Alexey Kuznetsov:	use new route cache
+ *		Andi Kleen:		Fix broken PMTU recovery and remove
+ *					some redundant tests.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
+ *		Andi Kleen	:	Split fast and slow ip_build_xmit path
+ *					for decreased register pressure on x86
+ *					and more readability.
+ *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
+ *					silently drop skb instead of failing with -EPERM.
+ *		Detlev Wengorz	:	Copy protocol for fragments.
+ *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
+ *					datagrams.
+ *		Hirokazu Takahashi:	sendfile() on UDP works now.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <net/gso.h>
+#include <net/inetpeer.h>
+#include <net/inet_ecn.h>
+#include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netlink.h>
+#include <linux/tcp.h>
+
+static int
+ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+	    unsigned int mtu,
+	    int (*output)(struct net *, struct sock *, struct sk_buff *));
+
+/* Generate a checksum for an outgoing IP datagram. */
+void ip_send_check(struct iphdr *iph)
+{
+	iph->check = 0;
+	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+EXPORT_SYMBOL(ip_send_check);
+
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	iph_set_totlen(iph, skb->len);
+	ip_send_check(iph);
+
+	/* if egress device is enslaved to an L3 master device pass the
+	 * skb to its handler for processing
+	 */
+	skb = l3mdev_ip_out(sk, skb);
+	if (unlikely(!skb))
+		return 0;
+
+	skb->protocol = htons(ETH_P_IP);
+
+	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+		       net, sk, skb, NULL, skb_dst(skb)->dev,
+		       dst_output);
+}
+
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	int err;
+
+	err = __ip_local_out(net, sk, skb);
+	if (likely(err == 1))
+		err = dst_output(net, sk, skb);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ip_local_out);
+
+static inline int ip_select_ttl(const struct inet_sock *inet,
+				const struct dst_entry *dst)
+{
+	int ttl = READ_ONCE(inet->uc_ttl);
+
+	if (ttl < 0)
+		ttl = ip4_dst_hoplimit(dst);
+	return ttl;
+}
+
+/*
+ *		Add an ip header to a skbuff and send it out.
+ *
+ */
+int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
+			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
+			  u8 tos)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = skb_rtable(skb);
+	struct net *net = sock_net(sk);
+	struct iphdr *iph;
+
+	/* Build the IP header. */
+	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	iph->version  = 4;
+	iph->ihl      = 5;
+	iph->tos      = tos;
+	iph->ttl      = ip_select_ttl(inet, &rt->dst);
+	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
+	iph->saddr    = saddr;
+	iph->protocol = sk->sk_protocol;
+	/* Do not bother generating IPID for small packets (eg SYNACK) */
+	if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
+		iph->frag_off = htons(IP_DF);
+		iph->id = 0;
+	} else {
+		iph->frag_off = 0;
+		/* TCP packets here are SYNACK with fat IPv4/TCP options.
+		 * Avoid using the hashed IP ident generator.
+		 */
+		if (sk->sk_protocol == IPPROTO_TCP)
+			iph->id = (__force __be16)get_random_u16();
+		else
+			__ip_select_ident(net, iph, 1);
+	}
+
+	if (opt && opt->opt.optlen) {
+		iph->ihl += opt->opt.optlen>>2;
+		ip_options_build(skb, &opt->opt, daddr, rt);
+	}
+
+	skb->priority = READ_ONCE(sk->sk_priority);
+	if (!skb->mark)
+		skb->mark = READ_ONCE(sk->sk_mark);
+
+	/* Send it out. */
+	return ip_local_out(net, skb->sk, skb);
+}
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
+static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct rtable *rt = (struct rtable *)dst;
+	struct net_device *dev = dst->dev;
+	unsigned int hh_len = LL_RESERVED_SPACE(dev);
+	struct neighbour *neigh;
+	bool is_v6gw = false;
+
+	if (rt->rt_type == RTN_MULTICAST) {
+		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
+	} else if (rt->rt_type == RTN_BROADCAST)
+		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
+
+	/* OUTOCTETS should be counted after fragment */
+	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
+
+	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+		skb = skb_expand_head(skb, hh_len);
+		if (!skb)
+			return -ENOMEM;
+	}
+
+	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
+		int res = lwtunnel_xmit(skb);
+
+		if (res != LWTUNNEL_XMIT_CONTINUE)
+			return res;
+	}
+
+	rcu_read_lock();
+	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+	if (!IS_ERR(neigh)) {
+		int res;
+
+		sock_confirm_neigh(skb, neigh);
+		/* if crossing protocols, can not use the cached header */
+		res = neigh_output(neigh, skb, is_v6gw);
+		rcu_read_unlock();
+		return res;
+	}
+	rcu_read_unlock();
+
+	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+			    __func__);
+	kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
+	return PTR_ERR(neigh);
+}
+
+static int ip_finish_output_gso(struct net *net, struct sock *sk,
+				struct sk_buff *skb, unsigned int mtu)
+{
+	struct sk_buff *segs, *nskb;
+	netdev_features_t features;
+	int ret = 0;
+
+	/* common case: seglen is <= mtu
+	 */
+	if (skb_gso_validate_network_len(skb, mtu))
+		return ip_finish_output2(net, sk, skb);
+
+	/* Slowpath -  GSO segment length exceeds the egress MTU.
+	 *
+	 * This can happen in several cases:
+	 *  - Forwarding of a TCP GRO skb, when DF flag is not set.
+	 *  - Forwarding of an skb that arrived on a virtualization interface
+	 *    (virtio-net/vhost/tap) with TSO/GSO size set by other network
+	 *    stack.
+	 *  - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
+	 *    interface with a smaller MTU.
+	 *  - Arriving GRO skb (or GSO skb in a virtualized environment) that is
+	 *    bridged to a NETIF_F_TSO tunnel stacked over an interface with an
+	 *    insufficient MTU.
+	 */
+	features = netif_skb_features(skb);
+	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+	if (IS_ERR_OR_NULL(segs)) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+
+	consume_skb(skb);
+
+	skb_list_walk_safe(segs, segs, nskb) {
+		int err;
+
+		skb_mark_not_on_list(segs);
+		err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
+
+		if (err && ret == 0)
+			ret = err;
+	}
+
+	return ret;
+}
+
+static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	unsigned int mtu;
+
+#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
+	/* Policy lookup after SNAT yielded a new policy */
+	if (skb_dst(skb)->xfrm) {
+		IPCB(skb)->flags |= IPSKB_REROUTED;
+		return dst_output(net, sk, skb);
+	}
+#endif
+	mtu = ip_skb_dst_mtu(sk, skb);
+	if (skb_is_gso(skb))
+		return ip_finish_output_gso(net, sk, skb, mtu);
+
+	if (skb->len > mtu || IPCB(skb)->frag_max_size)
+		return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
+
+	return ip_finish_output2(net, sk, skb);
+}
+
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+
+	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+	switch (ret) {
+	case NET_XMIT_SUCCESS:
+		return __ip_finish_output(net, sk, skb);
+	case NET_XMIT_CN:
+		return __ip_finish_output(net, sk, skb) ? : ret;
+	default:
+		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
+		return ret;
+	}
+}
+
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+			       struct sk_buff *skb)
+{
+	struct rtable *new_rt;
+	bool do_cn = false;
+	int ret, err;
+
+	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+	switch (ret) {
+	case NET_XMIT_CN:
+		do_cn = true;
+		fallthrough;
+	case NET_XMIT_SUCCESS:
+		break;
+	default:
+		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
+		return ret;
+	}
+
+	/* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
+	 * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
+	 * see ipv4_pktinfo_prepare().
+	 */
+	new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
+	if (new_rt) {
+		new_rt->rt_iif = 0;
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &new_rt->dst);
+	}
+
+	err = dev_loopback_xmit(net, sk, skb);
+	return (do_cn && err) ? ret : err;
+}
+
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = rt->dst.dev;
+
+	/*
+	 *	If the indicated interface is up and running, send the packet.
+	 */
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+
+	/*
+	 *	Multicasts are looped back for other local users
+	 */
+
+	if (rt->rt_flags&RTCF_MULTICAST) {
+		if (sk_mc_loop(sk)
+#ifdef CONFIG_IP_MROUTE
+		/* Small optimization: do not loopback not local frames,
+		   which returned after forwarding; they will be  dropped
+		   by ip_mr_input in any case.
+		   Note, that local frames are looped back to be delivered
+		   to local recipients.
+
+		   This check is duplicated in ip_mr_input at the moment.
+		 */
+		    &&
+		    ((rt->rt_flags & RTCF_LOCAL) ||
+		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
+#endif
+		   ) {
+			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+			if (newskb)
+				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+					net, sk, newskb, NULL, newskb->dev,
+					ip_mc_finish_output);
+		}
+
+		/* Multicasts with ttl 0 must not go beyond the host */
+
+		if (ip_hdr(skb)->ttl == 0) {
+			kfree_skb(skb);
+			return 0;
+		}
+	}
+
+	if (rt->rt_flags&RTCF_BROADCAST) {
+		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+		if (newskb)
+			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+				net, sk, newskb, NULL, newskb->dev,
+				ip_mc_finish_output);
+	}
+
+	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+			    net, sk, skb, NULL, skb->dev,
+			    ip_finish_output,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
+
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+
+	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+			    net, sk, skb, indev, dev,
+			    ip_finish_output,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+EXPORT_SYMBOL(ip_output);
+
+/*
+ * copy saddr and daddr, possibly using 64bit load/stores
+ * Equivalent to :
+ *   iph->saddr = fl4->saddr;
+ *   iph->daddr = fl4->daddr;
+ */
+static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
+{
+	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
+		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
+
+	iph->saddr = fl4->saddr;
+	iph->daddr = fl4->daddr;
+}
+
+/* Note: skb->sk can be different from sk, in case of tunnels */
+int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+		    __u8 tos)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	struct ip_options_rcu *inet_opt;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	struct iphdr *iph;
+	int res;
+
+	/* Skip all of this if the packet is already routed,
+	 * f.e. by something like SCTP.
+	 */
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	fl4 = &fl->u.ip4;
+	rt = skb_rtable(skb);
+	if (rt)
+		goto packet_routed;
+
+	/* Make sure we can route this packet. */
+	rt = (struct rtable *)__sk_dst_check(sk, 0);
+	if (!rt) {
+		__be32 daddr;
+
+		/* Use correct destination address if we have options. */
+		daddr = inet->inet_daddr;
+		if (inet_opt && inet_opt->opt.srr)
+			daddr = inet_opt->opt.faddr;
+
+		/* If this fails, retransmit mechanism of transport layer will
+		 * keep trying until route appears or the connection times
+		 * itself out.
+		 */
+		rt = ip_route_output_ports(net, fl4, sk,
+					   daddr, inet->inet_saddr,
+					   inet->inet_dport,
+					   inet->inet_sport,
+					   sk->sk_protocol,
+					   RT_CONN_FLAGS_TOS(sk, tos),
+					   sk->sk_bound_dev_if);
+		if (IS_ERR(rt))
+			goto no_route;
+		sk_setup_caps(sk, &rt->dst);
+	}
+	skb_dst_set_noref(skb, &rt->dst);
+
+packet_routed:
+	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
+		goto no_route;
+
+	/* OK, we know where to send it, allocate and build IP header. */
+	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
+	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
+		iph->frag_off = htons(IP_DF);
+	else
+		iph->frag_off = 0;
+	iph->ttl      = ip_select_ttl(inet, &rt->dst);
+	iph->protocol = sk->sk_protocol;
+	ip_copy_addrs(iph, fl4);
+
+	/* Transport layer set skb->h.foo itself. */
+
+	if (inet_opt && inet_opt->opt.optlen) {
+		iph->ihl += inet_opt->opt.optlen >> 2;
+		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
+	}
+
+	ip_select_ident_segs(net, skb, sk,
+			     skb_shinfo(skb)->gso_segs ?: 1);
+
+	/* TODO : should we use skb->sk here instead of sk ? */
+	skb->priority = READ_ONCE(sk->sk_priority);
+	skb->mark = READ_ONCE(sk->sk_mark);
+
+	res = ip_local_out(net, sk, skb);
+	rcu_read_unlock();
+	return res;
+
+no_route:
+	rcu_read_unlock();
+	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+	kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
+	return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(__ip_queue_xmit);
+
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+{
+	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+}
+EXPORT_SYMBOL(ip_queue_xmit);
+
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+	to->pkt_type = from->pkt_type;
+	to->priority = from->priority;
+	to->protocol = from->protocol;
+	to->skb_iif = from->skb_iif;
+	skb_dst_drop(to);
+	skb_dst_copy(to, from);
+	to->dev = from->dev;
+	to->mark = from->mark;
+
+	skb_copy_hash(to, from);
+
+#ifdef CONFIG_NET_SCHED
+	to->tc_index = from->tc_index;
+#endif
+	nf_copy(to, from);
+	skb_ext_copy(to, from);
+#if IS_ENABLED(CONFIG_IP_VS)
+	to->ipvs_property = from->ipvs_property;
+#endif
+	skb_copy_secmark(to, from);
+}
+
+static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		       unsigned int mtu,
+		       int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	if ((iph->frag_off & htons(IP_DF)) == 0)
+		return ip_do_fragment(net, sk, skb, output);
+
+	if (unlikely(!skb->ignore_df ||
+		     (IPCB(skb)->frag_max_size &&
+		      IPCB(skb)->frag_max_size > mtu))) {
+		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(mtu));
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	return ip_do_fragment(net, sk, skb, output);
+}
+
+void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
+		      unsigned int hlen, struct ip_fraglist_iter *iter)
+{
+	unsigned int first_len = skb_pagelen(skb);
+
+	iter->frag = skb_shinfo(skb)->frag_list;
+	skb_frag_list_init(skb);
+
+	iter->offset = 0;
+	iter->iph = iph;
+	iter->hlen = hlen;
+
+	skb->data_len = first_len - skb_headlen(skb);
+	skb->len = first_len;
+	iph->tot_len = htons(first_len);
+	iph->frag_off = htons(IP_MF);
+	ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_init);
+
+void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
+{
+	unsigned int hlen = iter->hlen;
+	struct iphdr *iph = iter->iph;
+	struct sk_buff *frag;
+
+	frag = iter->frag;
+	frag->ip_summed = CHECKSUM_NONE;
+	skb_reset_transport_header(frag);
+	__skb_push(frag, hlen);
+	skb_reset_network_header(frag);
+	memcpy(skb_network_header(frag), iph, hlen);
+	iter->iph = ip_hdr(frag);
+	iph = iter->iph;
+	iph->tot_len = htons(frag->len);
+	ip_copy_metadata(frag, skb);
+	iter->offset += skb->len - hlen;
+	iph->frag_off = htons(iter->offset >> 3);
+	if (frag->next)
+		iph->frag_off |= htons(IP_MF);
+	/* Ready, complete checksum */
+	ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_prepare);
+
+void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
+		  unsigned int ll_rs, unsigned int mtu, bool DF,
+		  struct ip_frag_state *state)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	state->DF = DF;
+	state->hlen = hlen;
+	state->ll_rs = ll_rs;
+	state->mtu = mtu;
+
+	state->left = skb->len - hlen;	/* Space per frame */
+	state->ptr = hlen;		/* Where to start from */
+
+	state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+	state->not_last_frag = iph->frag_off & htons(IP_MF);
+}
+EXPORT_SYMBOL(ip_frag_init);
+
+static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
+			 bool first_frag)
+{
+	/* Copy the flags to each fragment. */
+	IPCB(to)->flags = IPCB(from)->flags;
+
+	/* ANK: dirty, but effective trick. Upgrade options only if
+	 * the segment to be fragmented was THE FIRST (otherwise,
+	 * options are already fixed) and make it ONCE
+	 * on the initial skb, so that all the following fragments
+	 * will inherit fixed options.
+	 */
+	if (first_frag)
+		ip_options_fragment(from);
+}
+
+struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
+{
+	unsigned int len = state->left;
+	struct sk_buff *skb2;
+	struct iphdr *iph;
+
+	/* IF: it doesn't fit, use 'mtu' - the data space left */
+	if (len > state->mtu)
+		len = state->mtu;
+	/* IF: we are not sending up to and including the packet end
+	   then align the next start on an eight byte boundary */
+	if (len < state->left)	{
+		len &= ~7;
+	}
+
+	/* Allocate buffer */
+	skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
+	if (!skb2)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 *	Set up data on packet
+	 */
+
+	ip_copy_metadata(skb2, skb);
+	skb_reserve(skb2, state->ll_rs);
+	skb_put(skb2, len + state->hlen);
+	skb_reset_network_header(skb2);
+	skb2->transport_header = skb2->network_header + state->hlen;
+
+	/*
+	 *	Charge the memory for the fragment to any owner
+	 *	it might possess
+	 */
+
+	if (skb->sk)
+		skb_set_owner_w(skb2, skb->sk);
+
+	/*
+	 *	Copy the packet header into the new buffer.
+	 */
+
+	skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
+
+	/*
+	 *	Copy a block of the IP datagram.
+	 */
+	if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
+		BUG();
+	state->left -= len;
+
+	/*
+	 *	Fill in the new header fields.
+	 */
+	iph = ip_hdr(skb2);
+	iph->frag_off = htons((state->offset >> 3));
+	if (state->DF)
+		iph->frag_off |= htons(IP_DF);
+
+	/*
+	 *	Added AC : If we are fragmenting a fragment that's not the
+	 *		   last fragment then keep MF on each bit
+	 */
+	if (state->left > 0 || state->not_last_frag)
+		iph->frag_off |= htons(IP_MF);
+	state->ptr += len;
+	state->offset += len;
+
+	iph->tot_len = htons(len + state->hlen);
+
+	ip_send_check(iph);
+
+	return skb2;
+}
+EXPORT_SYMBOL(ip_frag_next);
+
+/*
+ *	This IP datagram is too large to be sent in one piece.  Break it up into
+ *	smaller pieces (each of size equal to IP header plus
+ *	a block of the data of the original IP data part) that will yet fit in a
+ *	single device frame, and queue such a frame for sending.
+ */
+
+int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+		   int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+	struct iphdr *iph;
+	struct sk_buff *skb2;
+	bool mono_delivery_time = skb->mono_delivery_time;
+	struct rtable *rt = skb_rtable(skb);
+	unsigned int mtu, hlen, ll_rs;
+	struct ip_fraglist_iter iter;
+	ktime_t tstamp = skb->tstamp;
+	struct ip_frag_state state;
+	int err = 0;
+
+	/* for offloaded checksums cleanup checksum before fragmentation */
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    (err = skb_checksum_help(skb)))
+		goto fail;
+
+	/*
+	 *	Point into the IP datagram header.
+	 */
+
+	iph = ip_hdr(skb);
+
+	mtu = ip_skb_dst_mtu(sk, skb);
+	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+		mtu = IPCB(skb)->frag_max_size;
+
+	/*
+	 *	Setup starting values.
+	 */
+
+	hlen = iph->ihl * 4;
+	mtu = mtu - hlen;	/* Size of data space */
+	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
+	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
+
+	/* When frag_list is given, use it. First, check its validity:
+	 * some transformers could create wrong frag_list or break existing
+	 * one, it is not prohibited. In this case fall back to copying.
+	 *
+	 * LATER: this step can be merged to real generation of fragments,
+	 * we can switch to copy when see the first bad fragment.
+	 */
+	if (skb_has_frag_list(skb)) {
+		struct sk_buff *frag, *frag2;
+		unsigned int first_len = skb_pagelen(skb);
+
+		if (first_len - hlen > mtu ||
+		    ((first_len - hlen) & 7) ||
+		    ip_is_fragment(iph) ||
+		    skb_cloned(skb) ||
+		    skb_headroom(skb) < ll_rs)
+			goto slow_path;
+
+		skb_walk_frags(skb, frag) {
+			/* Correct geometry. */
+			if (frag->len > mtu ||
+			    ((frag->len & 7) && frag->next) ||
+			    skb_headroom(frag) < hlen + ll_rs)
+				goto slow_path_clean;
+
+			/* Partially cloned skb? */
+			if (skb_shared(frag))
+				goto slow_path_clean;
+
+			BUG_ON(frag->sk);
+			if (skb->sk) {
+				frag->sk = skb->sk;
+				frag->destructor = sock_wfree;
+			}
+			skb->truesize -= frag->truesize;
+		}
+
+		/* Everything is OK. Generate! */
+		ip_fraglist_init(skb, iph, hlen, &iter);
+
+		for (;;) {
+			/* Prepare header of the next frame,
+			 * before previous one went down. */
+			if (iter.frag) {
+				bool first_frag = (iter.offset == 0);
+
+				IPCB(iter.frag)->flags = IPCB(skb)->flags;
+				ip_fraglist_prepare(skb, &iter);
+				if (first_frag && IPCB(skb)->opt.optlen) {
+					/* ipcb->opt is not populated for frags
+					 * coming from __ip_make_skb(),
+					 * ip_options_fragment() needs optlen
+					 */
+					IPCB(iter.frag)->opt.optlen =
+						IPCB(skb)->opt.optlen;
+					ip_options_fragment(iter.frag);
+					ip_send_check(iter.iph);
+				}
+			}
+
+			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
+			err = output(net, sk, skb);
+
+			if (!err)
+				IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
+			if (err || !iter.frag)
+				break;
+
+			skb = ip_fraglist_next(&iter);
+		}
+
+		if (err == 0) {
+			IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
+			return 0;
+		}
+
+		kfree_skb_list(iter.frag);
+
+		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+		return err;
+
+slow_path_clean:
+		skb_walk_frags(skb, frag2) {
+			if (frag2 == frag)
+				break;
+			frag2->sk = NULL;
+			frag2->destructor = NULL;
+			skb->truesize += frag2->truesize;
+		}
+	}
+
+slow_path:
+	/*
+	 *	Fragment the datagram.
+	 */
+
+	ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
+		     &state);
+
+	/*
+	 *	Keep copying data until we run out.
+	 */
+
+	while (state.left > 0) {
+		bool first_frag = (state.offset == 0);
+
+		skb2 = ip_frag_next(skb, &state);
+		if (IS_ERR(skb2)) {
+			err = PTR_ERR(skb2);
+			goto fail;
+		}
+		ip_frag_ipcb(skb, skb2, first_frag);
+
+		/*
+		 *	Put this fragment into the sending queue.
+		 */
+		skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
+		err = output(net, sk, skb2);
+		if (err)
+			goto fail;
+
+		IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
+	}
+	consume_skb(skb);
+	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
+	return err;
+
+fail:
+	kfree_skb(skb);
+	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+	return err;
+}
+EXPORT_SYMBOL(ip_do_fragment);
+
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+	struct msghdr *msg = from;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (!copy_from_iter_full(to, len, &msg->msg_iter))
+			return -EFAULT;
+	} else {
+		__wsum csum = 0;
+		if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
+			return -EFAULT;
+		skb->csum = csum_block_add(skb->csum, csum, odd);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ip_generic_getfrag);
+
+static int __ip_append_data(struct sock *sk,
+			    struct flowi4 *fl4,
+			    struct sk_buff_head *queue,
+			    struct inet_cork *cork,
+			    struct page_frag *pfrag,
+			    int getfrag(void *from, char *to, int offset,
+					int len, int odd, struct sk_buff *skb),
+			    void *from, int length, int transhdrlen,
+			    unsigned int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ubuf_info *uarg = NULL;
+	struct sk_buff *skb;
+	struct ip_options *opt = cork->opt;
+	int hh_len;
+	int exthdrlen;
+	int mtu;
+	int copy;
+	int err;
+	int offset = 0;
+	bool zc = false;
+	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
+	int csummode = CHECKSUM_NONE;
+	struct rtable *rt = (struct rtable *)cork->dst;
+	unsigned int wmem_alloc_delta = 0;
+	bool paged, extra_uref = false;
+	u32 tskey = 0;
+
+	skb = skb_peek_tail(queue);
+
+	exthdrlen = !skb ? rt->dst.header_len : 0;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+	paged = !!cork->gso_size;
+
+	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
+		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+
+	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+
+	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+	maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
+
+	if (cork->length + length > maxnonfragsize - fragheaderlen) {
+		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+			       mtu - (opt ? opt->optlen : 0));
+		return -EMSGSIZE;
+	}
+
+	/*
+	 * transhdrlen > 0 means that this is the first fragment and we wish
+	 * it won't be fragmented in the future.
+	 */
+	if (transhdrlen &&
+	    length + fragheaderlen <= mtu &&
+	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
+	    (!(flags & MSG_MORE) || cork->gso_size) &&
+	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
+		csummode = CHECKSUM_PARTIAL;
+
+	if ((flags & MSG_ZEROCOPY) && length) {
+		struct msghdr *msg = from;
+
+		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
+			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
+				return -EINVAL;
+
+			/* Leave uarg NULL if can't zerocopy, callers should
+			 * be able to handle it.
+			 */
+			if ((rt->dst.dev->features & NETIF_F_SG) &&
+			    csummode == CHECKSUM_PARTIAL) {
+				paged = true;
+				zc = true;
+				uarg = msg->msg_ubuf;
+			}
+		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
+			if (!uarg)
+				return -ENOBUFS;
+			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
+			if (rt->dst.dev->features & NETIF_F_SG &&
+			    csummode == CHECKSUM_PARTIAL) {
+				paged = true;
+				zc = true;
+			} else {
+				uarg_to_msgzc(uarg)->zerocopy = 0;
+				skb_zcopy_set(skb, uarg, &extra_uref);
+			}
+		}
+	} else if ((flags & MSG_SPLICE_PAGES) && length) {
+		if (inet_test_bit(HDRINCL, sk))
+			return -EPERM;
+		if (rt->dst.dev->features & NETIF_F_SG &&
+		    getfrag == ip_generic_getfrag)
+			/* We need an empty buffer to attach stuff to */
+			paged = true;
+		else
+			flags &= ~MSG_SPLICE_PAGES;
+	}
+
+	cork->length += length;
+
+	/* So, what's going on in the loop below?
+	 *
+	 * We use calculated fragment length to generate chained skb,
+	 * each of segments is IP fragment ready for sending to network after
+	 * adding appropriate IP header.
+	 */
+
+	if (!skb)
+		goto alloc_new_skb;
+
+	while (length > 0) {
+		/* Check if the remaining data fits into current packet. */
+		copy = mtu - skb->len;
+		if (copy < length)
+			copy = maxfraglen - skb->len;
+		if (copy <= 0) {
+			char *data;
+			unsigned int datalen;
+			unsigned int fraglen;
+			unsigned int fraggap;
+			unsigned int alloclen, alloc_extra;
+			unsigned int pagedlen;
+			struct sk_buff *skb_prev;
+alloc_new_skb:
+			skb_prev = skb;
+			if (skb_prev)
+				fraggap = skb_prev->len - maxfraglen;
+			else
+				fraggap = 0;
+
+			/*
+			 * If remaining data exceeds the mtu,
+			 * we know we need more fragment(s).
+			 */
+			datalen = length + fraggap;
+			if (datalen > mtu - fragheaderlen)
+				datalen = maxfraglen - fragheaderlen;
+			fraglen = datalen + fragheaderlen;
+			pagedlen = 0;
+
+			alloc_extra = hh_len + 15;
+			alloc_extra += exthdrlen;
+
+			/* The last fragment gets additional space at tail.
+			 * Note, with MSG_MORE we overallocate on fragments,
+			 * because we have no idea what fragment will be
+			 * the last.
+			 */
+			if (datalen == length + fraggap)
+				alloc_extra += rt->dst.trailer_len;
+
+			if ((flags & MSG_MORE) &&
+			    !(rt->dst.dev->features&NETIF_F_SG))
+				alloclen = mtu;
+			else if (!paged &&
+				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
+				  !(rt->dst.dev->features & NETIF_F_SG)))
+				alloclen = fraglen;
+			else {
+				alloclen = fragheaderlen + transhdrlen;
+				pagedlen = datalen - transhdrlen;
+			}
+
+			alloclen += alloc_extra;
+
+			if (transhdrlen) {
+				skb = sock_alloc_send_skb(sk, alloclen,
+						(flags & MSG_DONTWAIT), &err);
+			} else {
+				skb = NULL;
+				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
+				    2 * sk->sk_sndbuf)
+					skb = alloc_skb(alloclen,
+							sk->sk_allocation);
+				if (unlikely(!skb))
+					err = -ENOBUFS;
+			}
+			if (!skb)
+				goto error;
+
+			/*
+			 *	Fill in the control structures
+			 */
+			skb->ip_summed = csummode;
+			skb->csum = 0;
+			skb_reserve(skb, hh_len);
+
+			/*
+			 *	Find where to start putting bytes.
+			 */
+			data = skb_put(skb, fraglen + exthdrlen - pagedlen);
+			skb_set_network_header(skb, exthdrlen);
+			skb->transport_header = (skb->network_header +
+						 fragheaderlen);
+			data += fragheaderlen + exthdrlen;
+
+			if (fraggap) {
+				skb->csum = skb_copy_and_csum_bits(
+					skb_prev, maxfraglen,
+					data + transhdrlen, fraggap);
+				skb_prev->csum = csum_sub(skb_prev->csum,
+							  skb->csum);
+				data += fraggap;
+				pskb_trim_unique(skb_prev, maxfraglen);
+			}
+
+			copy = datalen - transhdrlen - fraggap - pagedlen;
+			/* [!] NOTE: copy will be negative if pagedlen>0
+			 * because then the equation reduces to -fraggap.
+			 */
+			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+				err = -EFAULT;
+				kfree_skb(skb);
+				goto error;
+			} else if (flags & MSG_SPLICE_PAGES) {
+				copy = 0;
+			}
+
+			offset += copy;
+			length -= copy + transhdrlen;
+			transhdrlen = 0;
+			exthdrlen = 0;
+			csummode = CHECKSUM_NONE;
+
+			/* only the initial fragment is time stamped */
+			skb_shinfo(skb)->tx_flags = cork->tx_flags;
+			cork->tx_flags = 0;
+			skb_shinfo(skb)->tskey = tskey;
+			tskey = 0;
+			skb_zcopy_set(skb, uarg, &extra_uref);
+
+			if ((flags & MSG_CONFIRM) && !skb_prev)
+				skb_set_dst_pending_confirm(skb, 1);
+
+			/*
+			 * Put the packet on the pending queue.
+			 */
+			if (!skb->destructor) {
+				skb->destructor = sock_wfree;
+				skb->sk = sk;
+				wmem_alloc_delta += skb->truesize;
+			}
+			__skb_queue_tail(queue, skb);
+			continue;
+		}
+
+		if (copy > length)
+			copy = length;
+
+		if (!(rt->dst.dev->features&NETIF_F_SG) &&
+		    skb_tailroom(skb) >= copy) {
+			unsigned int off;
+
+			off = skb->len;
+			if (getfrag(from, skb_put(skb, copy),
+					offset, copy, off, skb) < 0) {
+				__skb_trim(skb, off);
+				err = -EFAULT;
+				goto error;
+			}
+		} else if (flags & MSG_SPLICE_PAGES) {
+			struct msghdr *msg = from;
+
+			err = -EIO;
+			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
+				goto error;
+
+			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
+						   sk->sk_allocation);
+			if (err < 0)
+				goto error;
+			copy = err;
+			wmem_alloc_delta += copy;
+		} else if (!zc) {
+			int i = skb_shinfo(skb)->nr_frags;
+
+			err = -ENOMEM;
+			if (!sk_page_frag_refill(sk, pfrag))
+				goto error;
+
+			skb_zcopy_downgrade_managed(skb);
+			if (!skb_can_coalesce(skb, i, pfrag->page,
+					      pfrag->offset)) {
+				err = -EMSGSIZE;
+				if (i == MAX_SKB_FRAGS)
+					goto error;
+
+				__skb_fill_page_desc(skb, i, pfrag->page,
+						     pfrag->offset, 0);
+				skb_shinfo(skb)->nr_frags = ++i;
+				get_page(pfrag->page);
+			}
+			copy = min_t(int, copy, pfrag->size - pfrag->offset);
+			if (getfrag(from,
+				    page_address(pfrag->page) + pfrag->offset,
+				    offset, copy, skb->len, skb) < 0)
+				goto error_efault;
+
+			pfrag->offset += copy;
+			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+			skb_len_add(skb, copy);
+			wmem_alloc_delta += copy;
+		} else {
+			err = skb_zerocopy_iter_dgram(skb, from, copy);
+			if (err < 0)
+				goto error;
+		}
+		offset += copy;
+		length -= copy;
+	}
+
+	if (wmem_alloc_delta)
+		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
+	return 0;
+
+error_efault:
+	err = -EFAULT;
+error:
+	net_zcopy_put_abort(uarg, extra_uref);
+	cork->length -= length;
+	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
+	return err;
+}
+
+static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
+			 struct ipcm_cookie *ipc, struct rtable **rtp)
+{
+	struct ip_options_rcu *opt;
+	struct rtable *rt;
+
+	rt = *rtp;
+	if (unlikely(!rt))
+		return -EFAULT;
+
+	/*
+	 * setup for corking.
+	 */
+	opt = ipc->opt;
+	if (opt) {
+		if (!cork->opt) {
+			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
+					    sk->sk_allocation);
+			if (unlikely(!cork->opt))
+				return -ENOBUFS;
+		}
+		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
+		cork->flags |= IPCORK_OPT;
+		cork->addr = ipc->addr;
+	}
+
+	cork->fragsize = ip_sk_use_pmtu(sk) ?
+			 dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+
+	if (!inetdev_valid_mtu(cork->fragsize))
+		return -ENETUNREACH;
+
+	cork->gso_size = ipc->gso_size;
+
+	cork->dst = &rt->dst;
+	/* We stole this route, caller should not release it. */
+	*rtp = NULL;
+
+	cork->length = 0;
+	cork->ttl = ipc->ttl;
+	cork->tos = ipc->tos;
+	cork->mark = ipc->sockc.mark;
+	cork->priority = ipc->priority;
+	cork->transmit_time = ipc->sockc.transmit_time;
+	cork->tx_flags = 0;
+	sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
+
+	return 0;
+}
+
+/*
+ *	ip_append_data() can make one large IP datagram from many pieces of
+ *	data.  Each piece will be held on the socket until
+ *	ip_push_pending_frames() is called. Each piece can be a page or
+ *	non-page data.
+ *
+ *	Not only UDP, other transport protocols - e.g. raw sockets - can use
+ *	this interface potentially.
+ *
+ *	LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk, struct flowi4 *fl4,
+		   int getfrag(void *from, char *to, int offset, int len,
+			       int odd, struct sk_buff *skb),
+		   void *from, int length, int transhdrlen,
+		   struct ipcm_cookie *ipc, struct rtable **rtp,
+		   unsigned int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int err;
+
+	if (flags&MSG_PROBE)
+		return 0;
+
+	if (skb_queue_empty(&sk->sk_write_queue)) {
+		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
+		if (err)
+			return err;
+	} else {
+		transhdrlen = 0;
+	}
+
+	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+				sk_page_frag(sk), getfrag,
+				from, length, transhdrlen, flags);
+}
+
+static void ip_cork_release(struct inet_cork *cork)
+{
+	cork->flags &= ~IPCORK_OPT;
+	kfree(cork->opt);
+	cork->opt = NULL;
+	dst_release(cork->dst);
+	cork->dst = NULL;
+}
+
+/*
+ *	Combined all pending IP fragments on the socket as one IP datagram
+ *	and push them out.
+ */
+struct sk_buff *__ip_make_skb(struct sock *sk,
+			      struct flowi4 *fl4,
+			      struct sk_buff_head *queue,
+			      struct inet_cork *cork)
+{
+	struct sk_buff *skb, *tmp_skb;
+	struct sk_buff **tail_skb;
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	struct ip_options *opt = NULL;
+	struct rtable *rt = (struct rtable *)cork->dst;
+	struct iphdr *iph;
+	__be16 df = 0;
+	__u8 ttl;
+
+	skb = __skb_dequeue(queue);
+	if (!skb)
+		goto out;
+	tail_skb = &(skb_shinfo(skb)->frag_list);
+
+	/* move skb->data to ip header from ext header */
+	if (skb->data < skb_network_header(skb))
+		__skb_pull(skb, skb_network_offset(skb));
+	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
+		__skb_pull(tmp_skb, skb_network_header_len(skb));
+		*tail_skb = tmp_skb;
+		tail_skb = &(tmp_skb->next);
+		skb->len += tmp_skb->len;
+		skb->data_len += tmp_skb->len;
+		skb->truesize += tmp_skb->truesize;
+		tmp_skb->destructor = NULL;
+		tmp_skb->sk = NULL;
+	}
+
+	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
+	 * to fragment the frame generated here. No matter, what transforms
+	 * how transforms change size of the packet, it will come out.
+	 */
+	skb->ignore_df = ip_sk_ignore_df(sk);
+
+	/* DF bit is set when we want to see DF on outgoing frames.
+	 * If ignore_df is set too, we still allow to fragment this frame
+	 * locally. */
+	if (inet->pmtudisc == IP_PMTUDISC_DO ||
+	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
+	    (skb->len <= dst_mtu(&rt->dst) &&
+	     ip_dont_fragment(sk, &rt->dst)))
+		df = htons(IP_DF);
+
+	if (cork->flags & IPCORK_OPT)
+		opt = cork->opt;
+
+	if (cork->ttl != 0)
+		ttl = cork->ttl;
+	else if (rt->rt_type == RTN_MULTICAST)
+		ttl = inet->mc_ttl;
+	else
+		ttl = ip_select_ttl(inet, &rt->dst);
+
+	iph = ip_hdr(skb);
+	iph->version = 4;
+	iph->ihl = 5;
+	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+	iph->frag_off = df;
+	iph->ttl = ttl;
+	iph->protocol = sk->sk_protocol;
+	ip_copy_addrs(iph, fl4);
+	ip_select_ident(net, skb, sk);
+
+	if (opt) {
+		iph->ihl += opt->optlen >> 2;
+		ip_options_build(skb, opt, cork->addr, rt);
+	}
+
+	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
+	skb->mark = cork->mark;
+	skb->tstamp = cork->transmit_time;
+	/*
+	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
+	 * on dst refcount
+	 */
+	cork->dst = NULL;
+	skb_dst_set(skb, &rt->dst);
+
+	if (iph->protocol == IPPROTO_ICMP) {
+		u8 icmp_type;
+
+		/* For such sockets, transhdrlen is zero when do ip_append_data(),
+		 * so icmphdr does not in skb linear region and can not get icmp_type
+		 * by icmp_hdr(skb)->type.
+		 */
+		if (sk->sk_type == SOCK_RAW &&
+		    !inet_test_bit(HDRINCL, sk))
+			icmp_type = fl4->fl4_icmp_type;
+		else
+			icmp_type = icmp_hdr(skb)->type;
+		icmp_out_count(net, icmp_type);
+	}
+
+	ip_cork_release(cork);
+out:
+	return skb;
+}
+
+int ip_send_skb(struct net *net, struct sk_buff *skb)
+{
+	int err;
+
+	err = ip_local_out(net, skb->sk, skb);
+	if (err) {
+		if (err > 0)
+			err = net_xmit_errno(err);
+		if (err)
+			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+	}
+
+	return err;
+}
+
+int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
+{
+	struct sk_buff *skb;
+
+	skb = ip_finish_skb(sk, fl4);
+	if (!skb)
+		return 0;
+
+	/* Netfilter gets whole the not fragmented skb. */
+	return ip_send_skb(sock_net(sk), skb);
+}
+
+/*
+ *	Throw away all pending data on the socket.
+ */
+static void __ip_flush_pending_frames(struct sock *sk,
+				      struct sk_buff_head *queue,
+				      struct inet_cork *cork)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue_tail(queue)) != NULL)
+		kfree_skb(skb);
+
+	ip_cork_release(cork);
+}
+
+void ip_flush_pending_frames(struct sock *sk)
+{
+	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
+}
+
+struct sk_buff *ip_make_skb(struct sock *sk,
+			    struct flowi4 *fl4,
+			    int getfrag(void *from, char *to, int offset,
+					int len, int odd, struct sk_buff *skb),
+			    void *from, int length, int transhdrlen,
+			    struct ipcm_cookie *ipc, struct rtable **rtp,
+			    struct inet_cork *cork, unsigned int flags)
+{
+	struct sk_buff_head queue;
+	int err;
+
+	if (flags & MSG_PROBE)
+		return NULL;
+
+	__skb_queue_head_init(&queue);
+
+	cork->flags = 0;
+	cork->addr = 0;
+	cork->opt = NULL;
+	err = ip_setup_cork(sk, cork, ipc, rtp);
+	if (err)
+		return ERR_PTR(err);
+
+	err = __ip_append_data(sk, fl4, &queue, cork,
+			       &current->task_frag, getfrag,
+			       from, length, transhdrlen, flags);
+	if (err) {
+		__ip_flush_pending_frames(sk, &queue, cork);
+		return ERR_PTR(err);
+	}
+
+	return __ip_make_skb(sk, fl4, &queue, cork);
+}
+
+/*
+ *	Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset,
+			      int len, int odd, struct sk_buff *skb)
+{
+	__wsum csum;
+
+	csum = csum_partial_copy_nocheck(dptr+offset, to, len);
+	skb->csum = csum_block_add(skb->csum, csum, odd);
+	return 0;
+}
+
+/*
+ *	Generic function to send a packet as reply to another packet.
+ *	Used to send some TCP resets/acks so far.
+ */
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+			   const struct ip_options *sopt,
+			   __be32 daddr, __be32 saddr,
+			   const struct ip_reply_arg *arg,
+			   unsigned int len, u64 transmit_time, u32 txhash)
+{
+	struct ip_options_data replyopts;
+	struct ipcm_cookie ipc;
+	struct flowi4 fl4;
+	struct rtable *rt = skb_rtable(skb);
+	struct net *net = sock_net(sk);
+	struct sk_buff *nskb;
+	int err;
+	int oif;
+
+	if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
+		return;
+
+	ipcm_init(&ipc);
+	ipc.addr = daddr;
+	ipc.sockc.transmit_time = transmit_time;
+
+	if (replyopts.opt.opt.optlen) {
+		ipc.opt = &replyopts.opt;
+
+		if (replyopts.opt.opt.srr)
+			daddr = replyopts.opt.opt.faddr;
+	}
+
+	oif = arg->bound_dev_if;
+	if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+		oif = skb->skb_iif;
+
+	flowi4_init_output(&fl4, oif,
+			   IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
+			   RT_TOS(arg->tos),
+			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
+			   ip_reply_arg_flowi_flags(arg),
+			   daddr, saddr,
+			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+			   arg->uid);
+	security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
+	rt = ip_route_output_flow(net, &fl4, sk);
+	if (IS_ERR(rt))
+		return;
+
+	inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
+
+	sk->sk_protocol = ip_hdr(skb)->protocol;
+	sk->sk_bound_dev_if = arg->bound_dev_if;
+	sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
+	ipc.sockc.mark = fl4.flowi4_mark;
+	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
+			     len, 0, &ipc, &rt, MSG_DONTWAIT);
+	if (unlikely(err)) {
+		ip_flush_pending_frames(sk);
+		goto out;
+	}
+
+	nskb = skb_peek(&sk->sk_write_queue);
+	if (nskb) {
+		if (arg->csumoffset >= 0)
+			*((__sum16 *)skb_transport_header(nskb) +
+			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
+								arg->csum));
+		nskb->ip_summed = CHECKSUM_NONE;
+		nskb->mono_delivery_time = !!transmit_time;
+		if (txhash)
+			skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
+		ip_push_pending_frames(sk, &fl4);
+	}
+out:
+	ip_rt_put(rt);
+}
+
+void __init ip_init(void)
+{
+	ip_rt_init();
+	inet_initpeers();
+
+#if defined(CONFIG_IP_MULTICAST)
+	igmp_mc_init();
+#endif
+}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 0000000000..cce9cb25f3
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1796 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP to API glue.
+ *
+ * Authors:	see ip.c
+ *
+ * Fixes:
+ *		Many		:	Split from ip.c , see ip.c for history.
+ *		Martin Mares	:	TOS setting fixed.
+ *		Alan Cox	:	Fixed a couple of oopses in Martin's
+ *					TOS tweaks.
+ *		Mike McLagan	:	Routing by source
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp_states.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/netfilter.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/inet_ecn.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/compat.h>
+#include <net/checksum.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/transp_v6.h>
+#endif
+#include <net/ip_fib.h>
+
+#include <linux/errqueue.h>
+#include <linux/uaccess.h>
+
+#include <linux/bpfilter.h>
+
+/*
+ *	SOL_IP control messages.
+ */
+
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
+
+	info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
+
+	put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+	int ttl = ip_hdr(skb)->ttl;
+	put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+	put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos);
+}
+
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+	if (IPCB(skb)->opt.optlen == 0)
+		return;
+
+	put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen,
+		 ip_hdr(skb) + 1);
+}
+
+
+static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg,
+				 struct sk_buff *skb)
+{
+	unsigned char optbuf[sizeof(struct ip_options) + 40];
+	struct ip_options *opt = (struct ip_options *)optbuf;
+
+	if (IPCB(skb)->opt.optlen == 0)
+		return;
+
+	if (ip_options_echo(net, opt, skb)) {
+		msg->msg_flags |= MSG_CTRUNC;
+		return;
+	}
+	ip_options_undo(opt);
+
+	put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+
+static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb)
+{
+	int val;
+
+	if (IPCB(skb)->frag_max_size == 0)
+		return;
+
+	val = IPCB(skb)->frag_max_size;
+	put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val);
+}
+
+static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
+				  int tlen, int offset)
+{
+	__wsum csum = skb->csum;
+
+	if (skb->ip_summed != CHECKSUM_COMPLETE)
+		return;
+
+	if (offset != 0) {
+		int tend_off = skb_transport_offset(skb) + tlen;
+		csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0));
+	}
+
+	put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
+}
+
+static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
+{
+	char *secdata;
+	u32 seclen, secid;
+	int err;
+
+	err = security_socket_getpeersec_dgram(NULL, skb, &secid);
+	if (err)
+		return;
+
+	err = security_secid_to_secctx(secid, &secdata, &seclen);
+	if (err)
+		return;
+
+	put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
+	security_release_secctx(secdata, seclen);
+}
+
+static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
+{
+	__be16 _ports[2], *ports;
+	struct sockaddr_in sin;
+
+	/* All current transport protocols have the port numbers in the
+	 * first four bytes of the transport header and this function is
+	 * written with this assumption in mind.
+	 */
+	ports = skb_header_pointer(skb, skb_transport_offset(skb),
+				   sizeof(_ports), &_ports);
+	if (!ports)
+		return;
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = ip_hdr(skb)->daddr;
+	sin.sin_port = ports[1];
+	memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
+}
+
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+			 struct sk_buff *skb, int tlen, int offset)
+{
+	unsigned long flags = inet_cmsg_flags(inet_sk(sk));
+
+	if (!flags)
+		return;
+
+	/* Ordered by supposed usage frequency */
+	if (flags & IP_CMSG_PKTINFO) {
+		ip_cmsg_recv_pktinfo(msg, skb);
+
+		flags &= ~IP_CMSG_PKTINFO;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_TTL) {
+		ip_cmsg_recv_ttl(msg, skb);
+
+		flags &= ~IP_CMSG_TTL;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_TOS) {
+		ip_cmsg_recv_tos(msg, skb);
+
+		flags &= ~IP_CMSG_TOS;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_RECVOPTS) {
+		ip_cmsg_recv_opts(msg, skb);
+
+		flags &= ~IP_CMSG_RECVOPTS;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_RETOPTS) {
+		ip_cmsg_recv_retopts(sock_net(sk), msg, skb);
+
+		flags &= ~IP_CMSG_RETOPTS;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_PASSSEC) {
+		ip_cmsg_recv_security(msg, skb);
+
+		flags &= ~IP_CMSG_PASSSEC;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_ORIGDSTADDR) {
+		ip_cmsg_recv_dstaddr(msg, skb);
+
+		flags &= ~IP_CMSG_ORIGDSTADDR;
+		if (!flags)
+			return;
+	}
+
+	if (flags & IP_CMSG_CHECKSUM)
+		ip_cmsg_recv_checksum(msg, skb, tlen, offset);
+
+	if (flags & IP_CMSG_RECVFRAGSIZE)
+		ip_cmsg_recv_fragsize(msg, skb);
+}
+EXPORT_SYMBOL(ip_cmsg_recv_offset);
+
+int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
+		 bool allow_ipv6)
+{
+	int err, val;
+	struct cmsghdr *cmsg;
+	struct net *net = sock_net(sk);
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (allow_ipv6 &&
+		    cmsg->cmsg_level == SOL_IPV6 &&
+		    cmsg->cmsg_type == IPV6_PKTINFO) {
+			struct in6_pktinfo *src_info;
+
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
+				return -EINVAL;
+			src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+			if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
+				return -EINVAL;
+			if (src_info->ipi6_ifindex)
+				ipc->oif = src_info->ipi6_ifindex;
+			ipc->addr = src_info->ipi6_addr.s6_addr32[3];
+			continue;
+		}
+#endif
+		if (cmsg->cmsg_level == SOL_SOCKET) {
+			err = __sock_cmsg_send(sk, cmsg, &ipc->sockc);
+			if (err)
+				return err;
+			continue;
+		}
+
+		if (cmsg->cmsg_level != SOL_IP)
+			continue;
+		switch (cmsg->cmsg_type) {
+		case IP_RETOPTS:
+			err = cmsg->cmsg_len - sizeof(struct cmsghdr);
+
+			/* Our caller is responsible for freeing ipc->opt */
+			err = ip_options_get(net, &ipc->opt,
+					     KERNEL_SOCKPTR(CMSG_DATA(cmsg)),
+					     err < 40 ? err : 40);
+			if (err)
+				return err;
+			break;
+		case IP_PKTINFO:
+		{
+			struct in_pktinfo *info;
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+				return -EINVAL;
+			info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+			if (info->ipi_ifindex)
+				ipc->oif = info->ipi_ifindex;
+			ipc->addr = info->ipi_spec_dst.s_addr;
+			break;
+		}
+		case IP_TTL:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+				return -EINVAL;
+			val = *(int *)CMSG_DATA(cmsg);
+			if (val < 1 || val > 255)
+				return -EINVAL;
+			ipc->ttl = val;
+			break;
+		case IP_TOS:
+			if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
+				val = *(int *)CMSG_DATA(cmsg);
+			else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
+				val = *(u8 *)CMSG_DATA(cmsg);
+			else
+				return -EINVAL;
+			if (val < 0 || val > 255)
+				return -EINVAL;
+			ipc->tos = val;
+			ipc->priority = rt_tos2priority(ipc->tos);
+			break;
+		case IP_PROTOCOL:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+				return -EINVAL;
+			val = *(int *)CMSG_DATA(cmsg);
+			if (val < 1 || val > 255)
+				return -EINVAL;
+			ipc->protocol = val;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static void ip_ra_destroy_rcu(struct rcu_head *head)
+{
+	struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
+
+	sock_put(ra->saved_sk);
+	kfree(ra);
+}
+
+int ip_ra_control(struct sock *sk, unsigned char on,
+		  void (*destructor)(struct sock *))
+{
+	struct ip_ra_chain *ra, *new_ra;
+	struct ip_ra_chain __rcu **rap;
+	struct net *net = sock_net(sk);
+
+	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
+		return -EINVAL;
+
+	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+	if (on && !new_ra)
+		return -ENOMEM;
+
+	mutex_lock(&net->ipv4.ra_mutex);
+	for (rap = &net->ipv4.ra_chain;
+	     (ra = rcu_dereference_protected(*rap,
+			lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
+	     rap = &ra->next) {
+		if (ra->sk == sk) {
+			if (on) {
+				mutex_unlock(&net->ipv4.ra_mutex);
+				kfree(new_ra);
+				return -EADDRINUSE;
+			}
+			/* dont let ip_call_ra_chain() use sk again */
+			ra->sk = NULL;
+			RCU_INIT_POINTER(*rap, ra->next);
+			mutex_unlock(&net->ipv4.ra_mutex);
+
+			if (ra->destructor)
+				ra->destructor(sk);
+			/*
+			 * Delay sock_put(sk) and kfree(ra) after one rcu grace
+			 * period. This guarantee ip_call_ra_chain() dont need
+			 * to mess with socket refcounts.
+			 */
+			ra->saved_sk = sk;
+			call_rcu(&ra->rcu, ip_ra_destroy_rcu);
+			return 0;
+		}
+	}
+	if (!new_ra) {
+		mutex_unlock(&net->ipv4.ra_mutex);
+		return -ENOBUFS;
+	}
+	new_ra->sk = sk;
+	new_ra->destructor = destructor;
+
+	RCU_INIT_POINTER(new_ra->next, ra);
+	rcu_assign_pointer(*rap, new_ra);
+	sock_hold(sk);
+	mutex_unlock(&net->ipv4.ra_mutex);
+
+	return 0;
+}
+
+static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb,
+				    struct sock_ee_data_rfc4884 *out)
+{
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+	case ICMP_TIME_EXCEEDED:
+	case ICMP_PARAMETERPROB:
+		ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr),
+				      icmp_hdr(skb)->un.reserved[1] * 4);
+	}
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+		   __be16 port, u32 info, u8 *payload)
+{
+	struct sock_exterr_skb *serr;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	serr = SKB_EXT_ERR(skb);
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+	serr->ee.ee_type = icmp_hdr(skb)->type;
+	serr->ee.ee_code = icmp_hdr(skb)->code;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_info = info;
+	serr->ee.ee_data = 0;
+	serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) -
+				   skb_network_header(skb);
+	serr->port = port;
+
+	if (skb_pull(skb, payload - skb->data)) {
+		if (inet_test_bit(RECVERR_RFC4884, sk))
+			ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
+
+		skb_reset_transport_header(skb);
+		if (sock_queue_err_skb(sk, skb) == 0)
+			return;
+	}
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_icmp_error);
+
+void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
+{
+	struct sock_exterr_skb *serr;
+	struct iphdr *iph;
+	struct sk_buff *skb;
+
+	if (!inet_test_bit(RECVERR, sk))
+		return;
+
+	skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb_put(skb, sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	iph->daddr = daddr;
+
+	serr = SKB_EXT_ERR(skb);
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+	serr->ee.ee_type = 0;
+	serr->ee.ee_code = 0;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_info = info;
+	serr->ee.ee_data = 0;
+	serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
+	serr->port = port;
+
+	__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+	skb_reset_transport_header(skb);
+
+	if (sock_queue_err_skb(sk, skb))
+		kfree_skb(skb);
+}
+
+/* For some errors we have valid addr_offset even with zero payload and
+ * zero port. Also, addr_offset should be supported if port is set.
+ */
+static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr)
+{
+	return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+	       serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
+}
+
+/* IPv4 supports cmsg on all imcp errors and some timestamps
+ *
+ * Timestamp code paths do not initialize the fields expected by cmsg:
+ * the PKTINFO fields in skb->cb[]. Fill those in here.
+ */
+static bool ipv4_datagram_support_cmsg(const struct sock *sk,
+				       struct sk_buff *skb,
+				       int ee_origin)
+{
+	struct in_pktinfo *info;
+
+	if (ee_origin == SO_EE_ORIGIN_ICMP)
+		return true;
+
+	if (ee_origin == SO_EE_ORIGIN_LOCAL)
+		return false;
+
+	/* Support IP_PKTINFO on tstamp packets if requested, to correlate
+	 * timestamp with egress dev. Not possible for packets without iif
+	 * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
+	 */
+	info = PKTINFO_SKB_CB(skb);
+	if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) ||
+	    !info->ipi_ifindex)
+		return false;
+
+	info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
+	return true;
+}
+
+/*
+ *	Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb;
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+	struct {
+		struct sock_extended_err ee;
+		struct sockaddr_in	 offender;
+	} errhdr;
+	int err;
+	int copied;
+
+	err = -EAGAIN;
+	skb = sock_dequeue_err_skb(sk);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
+	if (unlikely(err)) {
+		kfree_skb(skb);
+		return err;
+	}
+	sock_recv_timestamp(msg, sk, skb);
+
+	serr = SKB_EXT_ERR(skb);
+
+	if (sin && ipv4_datagram_support_addr(serr)) {
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
+						   serr->addr_offset);
+		sin->sin_port = serr->port;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
+	}
+
+	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+	sin = &errhdr.offender;
+	memset(sin, 0, sizeof(*sin));
+
+	if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		if (inet_cmsg_flags(inet_sk(sk)))
+			ip_cmsg_recv(msg, skb);
+	}
+
+	put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+
+	/* Now we could try to dump offended packet options */
+
+	msg->msg_flags |= MSG_ERRQUEUE;
+	err = copied;
+
+	consume_skb(skb);
+out:
+	return err;
+}
+
+void __ip_sock_set_tos(struct sock *sk, int val)
+{
+	if (sk->sk_type == SOCK_STREAM) {
+		val &= ~INET_ECN_MASK;
+		val |= inet_sk(sk)->tos & INET_ECN_MASK;
+	}
+	if (inet_sk(sk)->tos != val) {
+		inet_sk(sk)->tos = val;
+		WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
+		sk_dst_reset(sk);
+	}
+}
+
+void ip_sock_set_tos(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__ip_sock_set_tos(sk, val);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_tos);
+
+void ip_sock_set_freebind(struct sock *sk)
+{
+	inet_set_bit(FREEBIND, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_freebind);
+
+void ip_sock_set_recverr(struct sock *sk)
+{
+	inet_set_bit(RECVERR, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_recverr);
+
+int ip_sock_set_mtu_discover(struct sock *sk, int val)
+{
+	if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
+		return -EINVAL;
+	lock_sock(sk);
+	inet_sk(sk)->pmtudisc = val;
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(ip_sock_set_mtu_discover);
+
+void ip_sock_set_pktinfo(struct sock *sk)
+{
+	inet_set_bit(PKTINFO, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_pktinfo);
+
+/*
+ *	Socket option code for IP. This is the end of the line after any
+ *	TCP,UDP etc options on an IP socket.
+ */
+static bool setsockopt_needs_rtnl(int optname)
+{
+	switch (optname) {
+	case IP_ADD_MEMBERSHIP:
+	case IP_ADD_SOURCE_MEMBERSHIP:
+	case IP_BLOCK_SOURCE:
+	case IP_DROP_MEMBERSHIP:
+	case IP_DROP_SOURCE_MEMBERSHIP:
+	case IP_MSFILTER:
+	case IP_UNBLOCK_SOURCE:
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_MSFILTER:
+	case MCAST_JOIN_GROUP:
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+	case MCAST_UNBLOCK_SOURCE:
+		return true;
+	}
+	return false;
+}
+
+static int set_mcast_msfilter(struct sock *sk, int ifindex,
+			      int numsrc, int fmode,
+			      struct sockaddr_storage *group,
+			      struct sockaddr_storage *list)
+{
+	struct ip_msfilter *msf;
+	struct sockaddr_in *psin;
+	int err, i;
+
+	msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL);
+	if (!msf)
+		return -ENOBUFS;
+
+	psin = (struct sockaddr_in *)group;
+	if (psin->sin_family != AF_INET)
+		goto Eaddrnotavail;
+	msf->imsf_multiaddr = psin->sin_addr.s_addr;
+	msf->imsf_interface = 0;
+	msf->imsf_fmode = fmode;
+	msf->imsf_numsrc = numsrc;
+	for (i = 0; i < numsrc; ++i) {
+		psin = (struct sockaddr_in *)&list[i];
+
+		if (psin->sin_family != AF_INET)
+			goto Eaddrnotavail;
+		msf->imsf_slist_flex[i] = psin->sin_addr.s_addr;
+	}
+	err = ip_mc_msfilter(sk, msf, ifindex);
+	kfree(msf);
+	return err;
+
+Eaddrnotavail:
+	kfree(msf);
+	return -EADDRNOTAVAIL;
+}
+
+static int copy_group_source_from_sockptr(struct group_source_req *greqs,
+		sockptr_t optval, int optlen)
+{
+	if (in_compat_syscall()) {
+		struct compat_group_source_req gr32;
+
+		if (optlen != sizeof(gr32))
+			return -EINVAL;
+		if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+			return -EFAULT;
+		greqs->gsr_interface = gr32.gsr_interface;
+		greqs->gsr_group = gr32.gsr_group;
+		greqs->gsr_source = gr32.gsr_source;
+	} else {
+		if (optlen != sizeof(*greqs))
+			return -EINVAL;
+		if (copy_from_sockptr(greqs, optval, sizeof(*greqs)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int do_mcast_group_source(struct sock *sk, int optname,
+		sockptr_t optval, int optlen)
+{
+	struct group_source_req greqs;
+	struct ip_mreq_source mreqs;
+	struct sockaddr_in *psin;
+	int omode, add, err;
+
+	err = copy_group_source_from_sockptr(&greqs, optval, optlen);
+	if (err)
+		return err;
+
+	if (greqs.gsr_group.ss_family != AF_INET ||
+	    greqs.gsr_source.ss_family != AF_INET)
+		return -EADDRNOTAVAIL;
+
+	psin = (struct sockaddr_in *)&greqs.gsr_group;
+	mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+	psin = (struct sockaddr_in *)&greqs.gsr_source;
+	mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+	mreqs.imr_interface = 0; /* use index for mc_source */
+
+	if (optname == MCAST_BLOCK_SOURCE) {
+		omode = MCAST_EXCLUDE;
+		add = 1;
+	} else if (optname == MCAST_UNBLOCK_SOURCE) {
+		omode = MCAST_EXCLUDE;
+		add = 0;
+	} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+		struct ip_mreqn mreq;
+
+		psin = (struct sockaddr_in *)&greqs.gsr_group;
+		mreq.imr_multiaddr = psin->sin_addr;
+		mreq.imr_address.s_addr = 0;
+		mreq.imr_ifindex = greqs.gsr_interface;
+		err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
+		if (err && err != -EADDRINUSE)
+			return err;
+		greqs.gsr_interface = mreq.imr_ifindex;
+		omode = MCAST_INCLUDE;
+		add = 1;
+	} else /* MCAST_LEAVE_SOURCE_GROUP */ {
+		omode = MCAST_INCLUDE;
+		add = 0;
+	}
+	return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface);
+}
+
+static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)
+{
+	struct group_filter *gsf = NULL;
+	int err;
+
+	if (optlen < GROUP_FILTER_SIZE(0))
+		return -EINVAL;
+	if (optlen > READ_ONCE(sysctl_optmem_max))
+		return -ENOBUFS;
+
+	gsf = memdup_sockptr(optval, optlen);
+	if (IS_ERR(gsf))
+		return PTR_ERR(gsf);
+
+	/* numsrc >= (4G-140)/128 overflow in 32 bits */
+	err = -ENOBUFS;
+	if (gsf->gf_numsrc >= 0x1ffffff ||
+	    gsf->gf_numsrc > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf))
+		goto out_free_gsf;
+
+	err = -EINVAL;
+	if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen)
+		goto out_free_gsf;
+
+	err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc,
+				 gsf->gf_fmode, &gsf->gf_group,
+				 gsf->gf_slist_flex);
+out_free_gsf:
+	kfree(gsf);
+	return err;
+}
+
+static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+		int optlen)
+{
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+	struct compat_group_filter *gf32;
+	unsigned int n;
+	void *p;
+	int err;
+
+	if (optlen < size0)
+		return -EINVAL;
+	if (optlen > READ_ONCE(sysctl_optmem_max) - 4)
+		return -ENOBUFS;
+
+	p = kmalloc(optlen + 4, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
+
+	err = -EFAULT;
+	if (copy_from_sockptr(gf32, optval, optlen))
+		goto out_free_gsf;
+
+	/* numsrc >= (4G-140)/128 overflow in 32 bits */
+	n = gf32->gf_numsrc;
+	err = -ENOBUFS;
+	if (n >= 0x1ffffff)
+		goto out_free_gsf;
+
+	err = -EINVAL;
+	if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
+		goto out_free_gsf;
+
+	/* numsrc >= (4G-140)/128 overflow in 32 bits */
+	err = -ENOBUFS;
+	if (n > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf))
+		goto out_free_gsf;
+	err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode,
+				 &gf32->gf_group, gf32->gf_slist_flex);
+out_free_gsf:
+	kfree(p);
+	return err;
+}
+
+static int ip_mcast_join_leave(struct sock *sk, int optname,
+		sockptr_t optval, int optlen)
+{
+	struct ip_mreqn mreq = { };
+	struct sockaddr_in *psin;
+	struct group_req greq;
+
+	if (optlen < sizeof(struct group_req))
+		return -EINVAL;
+	if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+		return -EFAULT;
+
+	psin = (struct sockaddr_in *)&greq.gr_group;
+	if (psin->sin_family != AF_INET)
+		return -EINVAL;
+	mreq.imr_multiaddr = psin->sin_addr;
+	mreq.imr_ifindex = greq.gr_interface;
+	if (optname == MCAST_JOIN_GROUP)
+		return ip_mc_join_group(sk, &mreq);
+	return ip_mc_leave_group(sk, &mreq);
+}
+
+static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
+		sockptr_t optval, int optlen)
+{
+	struct compat_group_req greq;
+	struct ip_mreqn mreq = { };
+	struct sockaddr_in *psin;
+
+	if (optlen < sizeof(struct compat_group_req))
+		return -EINVAL;
+	if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+		return -EFAULT;
+
+	psin = (struct sockaddr_in *)&greq.gr_group;
+	if (psin->sin_family != AF_INET)
+		return -EINVAL;
+	mreq.imr_multiaddr = psin->sin_addr;
+	mreq.imr_ifindex = greq.gr_interface;
+
+	if (optname == MCAST_JOIN_GROUP)
+		return ip_mc_join_group(sk, &mreq);
+	return ip_mc_leave_group(sk, &mreq);
+}
+
+DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);
+
+int do_ip_setsockopt(struct sock *sk, int level, int optname,
+		     sockptr_t optval, unsigned int optlen)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	int val = 0, err;
+	bool needs_rtnl = setsockopt_needs_rtnl(optname);
+
+	switch (optname) {
+	case IP_PKTINFO:
+	case IP_RECVTTL:
+	case IP_RECVOPTS:
+	case IP_RECVTOS:
+	case IP_RETOPTS:
+	case IP_TOS:
+	case IP_TTL:
+	case IP_HDRINCL:
+	case IP_MTU_DISCOVER:
+	case IP_RECVERR:
+	case IP_ROUTER_ALERT:
+	case IP_FREEBIND:
+	case IP_PASSSEC:
+	case IP_TRANSPARENT:
+	case IP_MINTTL:
+	case IP_NODEFRAG:
+	case IP_BIND_ADDRESS_NO_PORT:
+	case IP_UNICAST_IF:
+	case IP_MULTICAST_TTL:
+	case IP_MULTICAST_ALL:
+	case IP_MULTICAST_LOOP:
+	case IP_RECVORIGDSTADDR:
+	case IP_CHECKSUM:
+	case IP_RECVFRAGSIZE:
+	case IP_RECVERR_RFC4884:
+	case IP_LOCAL_PORT_RANGE:
+		if (optlen >= sizeof(int)) {
+			if (copy_from_sockptr(&val, optval, sizeof(val)))
+				return -EFAULT;
+		} else if (optlen >= sizeof(char)) {
+			unsigned char ucval;
+
+			if (copy_from_sockptr(&ucval, optval, sizeof(ucval)))
+				return -EFAULT;
+			val = (int) ucval;
+		}
+	}
+
+	/* If optlen==0, it is equivalent to val == 0 */
+
+	if (optname == IP_ROUTER_ALERT)
+		return ip_ra_control(sk, val ? 1 : 0, NULL);
+	if (ip_mroute_opt(optname))
+		return ip_mroute_setsockopt(sk, optname, optval, optlen);
+
+	/* Handle options that can be set without locking the socket. */
+	switch (optname) {
+	case IP_PKTINFO:
+		inet_assign_bit(PKTINFO, sk, val);
+		return 0;
+	case IP_RECVTTL:
+		inet_assign_bit(TTL, sk, val);
+		return 0;
+	case IP_RECVTOS:
+		inet_assign_bit(TOS, sk, val);
+		return 0;
+	case IP_RECVOPTS:
+		inet_assign_bit(RECVOPTS, sk, val);
+		return 0;
+	case IP_RETOPTS:
+		inet_assign_bit(RETOPTS, sk, val);
+		return 0;
+	case IP_PASSSEC:
+		inet_assign_bit(PASSSEC, sk, val);
+		return 0;
+	case IP_RECVORIGDSTADDR:
+		inet_assign_bit(ORIGDSTADDR, sk, val);
+		return 0;
+	case IP_RECVFRAGSIZE:
+		if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
+			return -EINVAL;
+		inet_assign_bit(RECVFRAGSIZE, sk, val);
+		return 0;
+	case IP_RECVERR:
+		inet_assign_bit(RECVERR, sk, val);
+		if (!val)
+			skb_errqueue_purge(&sk->sk_error_queue);
+		return 0;
+	case IP_RECVERR_RFC4884:
+		if (val < 0 || val > 1)
+			return -EINVAL;
+		inet_assign_bit(RECVERR_RFC4884, sk, val);
+		return 0;
+	case IP_FREEBIND:
+		if (optlen < 1)
+			return -EINVAL;
+		inet_assign_bit(FREEBIND, sk, val);
+		return 0;
+	case IP_HDRINCL:
+		if (sk->sk_type != SOCK_RAW)
+			return -ENOPROTOOPT;
+		inet_assign_bit(HDRINCL, sk, val);
+		return 0;
+	case IP_MULTICAST_LOOP:
+		if (optlen < 1)
+			return -EINVAL;
+		inet_assign_bit(MC_LOOP, sk, val);
+		return 0;
+	case IP_MULTICAST_ALL:
+		if (optlen < 1)
+			return -EINVAL;
+		if (val != 0 && val != 1)
+			return -EINVAL;
+		inet_assign_bit(MC_ALL, sk, val);
+		return 0;
+	case IP_TRANSPARENT:
+		if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
+		if (optlen < 1)
+			return -EINVAL;
+		inet_assign_bit(TRANSPARENT, sk, val);
+		return 0;
+	case IP_NODEFRAG:
+		if (sk->sk_type != SOCK_RAW)
+			return -ENOPROTOOPT;
+		inet_assign_bit(NODEFRAG, sk, val);
+		return 0;
+	case IP_BIND_ADDRESS_NO_PORT:
+		inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val);
+		return 0;
+	case IP_TTL:
+		if (optlen < 1)
+			return -EINVAL;
+		if (val != -1 && (val < 1 || val > 255))
+			return -EINVAL;
+		WRITE_ONCE(inet->uc_ttl, val);
+		return 0;
+	case IP_MINTTL:
+		if (optlen < 1)
+			return -EINVAL;
+		if (val < 0 || val > 255)
+			return -EINVAL;
+
+		if (val)
+			static_branch_enable(&ip4_min_ttl);
+
+		WRITE_ONCE(inet->min_ttl, val);
+		return 0;
+	}
+
+	err = 0;
+	if (needs_rtnl)
+		rtnl_lock();
+	sockopt_lock_sock(sk);
+
+	switch (optname) {
+	case IP_OPTIONS:
+	{
+		struct ip_options_rcu *old, *opt = NULL;
+
+		if (optlen > 40)
+			goto e_inval;
+		err = ip_options_get(sock_net(sk), &opt, optval, optlen);
+		if (err)
+			break;
+		old = rcu_dereference_protected(inet->inet_opt,
+						lockdep_sock_is_held(sk));
+		if (inet_test_bit(IS_ICSK, sk)) {
+			struct inet_connection_sock *icsk = inet_csk(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+			if (sk->sk_family == PF_INET ||
+			    (!((1 << sk->sk_state) &
+			       (TCPF_LISTEN | TCPF_CLOSE)) &&
+			     inet->inet_daddr != LOOPBACK4_IPV6)) {
+#endif
+				if (old)
+					icsk->icsk_ext_hdr_len -= old->opt.optlen;
+				if (opt)
+					icsk->icsk_ext_hdr_len += opt->opt.optlen;
+				icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+#if IS_ENABLED(CONFIG_IPV6)
+			}
+#endif
+		}
+		rcu_assign_pointer(inet->inet_opt, opt);
+		if (old)
+			kfree_rcu(old, rcu);
+		break;
+	}
+	case IP_CHECKSUM:
+		if (val) {
+			if (!(inet_test_bit(CHECKSUM, sk))) {
+				inet_inc_convert_csum(sk);
+				inet_set_bit(CHECKSUM, sk);
+			}
+		} else {
+			if (inet_test_bit(CHECKSUM, sk)) {
+				inet_dec_convert_csum(sk);
+				inet_clear_bit(CHECKSUM, sk);
+			}
+		}
+		break;
+	case IP_TOS:	/* This sets both TOS and Precedence */
+		__ip_sock_set_tos(sk, val);
+		break;
+	case IP_MTU_DISCOVER:
+		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
+			goto e_inval;
+		inet->pmtudisc = val;
+		break;
+	case IP_MULTICAST_TTL:
+		if (sk->sk_type == SOCK_STREAM)
+			goto e_inval;
+		if (optlen < 1)
+			goto e_inval;
+		if (val == -1)
+			val = 1;
+		if (val < 0 || val > 255)
+			goto e_inval;
+		inet->mc_ttl = val;
+		break;
+	case IP_UNICAST_IF:
+	{
+		struct net_device *dev = NULL;
+		int ifindex;
+		int midx;
+
+		if (optlen != sizeof(int))
+			goto e_inval;
+
+		ifindex = (__force int)ntohl((__force __be32)val);
+		if (ifindex == 0) {
+			inet->uc_index = 0;
+			err = 0;
+			break;
+		}
+
+		dev = dev_get_by_index(sock_net(sk), ifindex);
+		err = -EADDRNOTAVAIL;
+		if (!dev)
+			break;
+
+		midx = l3mdev_master_ifindex(dev);
+		dev_put(dev);
+
+		err = -EINVAL;
+		if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if)
+			break;
+
+		inet->uc_index = ifindex;
+		err = 0;
+		break;
+	}
+	case IP_MULTICAST_IF:
+	{
+		struct ip_mreqn mreq;
+		struct net_device *dev = NULL;
+		int midx;
+
+		if (sk->sk_type == SOCK_STREAM)
+			goto e_inval;
+		/*
+		 *	Check the arguments are allowable
+		 */
+
+		if (optlen < sizeof(struct in_addr))
+			goto e_inval;
+
+		err = -EFAULT;
+		if (optlen >= sizeof(struct ip_mreqn)) {
+			if (copy_from_sockptr(&mreq, optval, sizeof(mreq)))
+				break;
+		} else {
+			memset(&mreq, 0, sizeof(mreq));
+			if (optlen >= sizeof(struct ip_mreq)) {
+				if (copy_from_sockptr(&mreq, optval,
+						      sizeof(struct ip_mreq)))
+					break;
+			} else if (optlen >= sizeof(struct in_addr)) {
+				if (copy_from_sockptr(&mreq.imr_address, optval,
+						      sizeof(struct in_addr)))
+					break;
+			}
+		}
+
+		if (!mreq.imr_ifindex) {
+			if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
+				inet->mc_index = 0;
+				inet->mc_addr  = 0;
+				err = 0;
+				break;
+			}
+			dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
+			if (dev)
+				mreq.imr_ifindex = dev->ifindex;
+		} else
+			dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
+
+
+		err = -EADDRNOTAVAIL;
+		if (!dev)
+			break;
+
+		midx = l3mdev_master_ifindex(dev);
+
+		dev_put(dev);
+
+		err = -EINVAL;
+		if (sk->sk_bound_dev_if &&
+		    mreq.imr_ifindex != sk->sk_bound_dev_if &&
+		    midx != sk->sk_bound_dev_if)
+			break;
+
+		inet->mc_index = mreq.imr_ifindex;
+		inet->mc_addr  = mreq.imr_address.s_addr;
+		err = 0;
+		break;
+	}
+
+	case IP_ADD_MEMBERSHIP:
+	case IP_DROP_MEMBERSHIP:
+	{
+		struct ip_mreqn mreq;
+
+		err = -EPROTO;
+		if (inet_test_bit(IS_ICSK, sk))
+			break;
+
+		if (optlen < sizeof(struct ip_mreq))
+			goto e_inval;
+		err = -EFAULT;
+		if (optlen >= sizeof(struct ip_mreqn)) {
+			if (copy_from_sockptr(&mreq, optval, sizeof(mreq)))
+				break;
+		} else {
+			memset(&mreq, 0, sizeof(mreq));
+			if (copy_from_sockptr(&mreq, optval,
+					      sizeof(struct ip_mreq)))
+				break;
+		}
+
+		if (optname == IP_ADD_MEMBERSHIP)
+			err = ip_mc_join_group(sk, &mreq);
+		else
+			err = ip_mc_leave_group(sk, &mreq);
+		break;
+	}
+	case IP_MSFILTER:
+	{
+		struct ip_msfilter *msf;
+
+		if (optlen < IP_MSFILTER_SIZE(0))
+			goto e_inval;
+		if (optlen > READ_ONCE(sysctl_optmem_max)) {
+			err = -ENOBUFS;
+			break;
+		}
+		msf = memdup_sockptr(optval, optlen);
+		if (IS_ERR(msf)) {
+			err = PTR_ERR(msf);
+			break;
+		}
+		/* numsrc >= (1G-4) overflow in 32 bits */
+		if (msf->imsf_numsrc >= 0x3ffffffcU ||
+		    msf->imsf_numsrc > READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
+			kfree(msf);
+			err = -ENOBUFS;
+			break;
+		}
+		if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+			kfree(msf);
+			err = -EINVAL;
+			break;
+		}
+		err = ip_mc_msfilter(sk, msf, 0);
+		kfree(msf);
+		break;
+	}
+	case IP_BLOCK_SOURCE:
+	case IP_UNBLOCK_SOURCE:
+	case IP_ADD_SOURCE_MEMBERSHIP:
+	case IP_DROP_SOURCE_MEMBERSHIP:
+	{
+		struct ip_mreq_source mreqs;
+		int omode, add;
+
+		if (optlen != sizeof(struct ip_mreq_source))
+			goto e_inval;
+		if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) {
+			err = -EFAULT;
+			break;
+		}
+		if (optname == IP_BLOCK_SOURCE) {
+			omode = MCAST_EXCLUDE;
+			add = 1;
+		} else if (optname == IP_UNBLOCK_SOURCE) {
+			omode = MCAST_EXCLUDE;
+			add = 0;
+		} else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+			struct ip_mreqn mreq;
+
+			mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+			mreq.imr_address.s_addr = mreqs.imr_interface;
+			mreq.imr_ifindex = 0;
+			err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
+			if (err && err != -EADDRINUSE)
+				break;
+			omode = MCAST_INCLUDE;
+			add = 1;
+		} else /* IP_DROP_SOURCE_MEMBERSHIP */ {
+			omode = MCAST_INCLUDE;
+			add = 0;
+		}
+		err = ip_mc_source(add, omode, sk, &mreqs, 0);
+		break;
+	}
+	case MCAST_JOIN_GROUP:
+	case MCAST_LEAVE_GROUP:
+		if (in_compat_syscall())
+			err = compat_ip_mcast_join_leave(sk, optname, optval,
+							 optlen);
+		else
+			err = ip_mcast_join_leave(sk, optname, optval, optlen);
+		break;
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_UNBLOCK_SOURCE:
+		err = do_mcast_group_source(sk, optname, optval, optlen);
+		break;
+	case MCAST_MSFILTER:
+		if (in_compat_syscall())
+			err = compat_ip_set_mcast_msfilter(sk, optval, optlen);
+		else
+			err = ip_set_mcast_msfilter(sk, optval, optlen);
+		break;
+	case IP_IPSEC_POLICY:
+	case IP_XFRM_POLICY:
+		err = -EPERM;
+		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+			break;
+		err = xfrm_user_policy(sk, optname, optval, optlen);
+		break;
+
+	case IP_LOCAL_PORT_RANGE:
+	{
+		const __u16 lo = val;
+		const __u16 hi = val >> 16;
+
+		if (optlen != sizeof(__u32))
+			goto e_inval;
+		if (lo != 0 && hi != 0 && lo > hi)
+			goto e_inval;
+
+		inet->local_port_range.lo = lo;
+		inet->local_port_range.hi = hi;
+		break;
+	}
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+	sockopt_release_sock(sk);
+	if (needs_rtnl)
+		rtnl_unlock();
+	return err;
+
+e_inval:
+	sockopt_release_sock(sk);
+	if (needs_rtnl)
+		rtnl_unlock();
+	return -EINVAL;
+}
+
+/**
+ * ipv4_pktinfo_prepare - transfer some info from rtable to skb
+ * @sk: socket
+ * @skb: buffer
+ *
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * destination in skb->cb[] before dst drop.
+ * This way, receiver doesn't make cache line misses to read rtable.
+ */
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
+{
+	struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+	bool prepare = inet_test_bit(PKTINFO, sk) ||
+		       ipv6_sk_rxinfo(sk);
+
+	if (prepare && skb_rtable(skb)) {
+		/* skb->cb is overloaded: prior to this point it is IP{6}CB
+		 * which has interface index (iif) as the first member of the
+		 * underlying inet{6}_skb_parm struct. This code then overlays
+		 * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
+		 * element so the iif is picked up from the prior IPCB. If iif
+		 * is the loopback interface, then return the sending interface
+		 * (e.g., process binds socket to eth0 for Tx which is
+		 * redirected to loopback in the rtable/dst).
+		 */
+		struct rtable *rt = skb_rtable(skb);
+		bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
+
+		if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
+			pktinfo->ipi_ifindex = inet_iif(skb);
+		else if (l3slave && rt && rt->rt_iif)
+			pktinfo->ipi_ifindex = rt->rt_iif;
+
+		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
+	} else {
+		pktinfo->ipi_ifindex = 0;
+		pktinfo->ipi_spec_dst.s_addr = 0;
+	}
+	skb_dst_drop(skb);
+}
+
+int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+		unsigned int optlen)
+{
+	int err;
+
+	if (level != SOL_IP)
+		return -ENOPROTOOPT;
+
+	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#if IS_ENABLED(CONFIG_BPFILTER_UMH)
+	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
+	    optname < BPFILTER_IPT_SET_MAX)
+		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
+#endif
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+			optname != IP_IPSEC_POLICY &&
+			optname != IP_XFRM_POLICY &&
+			!ip_mroute_opt(optname))
+		err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
+#endif
+	return err;
+}
+EXPORT_SYMBOL(ip_setsockopt);
+
+/*
+ *	Get the options. Note for future reference. The GET of IP options gets
+ *	the _received_ ones. The set sets the _sent_ ones.
+ */
+
+static bool getsockopt_needs_rtnl(int optname)
+{
+	switch (optname) {
+	case IP_MSFILTER:
+	case MCAST_MSFILTER:
+		return true;
+	}
+	return false;
+}
+
+static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval,
+				 sockptr_t optlen, int len)
+{
+	const int size0 = offsetof(struct group_filter, gf_slist_flex);
+	struct group_filter gsf;
+	int num, gsf_size;
+	int err;
+
+	if (len < size0)
+		return -EINVAL;
+	if (copy_from_sockptr(&gsf, optval, size0))
+		return -EFAULT;
+
+	num = gsf.gf_numsrc;
+	err = ip_mc_gsfget(sk, &gsf, optval,
+			   offsetof(struct group_filter, gf_slist_flex));
+	if (err)
+		return err;
+	if (gsf.gf_numsrc < num)
+		num = gsf.gf_numsrc;
+	gsf_size = GROUP_FILTER_SIZE(num);
+	if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) ||
+	    copy_to_sockptr(optval, &gsf, size0))
+		return -EFAULT;
+	return 0;
+}
+
+static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval,
+					sockptr_t optlen, int len)
+{
+	const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+	struct compat_group_filter gf32;
+	struct group_filter gf;
+	int num;
+	int err;
+
+	if (len < size0)
+		return -EINVAL;
+	if (copy_from_sockptr(&gf32, optval, size0))
+		return -EFAULT;
+
+	gf.gf_interface = gf32.gf_interface;
+	gf.gf_fmode = gf32.gf_fmode;
+	num = gf.gf_numsrc = gf32.gf_numsrc;
+	gf.gf_group = gf32.gf_group;
+
+	err = ip_mc_gsfget(sk, &gf, optval,
+			   offsetof(struct compat_group_filter, gf_slist_flex));
+	if (err)
+		return err;
+	if (gf.gf_numsrc < num)
+		num = gf.gf_numsrc;
+	len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32));
+	if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+	    copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode),
+				   &gf.gf_fmode, sizeof(gf.gf_fmode)) ||
+	    copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc),
+				   &gf.gf_numsrc, sizeof(gf.gf_numsrc)))
+		return -EFAULT;
+	return 0;
+}
+
+int do_ip_getsockopt(struct sock *sk, int level, int optname,
+		     sockptr_t optval, sockptr_t optlen)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	bool needs_rtnl = getsockopt_needs_rtnl(optname);
+	int val, err = 0;
+	int len;
+
+	if (level != SOL_IP)
+		return -EOPNOTSUPP;
+
+	if (ip_mroute_opt(optname))
+		return ip_mroute_getsockopt(sk, optname, optval, optlen);
+
+	if (copy_from_sockptr(&len, optlen, sizeof(int)))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	/* Handle options that can be read without locking the socket. */
+	switch (optname) {
+	case IP_PKTINFO:
+		val = inet_test_bit(PKTINFO, sk);
+		goto copyval;
+	case IP_RECVTTL:
+		val = inet_test_bit(TTL, sk);
+		goto copyval;
+	case IP_RECVTOS:
+		val = inet_test_bit(TOS, sk);
+		goto copyval;
+	case IP_RECVOPTS:
+		val = inet_test_bit(RECVOPTS, sk);
+		goto copyval;
+	case IP_RETOPTS:
+		val = inet_test_bit(RETOPTS, sk);
+		goto copyval;
+	case IP_PASSSEC:
+		val = inet_test_bit(PASSSEC, sk);
+		goto copyval;
+	case IP_RECVORIGDSTADDR:
+		val = inet_test_bit(ORIGDSTADDR, sk);
+		goto copyval;
+	case IP_CHECKSUM:
+		val = inet_test_bit(CHECKSUM, sk);
+		goto copyval;
+	case IP_RECVFRAGSIZE:
+		val = inet_test_bit(RECVFRAGSIZE, sk);
+		goto copyval;
+	case IP_RECVERR:
+		val = inet_test_bit(RECVERR, sk);
+		goto copyval;
+	case IP_RECVERR_RFC4884:
+		val = inet_test_bit(RECVERR_RFC4884, sk);
+		goto copyval;
+	case IP_FREEBIND:
+		val = inet_test_bit(FREEBIND, sk);
+		goto copyval;
+	case IP_HDRINCL:
+		val = inet_test_bit(HDRINCL, sk);
+		goto copyval;
+	case IP_MULTICAST_LOOP:
+		val = inet_test_bit(MC_LOOP, sk);
+		goto copyval;
+	case IP_MULTICAST_ALL:
+		val = inet_test_bit(MC_ALL, sk);
+		goto copyval;
+	case IP_TRANSPARENT:
+		val = inet_test_bit(TRANSPARENT, sk);
+		goto copyval;
+	case IP_NODEFRAG:
+		val = inet_test_bit(NODEFRAG, sk);
+		goto copyval;
+	case IP_BIND_ADDRESS_NO_PORT:
+		val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
+		goto copyval;
+	case IP_TTL:
+		val = READ_ONCE(inet->uc_ttl);
+		if (val < 0)
+			val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl);
+		goto copyval;
+	case IP_MINTTL:
+		val = READ_ONCE(inet->min_ttl);
+		goto copyval;
+	}
+
+	if (needs_rtnl)
+		rtnl_lock();
+	sockopt_lock_sock(sk);
+
+	switch (optname) {
+	case IP_OPTIONS:
+	{
+		unsigned char optbuf[sizeof(struct ip_options)+40];
+		struct ip_options *opt = (struct ip_options *)optbuf;
+		struct ip_options_rcu *inet_opt;
+
+		inet_opt = rcu_dereference_protected(inet->inet_opt,
+						     lockdep_sock_is_held(sk));
+		opt->optlen = 0;
+		if (inet_opt)
+			memcpy(optbuf, &inet_opt->opt,
+			       sizeof(struct ip_options) +
+			       inet_opt->opt.optlen);
+		sockopt_release_sock(sk);
+
+		if (opt->optlen == 0) {
+			len = 0;
+			return copy_to_sockptr(optlen, &len, sizeof(int));
+		}
+
+		ip_options_undo(opt);
+
+		len = min_t(unsigned int, len, opt->optlen);
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, opt->__data, len))
+			return -EFAULT;
+		return 0;
+	}
+	case IP_TOS:
+		val = inet->tos;
+		break;
+	case IP_MTU_DISCOVER:
+		val = inet->pmtudisc;
+		break;
+	case IP_MTU:
+	{
+		struct dst_entry *dst;
+		val = 0;
+		dst = sk_dst_get(sk);
+		if (dst) {
+			val = dst_mtu(dst);
+			dst_release(dst);
+		}
+		if (!val) {
+			sockopt_release_sock(sk);
+			return -ENOTCONN;
+		}
+		break;
+	}
+	case IP_MULTICAST_TTL:
+		val = inet->mc_ttl;
+		break;
+	case IP_UNICAST_IF:
+		val = (__force int)htonl((__u32) inet->uc_index);
+		break;
+	case IP_MULTICAST_IF:
+	{
+		struct in_addr addr;
+		len = min_t(unsigned int, len, sizeof(struct in_addr));
+		addr.s_addr = inet->mc_addr;
+		sockopt_release_sock(sk);
+
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, &addr, len))
+			return -EFAULT;
+		return 0;
+	}
+	case IP_MSFILTER:
+	{
+		struct ip_msfilter msf;
+
+		if (len < IP_MSFILTER_SIZE(0)) {
+			err = -EINVAL;
+			goto out;
+		}
+		if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) {
+			err = -EFAULT;
+			goto out;
+		}
+		err = ip_mc_msfget(sk, &msf, optval, optlen);
+		goto out;
+	}
+	case MCAST_MSFILTER:
+		if (in_compat_syscall())
+			err = compat_ip_get_mcast_msfilter(sk, optval, optlen,
+							   len);
+		else
+			err = ip_get_mcast_msfilter(sk, optval, optlen, len);
+		goto out;
+	case IP_PKTOPTIONS:
+	{
+		struct msghdr msg;
+
+		sockopt_release_sock(sk);
+
+		if (sk->sk_type != SOCK_STREAM)
+			return -ENOPROTOOPT;
+
+		if (optval.is_kernel) {
+			msg.msg_control_is_user = false;
+			msg.msg_control = optval.kernel;
+		} else {
+			msg.msg_control_is_user = true;
+			msg.msg_control_user = optval.user;
+		}
+		msg.msg_controllen = len;
+		msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
+
+		if (inet_test_bit(PKTINFO, sk)) {
+			struct in_pktinfo info;
+
+			info.ipi_addr.s_addr = inet->inet_rcv_saddr;
+			info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
+			info.ipi_ifindex = inet->mc_index;
+			put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+		}
+		if (inet_test_bit(TTL, sk)) {
+			int hlim = inet->mc_ttl;
+			put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+		}
+		if (inet_test_bit(TOS, sk)) {
+			int tos = inet->rcv_tos;
+			put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+		}
+		len -= msg.msg_controllen;
+		return copy_to_sockptr(optlen, &len, sizeof(int));
+	}
+	case IP_LOCAL_PORT_RANGE:
+		val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
+		break;
+	case IP_PROTOCOL:
+		val = inet_sk(sk)->inet_num;
+		break;
+	default:
+		sockopt_release_sock(sk);
+		return -ENOPROTOOPT;
+	}
+	sockopt_release_sock(sk);
+copyval:
+	if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
+		unsigned char ucval = (unsigned char)val;
+		len = 1;
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, &ucval, 1))
+			return -EFAULT;
+	} else {
+		len = min_t(unsigned int, sizeof(int), len);
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, &val, len))
+			return -EFAULT;
+	}
+	return 0;
+
+out:
+	sockopt_release_sock(sk);
+	if (needs_rtnl)
+		rtnl_unlock();
+	return err;
+}
+
+int ip_getsockopt(struct sock *sk, int level,
+		  int optname, char __user *optval, int __user *optlen)
+{
+	int err;
+
+	err = do_ip_getsockopt(sk, level, optname,
+			       USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
+
+#if IS_ENABLED(CONFIG_BPFILTER_UMH)
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
+			!ip_mroute_opt(optname)) {
+		int len;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		err = nf_getsockopt(sk, PF_INET, optname, optval, &len);
+		if (err >= 0)
+			err = put_user(len, optlen);
+		return err;
+	}
+#endif
+	return err;
+}
+EXPORT_SYMBOL(ip_getsockopt);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
new file mode 100644
index 0000000000..beeae624c4
--- /dev/null
+++ b/net/ipv4/ip_tunnel.c
@@ -0,0 +1,1301 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/rculist.h>
+#include <linux/err.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/udp.h>
+#include <net/dst_metadata.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
+{
+	return hash_32((__force u32)key ^ (__force u32)remote,
+			 IP_TNL_HASH_BITS);
+}
+
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
+				__be16 flags, __be32 key)
+{
+	if (p->i_flags & TUNNEL_KEY) {
+		if (flags & TUNNEL_KEY)
+			return key == p->i_key;
+		else
+			/* key expected, none present */
+			return false;
+	} else
+		return !(flags & TUNNEL_KEY);
+}
+
+/* Fallback tunnel: no source, no destination, no key, no options
+
+   Tunnel hash table:
+   We require exact key match i.e. if a key is present in packet
+   it will match only tunnel with the same key; if it is not present,
+   it will match only keyless tunnel.
+
+   All keysless packets, if not matched configured keyless tunnels
+   will match fallback tunnel.
+   Given src, dst and key, find appropriate for input tunnel.
+*/
+struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
+				   int link, __be16 flags,
+				   __be32 remote, __be32 local,
+				   __be32 key)
+{
+	struct ip_tunnel *t, *cand = NULL;
+	struct hlist_head *head;
+	struct net_device *ndev;
+	unsigned int hash;
+
+	hash = ip_tunnel_hash(key, remote);
+	head = &itn->tunnels[hash];
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if (local != t->parms.iph.saddr ||
+		    remote != t->parms.iph.daddr ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (!ip_tunnel_key_match(&t->parms, flags, key))
+			continue;
+
+		if (t->parms.link == link)
+			return t;
+		else
+			cand = t;
+	}
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if (remote != t->parms.iph.daddr ||
+		    t->parms.iph.saddr != 0 ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (!ip_tunnel_key_match(&t->parms, flags, key))
+			continue;
+
+		if (t->parms.link == link)
+			return t;
+		else if (!cand)
+			cand = t;
+	}
+
+	hash = ip_tunnel_hash(key, 0);
+	head = &itn->tunnels[hash];
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
+		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
+			continue;
+
+		if (!(t->dev->flags & IFF_UP))
+			continue;
+
+		if (!ip_tunnel_key_match(&t->parms, flags, key))
+			continue;
+
+		if (t->parms.link == link)
+			return t;
+		else if (!cand)
+			cand = t;
+	}
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
+		    t->parms.iph.saddr != 0 ||
+		    t->parms.iph.daddr != 0 ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (t->parms.link == link)
+			return t;
+		else if (!cand)
+			cand = t;
+	}
+
+	if (cand)
+		return cand;
+
+	t = rcu_dereference(itn->collect_md_tun);
+	if (t && t->dev->flags & IFF_UP)
+		return t;
+
+	ndev = READ_ONCE(itn->fb_tunnel_dev);
+	if (ndev && ndev->flags & IFF_UP)
+		return netdev_priv(ndev);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
+
+static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
+				    struct ip_tunnel_parm *parms)
+{
+	unsigned int h;
+	__be32 remote;
+	__be32 i_key = parms->i_key;
+
+	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
+		remote = parms->iph.daddr;
+	else
+		remote = 0;
+
+	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+		i_key = 0;
+
+	h = ip_tunnel_hash(i_key, remote);
+	return &itn->tunnels[h];
+}
+
+static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+	struct hlist_head *head = ip_bucket(itn, &t->parms);
+
+	if (t->collect_md)
+		rcu_assign_pointer(itn->collect_md_tun, t);
+	hlist_add_head_rcu(&t->hash_node, head);
+}
+
+static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+	if (t->collect_md)
+		rcu_assign_pointer(itn->collect_md_tun, NULL);
+	hlist_del_init_rcu(&t->hash_node);
+}
+
+static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
+					struct ip_tunnel_parm *parms,
+					int type)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	__be32 key = parms->i_key;
+	__be16 flags = parms->i_flags;
+	int link = parms->link;
+	struct ip_tunnel *t = NULL;
+	struct hlist_head *head = ip_bucket(itn, parms);
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if (local == t->parms.iph.saddr &&
+		    remote == t->parms.iph.daddr &&
+		    link == t->parms.link &&
+		    type == t->dev->type &&
+		    ip_tunnel_key_match(&t->parms, flags, key))
+			break;
+	}
+	return t;
+}
+
+static struct net_device *__ip_tunnel_create(struct net *net,
+					     const struct rtnl_link_ops *ops,
+					     struct ip_tunnel_parm *parms)
+{
+	int err;
+	struct ip_tunnel *tunnel;
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+
+	err = -E2BIG;
+	if (parms->name[0]) {
+		if (!dev_valid_name(parms->name))
+			goto failed;
+		strscpy(name, parms->name, IFNAMSIZ);
+	} else {
+		if (strlen(ops->kind) > (IFNAMSIZ - 3))
+			goto failed;
+		strcpy(name, ops->kind);
+		strcat(name, "%d");
+	}
+
+	ASSERT_RTNL();
+	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
+	if (!dev) {
+		err = -ENOMEM;
+		goto failed;
+	}
+	dev_net_set(dev, net);
+
+	dev->rtnl_link_ops = ops;
+
+	tunnel = netdev_priv(dev);
+	tunnel->parms = *parms;
+	tunnel->net = net;
+
+	err = register_netdevice(dev);
+	if (err)
+		goto failed_free;
+
+	return dev;
+
+failed_free:
+	free_netdev(dev);
+failed:
+	return ERR_PTR(err);
+}
+
+static int ip_tunnel_bind_dev(struct net_device *dev)
+{
+	struct net_device *tdev = NULL;
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	const struct iphdr *iph;
+	int hlen = LL_MAX_HEADER;
+	int mtu = ETH_DATA_LEN;
+	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+	iph = &tunnel->parms.iph;
+
+	/* Guess output device to choose reasonable mtu and needed_headroom */
+	if (iph->daddr) {
+		struct flowi4 fl4;
+		struct rtable *rt;
+
+		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
+				    iph->saddr, tunnel->parms.o_key,
+				    RT_TOS(iph->tos), dev_net(dev),
+				    tunnel->parms.link, tunnel->fwmark, 0, 0);
+		rt = ip_route_output_key(tunnel->net, &fl4);
+
+		if (!IS_ERR(rt)) {
+			tdev = rt->dst.dev;
+			ip_rt_put(rt);
+		}
+		if (dev->type != ARPHRD_ETHER)
+			dev->flags |= IFF_POINTOPOINT;
+
+		dst_cache_reset(&tunnel->dst_cache);
+	}
+
+	if (!tdev && tunnel->parms.link)
+		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
+
+	if (tdev) {
+		hlen = tdev->hard_header_len + tdev->needed_headroom;
+		mtu = min(tdev->mtu, IP_MAX_MTU);
+	}
+
+	dev->needed_headroom = t_hlen + hlen;
+	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
+
+	if (mtu < IPV4_MIN_MTU)
+		mtu = IPV4_MIN_MTU;
+
+	return mtu;
+}
+
+static struct ip_tunnel *ip_tunnel_create(struct net *net,
+					  struct ip_tunnel_net *itn,
+					  struct ip_tunnel_parm *parms)
+{
+	struct ip_tunnel *nt;
+	struct net_device *dev;
+	int t_hlen;
+	int mtu;
+	int err;
+
+	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
+	if (IS_ERR(dev))
+		return ERR_CAST(dev);
+
+	mtu = ip_tunnel_bind_dev(dev);
+	err = dev_set_mtu(dev, mtu);
+	if (err)
+		goto err_dev_set_mtu;
+
+	nt = netdev_priv(dev);
+	t_hlen = nt->hlen + sizeof(struct iphdr);
+	dev->min_mtu = ETH_MIN_MTU;
+	dev->max_mtu = IP_MAX_MTU - t_hlen;
+	if (dev->type == ARPHRD_ETHER)
+		dev->max_mtu -= dev->hard_header_len;
+
+	ip_tunnel_add(itn, nt);
+	return nt;
+
+err_dev_set_mtu:
+	unregister_netdevice(dev);
+	return ERR_PTR(err);
+}
+
+void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct udphdr *udph;
+
+	if (iph->protocol != IPPROTO_UDP)
+		return;
+
+	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
+	info->encap.sport = udph->source;
+	info->encap.dport = udph->dest;
+}
+EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
+
+int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
+		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
+		  bool log_ecn_error)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	int err;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+	if (ipv4_is_multicast(iph->daddr)) {
+		DEV_STATS_INC(tunnel->dev, multicast);
+		skb->pkt_type = PACKET_BROADCAST;
+	}
+#endif
+
+	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
+	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
+		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
+		DEV_STATS_INC(tunnel->dev, rx_errors);
+		goto drop;
+	}
+
+	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
+		if (!(tpi->flags&TUNNEL_SEQ) ||
+		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
+			DEV_STATS_INC(tunnel->dev, rx_errors);
+			goto drop;
+		}
+		tunnel->i_seqno = ntohl(tpi->seq) + 1;
+	}
+
+	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
+
+	err = IP_ECN_decapsulate(iph, skb);
+	if (unlikely(err)) {
+		if (log_ecn_error)
+			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+					&iph->saddr, iph->tos);
+		if (err > 1) {
+			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+			DEV_STATS_INC(tunnel->dev, rx_errors);
+			goto drop;
+		}
+	}
+
+	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
+	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+
+	if (tunnel->dev->type == ARPHRD_ETHER) {
+		skb->protocol = eth_type_trans(skb, tunnel->dev);
+		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+	} else {
+		skb->dev = tunnel->dev;
+	}
+
+	if (tun_dst)
+		skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
+	gro_cells_receive(&tunnel->gro_cells, skb);
+	return 0;
+
+drop:
+	if (tun_dst)
+		dst_release((struct dst_entry *)tun_dst);
+	kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+
+int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
+			    unsigned int num)
+{
+	if (num >= MAX_IPTUN_ENCAP_OPS)
+		return -ERANGE;
+
+	return !cmpxchg((const struct ip_tunnel_encap_ops **)
+			&iptun_encaps[num],
+			NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
+
+int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
+			    unsigned int num)
+{
+	int ret;
+
+	if (num >= MAX_IPTUN_ENCAP_OPS)
+		return -ERANGE;
+
+	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
+		       &iptun_encaps[num],
+		       ops, NULL) == ops) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
+
+int ip_tunnel_encap_setup(struct ip_tunnel *t,
+			  struct ip_tunnel_encap *ipencap)
+{
+	int hlen;
+
+	memset(&t->encap, 0, sizeof(t->encap));
+
+	hlen = ip_encap_hlen(ipencap);
+	if (hlen < 0)
+		return hlen;
+
+	t->encap.type = ipencap->type;
+	t->encap.sport = ipencap->sport;
+	t->encap.dport = ipencap->dport;
+	t->encap.flags = ipencap->flags;
+
+	t->encap_hlen = hlen;
+	t->hlen = t->encap_hlen + t->tun_hlen;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
+
+static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+			    struct rtable *rt, __be16 df,
+			    const struct iphdr *inner_iph,
+			    int tunnel_hlen, __be32 dst, bool md)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int pkt_size;
+	int mtu;
+
+	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
+	pkt_size = skb->len - tunnel_hlen;
+	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
+
+	if (df) {
+		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
+		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
+	} else {
+		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+	}
+
+	if (skb_valid_dst(skb))
+		skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		if (!skb_is_gso(skb) &&
+		    (inner_iph->frag_off & htons(IP_DF)) &&
+		    mtu < pkt_size) {
+			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+			return -E2BIG;
+		}
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6;
+		__be32 daddr;
+
+		rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
+					   NULL;
+		daddr = md ? dst : tunnel->parms.iph.daddr;
+
+		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+			   mtu >= IPV6_MIN_MTU) {
+			if ((daddr && !ipv4_is_multicast(daddr)) ||
+			    rt6->rt6i_dst.plen == 128) {
+				rt6->rt6i_flags |= RTF_MODIFIED;
+				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+			}
+		}
+
+		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+					mtu < pkt_size) {
+			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			return -E2BIG;
+		}
+	}
+#endif
+	return 0;
+}
+
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+		       u8 proto, int tunnel_hlen)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	u32 headroom = sizeof(struct iphdr);
+	struct ip_tunnel_info *tun_info;
+	const struct ip_tunnel_key *key;
+	const struct iphdr *inner_iph;
+	struct rtable *rt = NULL;
+	struct flowi4 fl4;
+	__be16 df = 0;
+	u8 tos, ttl;
+	bool use_cache;
+
+	tun_info = skb_tunnel_info(skb);
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
+		goto tx_error;
+	key = &tun_info->key;
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	tos = key->tos;
+	if (tos == 1) {
+		if (skb->protocol == htons(ETH_P_IP))
+			tos = inner_iph->tos;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+	}
+	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
+			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
+			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
+			    key->flow_flags);
+
+	if (!tunnel_hlen)
+		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
+
+	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
+		goto tx_error;
+
+	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+	if (use_cache)
+		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
+	if (!rt) {
+		rt = ip_route_output_key(tunnel->net, &fl4);
+		if (IS_ERR(rt)) {
+			DEV_STATS_INC(dev, tx_carrier_errors);
+			goto tx_error;
+		}
+		if (use_cache)
+			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+					  fl4.saddr);
+	}
+	if (rt->dst.dev == dev) {
+		ip_rt_put(rt);
+		DEV_STATS_INC(dev, collisions);
+		goto tx_error;
+	}
+
+	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+		df = htons(IP_DF);
+	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
+			    key->u.ipv4.dst, true)) {
+		ip_rt_put(rt);
+		goto tx_error;
+	}
+
+	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+	ttl = key->ttl;
+	if (ttl == 0) {
+		if (skb->protocol == htons(ETH_P_IP))
+			ttl = inner_iph->ttl;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+		else
+			ttl = ip4_dst_hoplimit(&rt->dst);
+	}
+
+	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+	if (headroom > READ_ONCE(dev->needed_headroom))
+		WRITE_ONCE(dev->needed_headroom, headroom);
+
+	if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
+		ip_rt_put(rt);
+		goto tx_dropped;
+	}
+	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
+		      df, !net_eq(tunnel->net, dev_net(dev)));
+	return;
+tx_error:
+	DEV_STATS_INC(dev, tx_errors);
+	goto kfree;
+tx_dropped:
+	DEV_STATS_INC(dev, tx_dropped);
+kfree:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
+void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+		    const struct iphdr *tnl_params, u8 protocol)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct ip_tunnel_info *tun_info = NULL;
+	const struct iphdr *inner_iph;
+	unsigned int max_headroom;	/* The extra header space needed */
+	struct rtable *rt = NULL;		/* Route to the other host */
+	__be16 payload_protocol;
+	bool use_cache = false;
+	struct flowi4 fl4;
+	bool md = false;
+	bool connected;
+	u8 tos, ttl;
+	__be32 dst;
+	__be16 df;
+
+	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	connected = (tunnel->parms.iph.daddr != 0);
+	payload_protocol = skb_protocol(skb, true);
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	dst = tnl_params->daddr;
+	if (dst == 0) {
+		/* NBMA tunnel */
+
+		if (!skb_dst(skb)) {
+			DEV_STATS_INC(dev, tx_fifo_errors);
+			goto tx_error;
+		}
+
+		tun_info = skb_tunnel_info(skb);
+		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
+		    ip_tunnel_info_af(tun_info) == AF_INET &&
+		    tun_info->key.u.ipv4.dst) {
+			dst = tun_info->key.u.ipv4.dst;
+			md = true;
+			connected = true;
+		} else if (payload_protocol == htons(ETH_P_IP)) {
+			rt = skb_rtable(skb);
+			dst = rt_nexthop(rt, inner_iph->daddr);
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (payload_protocol == htons(ETH_P_IPV6)) {
+			const struct in6_addr *addr6;
+			struct neighbour *neigh;
+			bool do_tx_error_icmp;
+			int addr_type;
+
+			neigh = dst_neigh_lookup(skb_dst(skb),
+						 &ipv6_hdr(skb)->daddr);
+			if (!neigh)
+				goto tx_error;
+
+			addr6 = (const struct in6_addr *)&neigh->primary_key;
+			addr_type = ipv6_addr_type(addr6);
+
+			if (addr_type == IPV6_ADDR_ANY) {
+				addr6 = &ipv6_hdr(skb)->daddr;
+				addr_type = ipv6_addr_type(addr6);
+			}
+
+			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+				do_tx_error_icmp = true;
+			else {
+				do_tx_error_icmp = false;
+				dst = addr6->s6_addr32[3];
+			}
+			neigh_release(neigh);
+			if (do_tx_error_icmp)
+				goto tx_error_icmp;
+		}
+#endif
+		else
+			goto tx_error;
+
+		if (!md)
+			connected = false;
+	}
+
+	tos = tnl_params->tos;
+	if (tos & 0x1) {
+		tos &= ~0x1;
+		if (payload_protocol == htons(ETH_P_IP)) {
+			tos = inner_iph->tos;
+			connected = false;
+		} else if (payload_protocol == htons(ETH_P_IPV6)) {
+			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+			connected = false;
+		}
+	}
+
+	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
+			    tunnel->parms.o_key, RT_TOS(tos),
+			    dev_net(dev), tunnel->parms.link,
+			    tunnel->fwmark, skb_get_hash(skb), 0);
+
+	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
+		goto tx_error;
+
+	if (connected && md) {
+		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+		if (use_cache)
+			rt = dst_cache_get_ip4(&tun_info->dst_cache,
+					       &fl4.saddr);
+	} else {
+		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
+						&fl4.saddr) : NULL;
+	}
+
+	if (!rt) {
+		rt = ip_route_output_key(tunnel->net, &fl4);
+
+		if (IS_ERR(rt)) {
+			DEV_STATS_INC(dev, tx_carrier_errors);
+			goto tx_error;
+		}
+		if (use_cache)
+			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+					  fl4.saddr);
+		else if (!md && connected)
+			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+					  fl4.saddr);
+	}
+
+	if (rt->dst.dev == dev) {
+		ip_rt_put(rt);
+		DEV_STATS_INC(dev, collisions);
+		goto tx_error;
+	}
+
+	df = tnl_params->frag_off;
+	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
+		df |= (inner_iph->frag_off & htons(IP_DF));
+
+	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
+		ip_rt_put(rt);
+		goto tx_error;
+	}
+
+	if (tunnel->err_count > 0) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+			tunnel->err_count--;
+
+			dst_link_failure(skb);
+		} else
+			tunnel->err_count = 0;
+	}
+
+	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+	ttl = tnl_params->ttl;
+	if (ttl == 0) {
+		if (payload_protocol == htons(ETH_P_IP))
+			ttl = inner_iph->ttl;
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (payload_protocol == htons(ETH_P_IPV6))
+			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+#endif
+		else
+			ttl = ip4_dst_hoplimit(&rt->dst);
+	}
+
+	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
+	if (max_headroom > READ_ONCE(dev->needed_headroom))
+		WRITE_ONCE(dev->needed_headroom, max_headroom);
+
+	if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
+		ip_rt_put(rt);
+		DEV_STATS_INC(dev, tx_dropped);
+		kfree_skb(skb);
+		return;
+	}
+
+	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
+		      df, !net_eq(tunnel->net, dev_net(dev)));
+	return;
+
+#if IS_ENABLED(CONFIG_IPV6)
+tx_error_icmp:
+	dst_link_failure(skb);
+#endif
+tx_error:
+	DEV_STATS_INC(dev, tx_errors);
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
+
+static void ip_tunnel_update(struct ip_tunnel_net *itn,
+			     struct ip_tunnel *t,
+			     struct net_device *dev,
+			     struct ip_tunnel_parm *p,
+			     bool set_mtu,
+			     __u32 fwmark)
+{
+	ip_tunnel_del(itn, t);
+	t->parms.iph.saddr = p->iph.saddr;
+	t->parms.iph.daddr = p->iph.daddr;
+	t->parms.i_key = p->i_key;
+	t->parms.o_key = p->o_key;
+	if (dev->type != ARPHRD_ETHER) {
+		__dev_addr_set(dev, &p->iph.saddr, 4);
+		memcpy(dev->broadcast, &p->iph.daddr, 4);
+	}
+	ip_tunnel_add(itn, t);
+
+	t->parms.iph.ttl = p->iph.ttl;
+	t->parms.iph.tos = p->iph.tos;
+	t->parms.iph.frag_off = p->iph.frag_off;
+
+	if (t->parms.link != p->link || t->fwmark != fwmark) {
+		int mtu;
+
+		t->parms.link = p->link;
+		t->fwmark = fwmark;
+		mtu = ip_tunnel_bind_dev(dev);
+		if (set_mtu)
+			dev->mtu = mtu;
+	}
+	dst_cache_reset(&t->dst_cache);
+	netdev_state_change(dev);
+}
+
+int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+	int err = 0;
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct net *net = t->net;
+	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
+
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		if (dev == itn->fb_tunnel_dev) {
+			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+			if (!t)
+				t = netdev_priv(dev);
+		}
+		memcpy(p, &t->parms, sizeof(*p));
+		break;
+
+	case SIOCADDTUNNEL:
+	case SIOCCHGTUNNEL:
+		err = -EPERM;
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			goto done;
+		if (p->iph.ttl)
+			p->iph.frag_off |= htons(IP_DF);
+		if (!(p->i_flags & VTI_ISVTI)) {
+			if (!(p->i_flags & TUNNEL_KEY))
+				p->i_key = 0;
+			if (!(p->o_flags & TUNNEL_KEY))
+				p->o_key = 0;
+		}
+
+		t = ip_tunnel_find(itn, p, itn->type);
+
+		if (cmd == SIOCADDTUNNEL) {
+			if (!t) {
+				t = ip_tunnel_create(net, itn, p);
+				err = PTR_ERR_OR_ZERO(t);
+				break;
+			}
+
+			err = -EEXIST;
+			break;
+		}
+		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+			if (t) {
+				if (t->dev != dev) {
+					err = -EEXIST;
+					break;
+				}
+			} else {
+				unsigned int nflags = 0;
+
+				if (ipv4_is_multicast(p->iph.daddr))
+					nflags = IFF_BROADCAST;
+				else if (p->iph.daddr)
+					nflags = IFF_POINTOPOINT;
+
+				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+					err = -EINVAL;
+					break;
+				}
+
+				t = netdev_priv(dev);
+			}
+		}
+
+		if (t) {
+			err = 0;
+			ip_tunnel_update(itn, t, dev, p, true, 0);
+		} else {
+			err = -ENOENT;
+		}
+		break;
+
+	case SIOCDELTUNNEL:
+		err = -EPERM;
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			goto done;
+
+		if (dev == itn->fb_tunnel_dev) {
+			err = -ENOENT;
+			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+			if (!t)
+				goto done;
+			err = -EPERM;
+			if (t == netdev_priv(itn->fb_tunnel_dev))
+				goto done;
+			dev = t->dev;
+		}
+		unregister_netdevice(dev);
+		err = 0;
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+done:
+	return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
+
+int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+			     void __user *data, int cmd)
+{
+	struct ip_tunnel_parm p;
+	int err;
+
+	if (copy_from_user(&p, data, sizeof(p)))
+		return -EFAULT;
+	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
+	if (!err && copy_to_user(data, &p, sizeof(p)))
+		return -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
+
+int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+	int max_mtu = IP_MAX_MTU - t_hlen;
+
+	if (dev->type == ARPHRD_ETHER)
+		max_mtu -= dev->hard_header_len;
+
+	if (new_mtu < ETH_MIN_MTU)
+		return -EINVAL;
+
+	if (new_mtu > max_mtu) {
+		if (strict)
+			return -EINVAL;
+
+		new_mtu = max_mtu;
+	}
+
+	dev->mtu = new_mtu;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+	return __ip_tunnel_change_mtu(dev, new_mtu, true);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	gro_cells_destroy(&tunnel->gro_cells);
+	dst_cache_destroy(&tunnel->dst_cache);
+	free_percpu(dev->tstats);
+}
+
+void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct ip_tunnel_net *itn;
+
+	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
+
+	if (itn->fb_tunnel_dev != dev) {
+		ip_tunnel_del(itn, netdev_priv(dev));
+		unregister_netdevice_queue(dev, head);
+	}
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
+
+struct net *ip_tunnel_get_link_net(const struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	return tunnel->net;
+}
+EXPORT_SYMBOL(ip_tunnel_get_link_net);
+
+int ip_tunnel_get_iflink(const struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	return tunnel->parms.link;
+}
+EXPORT_SYMBOL(ip_tunnel_get_iflink);
+
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
+				  struct rtnl_link_ops *ops, char *devname)
+{
+	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+	struct ip_tunnel_parm parms;
+	unsigned int i;
+
+	itn->rtnl_link_ops = ops;
+	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&itn->tunnels[i]);
+
+	if (!ops || !net_has_fallback_tunnels(net)) {
+		struct ip_tunnel_net *it_init_net;
+
+		it_init_net = net_generic(&init_net, ip_tnl_net_id);
+		itn->type = it_init_net->type;
+		itn->fb_tunnel_dev = NULL;
+		return 0;
+	}
+
+	memset(&parms, 0, sizeof(parms));
+	if (devname)
+		strscpy(parms.name, devname, IFNAMSIZ);
+
+	rtnl_lock();
+	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+	/* FB netdevice is special: we have one, and only one per netns.
+	 * Allowing to move it to another netns is clearly unsafe.
+	 */
+	if (!IS_ERR(itn->fb_tunnel_dev)) {
+		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
+		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+		itn->type = itn->fb_tunnel_dev->type;
+	}
+	rtnl_unlock();
+
+	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
+
+static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
+			      struct list_head *head,
+			      struct rtnl_link_ops *ops)
+{
+	struct net_device *dev, *aux;
+	int h;
+
+	for_each_netdev_safe(net, dev, aux)
+		if (dev->rtnl_link_ops == ops)
+			unregister_netdevice_queue(dev, head);
+
+	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
+		struct ip_tunnel *t;
+		struct hlist_node *n;
+		struct hlist_head *thead = &itn->tunnels[h];
+
+		hlist_for_each_entry_safe(t, n, thead, hash_node)
+			/* If dev is in the same netns, it has already
+			 * been added to the list by the previous loop.
+			 */
+			if (!net_eq(dev_net(t->dev), net))
+				unregister_netdevice_queue(t->dev, head);
+	}
+}
+
+void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
+			   struct rtnl_link_ops *ops)
+{
+	struct ip_tunnel_net *itn;
+	struct net *net;
+	LIST_HEAD(list);
+
+	rtnl_lock();
+	list_for_each_entry(net, net_list, exit_list) {
+		itn = net_generic(net, id);
+		ip_tunnel_destroy(net, itn, &list, ops);
+	}
+	unregister_netdevice_many(&list);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
+
+int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
+		      struct ip_tunnel_parm *p, __u32 fwmark)
+{
+	struct ip_tunnel *nt;
+	struct net *net = dev_net(dev);
+	struct ip_tunnel_net *itn;
+	int mtu;
+	int err;
+
+	nt = netdev_priv(dev);
+	itn = net_generic(net, nt->ip_tnl_net_id);
+
+	if (nt->collect_md) {
+		if (rtnl_dereference(itn->collect_md_tun))
+			return -EEXIST;
+	} else {
+		if (ip_tunnel_find(itn, p, dev->type))
+			return -EEXIST;
+	}
+
+	nt->net = net;
+	nt->parms = *p;
+	nt->fwmark = fwmark;
+	err = register_netdevice(dev);
+	if (err)
+		goto err_register_netdevice;
+
+	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+		eth_hw_addr_random(dev);
+
+	mtu = ip_tunnel_bind_dev(dev);
+	if (tb[IFLA_MTU]) {
+		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
+
+		if (dev->type == ARPHRD_ETHER)
+			max -= dev->hard_header_len;
+
+		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
+	}
+
+	err = dev_set_mtu(dev, mtu);
+	if (err)
+		goto err_dev_set_mtu;
+
+	ip_tunnel_add(itn, nt);
+	return 0;
+
+err_dev_set_mtu:
+	unregister_netdevice(dev);
+err_register_netdevice:
+	return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
+
+int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
+			 struct ip_tunnel_parm *p, __u32 fwmark)
+{
+	struct ip_tunnel *t;
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct net *net = tunnel->net;
+	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+	if (dev == itn->fb_tunnel_dev)
+		return -EINVAL;
+
+	t = ip_tunnel_find(itn, p, dev->type);
+
+	if (t) {
+		if (t->dev != dev)
+			return -EEXIST;
+	} else {
+		t = tunnel;
+
+		if (dev->type != ARPHRD_ETHER) {
+			unsigned int nflags = 0;
+
+			if (ipv4_is_multicast(p->iph.daddr))
+				nflags = IFF_BROADCAST;
+			else if (p->iph.daddr)
+				nflags = IFF_POINTOPOINT;
+
+			if ((dev->flags ^ nflags) &
+			    (IFF_POINTOPOINT | IFF_BROADCAST))
+				return -EINVAL;
+		}
+	}
+
+	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
+
+int ip_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+	int err;
+
+	dev->needs_free_netdev = true;
+	dev->priv_destructor = ip_tunnel_dev_free;
+	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+	if (err) {
+		free_percpu(dev->tstats);
+		return err;
+	}
+
+	err = gro_cells_init(&tunnel->gro_cells, dev);
+	if (err) {
+		dst_cache_destroy(&tunnel->dst_cache);
+		free_percpu(dev->tstats);
+		return err;
+	}
+
+	tunnel->dev = dev;
+	tunnel->net = dev_net(dev);
+	strcpy(tunnel->parms.name, dev->name);
+	iph->version		= 4;
+	iph->ihl		= 5;
+
+	if (tunnel->collect_md)
+		netif_keep_dst(dev);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init);
+
+void ip_tunnel_uninit(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct net *net = tunnel->net;
+	struct ip_tunnel_net *itn;
+
+	itn = net_generic(net, tunnel->ip_tnl_net_id);
+	ip_tunnel_del(itn, netdev_priv(dev));
+	if (itn->fb_tunnel_dev == dev)
+		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
+
+	dst_cache_reset(&tunnel->dst_cache);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
+
+/* Do least required initialization, rest of init is done in tunnel_init call */
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	tunnel->ip_tnl_net_id = net_id;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_setup);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
new file mode 100644
index 0000000000..586b1b3e35
--- /dev/null
+++ b/net/ipv4/ip_tunnel_core.c
@@ -0,0 +1,1148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/static_key.h>
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/dst_metadata.h>
+#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
+
+const struct ip_tunnel_encap_ops __rcu *
+		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(iptun_encaps);
+
+const struct ip6_tnl_encap_ops __rcu *
+		ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(ip6tun_encaps);
+
+void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
+		   __be32 src, __be32 dst, __u8 proto,
+		   __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+	int pkt_len = skb->len - skb_inner_network_offset(skb);
+	struct net *net = dev_net(rt->dst.dev);
+	struct net_device *dev = skb->dev;
+	struct iphdr *iph;
+	int err;
+
+	skb_scrub_packet(skb, xnet);
+
+	skb_clear_hash_if_not_l4(skb);
+	skb_dst_set(skb, &rt->dst);
+	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+	/* Push down and install the IP header. */
+	skb_push(skb, sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+
+	iph = ip_hdr(skb);
+
+	iph->version	=	4;
+	iph->ihl	=	sizeof(struct iphdr) >> 2;
+	iph->frag_off	=	ip_mtu_locked(&rt->dst) ? 0 : df;
+	iph->protocol	=	proto;
+	iph->tos	=	tos;
+	iph->daddr	=	dst;
+	iph->saddr	=	src;
+	iph->ttl	=	ttl;
+	__ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
+
+	err = ip_local_out(net, sk, skb);
+
+	if (dev) {
+		if (unlikely(net_xmit_eval(err)))
+			pkt_len = 0;
+		iptunnel_xmit_stats(dev, pkt_len);
+	}
+}
+EXPORT_SYMBOL_GPL(iptunnel_xmit);
+
+int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
+			   __be16 inner_proto, bool raw_proto, bool xnet)
+{
+	if (unlikely(!pskb_may_pull(skb, hdr_len)))
+		return -ENOMEM;
+
+	skb_pull_rcsum(skb, hdr_len);
+
+	if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
+		struct ethhdr *eh;
+
+		if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+			return -ENOMEM;
+
+		eh = (struct ethhdr *)skb->data;
+		if (likely(eth_proto_is_802_3(eh->h_proto)))
+			skb->protocol = eh->h_proto;
+		else
+			skb->protocol = htons(ETH_P_802_2);
+
+	} else {
+		skb->protocol = inner_proto;
+	}
+
+	skb_clear_hash_if_not_l4(skb);
+	__vlan_hwaccel_clear_tag(skb);
+	skb_set_queue_mapping(skb, 0);
+	skb_scrub_packet(skb, xnet);
+
+	return iptunnel_pull_offloads(skb);
+}
+EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
+
+struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
+					     gfp_t flags)
+{
+	struct metadata_dst *res;
+	struct ip_tunnel_info *dst, *src;
+
+	if (!md || md->type != METADATA_IP_TUNNEL ||
+	    md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
+		return NULL;
+
+	src = &md->u.tun_info;
+	res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
+	if (!res)
+		return NULL;
+
+	dst = &res->u.tun_info;
+	dst->key.tun_id = src->key.tun_id;
+	if (src->mode & IP_TUNNEL_INFO_IPV6)
+		memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
+		       sizeof(struct in6_addr));
+	else
+		dst->key.u.ipv4.dst = src->key.u.ipv4.src;
+	dst->key.tun_flags = src->key.tun_flags;
+	dst->mode = src->mode | IP_TUNNEL_INFO_TX;
+	ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
+				src->options_len, 0);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
+
+int iptunnel_handle_offloads(struct sk_buff *skb,
+			     int gso_type_mask)
+{
+	int err;
+
+	if (likely(!skb->encapsulation)) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
+	if (skb_is_gso(skb)) {
+		err = skb_header_unclone(skb, GFP_ATOMIC);
+		if (unlikely(err))
+			return err;
+		skb_shinfo(skb)->gso_type |= gso_type_mask;
+		return 0;
+	}
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		skb->ip_summed = CHECKSUM_NONE;
+		/* We clear encapsulation here to prevent badly-written
+		 * drivers potentially deciding to offload an inner checksum
+		 * if we set CHECKSUM_PARTIAL on the outer header.
+		 * This should go away when the drivers are all fixed.
+		 */
+		skb->encapsulation = 0;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
+
+/**
+ * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
+ * @skb:	Original packet with L2 header
+ * @mtu:	MTU value for ICMP error
+ *
+ * Return: length on success, negative error code if message couldn't be built.
+ */
+static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct icmphdr *icmph;
+	struct iphdr *niph;
+	struct ethhdr eh;
+	int len, err;
+
+	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
+		return -EINVAL;
+
+	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
+	pskb_pull(skb, ETH_HLEN);
+	skb_reset_network_header(skb);
+
+	err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
+	if (err)
+		return err;
+
+	len = skb->len + sizeof(*icmph);
+	err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
+	if (err)
+		return err;
+
+	icmph = skb_push(skb, sizeof(*icmph));
+	*icmph = (struct icmphdr) {
+		.type			= ICMP_DEST_UNREACH,
+		.code			= ICMP_FRAG_NEEDED,
+		.checksum		= 0,
+		.un.frag.__unused	= 0,
+		.un.frag.mtu		= htons(mtu),
+	};
+	icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
+	skb_reset_transport_header(skb);
+
+	niph = skb_push(skb, sizeof(*niph));
+	*niph = (struct iphdr) {
+		.ihl			= sizeof(*niph) / 4u,
+		.version 		= 4,
+		.tos 			= 0,
+		.tot_len		= htons(len + sizeof(*niph)),
+		.id			= 0,
+		.frag_off		= htons(IP_DF),
+		.ttl			= iph->ttl,
+		.protocol		= IPPROTO_ICMP,
+		.saddr			= iph->daddr,
+		.daddr			= iph->saddr,
+	};
+	ip_send_check(niph);
+	skb_reset_network_header(skb);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
+	skb_reset_mac_header(skb);
+
+	return skb->len;
+}
+
+/**
+ * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
+ * @skb:	Buffer being sent by encapsulation, L2 headers expected
+ * @mtu:	Network MTU for path
+ *
+ * Return: 0 for no ICMP reply, length if built, negative value on error.
+ */
+static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
+{
+	const struct icmphdr *icmph = icmp_hdr(skb);
+	const struct iphdr *iph = ip_hdr(skb);
+
+	if (mtu < 576 || iph->frag_off != htons(IP_DF))
+		return 0;
+
+	if (ipv4_is_lbcast(iph->daddr)  || ipv4_is_multicast(iph->daddr) ||
+	    ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr)  ||
+	    ipv4_is_lbcast(iph->saddr)  || ipv4_is_multicast(iph->saddr))
+		return 0;
+
+	if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
+		return 0;
+
+	return iptunnel_pmtud_build_icmp(skb, mtu);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
+ * @skb:	Original packet with L2 header
+ * @mtu:	MTU value for ICMPv6 error
+ *
+ * Return: length on success, negative error code if message couldn't be built.
+ */
+static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct icmp6hdr *icmp6h;
+	struct ipv6hdr *nip6h;
+	struct ethhdr eh;
+	int len, err;
+	__wsum csum;
+
+	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
+		return -EINVAL;
+
+	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
+	pskb_pull(skb, ETH_HLEN);
+	skb_reset_network_header(skb);
+
+	err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
+	if (err)
+		return err;
+
+	len = skb->len + sizeof(*icmp6h);
+	err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
+	if (err)
+		return err;
+
+	icmp6h = skb_push(skb, sizeof(*icmp6h));
+	*icmp6h = (struct icmp6hdr) {
+		.icmp6_type		= ICMPV6_PKT_TOOBIG,
+		.icmp6_code		= 0,
+		.icmp6_cksum		= 0,
+		.icmp6_mtu		= htonl(mtu),
+	};
+	skb_reset_transport_header(skb);
+
+	nip6h = skb_push(skb, sizeof(*nip6h));
+	*nip6h = (struct ipv6hdr) {
+		.priority		= 0,
+		.version		= 6,
+		.flow_lbl		= { 0 },
+		.payload_len		= htons(len),
+		.nexthdr		= IPPROTO_ICMPV6,
+		.hop_limit		= ip6h->hop_limit,
+		.saddr			= ip6h->daddr,
+		.daddr			= ip6h->saddr,
+	};
+	skb_reset_network_header(skb);
+
+	csum = csum_partial(icmp6h, len, 0);
+	icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
+					      IPPROTO_ICMPV6, csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
+	skb_reset_mac_header(skb);
+
+	return skb->len;
+}
+
+/**
+ * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
+ * @skb:	Buffer being sent by encapsulation, L2 headers expected
+ * @mtu:	Network MTU for path
+ *
+ * Return: 0 for no ICMPv6 reply, length if built, negative value on error.
+ */
+static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	int stype = ipv6_addr_type(&ip6h->saddr);
+	u8 proto = ip6h->nexthdr;
+	__be16 frag_off;
+	int offset;
+
+	if (mtu < IPV6_MIN_MTU)
+		return 0;
+
+	if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
+	    stype == IPV6_ADDR_LOOPBACK)
+		return 0;
+
+	offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
+				  &frag_off);
+	if (offset < 0 || (frag_off & htons(~0x7)))
+		return 0;
+
+	if (proto == IPPROTO_ICMPV6) {
+		struct icmp6hdr *icmp6h;
+
+		if (!pskb_may_pull(skb, skb_network_header(skb) +
+					offset + 1 - skb->data))
+			return 0;
+
+		icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
+		if (icmpv6_is_err(icmp6h->icmp6_type) ||
+		    icmp6h->icmp6_type == NDISC_REDIRECT)
+			return 0;
+	}
+
+	return iptunnel_pmtud_build_icmpv6(skb, mtu);
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+/**
+ * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
+ * @skb:	Buffer being sent by encapsulation, L2 headers expected
+ * @encap_dst:	Destination for tunnel encapsulation (outer IP)
+ * @headroom:	Encapsulation header size, bytes
+ * @reply:	Build matching ICMP or ICMPv6 message as a result
+ *
+ * L2 tunnel implementations that can carry IP and can be directly bridged
+ * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
+ * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
+ * based on payload and sent back by the encapsulation itself.
+ *
+ * For routable interfaces, we just need to update the PMTU for the destination.
+ *
+ * Return: 0 if ICMP error not needed, length if built, negative value on error
+ */
+int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
+			  int headroom, bool reply)
+{
+	u32 mtu = dst_mtu(encap_dst) - headroom;
+
+	if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
+	    (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
+		return 0;
+
+	skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+	if (!reply || skb->pkt_type == PACKET_HOST)
+		return 0;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		return iptunnel_pmtud_check_icmp(skb, mtu);
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6))
+		return iptunnel_pmtud_check_icmpv6(skb, mtu);
+#endif
+	return 0;
+}
+EXPORT_SYMBOL(skb_tunnel_check_pmtu);
+
+static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+	[LWTUNNEL_IP_UNSPEC]	= { .strict_start_type = LWTUNNEL_IP_OPTS },
+	[LWTUNNEL_IP_ID]	= { .type = NLA_U64 },
+	[LWTUNNEL_IP_DST]	= { .type = NLA_U32 },
+	[LWTUNNEL_IP_SRC]	= { .type = NLA_U32 },
+	[LWTUNNEL_IP_TTL]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_TOS]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
+	[LWTUNNEL_IP_OPTS]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
+	[LWTUNNEL_IP_OPTS_GENEVE]	= { .type = NLA_NESTED },
+	[LWTUNNEL_IP_OPTS_VXLAN]	= { .type = NLA_NESTED },
+	[LWTUNNEL_IP_OPTS_ERSPAN]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
+	[LWTUNNEL_IP_OPT_GENEVE_CLASS]	= { .type = NLA_U16 },
+	[LWTUNNEL_IP_OPT_GENEVE_TYPE]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_OPT_GENEVE_DATA]	= { .type = NLA_BINARY, .len = 128 },
+};
+
+static const struct nla_policy
+vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
+	[LWTUNNEL_IP_OPT_VXLAN_GBP]	= { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
+	[LWTUNNEL_IP_OPT_ERSPAN_VER]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_OPT_ERSPAN_INDEX]	= { .type = NLA_U32 },
+	[LWTUNNEL_IP_OPT_ERSPAN_DIR]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_OPT_ERSPAN_HWID]	= { .type = NLA_U8 },
+};
+
+static int ip_tun_parse_opts_geneve(struct nlattr *attr,
+				    struct ip_tunnel_info *info, int opts_len,
+				    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
+	int data_len, err;
+
+	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
+			       geneve_opt_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
+	    !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
+	    !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
+		return -EINVAL;
+
+	attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
+	data_len = nla_len(attr);
+	if (data_len % 4)
+		return -EINVAL;
+
+	if (info) {
+		struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
+
+		memcpy(opt->opt_data, nla_data(attr), data_len);
+		opt->length = data_len / 4;
+		attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
+		opt->opt_class = nla_get_be16(attr);
+		attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
+		opt->type = nla_get_u8(attr);
+		info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+	}
+
+	return sizeof(struct geneve_opt) + data_len;
+}
+
+static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
+				   struct ip_tunnel_info *info, int opts_len,
+				   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
+			       vxlan_opt_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
+		return -EINVAL;
+
+	if (info) {
+		struct vxlan_metadata *md =
+			ip_tunnel_info_opts(info) + opts_len;
+
+		attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
+		md->gbp = nla_get_u32(attr);
+		md->gbp &= VXLAN_GBP_MASK;
+		info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+	}
+
+	return sizeof(struct vxlan_metadata);
+}
+
+static int ip_tun_parse_opts_erspan(struct nlattr *attr,
+				    struct ip_tunnel_info *info, int opts_len,
+				    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
+	int err;
+	u8 ver;
+
+	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
+			       erspan_opt_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
+		return -EINVAL;
+
+	ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
+	if (ver == 1) {
+		if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
+			return -EINVAL;
+	} else if (ver == 2) {
+		if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
+		    !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
+			return -EINVAL;
+	} else {
+		return -EINVAL;
+	}
+
+	if (info) {
+		struct erspan_metadata *md =
+			ip_tunnel_info_opts(info) + opts_len;
+
+		md->version = ver;
+		if (ver == 1) {
+			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
+			md->u.index = nla_get_be32(attr);
+		} else {
+			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
+			md->u.md2.dir = nla_get_u8(attr);
+			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
+			set_hwid(&md->u.md2, nla_get_u8(attr));
+		}
+
+		info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+	}
+
+	return sizeof(struct erspan_metadata);
+}
+
+static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+			     struct netlink_ext_ack *extack)
+{
+	int err, rem, opt_len, opts_len = 0;
+	struct nlattr *nla;
+	__be16 type = 0;
+
+	if (!attr)
+		return 0;
+
+	err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
+			   ip_opts_policy, extack);
+	if (err)
+		return err;
+
+	nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+		switch (nla_type(nla)) {
+		case LWTUNNEL_IP_OPTS_GENEVE:
+			if (type && type != TUNNEL_GENEVE_OPT)
+				return -EINVAL;
+			opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
+							   extack);
+			if (opt_len < 0)
+				return opt_len;
+			opts_len += opt_len;
+			if (opts_len > IP_TUNNEL_OPTS_MAX)
+				return -EINVAL;
+			type = TUNNEL_GENEVE_OPT;
+			break;
+		case LWTUNNEL_IP_OPTS_VXLAN:
+			if (type)
+				return -EINVAL;
+			opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
+							  extack);
+			if (opt_len < 0)
+				return opt_len;
+			opts_len += opt_len;
+			type = TUNNEL_VXLAN_OPT;
+			break;
+		case LWTUNNEL_IP_OPTS_ERSPAN:
+			if (type)
+				return -EINVAL;
+			opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
+							   extack);
+			if (opt_len < 0)
+				return opt_len;
+			opts_len += opt_len;
+			type = TUNNEL_ERSPAN_OPT;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return opts_len;
+}
+
+static int ip_tun_get_optlen(struct nlattr *attr,
+			     struct netlink_ext_ack *extack)
+{
+	return ip_tun_parse_opts(attr, NULL, extack);
+}
+
+static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+			   struct netlink_ext_ack *extack)
+{
+	return ip_tun_parse_opts(attr, info, extack);
+}
+
+static int ip_tun_build_state(struct net *net, struct nlattr *attr,
+			      unsigned int family, const void *cfg,
+			      struct lwtunnel_state **ts,
+			      struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
+	struct lwtunnel_state *new_state;
+	struct ip_tunnel_info *tun_info;
+	int err, opt_len;
+
+	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
+					  ip_tun_policy, extack);
+	if (err < 0)
+		return err;
+
+	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
+	if (opt_len < 0)
+		return opt_len;
+
+	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
+	if (!new_state)
+		return -ENOMEM;
+
+	new_state->type = LWTUNNEL_ENCAP_IP;
+
+	tun_info = lwt_tun_info(new_state);
+
+	err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
+	if (err < 0) {
+		lwtstate_free(new_state);
+		return err;
+	}
+
+#ifdef CONFIG_DST_CACHE
+	err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
+	if (err) {
+		lwtstate_free(new_state);
+		return err;
+	}
+#endif
+
+	if (tb[LWTUNNEL_IP_ID])
+		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
+
+	if (tb[LWTUNNEL_IP_DST])
+		tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
+
+	if (tb[LWTUNNEL_IP_SRC])
+		tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
+
+	if (tb[LWTUNNEL_IP_TTL])
+		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
+
+	if (tb[LWTUNNEL_IP_TOS])
+		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
+
+	if (tb[LWTUNNEL_IP_FLAGS])
+		tun_info->key.tun_flags |=
+				(nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) &
+				 ~TUNNEL_OPTIONS_PRESENT);
+
+	tun_info->mode = IP_TUNNEL_INFO_TX;
+	tun_info->options_len = opt_len;
+
+	*ts = new_state;
+
+	return 0;
+}
+
+static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
+{
+#ifdef CONFIG_DST_CACHE
+	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+	dst_cache_destroy(&tun_info->dst_cache);
+#endif
+}
+
+static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
+					 struct ip_tunnel_info *tun_info)
+{
+	struct geneve_opt *opt;
+	struct nlattr *nest;
+	int offset = 0;
+
+	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
+	if (!nest)
+		return -ENOMEM;
+
+	while (tun_info->options_len > offset) {
+		opt = ip_tunnel_info_opts(tun_info) + offset;
+		if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
+				 opt->opt_class) ||
+		    nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
+		    nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
+			    opt->opt_data)) {
+			nla_nest_cancel(skb, nest);
+			return -ENOMEM;
+		}
+		offset += sizeof(*opt) + opt->length * 4;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+}
+
+static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
+					struct ip_tunnel_info *tun_info)
+{
+	struct vxlan_metadata *md;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
+	if (!nest)
+		return -ENOMEM;
+
+	md = ip_tunnel_info_opts(tun_info);
+	if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
+		nla_nest_cancel(skb, nest);
+		return -ENOMEM;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+}
+
+static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
+					 struct ip_tunnel_info *tun_info)
+{
+	struct erspan_metadata *md;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
+	if (!nest)
+		return -ENOMEM;
+
+	md = ip_tunnel_info_opts(tun_info);
+	if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
+		goto err;
+
+	if (md->version == 1 &&
+	    nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
+		goto err;
+
+	if (md->version == 2 &&
+	    (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
+	     nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
+			get_hwid(&md->u.md2))))
+		goto err;
+
+	nla_nest_end(skb, nest);
+	return 0;
+err:
+	nla_nest_cancel(skb, nest);
+	return -ENOMEM;
+}
+
+static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
+				  struct ip_tunnel_info *tun_info)
+{
+	struct nlattr *nest;
+	int err = 0;
+
+	if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+		return 0;
+
+	nest = nla_nest_start_noflag(skb, type);
+	if (!nest)
+		return -ENOMEM;
+
+	if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT)
+		err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
+	else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT)
+		err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
+	else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)
+		err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
+
+	if (err) {
+		nla_nest_cancel(skb, nest);
+		return err;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+}
+
+static int ip_tun_fill_encap_info(struct sk_buff *skb,
+				  struct lwtunnel_state *lwtstate)
+{
+	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+	if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
+			 LWTUNNEL_IP_PAD) ||
+	    nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
+	    nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
+	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
+	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
+	    nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) ||
+	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
+{
+	int opt_len;
+
+	if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+		return 0;
+
+	opt_len = nla_total_size(0);		/* LWTUNNEL_IP_OPTS */
+	if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+		struct geneve_opt *opt;
+		int offset = 0;
+
+		opt_len += nla_total_size(0);	/* LWTUNNEL_IP_OPTS_GENEVE */
+		while (info->options_len > offset) {
+			opt = ip_tunnel_info_opts(info) + offset;
+			opt_len += nla_total_size(2)	/* OPT_GENEVE_CLASS */
+				   + nla_total_size(1)	/* OPT_GENEVE_TYPE */
+				   + nla_total_size(opt->length * 4);
+							/* OPT_GENEVE_DATA */
+			offset += sizeof(*opt) + opt->length * 4;
+		}
+	} else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_VXLAN */
+			   + nla_total_size(4);	/* OPT_VXLAN_GBP */
+	} else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+		struct erspan_metadata *md = ip_tunnel_info_opts(info);
+
+		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_ERSPAN */
+			   + nla_total_size(1)	/* OPT_ERSPAN_VER */
+			   + (md->version == 1 ? nla_total_size(4)
+						/* OPT_ERSPAN_INDEX (v1) */
+					       : nla_total_size(1) +
+						 nla_total_size(1));
+						/* OPT_ERSPAN_DIR + HWID (v2) */
+	}
+
+	return opt_len;
+}
+
+static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size_64bit(8)	/* LWTUNNEL_IP_ID */
+		+ nla_total_size(4)	/* LWTUNNEL_IP_DST */
+		+ nla_total_size(4)	/* LWTUNNEL_IP_SRC */
+		+ nla_total_size(1)	/* LWTUNNEL_IP_TOS */
+		+ nla_total_size(1)	/* LWTUNNEL_IP_TTL */
+		+ nla_total_size(2)	/* LWTUNNEL_IP_FLAGS */
+		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+					/* LWTUNNEL_IP_OPTS */
+}
+
+static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct ip_tunnel_info *info_a = lwt_tun_info(a);
+	struct ip_tunnel_info *info_b = lwt_tun_info(b);
+
+	return memcmp(info_a, info_b, sizeof(info_a->key)) ||
+	       info_a->mode != info_b->mode ||
+	       info_a->options_len != info_b->options_len ||
+	       memcmp(ip_tunnel_info_opts(info_a),
+		      ip_tunnel_info_opts(info_b), info_a->options_len);
+}
+
+static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
+	.build_state = ip_tun_build_state,
+	.destroy_state = ip_tun_destroy_state,
+	.fill_encap = ip_tun_fill_encap_info,
+	.get_encap_size = ip_tun_encap_nlsize,
+	.cmp_encap = ip_tun_cmp_encap,
+	.owner = THIS_MODULE,
+};
+
+static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+	[LWTUNNEL_IP6_UNSPEC]	= { .strict_start_type = LWTUNNEL_IP6_OPTS },
+	[LWTUNNEL_IP6_ID]		= { .type = NLA_U64 },
+	[LWTUNNEL_IP6_DST]		= { .len = sizeof(struct in6_addr) },
+	[LWTUNNEL_IP6_SRC]		= { .len = sizeof(struct in6_addr) },
+	[LWTUNNEL_IP6_HOPLIMIT]		= { .type = NLA_U8 },
+	[LWTUNNEL_IP6_TC]		= { .type = NLA_U8 },
+	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
+	[LWTUNNEL_IP6_OPTS]		= { .type = NLA_NESTED },
+};
+
+static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
+			       unsigned int family, const void *cfg,
+			       struct lwtunnel_state **ts,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
+	struct lwtunnel_state *new_state;
+	struct ip_tunnel_info *tun_info;
+	int err, opt_len;
+
+	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
+					  ip6_tun_policy, extack);
+	if (err < 0)
+		return err;
+
+	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
+	if (opt_len < 0)
+		return opt_len;
+
+	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
+	if (!new_state)
+		return -ENOMEM;
+
+	new_state->type = LWTUNNEL_ENCAP_IP6;
+
+	tun_info = lwt_tun_info(new_state);
+
+	err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
+	if (err < 0) {
+		lwtstate_free(new_state);
+		return err;
+	}
+
+	if (tb[LWTUNNEL_IP6_ID])
+		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
+
+	if (tb[LWTUNNEL_IP6_DST])
+		tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
+
+	if (tb[LWTUNNEL_IP6_SRC])
+		tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
+
+	if (tb[LWTUNNEL_IP6_HOPLIMIT])
+		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
+
+	if (tb[LWTUNNEL_IP6_TC])
+		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
+
+	if (tb[LWTUNNEL_IP6_FLAGS])
+		tun_info->key.tun_flags |=
+				(nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) &
+				 ~TUNNEL_OPTIONS_PRESENT);
+
+	tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
+	tun_info->options_len = opt_len;
+
+	*ts = new_state;
+
+	return 0;
+}
+
+static int ip6_tun_fill_encap_info(struct sk_buff *skb,
+				   struct lwtunnel_state *lwtstate)
+{
+	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+	if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
+			 LWTUNNEL_IP6_PAD) ||
+	    nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
+	    nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
+	    nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
+	    nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
+	    nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) ||
+	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size_64bit(8)	/* LWTUNNEL_IP6_ID */
+		+ nla_total_size(16)	/* LWTUNNEL_IP6_DST */
+		+ nla_total_size(16)	/* LWTUNNEL_IP6_SRC */
+		+ nla_total_size(1)	/* LWTUNNEL_IP6_HOPLIMIT */
+		+ nla_total_size(1)	/* LWTUNNEL_IP6_TC */
+		+ nla_total_size(2)	/* LWTUNNEL_IP6_FLAGS */
+		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+					/* LWTUNNEL_IP6_OPTS */
+}
+
+static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
+	.build_state = ip6_tun_build_state,
+	.fill_encap = ip6_tun_fill_encap_info,
+	.get_encap_size = ip6_tun_encap_nlsize,
+	.cmp_encap = ip_tun_cmp_encap,
+	.owner = THIS_MODULE,
+};
+
+void __init ip_tunnel_core_init(void)
+{
+	/* If you land here, make sure whether increasing ip_tunnel_info's
+	 * options_len is a reasonable choice with its usage in front ends
+	 * (f.e., it's part of flow keys, etc).
+	 */
+	BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
+
+	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
+	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
+}
+
+DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
+EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
+
+void ip_tunnel_need_metadata(void)
+{
+	static_branch_inc(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
+
+void ip_tunnel_unneed_metadata(void)
+{
+	static_branch_dec(&ip_tunnel_metadata_cnt);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
+
+/* Returns either the correct skb->protocol value, or 0 if invalid. */
+__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
+{
+	if (skb_network_header(skb) >= skb->head &&
+	    (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
+	    ip_hdr(skb)->version == 4)
+		return htons(ETH_P_IP);
+	if (skb_network_header(skb) >= skb->head &&
+	    (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
+	    ipv6_hdr(skb)->version == 6)
+		return htons(ETH_P_IPV6);
+	return 0;
+}
+EXPORT_SYMBOL(ip_tunnel_parse_protocol);
+
+const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
+EXPORT_SYMBOL(ip_tunnel_header_ops);
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
+				   struct ip_tunnel_encap *encap)
+{
+	bool ret = false;
+
+	memset(encap, 0, sizeof(*encap));
+
+	if (!data)
+		return ret;
+
+	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+		ret = true;
+		encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+		ret = true;
+		encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+		ret = true;
+		encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+		ret = true;
+		encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
+
+void ip_tunnel_netlink_parms(struct nlattr *data[],
+			     struct ip_tunnel_parm *parms)
+{
+	if (data[IFLA_IPTUN_LINK])
+		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+	if (data[IFLA_IPTUN_LOCAL])
+		parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
+
+	if (data[IFLA_IPTUN_REMOTE])
+		parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
+
+	if (data[IFLA_IPTUN_TTL]) {
+		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+		if (parms->iph.ttl)
+			parms->iph.frag_off = htons(IP_DF);
+	}
+
+	if (data[IFLA_IPTUN_TOS])
+		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+		parms->iph.frag_off = htons(IP_DF);
+
+	if (data[IFLA_IPTUN_FLAGS])
+		parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+
+	if (data[IFLA_IPTUN_PROTO])
+		parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 0000000000..d1e7d0ceb7
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,726 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	Linux NET3: IP/IP protocol decoder modified to support
+ *		    virtual tunnel interface
+ *
+ *	Authors:
+ *		Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
+ */
+
+/*
+   This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
+
+   For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+#include <linux/icmpv6.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+static struct rtnl_link_ops vti_link_ops __read_mostly;
+
+static unsigned int vti_net_id __read_mostly;
+static int vti_tunnel_init(struct net_device *dev);
+
+static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
+		     int encap_type, bool update_skb_dev)
+{
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+				  iph->saddr, iph->daddr, 0);
+	if (tunnel) {
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto drop;
+
+		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+
+		if (update_skb_dev)
+			skb->dev = tunnel->dev;
+
+		return xfrm_input(skb, nexthdr, spi, encap_type);
+	}
+
+	return -EINVAL;
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+			   int encap_type)
+{
+	return vti_input(skb, nexthdr, spi, encap_type, false);
+}
+
+static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev)
+{
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+	return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev);
+}
+
+static int vti_rcv_proto(struct sk_buff *skb)
+{
+	return vti_rcv(skb, 0, false);
+}
+
+static int vti_rcv_cb(struct sk_buff *skb, int err)
+{
+	unsigned short family;
+	struct net_device *dev;
+	struct xfrm_state *x;
+	const struct xfrm_mode *inner_mode;
+	struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
+	u32 orig_mark = skb->mark;
+	int ret;
+
+	if (!tunnel)
+		return 1;
+
+	dev = tunnel->dev;
+
+	if (err) {
+		DEV_STATS_INC(dev, rx_errors);
+		DEV_STATS_INC(dev, rx_dropped);
+
+		return 0;
+	}
+
+	x = xfrm_input_state(skb);
+
+	inner_mode = &x->inner_mode;
+
+	if (x->sel.family == AF_UNSPEC) {
+		inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+		if (inner_mode == NULL) {
+			XFRM_INC_STATS(dev_net(skb->dev),
+				       LINUX_MIB_XFRMINSTATEMODEERROR);
+			return -EINVAL;
+		}
+	}
+
+	family = inner_mode->family;
+
+	skb->mark = be32_to_cpu(tunnel->parms.i_key);
+	ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+	skb->mark = orig_mark;
+
+	if (!ret)
+		return -EPERM;
+
+	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
+	skb->dev = dev;
+	dev_sw_netstats_rx_add(dev, skb->len);
+
+	return 0;
+}
+
+static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
+{
+	xfrm_address_t *daddr = (xfrm_address_t *)&dst;
+	xfrm_address_t *saddr = (xfrm_address_t *)&src;
+
+	/* if there is no transform then this tunnel is not functional.
+	 * Or if the xfrm is not mode tunnel.
+	 */
+	if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+	    x->props.family != AF_INET)
+		return false;
+
+	if (!dst)
+		return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
+
+	if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
+		return false;
+
+	return true;
+}
+
+static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
+			    struct flowi *fl)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct ip_tunnel_parm *parms = &tunnel->parms;
+	struct dst_entry *dst = skb_dst(skb);
+	struct net_device *tdev;	/* Device to other host */
+	int pkt_len = skb->len;
+	int err;
+	int mtu;
+
+	if (!dst) {
+		switch (skb->protocol) {
+		case htons(ETH_P_IP): {
+			struct rtable *rt;
+
+			fl->u.ip4.flowi4_oif = dev->ifindex;
+			fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+			rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+			if (IS_ERR(rt)) {
+				DEV_STATS_INC(dev, tx_carrier_errors);
+				goto tx_error_icmp;
+			}
+			dst = &rt->dst;
+			skb_dst_set(skb, dst);
+			break;
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		case htons(ETH_P_IPV6):
+			fl->u.ip6.flowi6_oif = dev->ifindex;
+			fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+			dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+			if (dst->error) {
+				dst_release(dst);
+				dst = NULL;
+				DEV_STATS_INC(dev, tx_carrier_errors);
+				goto tx_error_icmp;
+			}
+			skb_dst_set(skb, dst);
+			break;
+#endif
+		default:
+			DEV_STATS_INC(dev, tx_carrier_errors);
+			goto tx_error_icmp;
+		}
+	}
+
+	dst_hold(dst);
+	dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0);
+	if (IS_ERR(dst)) {
+		DEV_STATS_INC(dev, tx_carrier_errors);
+		goto tx_error_icmp;
+	}
+
+	if (dst->flags & DST_XFRM_QUEUE)
+		goto xmit;
+
+	if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
+		DEV_STATS_INC(dev, tx_carrier_errors);
+		dst_release(dst);
+		goto tx_error_icmp;
+	}
+
+	tdev = dst->dev;
+
+	if (tdev == dev) {
+		dst_release(dst);
+		DEV_STATS_INC(dev, collisions);
+		goto tx_error;
+	}
+
+	mtu = dst_mtu(dst);
+	if (skb->len > mtu) {
+		skb_dst_update_pmtu_no_confirm(skb, mtu);
+		if (skb->protocol == htons(ETH_P_IP)) {
+			if (!(ip_hdr(skb)->frag_off & htons(IP_DF)))
+				goto xmit;
+			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				      htonl(mtu));
+		} else {
+			if (mtu < IPV6_MIN_MTU)
+				mtu = IPV6_MIN_MTU;
+
+			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		}
+
+		dst_release(dst);
+		goto tx_error;
+	}
+
+xmit:
+	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
+	skb_dst_set(skb, dst);
+	skb->dev = skb_dst(skb)->dev;
+
+	err = dst_output(tunnel->net, skb->sk, skb);
+	if (net_xmit_eval(err) == 0)
+		err = pkt_len;
+	iptunnel_xmit_stats(dev, err);
+	return NETDEV_TX_OK;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	DEV_STATS_INC(dev, tx_errors);
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct flowi fl;
+
+	if (!pskb_inet_may_pull(skb))
+		goto tx_err;
+
+	memset(&fl, 0, sizeof(fl));
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+		xfrm_decode_session(skb, &fl, AF_INET);
+		break;
+	case htons(ETH_P_IPV6):
+		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+		xfrm_decode_session(skb, &fl, AF_INET6);
+		break;
+	default:
+		goto tx_err;
+	}
+
+	/* override mark with tunnel output key */
+	fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);
+
+	return vti_xmit(skb, dev, &fl);
+
+tx_err:
+	DEV_STATS_INC(dev, tx_errors);
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int vti4_err(struct sk_buff *skb, u32 info)
+{
+	__be32 spi;
+	__u32 mark;
+	struct xfrm_state *x;
+	struct ip_tunnel *tunnel;
+	struct ip_esp_hdr *esph;
+	struct ip_auth_hdr *ah ;
+	struct ip_comp_hdr *ipch;
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	int protocol = iph->protocol;
+	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+				  iph->daddr, iph->saddr, 0);
+	if (!tunnel)
+		return -1;
+
+	mark = be32_to_cpu(tunnel->parms.o_key);
+
+	switch (protocol) {
+	case IPPROTO_ESP:
+		esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+		spi = esph->spi;
+		break;
+	case IPPROTO_AH:
+		ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+		spi = ah->spi;
+		break;
+	case IPPROTO_COMP:
+		ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+		spi = htonl(ntohs(ipch->cpi));
+		break;
+	default:
+		return 0;
+	}
+
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+			return 0;
+		break;
+	case ICMP_REDIRECT:
+		break;
+	default:
+		return 0;
+	}
+
+	x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
+			      spi, protocol, AF_INET);
+	if (!x)
+		return 0;
+
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+		ipv4_update_pmtu(skb, net, info, 0, protocol);
+	else
+		ipv4_redirect(skb, net, 0, protocol);
+	xfrm_state_put(x);
+
+	return 0;
+}
+
+static int
+vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+	int err = 0;
+
+	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+		if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP ||
+		    p->iph.ihl != 5)
+			return -EINVAL;
+	}
+
+	if (!(p->i_flags & GRE_KEY))
+		p->i_key = 0;
+	if (!(p->o_flags & GRE_KEY))
+		p->o_key = 0;
+
+	p->i_flags = VTI_ISVTI;
+
+	err = ip_tunnel_ctl(dev, p, cmd);
+	if (err)
+		return err;
+
+	if (cmd != SIOCDELTUNNEL) {
+		p->i_flags |= GRE_KEY;
+		p->o_flags |= GRE_KEY;
+	}
+	return 0;
+}
+
+static const struct net_device_ops vti_netdev_ops = {
+	.ndo_init	= vti_tunnel_init,
+	.ndo_uninit	= ip_tunnel_uninit,
+	.ndo_start_xmit	= vti_tunnel_xmit,
+	.ndo_siocdevprivate = ip_tunnel_siocdevprivate,
+	.ndo_change_mtu	= ip_tunnel_change_mtu,
+	.ndo_get_stats64 = dev_get_tstats64,
+	.ndo_get_iflink = ip_tunnel_get_iflink,
+	.ndo_tunnel_ctl	= vti_tunnel_ctl,
+};
+
+static void vti_tunnel_setup(struct net_device *dev)
+{
+	dev->netdev_ops		= &vti_netdev_ops;
+	dev->header_ops		= &ip_tunnel_header_ops;
+	dev->type		= ARPHRD_TUNNEL;
+	ip_tunnel_setup(dev, vti_net_id);
+}
+
+static int vti_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+
+	__dev_addr_set(dev, &iph->saddr, 4);
+	memcpy(dev->broadcast, &iph->daddr, 4);
+
+	dev->flags		= IFF_NOARP;
+	dev->addr_len		= 4;
+	dev->features		|= NETIF_F_LLTX;
+	netif_keep_dst(dev);
+
+	return ip_tunnel_init(dev);
+}
+
+static void __net_init vti_fb_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+
+	iph->version		= 4;
+	iph->protocol		= IPPROTO_IPIP;
+	iph->ihl		= 5;
+}
+
+static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
+	.handler	=	vti_rcv_proto,
+	.input_handler	=	vti_input_proto,
+	.cb_handler	=	vti_rcv_cb,
+	.err_handler	=	vti4_err,
+	.priority	=	100,
+};
+
+static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
+	.handler	=	vti_rcv_proto,
+	.input_handler	=	vti_input_proto,
+	.cb_handler	=	vti_rcv_cb,
+	.err_handler	=	vti4_err,
+	.priority	=	100,
+};
+
+static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
+	.handler	=	vti_rcv_proto,
+	.input_handler	=	vti_input_proto,
+	.cb_handler	=	vti_rcv_cb,
+	.err_handler	=	vti4_err,
+	.priority	=	100,
+};
+
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+static int vti_rcv_tunnel(struct sk_buff *skb)
+{
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+	return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false);
+}
+
+static struct xfrm_tunnel vti_ipip_handler __read_mostly = {
+	.handler	=	vti_rcv_tunnel,
+	.cb_handler	=	vti_rcv_cb,
+	.err_handler	=	vti4_err,
+	.priority	=	0,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct xfrm_tunnel vti_ipip6_handler __read_mostly = {
+	.handler	=	vti_rcv_tunnel,
+	.cb_handler	=	vti_rcv_cb,
+	.err_handler	=	vti4_err,
+	.priority	=	0,
+};
+#endif
+#endif
+
+static int __net_init vti_init_net(struct net *net)
+{
+	int err;
+	struct ip_tunnel_net *itn;
+
+	err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
+	if (err)
+		return err;
+	itn = net_generic(net, vti_net_id);
+	if (itn->fb_tunnel_dev)
+		vti_fb_tunnel_init(itn->fb_tunnel_dev);
+	return 0;
+}
+
+static void __net_exit vti_exit_batch_net(struct list_head *list_net)
+{
+	ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops);
+}
+
+static struct pernet_operations vti_net_ops = {
+	.init = vti_init_net,
+	.exit_batch = vti_exit_batch_net,
+	.id   = &vti_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+			       struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static void vti_netlink_parms(struct nlattr *data[],
+			      struct ip_tunnel_parm *parms,
+			      __u32 *fwmark)
+{
+	memset(parms, 0, sizeof(*parms));
+
+	parms->iph.protocol = IPPROTO_IPIP;
+
+	if (!data)
+		return;
+
+	parms->i_flags = VTI_ISVTI;
+
+	if (data[IFLA_VTI_LINK])
+		parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+	if (data[IFLA_VTI_IKEY])
+		parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+	if (data[IFLA_VTI_OKEY])
+		parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+
+	if (data[IFLA_VTI_LOCAL])
+		parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]);
+
+	if (data[IFLA_VTI_REMOTE])
+		parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]);
+
+	if (data[IFLA_VTI_FWMARK])
+		*fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]);
+}
+
+static int vti_newlink(struct net *src_net, struct net_device *dev,
+		       struct nlattr *tb[], struct nlattr *data[],
+		       struct netlink_ext_ack *extack)
+{
+	struct ip_tunnel_parm parms;
+	__u32 fwmark = 0;
+
+	vti_netlink_parms(data, &parms, &fwmark);
+	return ip_tunnel_newlink(dev, tb, &parms, fwmark);
+}
+
+static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
+			  struct nlattr *data[],
+			  struct netlink_ext_ack *extack)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	__u32 fwmark = t->fwmark;
+	struct ip_tunnel_parm p;
+
+	vti_netlink_parms(data, &p, &fwmark);
+	return ip_tunnel_changelink(dev, tb, &p, fwmark);
+}
+
+static size_t vti_get_size(const struct net_device *dev)
+{
+	return
+		/* IFLA_VTI_LINK */
+		nla_total_size(4) +
+		/* IFLA_VTI_IKEY */
+		nla_total_size(4) +
+		/* IFLA_VTI_OKEY */
+		nla_total_size(4) +
+		/* IFLA_VTI_LOCAL */
+		nla_total_size(4) +
+		/* IFLA_VTI_REMOTE */
+		nla_total_size(4) +
+		/* IFLA_VTI_FWMARK */
+		nla_total_size(4) +
+		0;
+}
+
+static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm *p = &t->parms;
+
+	if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) ||
+	    nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) ||
+	    nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) ||
+	    nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) ||
+	    nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) ||
+	    nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
+	[IFLA_VTI_LINK]		= { .type = NLA_U32 },
+	[IFLA_VTI_IKEY]		= { .type = NLA_U32 },
+	[IFLA_VTI_OKEY]		= { .type = NLA_U32 },
+	[IFLA_VTI_LOCAL]	= { .len = sizeof_field(struct iphdr, saddr) },
+	[IFLA_VTI_REMOTE]	= { .len = sizeof_field(struct iphdr, daddr) },
+	[IFLA_VTI_FWMARK]	= { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops vti_link_ops __read_mostly = {
+	.kind		= "vti",
+	.maxtype	= IFLA_VTI_MAX,
+	.policy		= vti_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= vti_tunnel_setup,
+	.validate	= vti_tunnel_validate,
+	.newlink	= vti_newlink,
+	.changelink	= vti_changelink,
+	.dellink        = ip_tunnel_dellink,
+	.get_size	= vti_get_size,
+	.fill_info	= vti_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
+};
+
+static int __init vti_init(void)
+{
+	const char *msg;
+	int err;
+
+	pr_info("IPv4 over IPsec tunneling driver\n");
+
+	msg = "tunnel device";
+	err = register_pernet_device(&vti_net_ops);
+	if (err < 0)
+		goto pernet_dev_failed;
+
+	msg = "tunnel protocols";
+	err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+	if (err < 0)
+		goto xfrm_proto_esp_failed;
+	err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
+	if (err < 0)
+		goto xfrm_proto_ah_failed;
+	err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
+	if (err < 0)
+		goto xfrm_proto_comp_failed;
+
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+	msg = "ipip tunnel";
+	err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET);
+	if (err < 0)
+		goto xfrm_tunnel_ipip_failed;
+#if IS_ENABLED(CONFIG_IPV6)
+	err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6);
+	if (err < 0)
+		goto xfrm_tunnel_ipip6_failed;
+#endif
+#endif
+
+	msg = "netlink interface";
+	err = rtnl_link_register(&vti_link_ops);
+	if (err < 0)
+		goto rtnl_link_failed;
+
+	return err;
+
+rtnl_link_failed:
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+#if IS_ENABLED(CONFIG_IPV6)
+	xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6);
+xfrm_tunnel_ipip6_failed:
+#endif
+	xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
+xfrm_tunnel_ipip_failed:
+#endif
+	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
+	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+	xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+	unregister_pernet_device(&vti_net_ops);
+pernet_dev_failed:
+	pr_err("vti init: failed to register %s\n", msg);
+	return err;
+}
+
+static void __exit vti_fini(void)
+{
+	rtnl_link_unregister(&vti_link_ops);
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+#if IS_ENABLED(CONFIG_IPV6)
+	xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6);
+#endif
+	xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
+#endif
+	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+	xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+	unregister_pernet_device(&vti_net_ops);
+}
+
+module_init(vti_init);
+module_exit(vti_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti");
+MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 0000000000..5a4fb2539b
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ *
+ * Todo:
+ *   - Tunable compression parameters.
+ *   - Compression stats.
+ *   - Adaptive compression.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+#include <net/ipcomp.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+
+static int ipcomp4_err(struct sk_buff *skb, u32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	__be32 spi;
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	switch (icmp_hdr(skb)->type) {
+	case ICMP_DEST_UNREACH:
+		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+			return 0;
+		break;
+	case ICMP_REDIRECT:
+		break;
+	default:
+		return 0;
+	}
+
+	spi = htonl(ntohs(ipch->cpi));
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      spi, IPPROTO_COMP, AF_INET);
+	if (!x)
+		return 0;
+
+	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
+	else
+		ipv4_redirect(skb, net, 0, IPPROTO_COMP);
+	xfrm_state_put(x);
+
+	return 0;
+}
+
+/* We always hold one tunnel user reference to indicate a tunnel */
+static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	struct xfrm_state *t;
+
+	t = xfrm_state_alloc(net);
+	if (!t)
+		goto out;
+
+	t->id.proto = IPPROTO_IPIP;
+	t->id.spi = x->props.saddr.a4;
+	t->id.daddr.a4 = x->id.daddr.a4;
+	memcpy(&t->sel, &x->sel, sizeof(t->sel));
+	t->props.family = AF_INET;
+	t->props.mode = x->props.mode;
+	t->props.saddr.a4 = x->props.saddr.a4;
+	t->props.flags = x->props.flags;
+	t->props.extra_flags = x->props.extra_flags;
+	memcpy(&t->mark, &x->mark, sizeof(t->mark));
+	t->if_id = x->if_id;
+
+	if (xfrm_init_state(t))
+		goto error;
+
+	atomic_set(&t->tunnel_users, 1);
+out:
+	return t;
+
+error:
+	t->km.state = XFRM_STATE_DEAD;
+	xfrm_state_put(t);
+	t = NULL;
+	goto out;
+}
+
+/*
+ * Must be protected by xfrm_cfg_mutex.  State and tunnel user references are
+ * always incremented on success.
+ */
+static int ipcomp_tunnel_attach(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	int err = 0;
+	struct xfrm_state *t;
+	u32 mark = x->mark.v & x->mark.m;
+
+	t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
+			      x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+	if (!t) {
+		t = ipcomp_tunnel_create(x);
+		if (!t) {
+			err = -EINVAL;
+			goto out;
+		}
+		xfrm_state_insert(t);
+		xfrm_state_hold(t);
+	}
+	x->tunnel = t;
+	atomic_inc(&t->tunnel_users);
+out:
+	return err;
+}
+
+static int ipcomp4_init_state(struct xfrm_state *x,
+			      struct netlink_ext_ack *extack)
+{
+	int err = -EINVAL;
+
+	x->props.header_len = 0;
+	switch (x->props.mode) {
+	case XFRM_MODE_TRANSPORT:
+		break;
+	case XFRM_MODE_TUNNEL:
+		x->props.header_len += sizeof(struct iphdr);
+		break;
+	default:
+		NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
+		goto out;
+	}
+
+	err = ipcomp_init_state(x, extack);
+	if (err)
+		goto out;
+
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
+		err = ipcomp_tunnel_attach(x);
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
+			goto out;
+		}
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
+{
+	return 0;
+}
+
+static const struct xfrm_type ipcomp_type = {
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_COMP,
+	.init_state	= ipcomp4_init_state,
+	.destructor	= ipcomp_destroy,
+	.input		= ipcomp_input,
+	.output		= ipcomp_output
+};
+
+static struct xfrm4_protocol ipcomp4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.input_handler	=	xfrm_input,
+	.cb_handler	=	ipcomp4_rcv_cb,
+	.err_handler	=	ipcomp4_err,
+	.priority	=	0,
+};
+
+static int __init ipcomp4_init(void)
+{
+	if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
+		pr_info("%s: can't add xfrm type\n", __func__);
+		return -EAGAIN;
+	}
+	if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+		pr_info("%s: can't add protocol\n", __func__);
+		xfrm_unregister_type(&ipcomp_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit ipcomp4_fini(void)
+{
+	if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+		pr_info("%s: can't remove protocol\n", __func__);
+	xfrm_unregister_type(&ipcomp_type, AF_INET);
+}
+
+module_init(ipcomp4_init);
+module_exit(ipcomp4_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 0000000000..c56b6fe6f0
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1851 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
+ *  user-supplied information to configure own IP address and routes.
+ *
+ *  Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *  Derived from network configuration code in fs/nfs/nfsroot.c,
+ *  originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ *  BOOTP rewritten to construct and analyse packets itself instead
+ *  of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ *					     -- MJ, December 1998
+ *
+ *  Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ *  initialization scheme.
+ *	- Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
+ *
+ *  DHCP support added.  To users this looks like a whole separate
+ *  protocol, but we know it's just a bag on the side of BOOTP.
+ *		-- Chip Salzenberg <chip@valinux.com>, May 2000
+ *
+ *  Ported DHCP support from 2.2.16 to 2.4.0-test4
+ *              -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
+ *
+ *  Merged changes from 2.2.19 into 2.4.3
+ *              -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
+ *
+ *  Multiple Nameservers in /proc/net/pnp
+ *              --  Josef Siemes <jsiemes@web.de>, Aug 2002
+ *
+ *  NTP servers in /proc/net/ipconfig/ntp_servers
+ *              --  Chris Novakovic <chris@chrisn.me.uk>, April 2018
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+#include <linux/delay.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+#include <net/route.h>
+
+#include <linux/uaccess.h>
+#include <net/checksum.h>
+#include <asm/processor.h>
+
+#if defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_DHCP
+#endif
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_BOOTP
+#endif
+#if defined(CONFIG_IP_PNP_RARP)
+#define IPCONFIG_RARP
+#endif
+#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
+#define IPCONFIG_DYNAMIC
+#endif
+
+/* Define the friendly delay before and after opening net devices */
+#define CONF_POST_OPEN		10	/* After opening: 10 msecs */
+
+/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
+#define CONF_OPEN_RETRIES 	2	/* (Re)open devices twice */
+#define CONF_SEND_RETRIES 	6	/* Send six requests per open */
+#define CONF_BASE_TIMEOUT	(HZ*2)	/* Initial timeout: 2 seconds */
+#define CONF_TIMEOUT_RANDOM	(HZ)	/* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT	*7/4	/* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX	(HZ*30)	/* Maximum allowed timeout */
+#define CONF_NAMESERVERS_MAX   3       /* Maximum number of nameservers
+					   - '3' from resolv.h */
+#define CONF_NTP_SERVERS_MAX   3	/* Maximum number of NTP servers */
+
+#define NONE cpu_to_be32(INADDR_NONE)
+#define ANY cpu_to_be32(INADDR_ANY)
+
+/* Wait for carrier timeout default in seconds */
+static unsigned int carrier_timeout = 120;
+
+/*
+ * Public IP configuration
+ */
+
+/* This is used by platforms which might be able to set the ipconfig
+ * variables using firmware environment vars.  If this is set, it will
+ * ignore such firmware variables.
+ */
+int ic_set_manually __initdata = 0;		/* IPconfig parameters set manually */
+
+static int ic_enable __initdata;		/* IP config enabled? */
+
+/* Protocol choice */
+int ic_proto_enabled __initdata = 0
+#ifdef IPCONFIG_BOOTP
+			| IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_DHCP
+			| IC_USE_DHCP
+#endif
+#ifdef IPCONFIG_RARP
+			| IC_RARP
+#endif
+			;
+
+static int ic_host_name_set __initdata;	/* Host name set by us? */
+
+__be32 ic_myaddr = NONE;		/* My IP address */
+static __be32 ic_netmask = NONE;	/* Netmask for local subnet */
+__be32 ic_gateway = NONE;	/* Gateway IP address */
+
+#ifdef IPCONFIG_DYNAMIC
+static __be32 ic_addrservaddr = NONE;	/* IP Address of the IP addresses'server */
+#endif
+
+__be32 ic_servaddr = NONE;	/* Boot server IP address */
+
+__be32 root_server_addr = NONE;	/* Address of NFS server */
+u8 root_server_path[256] = { 0, };	/* Path to mount as root */
+
+/* vendor class identifier */
+static char vendor_class_identifier[253] __initdata;
+
+#if defined(CONFIG_IP_PNP_DHCP)
+static char dhcp_client_identifier[253] __initdata;
+#endif
+
+/* Persistent data: */
+
+#ifdef IPCONFIG_DYNAMIC
+static int ic_proto_used;			/* Protocol used, if any */
+#else
+#define ic_proto_used 0
+#endif
+static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */
+static u8 ic_domain[64];		/* DNS (not NIS) domain name */
+
+/*
+ * Private state.
+ */
+
+/* Name of user-selected boot device */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
+
+/* Protocols supported by available interfaces */
+static int ic_proto_have_if __initdata;
+
+/* MTU for boot device */
+static int ic_dev_mtu __initdata;
+
+#ifdef IPCONFIG_DYNAMIC
+static DEFINE_SPINLOCK(ic_recv_lock);
+static volatile int ic_got_reply __initdata;    /* Proto(s) that replied */
+#endif
+#ifdef IPCONFIG_DHCP
+static int ic_dhcp_msgtype __initdata;	/* DHCP msg type received */
+#endif
+
+
+/*
+ *	Network devices
+ */
+
+struct ic_device {
+	struct ic_device *next;
+	struct net_device *dev;
+	unsigned short flags;
+	short able;
+	__be32 xid;
+};
+
+static struct ic_device *ic_first_dev __initdata;	/* List of open device */
+static struct ic_device *ic_dev __initdata;		/* Selected device */
+
+static bool __init ic_is_init_dev(struct net_device *dev)
+{
+	if (dev->flags & IFF_LOOPBACK)
+		return false;
+	return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+	    (!(dev->flags & IFF_LOOPBACK) &&
+	     (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+	     strncmp(dev->name, "dummy", 5));
+}
+
+static int __init ic_open_devs(void)
+{
+	struct ic_device *d, **last;
+	struct net_device *dev;
+	unsigned short oflags;
+	unsigned long start, next_msg;
+
+	last = &ic_first_dev;
+	rtnl_lock();
+
+	/* bring loopback device up first */
+	for_each_netdev(&init_net, dev) {
+		if (!(dev->flags & IFF_LOOPBACK))
+			continue;
+		if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0)
+			pr_err("IP-Config: Failed to open %s\n", dev->name);
+	}
+
+	for_each_netdev(&init_net, dev) {
+		if (ic_is_init_dev(dev)) {
+			int able = 0;
+			if (dev->mtu >= 364)
+				able |= IC_BOOTP;
+			else
+				pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small\n",
+					dev->name, dev->mtu);
+			if (!(dev->flags & IFF_NOARP))
+				able |= IC_RARP;
+			able &= ic_proto_enabled;
+			if (ic_proto_enabled && !able)
+				continue;
+			oflags = dev->flags;
+			if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) {
+				pr_err("IP-Config: Failed to open %s\n",
+				       dev->name);
+				continue;
+			}
+			if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
+				rtnl_unlock();
+				return -ENOMEM;
+			}
+			d->dev = dev;
+			*last = d;
+			last = &d->next;
+			d->flags = oflags;
+			d->able = able;
+			if (able & IC_BOOTP)
+				get_random_bytes(&d->xid, sizeof(__be32));
+			else
+				d->xid = 0;
+			ic_proto_have_if |= able;
+			pr_debug("IP-Config: %s UP (able=%d, xid=%08x)\n",
+				 dev->name, able, d->xid);
+		}
+	}
+	/* Devices with a complex topology like SFP ethernet interfaces needs
+	 * the rtnl_lock at init. The carrier wait-loop must therefore run
+	 * without holding it.
+	 */
+	rtnl_unlock();
+
+	/* no point in waiting if we could not bring up at least one device */
+	if (!ic_first_dev)
+		goto have_carrier;
+
+	/* wait for a carrier on at least one device */
+	start = jiffies;
+	next_msg = start + msecs_to_jiffies(20000);
+	while (time_before(jiffies, start +
+			   msecs_to_jiffies(carrier_timeout * 1000))) {
+		int wait, elapsed;
+
+		rtnl_lock();
+		for_each_netdev(&init_net, dev)
+			if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) {
+				rtnl_unlock();
+				goto have_carrier;
+			}
+		rtnl_unlock();
+
+		msleep(1);
+
+		if (time_before(jiffies, next_msg))
+			continue;
+
+		elapsed = jiffies_to_msecs(jiffies - start);
+		wait = (carrier_timeout * 1000 - elapsed + 500) / 1000;
+		pr_info("Waiting up to %d more seconds for network.\n", wait);
+		next_msg = jiffies + msecs_to_jiffies(20000);
+	}
+have_carrier:
+
+	*last = NULL;
+
+	if (!ic_first_dev) {
+		if (user_dev_name[0])
+			pr_err("IP-Config: Device `%s' not found\n",
+			       user_dev_name);
+		else
+			pr_err("IP-Config: No network devices available\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+/* Close all network interfaces except the one we've autoconfigured, and its
+ * lowers, in case it's a stacked virtual interface.
+ */
+static void __init ic_close_devs(void)
+{
+	struct net_device *selected_dev = ic_dev ? ic_dev->dev : NULL;
+	struct ic_device *d, *next;
+	struct net_device *dev;
+
+	rtnl_lock();
+	next = ic_first_dev;
+	while ((d = next)) {
+		bool bring_down = (d != ic_dev);
+		struct net_device *lower;
+		struct list_head *iter;
+
+		next = d->next;
+		dev = d->dev;
+
+		if (selected_dev) {
+			netdev_for_each_lower_dev(selected_dev, lower, iter) {
+				if (dev == lower) {
+					bring_down = false;
+					break;
+				}
+			}
+		}
+		if (bring_down) {
+			pr_debug("IP-Config: Downing %s\n", dev->name);
+			dev_change_flags(dev, d->flags, NULL);
+		}
+		kfree(d);
+	}
+	rtnl_unlock();
+}
+
+/*
+ *	Interface to various network functions.
+ */
+
+static inline void
+set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
+{
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = addr;
+	sin->sin_port = port;
+}
+
+/*
+ *	Set up interface addresses and routes.
+ */
+
+static int __init ic_setup_if(void)
+{
+	struct ifreq ir;
+	struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+	int err;
+
+	memset(&ir, 0, sizeof(ir));
+	strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name);
+	set_sockaddr(sin, ic_myaddr, 0);
+	if ((err = devinet_ioctl(&init_net, SIOCSIFADDR, &ir)) < 0) {
+		pr_err("IP-Config: Unable to set interface address (%d)\n",
+		       err);
+		return -1;
+	}
+	set_sockaddr(sin, ic_netmask, 0);
+	if ((err = devinet_ioctl(&init_net, SIOCSIFNETMASK, &ir)) < 0) {
+		pr_err("IP-Config: Unable to set interface netmask (%d)\n",
+		       err);
+		return -1;
+	}
+	set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+	if ((err = devinet_ioctl(&init_net, SIOCSIFBRDADDR, &ir)) < 0) {
+		pr_err("IP-Config: Unable to set interface broadcast address (%d)\n",
+		       err);
+		return -1;
+	}
+	/* Handle the case where we need non-standard MTU on the boot link (a network
+	 * using jumbo frames, for instance).  If we can't set the mtu, don't error
+	 * out, we'll try to muddle along.
+	 */
+	if (ic_dev_mtu != 0) {
+		rtnl_lock();
+		if ((err = dev_set_mtu(ic_dev->dev, ic_dev_mtu)) < 0)
+			pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
+			       ic_dev_mtu, err);
+		rtnl_unlock();
+	}
+	return 0;
+}
+
+static int __init ic_setup_routes(void)
+{
+	/* No need to setup device routes, only the default route... */
+
+	if (ic_gateway != NONE) {
+		struct rtentry rm;
+		int err;
+
+		memset(&rm, 0, sizeof(rm));
+		if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+			pr_err("IP-Config: Gateway not on directly connected network\n");
+			return -1;
+		}
+		set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+		set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+		set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+		rm.rt_flags = RTF_UP | RTF_GATEWAY;
+		if ((err = ip_rt_ioctl(&init_net, SIOCADDRT, &rm)) < 0) {
+			pr_err("IP-Config: Cannot add default route (%d)\n",
+			       err);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *	Fill in default values for all missing parameters.
+ */
+
+static int __init ic_defaults(void)
+{
+	/*
+	 *	At this point we have no userspace running so need not
+	 *	claim locks on system_utsname
+	 */
+
+	if (!ic_host_name_set)
+		sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr);
+
+	if (root_server_addr == NONE)
+		root_server_addr = ic_servaddr;
+
+	if (ic_netmask == NONE) {
+		if (IN_CLASSA(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSA_NET);
+		else if (IN_CLASSB(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSB_NET);
+		else if (IN_CLASSC(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSC_NET);
+		else if (IN_CLASSE(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSE_NET);
+		else {
+			pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
+			       &ic_myaddr);
+			return -1;
+		}
+		pr_notice("IP-Config: Guessing netmask %pI4\n",
+			  &ic_netmask);
+	}
+
+	return 0;
+}
+
+/*
+ *	RARP support.
+ */
+
+#ifdef IPCONFIG_RARP
+
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type rarp_packet_type __initdata = {
+	.type =	cpu_to_be16(ETH_P_RARP),
+	.func =	ic_rarp_recv,
+};
+
+static inline void __init ic_rarp_init(void)
+{
+	dev_add_pack(&rarp_packet_type);
+}
+
+static inline void __init ic_rarp_cleanup(void)
+{
+	dev_remove_pack(&rarp_packet_type);
+}
+
+/*
+ *  Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct arphdr *rarp;
+	unsigned char *rarp_ptr;
+	__be32 sip, tip;
+	unsigned char *tha;		/* t for "target" */
+	struct ic_device *d;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		return NET_RX_DROP;
+
+	if (!pskb_may_pull(skb, sizeof(struct arphdr)))
+		goto drop;
+
+	/* Basic sanity checks can be done without the lock.  */
+	rarp = (struct arphdr *)skb_transport_header(skb);
+
+	/* If this test doesn't pass, it's not IP, or we should
+	 * ignore it anyway.
+	 */
+	if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+		goto drop;
+
+	/* If it's not a RARP reply, delete it. */
+	if (rarp->ar_op != htons(ARPOP_RREPLY))
+		goto drop;
+
+	/* If it's not Ethernet, delete it. */
+	if (rarp->ar_pro != htons(ETH_P_IP))
+		goto drop;
+
+	if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+		goto drop;
+
+	/* OK, it is all there and looks valid, process... */
+	rarp = (struct arphdr *)skb_transport_header(skb);
+	rarp_ptr = (unsigned char *) (rarp + 1);
+
+	/* One reply at a time, please. */
+	spin_lock(&ic_recv_lock);
+
+	/* If we already have a reply, just drop the packet */
+	if (ic_got_reply)
+		goto drop_unlock;
+
+	/* Find the ic_device that the packet arrived on */
+	d = ic_first_dev;
+	while (d && d->dev != dev)
+		d = d->next;
+	if (!d)
+		goto drop_unlock;	/* should never happen */
+
+	/* Extract variable-width fields */
+	rarp_ptr += dev->addr_len;
+	memcpy(&sip, rarp_ptr, 4);
+	rarp_ptr += 4;
+	tha = rarp_ptr;
+	rarp_ptr += dev->addr_len;
+	memcpy(&tip, rarp_ptr, 4);
+
+	/* Discard packets which are not meant for us. */
+	if (memcmp(tha, dev->dev_addr, dev->addr_len))
+		goto drop_unlock;
+
+	/* Discard packets which are not from specified server. */
+	if (ic_servaddr != NONE && ic_servaddr != sip)
+		goto drop_unlock;
+
+	/* We have a winner! */
+	ic_dev = d;
+	if (ic_myaddr == NONE)
+		ic_myaddr = tip;
+	ic_servaddr = sip;
+	ic_addrservaddr = sip;
+	ic_got_reply = IC_RARP;
+
+drop_unlock:
+	/* Show's over.  Nothing to see here.  */
+	spin_unlock(&ic_recv_lock);
+
+drop:
+	/* Throw the packet out. */
+	kfree_skb(skb);
+	return 0;
+}
+
+
+/*
+ *  Send RARP request packet over a single interface.
+ */
+static void __init ic_rarp_send_if(struct ic_device *d)
+{
+	struct net_device *dev = d->dev;
+	arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
+		 dev->dev_addr, dev->dev_addr);
+}
+#endif
+
+/*
+ *  Predefine Nameservers
+ */
+static inline void __init ic_nameservers_predef(void)
+{
+	int i;
+
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+		ic_nameservers[i] = NONE;
+}
+
+/* Predefine NTP servers */
+static inline void __init ic_ntp_servers_predef(void)
+{
+	int i;
+
+	for (i = 0; i < CONF_NTP_SERVERS_MAX; i++)
+		ic_ntp_servers[i] = NONE;
+}
+
+/*
+ *	DHCP/BOOTP support.
+ */
+
+#ifdef IPCONFIG_BOOTP
+
+struct bootp_pkt {		/* BOOTP packet format */
+	struct iphdr iph;	/* IP header */
+	struct udphdr udph;	/* UDP header */
+	u8 op;			/* 1=request, 2=reply */
+	u8 htype;		/* HW address type */
+	u8 hlen;		/* HW address length */
+	u8 hops;		/* Used only by gateways */
+	__be32 xid;		/* Transaction ID */
+	__be16 secs;		/* Seconds since we started */
+	__be16 flags;		/* Just what it says */
+	__be32 client_ip;		/* Client's IP address if known */
+	__be32 your_ip;		/* Assigned IP address */
+	__be32 server_ip;		/* (Next, e.g. NFS) Server's IP address */
+	__be32 relay_ip;		/* IP address of BOOTP relay */
+	u8 hw_addr[16];		/* Client's HW address */
+	u8 serv_name[64];	/* Server host name */
+	u8 boot_file[128];	/* Name of boot file */
+	u8 exten[312];		/* DHCP options / BOOTP vendor extensions */
+};
+
+/* packet ops */
+#define BOOTP_REQUEST	1
+#define BOOTP_REPLY	2
+
+/* DHCP message types */
+#define DHCPDISCOVER	1
+#define DHCPOFFER	2
+#define DHCPREQUEST	3
+#define DHCPDECLINE	4
+#define DHCPACK		5
+#define DHCPNAK		6
+#define DHCPRELEASE	7
+#define DHCPINFORM	8
+
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type bootp_packet_type __initdata = {
+	.type =	cpu_to_be16(ETH_P_IP),
+	.func =	ic_bootp_recv,
+};
+
+/* DHCPACK can overwrite DNS if fallback was set upon first BOOTP reply */
+static int ic_nameservers_fallback __initdata;
+
+/*
+ *  Initialize DHCP/BOOTP extension fields in the request.
+ */
+
+static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
+
+#ifdef IPCONFIG_DHCP
+
+static void __init
+ic_dhcp_init_options(u8 *options, struct ic_device *d)
+{
+	u8 mt = ((ic_servaddr == NONE)
+		 ? DHCPDISCOVER : DHCPREQUEST);
+	u8 *e = options;
+	int len;
+
+	pr_debug("DHCP: Sending message type %d (%s)\n", mt, d->dev->name);
+
+	memcpy(e, ic_bootp_cookie, 4);	/* RFC1048 Magic Cookie */
+	e += 4;
+
+	*e++ = 53;		/* DHCP message type */
+	*e++ = 1;
+	*e++ = mt;
+
+	if (mt == DHCPREQUEST) {
+		*e++ = 54;	/* Server ID (IP address) */
+		*e++ = 4;
+		memcpy(e, &ic_servaddr, 4);
+		e += 4;
+
+		*e++ = 50;	/* Requested IP address */
+		*e++ = 4;
+		memcpy(e, &ic_myaddr, 4);
+		e += 4;
+	}
+
+	/* always? */
+	{
+		static const u8 ic_req_params[] = {
+			1,	/* Subnet mask */
+			3,	/* Default gateway */
+			6,	/* DNS server */
+			12,	/* Host name */
+			15,	/* Domain name */
+			17,	/* Boot path */
+			26,	/* MTU */
+			40,	/* NIS domain name */
+			42,	/* NTP servers */
+		};
+
+		*e++ = 55;	/* Parameter request list */
+		*e++ = sizeof(ic_req_params);
+		memcpy(e, ic_req_params, sizeof(ic_req_params));
+		e += sizeof(ic_req_params);
+
+		if (ic_host_name_set) {
+			*e++ = 12;	/* host-name */
+			len = strlen(utsname()->nodename);
+			*e++ = len;
+			memcpy(e, utsname()->nodename, len);
+			e += len;
+		}
+		if (*vendor_class_identifier) {
+			pr_info("DHCP: sending class identifier \"%s\"\n",
+				vendor_class_identifier);
+			*e++ = 60;	/* Class-identifier */
+			len = strlen(vendor_class_identifier);
+			*e++ = len;
+			memcpy(e, vendor_class_identifier, len);
+			e += len;
+		}
+		len = strlen(dhcp_client_identifier + 1);
+		/* the minimum length of identifier is 2, include 1 byte type,
+		 * and can not be larger than the length of options
+		 */
+		if (len >= 1 && len < 312 - (e - options) - 1) {
+			*e++ = 61;
+			*e++ = len + 1;
+			memcpy(e, dhcp_client_identifier, len + 1);
+			e += len + 1;
+		}
+	}
+
+	*e++ = 255;	/* End of the list */
+}
+
+#endif /* IPCONFIG_DHCP */
+
+static void __init ic_bootp_init_ext(u8 *e)
+{
+	memcpy(e, ic_bootp_cookie, 4);	/* RFC1048 Magic Cookie */
+	e += 4;
+	*e++ = 1;		/* Subnet mask request */
+	*e++ = 4;
+	e += 4;
+	*e++ = 3;		/* Default gateway request */
+	*e++ = 4;
+	e += 4;
+#if CONF_NAMESERVERS_MAX > 0
+	*e++ = 6;		/* (DNS) name server request */
+	*e++ = 4 * CONF_NAMESERVERS_MAX;
+	e += 4 * CONF_NAMESERVERS_MAX;
+#endif
+	*e++ = 12;		/* Host name request */
+	*e++ = 32;
+	e += 32;
+	*e++ = 40;		/* NIS Domain name request */
+	*e++ = 32;
+	e += 32;
+	*e++ = 17;		/* Boot path */
+	*e++ = 40;
+	e += 40;
+
+	*e++ = 57;		/* set extension buffer size for reply */
+	*e++ = 2;
+	*e++ = 1;		/* 128+236+8+20+14, see dhcpd sources */
+	*e++ = 150;
+
+	*e++ = 255;		/* End of the list */
+}
+
+
+/*
+ *  Initialize the DHCP/BOOTP mechanism.
+ */
+static inline void __init ic_bootp_init(void)
+{
+	/* Re-initialise all name servers and NTP servers to NONE, in case any
+	 * were set via the "ip=" or "nfsaddrs=" kernel command line parameters:
+	 * any IP addresses specified there will already have been decoded but
+	 * are no longer needed
+	 */
+	ic_nameservers_predef();
+	ic_ntp_servers_predef();
+
+	dev_add_pack(&bootp_packet_type);
+}
+
+
+/*
+ *  DHCP/BOOTP cleanup.
+ */
+static inline void __init ic_bootp_cleanup(void)
+{
+	dev_remove_pack(&bootp_packet_type);
+}
+
+
+/*
+ *  Send DHCP/BOOTP request to single interface.
+ */
+static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff)
+{
+	struct net_device *dev = d->dev;
+	struct sk_buff *skb;
+	struct bootp_pkt *b;
+	struct iphdr *h;
+	int hlen = LL_RESERVED_SPACE(dev);
+	int tlen = dev->needed_tailroom;
+
+	/* Allocate packet */
+	skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15,
+			GFP_KERNEL);
+	if (!skb)
+		return;
+	skb_reserve(skb, hlen);
+	b = skb_put_zero(skb, sizeof(struct bootp_pkt));
+
+	/* Construct IP header */
+	skb_reset_network_header(skb);
+	h = ip_hdr(skb);
+	h->version = 4;
+	h->ihl = 5;
+	h->tot_len = htons(sizeof(struct bootp_pkt));
+	h->frag_off = htons(IP_DF);
+	h->ttl = 64;
+	h->protocol = IPPROTO_UDP;
+	h->daddr = htonl(INADDR_BROADCAST);
+	h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+	/* Construct UDP header */
+	b->udph.source = htons(68);
+	b->udph.dest = htons(67);
+	b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
+	/* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+	/* Construct DHCP/BOOTP header */
+	b->op = BOOTP_REQUEST;
+	if (dev->type < 256) /* check for false types */
+		b->htype = dev->type;
+	else if (dev->type == ARPHRD_FDDI)
+		b->htype = ARPHRD_ETHER;
+	else {
+		pr_warn("Unknown ARP type 0x%04x for device %s\n", dev->type,
+			dev->name);
+		b->htype = dev->type; /* can cause undefined behavior */
+	}
+
+	/* server_ip and your_ip address are both already zero per RFC2131 */
+	b->hlen = dev->addr_len;
+	memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
+	b->secs = htons(jiffies_diff / HZ);
+	b->xid = d->xid;
+
+	/* add DHCP options or BOOTP extensions */
+#ifdef IPCONFIG_DHCP
+	if (ic_proto_enabled & IC_USE_DHCP)
+		ic_dhcp_init_options(b->exten, d);
+	else
+#endif
+		ic_bootp_init_ext(b->exten);
+
+	/* Chain packet down the line... */
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+	if (dev_hard_header(skb, dev, ntohs(skb->protocol),
+			    dev->broadcast, dev->dev_addr, skb->len) < 0) {
+		kfree_skb(skb);
+		printk("E");
+		return;
+	}
+
+	if (dev_queue_xmit(skb) < 0)
+		printk("E");
+}
+
+
+/*
+ *  Copy BOOTP-supplied string
+ */
+static int __init ic_bootp_string(char *dest, char *src, int len, int max)
+{
+	if (!len)
+		return 0;
+	if (len > max-1)
+		len = max-1;
+	memcpy(dest, src, len);
+	dest[len] = '\0';
+	return 1;
+}
+
+
+/*
+ *  Process BOOTP extensions.
+ */
+static void __init ic_do_bootp_ext(u8 *ext)
+{
+	u8 servers;
+	int i;
+	__be16 mtu;
+
+	u8 *c;
+
+	pr_debug("DHCP/BOOTP: Got extension %d:", *ext);
+	for (c=ext+2; c<ext+2+ext[1]; c++)
+		pr_debug(" %02x", *c);
+	pr_debug("\n");
+
+	switch (*ext++) {
+	case 1:		/* Subnet mask */
+		if (ic_netmask == NONE)
+			memcpy(&ic_netmask, ext+1, 4);
+		break;
+	case 3:		/* Default gateway */
+		if (ic_gateway == NONE)
+			memcpy(&ic_gateway, ext+1, 4);
+		break;
+	case 6:		/* DNS server */
+		servers= *ext/4;
+		if (servers > CONF_NAMESERVERS_MAX)
+			servers = CONF_NAMESERVERS_MAX;
+		for (i = 0; i < servers; i++) {
+			if (ic_nameservers[i] == NONE ||
+			    ic_nameservers_fallback)
+				memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+		}
+		break;
+	case 12:	/* Host name */
+		if (!ic_host_name_set) {
+			ic_bootp_string(utsname()->nodename, ext+1, *ext,
+					__NEW_UTS_LEN);
+			ic_host_name_set = 1;
+		}
+		break;
+	case 15:	/* Domain name (DNS) */
+		if (!ic_domain[0])
+			ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+		break;
+	case 17:	/* Root path */
+		if (!root_server_path[0])
+			ic_bootp_string(root_server_path, ext+1, *ext,
+					sizeof(root_server_path));
+		break;
+	case 26:	/* Interface MTU */
+		memcpy(&mtu, ext+1, sizeof(mtu));
+		ic_dev_mtu = ntohs(mtu);
+		break;
+	case 40:	/* NIS Domain name (_not_ DNS) */
+		ic_bootp_string(utsname()->domainname, ext+1, *ext,
+				__NEW_UTS_LEN);
+		break;
+	case 42:	/* NTP servers */
+		servers = *ext / 4;
+		if (servers > CONF_NTP_SERVERS_MAX)
+			servers = CONF_NTP_SERVERS_MAX;
+		for (i = 0; i < servers; i++) {
+			if (ic_ntp_servers[i] == NONE)
+				memcpy(&ic_ntp_servers[i], ext+1+4*i, 4);
+		}
+		break;
+	}
+}
+
+
+/*
+ *  Receive BOOTP reply.
+ */
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct bootp_pkt *b;
+	struct iphdr *h;
+	struct ic_device *d;
+	int len, ext_len;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	/* Perform verifications before taking the lock.  */
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		return NET_RX_DROP;
+
+	if (!pskb_may_pull(skb,
+			   sizeof(struct iphdr) +
+			   sizeof(struct udphdr)))
+		goto drop;
+
+	b = (struct bootp_pkt *)skb_network_header(skb);
+	h = &b->iph;
+
+	if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+		goto drop;
+
+	/* Fragments are not supported */
+	if (ip_is_fragment(h)) {
+		net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
+		goto drop;
+	}
+
+	if (skb->len < ntohs(h->tot_len))
+		goto drop;
+
+	if (ip_fast_csum((char *) h, h->ihl))
+		goto drop;
+
+	if (b->udph.source != htons(67) || b->udph.dest != htons(68))
+		goto drop;
+
+	if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+		goto drop;
+
+	len = ntohs(b->udph.len) - sizeof(struct udphdr);
+	ext_len = len - (sizeof(*b) -
+			 sizeof(struct iphdr) -
+			 sizeof(struct udphdr) -
+			 sizeof(b->exten));
+	if (ext_len < 0)
+		goto drop;
+
+	/* Ok the front looks good, make sure we can get at the rest.  */
+	if (!pskb_may_pull(skb, skb->len))
+		goto drop;
+
+	b = (struct bootp_pkt *)skb_network_header(skb);
+	h = &b->iph;
+
+	/* One reply at a time, please. */
+	spin_lock(&ic_recv_lock);
+
+	/* If we already have a reply, just drop the packet */
+	if (ic_got_reply)
+		goto drop_unlock;
+
+	/* Find the ic_device that the packet arrived on */
+	d = ic_first_dev;
+	while (d && d->dev != dev)
+		d = d->next;
+	if (!d)
+		goto drop_unlock;  /* should never happen */
+
+	/* Is it a reply to our BOOTP request? */
+	if (b->op != BOOTP_REPLY ||
+	    b->xid != d->xid) {
+		net_err_ratelimited("DHCP/BOOTP: Reply not for us on %s, op[%x] xid[%x]\n",
+				    d->dev->name, b->op, b->xid);
+		goto drop_unlock;
+	}
+
+	/* Parse extensions */
+	if (ext_len >= 4 &&
+	    !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
+		u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+		u8 *ext;
+
+#ifdef IPCONFIG_DHCP
+		if (ic_proto_enabled & IC_USE_DHCP) {
+			__be32 server_id = NONE;
+			int mt = 0;
+
+			ext = &b->exten[4];
+			while (ext < end && *ext != 0xff) {
+				u8 *opt = ext++;
+				if (*opt == 0)	/* Padding */
+					continue;
+				ext += *ext + 1;
+				if (ext >= end)
+					break;
+				switch (*opt) {
+				case 53:	/* Message type */
+					if (opt[1])
+						mt = opt[2];
+					break;
+				case 54:	/* Server ID (IP address) */
+					if (opt[1] >= 4)
+						memcpy(&server_id, opt + 2, 4);
+					break;
+				}
+			}
+
+			pr_debug("DHCP: Got message type %d (%s)\n", mt, d->dev->name);
+
+			switch (mt) {
+			case DHCPOFFER:
+				/* While in the process of accepting one offer,
+				 * ignore all others.
+				 */
+				if (ic_myaddr != NONE)
+					goto drop_unlock;
+
+				/* Let's accept that offer. */
+				ic_myaddr = b->your_ip;
+				ic_servaddr = server_id;
+				pr_debug("DHCP: Offered address %pI4 by server %pI4\n",
+					 &ic_myaddr, &b->iph.saddr);
+				/* The DHCP indicated server address takes
+				 * precedence over the bootp header one if
+				 * they are different.
+				 */
+				if ((server_id != NONE) &&
+				    (b->server_ip != server_id))
+					b->server_ip = ic_servaddr;
+				break;
+
+			case DHCPACK:
+				if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0)
+					goto drop_unlock;
+
+				/* Yeah! */
+				break;
+
+			default:
+				/* Urque.  Forget it*/
+				ic_myaddr = NONE;
+				ic_servaddr = NONE;
+				goto drop_unlock;
+			}
+
+			ic_dhcp_msgtype = mt;
+
+		}
+#endif /* IPCONFIG_DHCP */
+
+		ext = &b->exten[4];
+		while (ext < end && *ext != 0xff) {
+			u8 *opt = ext++;
+			if (*opt == 0)	/* Padding */
+				continue;
+			ext += *ext + 1;
+			if (ext < end)
+				ic_do_bootp_ext(opt);
+		}
+	}
+
+	/* We have a winner! */
+	ic_dev = d;
+	ic_myaddr = b->your_ip;
+	ic_servaddr = b->server_ip;
+	ic_addrservaddr = b->iph.saddr;
+	if (ic_gateway == NONE && b->relay_ip)
+		ic_gateway = b->relay_ip;
+	if (ic_nameservers[0] == NONE) {
+		ic_nameservers[0] = ic_servaddr;
+		ic_nameservers_fallback = 1;
+	}
+	ic_got_reply = IC_BOOTP;
+
+drop_unlock:
+	/* Show's over.  Nothing to see here.  */
+	spin_unlock(&ic_recv_lock);
+
+drop:
+	/* Throw the packet out. */
+	kfree_skb(skb);
+
+	return 0;
+}
+
+
+#endif
+
+
+/*
+ *	Dynamic IP configuration -- DHCP, BOOTP, RARP.
+ */
+
+#ifdef IPCONFIG_DYNAMIC
+
+static int __init ic_dynamic(void)
+{
+	int retries;
+	struct ic_device *d;
+	unsigned long start_jiffies, timeout, jiff;
+	int do_bootp = ic_proto_have_if & IC_BOOTP;
+	int do_rarp = ic_proto_have_if & IC_RARP;
+
+	/*
+	 * If none of DHCP/BOOTP/RARP was selected, return with an error.
+	 * This routine gets only called when some pieces of information
+	 * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
+	 */
+	if (!ic_proto_enabled) {
+		pr_err("IP-Config: Incomplete network configuration information\n");
+		return -1;
+	}
+
+#ifdef IPCONFIG_BOOTP
+	if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
+		pr_err("DHCP/BOOTP: No suitable device found\n");
+#endif
+#ifdef IPCONFIG_RARP
+	if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
+		pr_err("RARP: No suitable device found\n");
+#endif
+
+	if (!ic_proto_have_if)
+		/* Error message already printed */
+		return -1;
+
+	/*
+	 * Setup protocols
+	 */
+#ifdef IPCONFIG_BOOTP
+	if (do_bootp)
+		ic_bootp_init();
+#endif
+#ifdef IPCONFIG_RARP
+	if (do_rarp)
+		ic_rarp_init();
+#endif
+
+	/*
+	 * Send requests and wait, until we get an answer. This loop
+	 * seems to be a terrible waste of CPU time, but actually there is
+	 * only one process running at all, so we don't need to use any
+	 * scheduler functions.
+	 * [Actually we could now, but the nothing else running note still
+	 *  applies.. - AC]
+	 */
+	pr_notice("Sending %s%s%s requests .",
+		  do_bootp
+		  ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+		  (do_bootp && do_rarp) ? " and " : "",
+		  do_rarp ? "RARP" : "");
+
+	start_jiffies = jiffies;
+	d = ic_first_dev;
+	retries = CONF_SEND_RETRIES;
+	get_random_bytes(&timeout, sizeof(timeout));
+	timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
+	for (;;) {
+#ifdef IPCONFIG_BOOTP
+		if (do_bootp && (d->able & IC_BOOTP))
+			ic_bootp_send_if(d, jiffies - start_jiffies);
+#endif
+#ifdef IPCONFIG_RARP
+		if (do_rarp && (d->able & IC_RARP))
+			ic_rarp_send_if(d);
+#endif
+
+		if (!d->next) {
+			jiff = jiffies + timeout;
+			while (time_before(jiffies, jiff) && !ic_got_reply)
+				schedule_timeout_uninterruptible(1);
+		}
+#ifdef IPCONFIG_DHCP
+		/* DHCP isn't done until we get a DHCPACK. */
+		if ((ic_got_reply & IC_BOOTP) &&
+		    (ic_proto_enabled & IC_USE_DHCP) &&
+		    ic_dhcp_msgtype != DHCPACK) {
+			ic_got_reply = 0;
+			/* continue on device that got the reply */
+			d = ic_dev;
+			pr_cont(",");
+			continue;
+		}
+#endif /* IPCONFIG_DHCP */
+
+		if (ic_got_reply) {
+			pr_cont(" OK\n");
+			break;
+		}
+
+		if ((d = d->next))
+			continue;
+
+		if (! --retries) {
+			pr_cont(" timed out!\n");
+			break;
+		}
+
+		d = ic_first_dev;
+
+		timeout = timeout CONF_TIMEOUT_MULT;
+		if (timeout > CONF_TIMEOUT_MAX)
+			timeout = CONF_TIMEOUT_MAX;
+
+		pr_cont(".");
+	}
+
+#ifdef IPCONFIG_BOOTP
+	if (do_bootp)
+		ic_bootp_cleanup();
+#endif
+#ifdef IPCONFIG_RARP
+	if (do_rarp)
+		ic_rarp_cleanup();
+#endif
+
+	if (!ic_got_reply) {
+		ic_myaddr = NONE;
+		return -1;
+	}
+
+	pr_info("IP-Config: Got %s answer from %pI4, my address is %pI4\n",
+		((ic_got_reply & IC_RARP) ? "RARP"
+		: (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+		&ic_addrservaddr, &ic_myaddr);
+
+	return 0;
+}
+
+#endif /* IPCONFIG_DYNAMIC */
+
+#ifdef CONFIG_PROC_FS
+/* proc_dir_entry for /proc/net/ipconfig */
+static struct proc_dir_entry *ipconfig_dir;
+
+/* Name servers: */
+static int pnp_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	if (ic_proto_used & IC_PROTO)
+		seq_printf(seq, "#PROTO: %s\n",
+			   (ic_proto_used & IC_RARP) ? "RARP"
+			   : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP");
+	else
+		seq_puts(seq, "#MANUAL\n");
+
+	if (ic_domain[0])
+		seq_printf(seq,
+			   "domain %s\n", ic_domain);
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+		if (ic_nameservers[i] != NONE)
+			seq_printf(seq, "nameserver %pI4\n",
+				   &ic_nameservers[i]);
+	}
+	if (ic_servaddr != NONE)
+		seq_printf(seq, "bootserver %pI4\n",
+			   &ic_servaddr);
+	return 0;
+}
+
+/* Create the /proc/net/ipconfig directory */
+static int __init ipconfig_proc_net_init(void)
+{
+	ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net);
+	if (!ipconfig_dir)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Create a new file under /proc/net/ipconfig */
+static int ipconfig_proc_net_create(const char *name,
+				    const struct proc_ops *proc_ops)
+{
+	char *pname;
+	struct proc_dir_entry *p;
+
+	if (!ipconfig_dir)
+		return -ENOMEM;
+
+	pname = kasprintf(GFP_KERNEL, "%s%s", "ipconfig/", name);
+	if (!pname)
+		return -ENOMEM;
+
+	p = proc_create(pname, 0444, init_net.proc_net, proc_ops);
+	kfree(pname);
+	if (!p)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
+static int ntp_servers_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+		if (ic_ntp_servers[i] != NONE)
+			seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]);
+	}
+	return 0;
+}
+DEFINE_PROC_SHOW_ATTRIBUTE(ntp_servers);
+#endif /* CONFIG_PROC_FS */
+
+/*
+ *  Extract IP address from the parameter string if needed. Note that we
+ *  need to have root_server_addr set _before_ IPConfig gets called as it
+ *  can override it.
+ */
+__be32 __init root_nfs_parse_addr(char *name)
+{
+	__be32 addr;
+	int octets = 0;
+	char *cp, *cq;
+
+	cp = cq = name;
+	while (octets < 4) {
+		while (*cp >= '0' && *cp <= '9')
+			cp++;
+		if (cp == cq || cp - cq > 3)
+			break;
+		if (*cp == '.' || octets == 3)
+			octets++;
+		if (octets < 4)
+			cp++;
+		cq = cp;
+	}
+	if (octets == 4 && (*cp == ':' || *cp == '\0')) {
+		if (*cp == ':')
+			*cp++ = '\0';
+		addr = in_aton(name);
+		memmove(name, cp, strlen(cp) + 1);
+	} else
+		addr = NONE;
+
+	return addr;
+}
+
+#define DEVICE_WAIT_MAX		12 /* 12 seconds */
+
+static int __init wait_for_devices(void)
+{
+	int i;
+	bool try_init_devs = true;
+
+	for (i = 0; i < DEVICE_WAIT_MAX; i++) {
+		struct net_device *dev;
+		int found = 0;
+
+		/* make sure deferred device probes are finished */
+		wait_for_device_probe();
+
+		rtnl_lock();
+		for_each_netdev(&init_net, dev) {
+			if (ic_is_init_dev(dev)) {
+				found = 1;
+				break;
+			}
+		}
+		rtnl_unlock();
+		if (found)
+			return 0;
+		if (try_init_devs &&
+		    (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) {
+			try_init_devs = false;
+			wait_for_init_devices_probe();
+		}
+		ssleep(1);
+	}
+	return -ENODEV;
+}
+
+/*
+ *	IP Autoconfig dispatcher.
+ */
+
+static int __init ip_auto_config(void)
+{
+	__be32 addr;
+#ifdef IPCONFIG_DYNAMIC
+	int retries = CONF_OPEN_RETRIES;
+#endif
+	int err;
+	unsigned int i, count;
+
+	/* Initialise all name servers and NTP servers to NONE (but only if the
+	 * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
+	 * otherwise we'll overwrite the IP addresses specified there)
+	 */
+	if (ic_set_manually == 0) {
+		ic_nameservers_predef();
+		ic_ntp_servers_predef();
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
+
+	if (ipconfig_proc_net_init() == 0)
+		ipconfig_proc_net_create("ntp_servers", &ntp_servers_proc_ops);
+#endif /* CONFIG_PROC_FS */
+
+	if (!ic_enable)
+		return 0;
+
+	pr_debug("IP-Config: Entered.\n");
+#ifdef IPCONFIG_DYNAMIC
+ try_try_again:
+#endif
+	/* Wait for devices to appear */
+	err = wait_for_devices();
+	if (err)
+		return err;
+
+	/* Setup all network devices */
+	err = ic_open_devs();
+	if (err)
+		return err;
+
+	/* Give drivers a chance to settle */
+	msleep(CONF_POST_OPEN);
+
+	/*
+	 * If the config information is insufficient (e.g., our IP address or
+	 * IP address of the boot server is missing or we have multiple network
+	 * interfaces and no default was set), use BOOTP or RARP to get the
+	 * missing values.
+	 */
+	if (ic_myaddr == NONE ||
+#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT)
+	    (root_server_addr == NONE &&
+	     ic_servaddr == NONE &&
+	     (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) ||
+#endif
+	    ic_first_dev->next) {
+#ifdef IPCONFIG_DYNAMIC
+		if (ic_dynamic() < 0) {
+			ic_close_devs();
+
+			/*
+			 * I don't know why, but sometimes the
+			 * eepro100 driver (at least) gets upset and
+			 * doesn't work the first time it's opened.
+			 * But then if you close it and reopen it, it
+			 * works just fine.  So we need to try that at
+			 * least once before giving up.
+			 *
+			 * Also, if the root will be NFS-mounted, we
+			 * have nowhere to go if DHCP fails.  So we
+			 * just have to keep trying forever.
+			 *
+			 * 				-- Chip
+			 */
+#ifdef CONFIG_ROOT_NFS
+			if (ROOT_DEV ==  Root_NFS) {
+				pr_err("IP-Config: Retrying forever (NFS root)...\n");
+				goto try_try_again;
+			}
+#endif
+#ifdef CONFIG_CIFS_ROOT
+			if (ROOT_DEV == Root_CIFS) {
+				pr_err("IP-Config: Retrying forever (CIFS root)...\n");
+				goto try_try_again;
+			}
+#endif
+
+			if (--retries) {
+				pr_err("IP-Config: Reopening network devices...\n");
+				goto try_try_again;
+			}
+
+			/* Oh, well.  At least we tried. */
+			pr_err("IP-Config: Auto-configuration of network failed\n");
+			return -1;
+		}
+#else /* !DYNAMIC */
+		pr_err("IP-Config: Incomplete network configuration information\n");
+		ic_close_devs();
+		return -1;
+#endif /* IPCONFIG_DYNAMIC */
+	} else {
+		/* Device selected manually or only one device -> use it */
+		ic_dev = ic_first_dev;
+	}
+
+	addr = root_nfs_parse_addr(root_server_path);
+	if (root_server_addr == NONE)
+		root_server_addr = addr;
+
+	/*
+	 * Use defaults wherever applicable.
+	 */
+	if (ic_defaults() < 0)
+		return -1;
+
+	/*
+	 * Record which protocol was actually used.
+	 */
+#ifdef IPCONFIG_DYNAMIC
+	ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP);
+#endif
+
+#ifndef IPCONFIG_SILENT
+	/*
+	 * Clue in the operator.
+	 */
+	pr_info("IP-Config: Complete:\n");
+
+	pr_info("     device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
+		ic_dev->dev->name, ic_dev->dev->addr_len, ic_dev->dev->dev_addr,
+		&ic_myaddr, &ic_netmask, &ic_gateway);
+	pr_info("     host=%s, domain=%s, nis-domain=%s\n",
+		utsname()->nodename, ic_domain, utsname()->domainname);
+	pr_info("     bootserver=%pI4, rootserver=%pI4, rootpath=%s",
+		&ic_servaddr, &root_server_addr, root_server_path);
+	if (ic_dev_mtu)
+		pr_cont(", mtu=%d", ic_dev_mtu);
+	/* Name servers (if any): */
+	for (i = 0, count = 0; i < CONF_NAMESERVERS_MAX; i++) {
+		if (ic_nameservers[i] != NONE) {
+			if (i == 0)
+				pr_info("     nameserver%u=%pI4",
+					i, &ic_nameservers[i]);
+			else
+				pr_cont(", nameserver%u=%pI4",
+					i, &ic_nameservers[i]);
+
+			count++;
+		}
+		if ((i + 1 == CONF_NAMESERVERS_MAX) && count > 0)
+			pr_cont("\n");
+	}
+	/* NTP servers (if any): */
+	for (i = 0, count = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+		if (ic_ntp_servers[i] != NONE) {
+			if (i == 0)
+				pr_info("     ntpserver%u=%pI4",
+					i, &ic_ntp_servers[i]);
+			else
+				pr_cont(", ntpserver%u=%pI4",
+					i, &ic_ntp_servers[i]);
+
+			count++;
+		}
+		if ((i + 1 == CONF_NTP_SERVERS_MAX) && count > 0)
+			pr_cont("\n");
+	}
+#endif /* !SILENT */
+
+	/*
+	 * Close all network devices except the device we've
+	 * autoconfigured and set up routes.
+	 */
+	if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+		err = -1;
+	else
+		err = 0;
+
+	ic_close_devs();
+
+	return err;
+}
+
+late_initcall(ip_auto_config);
+
+
+/*
+ *  Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
+ *  command line parameter.  See Documentation/admin-guide/nfs/nfsroot.rst.
+ */
+static int __init ic_proto_name(char *name)
+{
+	if (!strcmp(name, "on") || !strcmp(name, "any")) {
+		return 1;
+	}
+	if (!strcmp(name, "off") || !strcmp(name, "none")) {
+		return 0;
+	}
+#ifdef CONFIG_IP_PNP_DHCP
+	else if (!strncmp(name, "dhcp", 4)) {
+		char *client_id;
+
+		ic_proto_enabled &= ~IC_RARP;
+		client_id = strstr(name, "dhcp,");
+		if (client_id) {
+			char *v;
+
+			client_id = client_id + 5;
+			v = strchr(client_id, ',');
+			if (!v)
+				return 1;
+			*v = 0;
+			if (kstrtou8(client_id, 0, dhcp_client_identifier))
+				pr_debug("DHCP: Invalid client identifier type\n");
+			strncpy(dhcp_client_identifier + 1, v + 1, 251);
+			*v = ',';
+		}
+		return 1;
+	}
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+	else if (!strcmp(name, "bootp")) {
+		ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP);
+		return 1;
+	}
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+	else if (!strcmp(name, "rarp")) {
+		ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP);
+		return 1;
+	}
+#endif
+#ifdef IPCONFIG_DYNAMIC
+	else if (!strcmp(name, "both")) {
+		ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+static int __init ip_auto_config_setup(char *addrs)
+{
+	char *cp, *ip, *dp;
+	int num = 0;
+
+	ic_set_manually = 1;
+	ic_enable = 1;
+
+	/*
+	 * If any dhcp, bootp etc options are set, leave autoconfig on
+	 * and skip the below static IP processing.
+	 */
+	if (ic_proto_name(addrs))
+		return 1;
+
+	/* If no static IP is given, turn off autoconfig and bail.  */
+	if (*addrs == 0 ||
+	    strcmp(addrs, "off") == 0 ||
+	    strcmp(addrs, "none") == 0) {
+		ic_enable = 0;
+		return 1;
+	}
+
+	/* Initialise all name servers and NTP servers to NONE */
+	ic_nameservers_predef();
+	ic_ntp_servers_predef();
+
+	/* Parse string for static IP assignment.  */
+	ip = addrs;
+	while (ip && *ip) {
+		if ((cp = strchr(ip, ':')))
+			*cp++ = '\0';
+		if (strlen(ip) > 0) {
+			pr_debug("IP-Config: Parameter #%d: `%s'\n", num, ip);
+			switch (num) {
+			case 0:
+				if ((ic_myaddr = in_aton(ip)) == ANY)
+					ic_myaddr = NONE;
+				break;
+			case 1:
+				if ((ic_servaddr = in_aton(ip)) == ANY)
+					ic_servaddr = NONE;
+				break;
+			case 2:
+				if ((ic_gateway = in_aton(ip)) == ANY)
+					ic_gateway = NONE;
+				break;
+			case 3:
+				if ((ic_netmask = in_aton(ip)) == ANY)
+					ic_netmask = NONE;
+				break;
+			case 4:
+				if ((dp = strchr(ip, '.'))) {
+					*dp++ = '\0';
+					strscpy(utsname()->domainname, dp,
+						sizeof(utsname()->domainname));
+				}
+				strscpy(utsname()->nodename, ip,
+					sizeof(utsname()->nodename));
+				ic_host_name_set = 1;
+				break;
+			case 5:
+				strscpy(user_dev_name, ip, sizeof(user_dev_name));
+				break;
+			case 6:
+				if (ic_proto_name(ip) == 0 &&
+				    ic_myaddr == NONE) {
+					ic_enable = 0;
+				}
+				break;
+			case 7:
+				if (CONF_NAMESERVERS_MAX >= 1) {
+					ic_nameservers[0] = in_aton(ip);
+					if (ic_nameservers[0] == ANY)
+						ic_nameservers[0] = NONE;
+				}
+				break;
+			case 8:
+				if (CONF_NAMESERVERS_MAX >= 2) {
+					ic_nameservers[1] = in_aton(ip);
+					if (ic_nameservers[1] == ANY)
+						ic_nameservers[1] = NONE;
+				}
+				break;
+			case 9:
+				if (CONF_NTP_SERVERS_MAX >= 1) {
+					ic_ntp_servers[0] = in_aton(ip);
+					if (ic_ntp_servers[0] == ANY)
+						ic_ntp_servers[0] = NONE;
+				}
+				break;
+			}
+		}
+		ip = cp;
+		num++;
+	}
+
+	return 1;
+}
+__setup("ip=", ip_auto_config_setup);
+
+static int __init nfsaddrs_config_setup(char *addrs)
+{
+	return ip_auto_config_setup(addrs);
+}
+__setup("nfsaddrs=", nfsaddrs_config_setup);
+
+static int __init vendor_class_identifier_setup(char *addrs)
+{
+	if (strscpy(vendor_class_identifier, addrs,
+		    sizeof(vendor_class_identifier))
+	    >= sizeof(vendor_class_identifier))
+		pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n",
+			vendor_class_identifier);
+	return 1;
+}
+__setup("dhcpclass=", vendor_class_identifier_setup);
+
+static int __init set_carrier_timeout(char *str)
+{
+	ssize_t ret;
+
+	if (!str)
+		return 0;
+
+	ret = kstrtouint(str, 0, &carrier_timeout);
+	if (ret)
+		return 0;
+
+	return 1;
+}
+__setup("carrier_timeout=", set_carrier_timeout);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
new file mode 100644
index 0000000000..27b8f83c6e
--- /dev/null
+++ b/net/ipv4/ipip.c
@@ -0,0 +1,663 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	Linux NET3:	IP/IP protocol decoder.
+ *
+ *	Authors:
+ *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
+ *
+ *	Fixes:
+ *		Alan Cox	:	Merged and made usable non modular (its so tiny its silly as
+ *					a module taking up 2 pages).
+ *		Alan Cox	: 	Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
+ *					to keep ip_forward happy.
+ *		Alan Cox	:	More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
+ *		Kai Schulte	:	Fixed #defines for IP_FIREWALL->FIREWALL
+ *              David Woodhouse :       Perform some basic ICMP handling.
+ *                                      IPIP Routing without decapsulation.
+ *              Carlos Picoto   :       GRE over IP support
+ *		Alexey Kuznetsov:	Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
+ *					I do not want to merge them together.
+ */
+
+/* tunnel.c: an IP tunnel driver
+
+	The purpose of this driver is to provide an IP tunnel through
+	which you can tunnel network traffic transparently across subnets.
+
+	This was written by looking at Nick Holloway's dummy driver
+	Thanks for the great code!
+
+		-Sam Lantinga	(slouken@cs.ucdavis.edu)  02/01/95
+
+	Minor tweaks:
+		Cleaned up the code a little and added some pre-1.3.0 tweaks.
+		dev->hard_header/hard_header_len changed to use no headers.
+		Comments/bracketing tweaked.
+		Made the tunnels use dev->name not tunnel: when error reporting.
+		Added tx_dropped stat
+
+		-Alan Cox	(alan@lxorguk.ukuu.org.uk) 21 March 95
+
+	Reworked:
+		Changed to tunnel to destination gateway in addition to the
+			tunnel's pointopoint address
+		Almost completely rewritten
+		Note:  There is currently no firewall or ICMP handling done.
+
+		-Sam Lantinga	(slouken@cs.ucdavis.edu) 02/13/96
+
+*/
+
+/* Things I wish I had known when writing the tunnel driver:
+
+	When the tunnel_xmit() function is called, the skb contains the
+	packet to be sent (plus a great deal of extra info), and dev
+	contains the tunnel device that _we_ are.
+
+	When we are passed a packet, we are expected to fill in the
+	source address with our source IP address.
+
+	What is the proper way to allocate, copy and free a buffer?
+	After you allocate it, it is a "0 length" chunk of memory
+	starting at zero.  If you want to add headers to the buffer
+	later, you'll have to call "skb_reserve(skb, amount)" with
+	the amount of memory you want reserved.  Then, you call
+	"skb_put(skb, amount)" with the amount of space you want in
+	the buffer.  skb_put() returns a pointer to the top (#0) of
+	that buffer.  skb->len is set to the amount of space you have
+	"allocated" with skb_put().  You can then write up to skb->len
+	bytes to that buffer.  If you need more, you can call skb_put()
+	again with the additional amount of space you need.  You can
+	find out how much more space you can allocate by calling
+	"skb_tailroom(skb)".
+	Now, to add header space, call "skb_push(skb, header_len)".
+	This creates space at the beginning of the buffer and returns
+	a pointer to this new space.  If later you need to strip a
+	header from a buffer, call "skb_pull(skb, header_len)".
+	skb_headroom() will return how much space is left at the top
+	of the buffer (before the main data).  Remember, this headroom
+	space must be reserved before the skb_put() function is called.
+	*/
+
+/*
+   This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
+
+   For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/dst_metadata.h>
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+static unsigned int ipip_net_id __read_mostly;
+
+static int ipip_tunnel_init(struct net_device *dev);
+static struct rtnl_link_ops ipip_link_ops __read_mostly;
+
+static int ipip_err(struct sk_buff *skb, u32 info)
+{
+	/* All the routers (except for Linux) return only
+	 * 8 bytes of packet payload. It means, that precise relaying of
+	 * ICMP in the real Internet is absolutely infeasible.
+	 */
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct ip_tunnel *t;
+	int err = 0;
+
+	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+			     iph->daddr, iph->saddr, 0);
+	if (!t) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	switch (type) {
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+			/* Impossible event. */
+			goto out;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			 * rfc2003 contains "deep thoughts" about NET_UNREACH,
+			 * I believe they are just ether pollution. --ANK
+			 */
+			break;
+		}
+		break;
+
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			goto out;
+		break;
+
+	case ICMP_REDIRECT:
+		break;
+
+	default:
+		goto out;
+	}
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
+		goto out;
+	}
+
+	if (type == ICMP_REDIRECT) {
+		ipv4_redirect(skb, net, t->parms.link, iph->protocol);
+		goto out;
+	}
+
+	if (t->parms.iph.daddr == 0) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+		goto out;
+
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+		t->err_count++;
+	else
+		t->err_count = 1;
+	t->err_time = jiffies;
+
+out:
+	return err;
+}
+
+static const struct tnl_ptk_info ipip_tpi = {
+	/* no tunnel info required for ipip. */
+	.proto = htons(ETH_P_IP),
+};
+
+#if IS_ENABLED(CONFIG_MPLS)
+static const struct tnl_ptk_info mplsip_tpi = {
+	/* no tunnel info required for mplsip. */
+	.proto = htons(ETH_P_MPLS_UC),
+};
+#endif
+
+static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
+{
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+	struct metadata_dst *tun_dst = NULL;
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph;
+
+	iph = ip_hdr(skb);
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+			iph->saddr, iph->daddr, 0);
+	if (tunnel) {
+		const struct tnl_ptk_info *tpi;
+
+		if (tunnel->parms.iph.protocol != ipproto &&
+		    tunnel->parms.iph.protocol != 0)
+			goto drop;
+
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto drop;
+#if IS_ENABLED(CONFIG_MPLS)
+		if (ipproto == IPPROTO_MPLS)
+			tpi = &mplsip_tpi;
+		else
+#endif
+			tpi = &ipip_tpi;
+		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
+			goto drop;
+		if (tunnel->collect_md) {
+			tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+			if (!tun_dst)
+				return 0;
+			ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
+		}
+		skb_reset_mac_header(skb);
+
+		return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+	}
+
+	return -1;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+	return ipip_tunnel_rcv(skb, IPPROTO_IPIP);
+}
+
+#if IS_ENABLED(CONFIG_MPLS)
+static int mplsip_rcv(struct sk_buff *skb)
+{
+	return ipip_tunnel_rcv(skb, IPPROTO_MPLS);
+}
+#endif
+
+/*
+ *	This function assumes it is being called from dev_queue_xmit()
+ *	and that skb is filled properly by that function.
+ */
+static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
+				    struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	const struct iphdr  *tiph = &tunnel->parms.iph;
+	u8 ipproto;
+
+	if (!pskb_inet_may_pull(skb))
+		goto tx_error;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		ipproto = IPPROTO_IPIP;
+		break;
+#if IS_ENABLED(CONFIG_MPLS)
+	case htons(ETH_P_MPLS_UC):
+		ipproto = IPPROTO_MPLS;
+		break;
+#endif
+	default:
+		goto tx_error;
+	}
+
+	if (tiph->protocol != ipproto && tiph->protocol != 0)
+		goto tx_error;
+
+	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
+		goto tx_error;
+
+	skb_set_inner_ipproto(skb, ipproto);
+
+	if (tunnel->collect_md)
+		ip_md_tunnel_xmit(skb, dev, ipproto, 0);
+	else
+		ip_tunnel_xmit(skb, dev, tiph, ipproto);
+	return NETDEV_TX_OK;
+
+tx_error:
+	kfree_skb(skb);
+
+	DEV_STATS_INC(dev, tx_errors);
+	return NETDEV_TX_OK;
+}
+
+static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
+{
+	switch (ipproto) {
+	case 0:
+	case IPPROTO_IPIP:
+#if IS_ENABLED(CONFIG_MPLS)
+	case IPPROTO_MPLS:
+#endif
+		return true;
+	}
+
+	return false;
+}
+
+static int
+ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+		if (p->iph.version != 4 ||
+		    !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) ||
+		    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
+			return -EINVAL;
+	}
+
+	p->i_key = p->o_key = 0;
+	p->i_flags = p->o_flags = 0;
+	return ip_tunnel_ctl(dev, p, cmd);
+}
+
+static const struct net_device_ops ipip_netdev_ops = {
+	.ndo_init       = ipip_tunnel_init,
+	.ndo_uninit     = ip_tunnel_uninit,
+	.ndo_start_xmit	= ipip_tunnel_xmit,
+	.ndo_siocdevprivate = ip_tunnel_siocdevprivate,
+	.ndo_change_mtu = ip_tunnel_change_mtu,
+	.ndo_get_stats64 = dev_get_tstats64,
+	.ndo_get_iflink = ip_tunnel_get_iflink,
+	.ndo_tunnel_ctl	= ipip_tunnel_ctl,
+};
+
+#define IPIP_FEATURES (NETIF_F_SG |		\
+		       NETIF_F_FRAGLIST |	\
+		       NETIF_F_HIGHDMA |	\
+		       NETIF_F_GSO_SOFTWARE |	\
+		       NETIF_F_HW_CSUM)
+
+static void ipip_tunnel_setup(struct net_device *dev)
+{
+	dev->netdev_ops		= &ipip_netdev_ops;
+	dev->header_ops		= &ip_tunnel_header_ops;
+
+	dev->type		= ARPHRD_TUNNEL;
+	dev->flags		= IFF_NOARP;
+	dev->addr_len		= 4;
+	dev->features		|= NETIF_F_LLTX;
+	netif_keep_dst(dev);
+
+	dev->features		|= IPIP_FEATURES;
+	dev->hw_features	|= IPIP_FEATURES;
+	ip_tunnel_setup(dev, ipip_net_id);
+}
+
+static int ipip_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	__dev_addr_set(dev, &tunnel->parms.iph.saddr, 4);
+	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+	tunnel->tun_hlen = 0;
+	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+	return ip_tunnel_init(dev);
+}
+
+static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+				struct netlink_ext_ack *extack)
+{
+	u8 proto;
+
+	if (!data || !data[IFLA_IPTUN_PROTO])
+		return 0;
+
+	proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+	if (proto != IPPROTO_IPIP && proto != IPPROTO_MPLS && proto != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void ipip_netlink_parms(struct nlattr *data[],
+			       struct ip_tunnel_parm *parms, bool *collect_md,
+			       __u32 *fwmark)
+{
+	memset(parms, 0, sizeof(*parms));
+
+	parms->iph.version = 4;
+	parms->iph.protocol = IPPROTO_IPIP;
+	parms->iph.ihl = 5;
+	*collect_md = false;
+
+	if (!data)
+		return;
+
+	ip_tunnel_netlink_parms(data, parms);
+
+	if (data[IFLA_IPTUN_COLLECT_METADATA])
+		*collect_md = true;
+
+	if (data[IFLA_IPTUN_FWMARK])
+		*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
+}
+
+static int ipip_newlink(struct net *src_net, struct net_device *dev,
+			struct nlattr *tb[], struct nlattr *data[],
+			struct netlink_ext_ack *extack)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm p;
+	struct ip_tunnel_encap ipencap;
+	__u32 fwmark = 0;
+
+	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+		int err = ip_tunnel_encap_setup(t, &ipencap);
+
+		if (err < 0)
+			return err;
+	}
+
+	ipip_netlink_parms(data, &p, &t->collect_md, &fwmark);
+	return ip_tunnel_newlink(dev, tb, &p, fwmark);
+}
+
+static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
+			   struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm p;
+	struct ip_tunnel_encap ipencap;
+	bool collect_md;
+	__u32 fwmark = t->fwmark;
+
+	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+		int err = ip_tunnel_encap_setup(t, &ipencap);
+
+		if (err < 0)
+			return err;
+	}
+
+	ipip_netlink_parms(data, &p, &collect_md, &fwmark);
+	if (collect_md)
+		return -EINVAL;
+
+	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+		return -EINVAL;
+
+	return ip_tunnel_changelink(dev, tb, &p, fwmark);
+}
+
+static size_t ipip_get_size(const struct net_device *dev)
+{
+	return
+		/* IFLA_IPTUN_LINK */
+		nla_total_size(4) +
+		/* IFLA_IPTUN_LOCAL */
+		nla_total_size(4) +
+		/* IFLA_IPTUN_REMOTE */
+		nla_total_size(4) +
+		/* IFLA_IPTUN_TTL */
+		nla_total_size(1) +
+		/* IFLA_IPTUN_TOS */
+		nla_total_size(1) +
+		/* IFLA_IPTUN_PROTO */
+		nla_total_size(1) +
+		/* IFLA_IPTUN_PMTUDISC */
+		nla_total_size(1) +
+		/* IFLA_IPTUN_ENCAP_TYPE */
+		nla_total_size(2) +
+		/* IFLA_IPTUN_ENCAP_FLAGS */
+		nla_total_size(2) +
+		/* IFLA_IPTUN_ENCAP_SPORT */
+		nla_total_size(2) +
+		/* IFLA_IPTUN_ENCAP_DPORT */
+		nla_total_size(2) +
+		/* IFLA_IPTUN_COLLECT_METADATA */
+		nla_total_size(0) +
+		/* IFLA_IPTUN_FWMARK */
+		nla_total_size(4) +
+		0;
+}
+
+static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct ip_tunnel_parm *parm = &tunnel->parms;
+
+	if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+	    nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+	    nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+	    nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
+	    nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+	    nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
+	    nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
+		       !!(parm->iph.frag_off & htons(IP_DF))) ||
+	    nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
+		goto nla_put_failure;
+
+	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+			tunnel->encap.type) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+			 tunnel->encap.sport) ||
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+			 tunnel->encap.dport) ||
+	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+			tunnel->encap.flags))
+		goto nla_put_failure;
+
+	if (tunnel->collect_md)
+		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+			goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
+	[IFLA_IPTUN_LINK]		= { .type = NLA_U32 },
+	[IFLA_IPTUN_LOCAL]		= { .type = NLA_U32 },
+	[IFLA_IPTUN_REMOTE]		= { .type = NLA_U32 },
+	[IFLA_IPTUN_TTL]		= { .type = NLA_U8 },
+	[IFLA_IPTUN_TOS]		= { .type = NLA_U8 },
+	[IFLA_IPTUN_PROTO]		= { .type = NLA_U8 },
+	[IFLA_IPTUN_PMTUDISC]		= { .type = NLA_U8 },
+	[IFLA_IPTUN_ENCAP_TYPE]		= { .type = NLA_U16 },
+	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
+	[IFLA_IPTUN_FWMARK]		= { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops ipip_link_ops __read_mostly = {
+	.kind		= "ipip",
+	.maxtype	= IFLA_IPTUN_MAX,
+	.policy		= ipip_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= ipip_tunnel_setup,
+	.validate	= ipip_tunnel_validate,
+	.newlink	= ipip_newlink,
+	.changelink	= ipip_changelink,
+	.dellink	= ip_tunnel_dellink,
+	.get_size	= ipip_get_size,
+	.fill_info	= ipip_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
+};
+
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+	.handler	=	ipip_rcv,
+	.err_handler	=	ipip_err,
+	.priority	=	1,
+};
+
+#if IS_ENABLED(CONFIG_MPLS)
+static struct xfrm_tunnel mplsip_handler __read_mostly = {
+	.handler	=	mplsip_rcv,
+	.err_handler	=	ipip_err,
+	.priority	=	1,
+};
+#endif
+
+static int __net_init ipip_init_net(struct net *net)
+{
+	return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
+}
+
+static void __net_exit ipip_exit_batch_net(struct list_head *list_net)
+{
+	ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops);
+}
+
+static struct pernet_operations ipip_net_ops = {
+	.init = ipip_init_net,
+	.exit_batch = ipip_exit_batch_net,
+	.id   = &ipip_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
+static int __init ipip_init(void)
+{
+	int err;
+
+	pr_info("ipip: IPv4 and MPLS over IPv4 tunneling driver\n");
+
+	err = register_pernet_device(&ipip_net_ops);
+	if (err < 0)
+		return err;
+	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+	if (err < 0) {
+		pr_info("%s: can't register tunnel\n", __func__);
+		goto xfrm_tunnel_ipip_failed;
+	}
+#if IS_ENABLED(CONFIG_MPLS)
+	err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS);
+	if (err < 0) {
+		pr_info("%s: can't register tunnel\n", __func__);
+		goto xfrm_tunnel_mplsip_failed;
+	}
+#endif
+	err = rtnl_link_register(&ipip_link_ops);
+	if (err < 0)
+		goto rtnl_link_failed;
+
+out:
+	return err;
+
+rtnl_link_failed:
+#if IS_ENABLED(CONFIG_MPLS)
+	xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
+xfrm_tunnel_mplsip_failed:
+
+#endif
+	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+xfrm_tunnel_ipip_failed:
+	unregister_pernet_device(&ipip_net_ops);
+	goto out;
+}
+
+static void __exit ipip_fini(void)
+{
+	rtnl_link_unregister(&ipip_link_ops);
+	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
+		pr_info("%s: can't deregister tunnel\n", __func__);
+#if IS_ENABLED(CONFIG_MPLS)
+	if (xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS))
+		pr_info("%s: can't deregister tunnel\n", __func__);
+#endif
+	unregister_pernet_device(&ipip_net_ops);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("ipip");
+MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
new file mode 100644
index 0000000000..0063a23725
--- /dev/null
+++ b/net/ipv4/ipmr.c
@@ -0,0 +1,3181 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	IP multicast routing support for mrouted 3.6/3.8
+ *
+ *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *	  Linux Consultancy and Custom Driver Development
+ *
+ *	Fixes:
+ *	Michael Chastain	:	Incorrect size of copying.
+ *	Alan Cox		:	Added the cache manager code
+ *	Alan Cox		:	Fixed the clone/copy bug and device race.
+ *	Mike McLagan		:	Routing by source
+ *	Malcolm Beattie		:	Buffer handling fixes.
+ *	Alexey Kuznetsov	:	Double buffer free and other fixes.
+ *	SVR Anand		:	Fixed several multicast bugs and problems.
+ *	Alexey Kuznetsov	:	Status, optimisations and more.
+ *	Brad Parker		:	Better behaviour on mrouted upcall
+ *					overflow.
+ *      Carlos Picoto           :       PIMv1 Support
+ *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
+ *					Relax this requirement to work with older peers.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/if_ether.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+#include <linux/export.h>
+#include <linux/rhashtable.h>
+#include <net/ip_tunnels.h>
+#include <net/checksum.h>
+#include <net/netlink.h>
+#include <net/fib_rules.h>
+#include <linux/netconf.h>
+#include <net/rtnh.h>
+
+#include <linux/nospec.h>
+
+struct ipmr_rule {
+	struct fib_rule		common;
+};
+
+struct ipmr_result {
+	struct mr_table		*mrt;
+};
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+ * Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_SPINLOCK(mrt_lock);
+
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+	return rcu_dereference(vif->dev);
+}
+
+/* Multicast router control variables */
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
+ */
+
+static struct kmem_cache *mrt_cachep __ro_after_init;
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id);
+static void ipmr_free_table(struct mr_table *mrt);
+
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+			  struct net_device *dev, struct sk_buff *skb,
+			  struct mfc_cache *cache, int local);
+static int ipmr_cache_report(const struct mr_table *mrt,
+			     struct sk_buff *pkt, vifi_t vifi, int assert);
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+				 int cmd);
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
+static void mroute_clean_tables(struct mr_table *mrt, int flags);
+static void ipmr_expire_process(struct timer_list *t);
+
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+#define ipmr_for_each_table(mrt, net)					\
+	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list,	\
+				lockdep_rtnl_is_held() ||		\
+				list_empty(&net->ipv4.mr_tables))
+
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+					   struct mr_table *mrt)
+{
+	struct mr_table *ret;
+
+	if (!mrt)
+		ret = list_entry_rcu(net->ipv4.mr_tables.next,
+				     struct mr_table, list);
+	else
+		ret = list_entry_rcu(mrt->list.next,
+				     struct mr_table, list);
+
+	if (&ret->list == &net->ipv4.mr_tables)
+		return NULL;
+	return ret;
+}
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+
+	ipmr_for_each_table(mrt, net) {
+		if (mrt->id == id)
+			return mrt;
+	}
+	return NULL;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+			   struct mr_table **mrt)
+{
+	int err;
+	struct ipmr_result res;
+	struct fib_lookup_arg arg = {
+		.result = &res,
+		.flags = FIB_LOOKUP_NOREF,
+	};
+
+	/* update flow if oif or iif point to device enslaved to l3mdev */
+	l3mdev_update_flow(net, flowi4_to_flowi(flp4));
+
+	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+			       flowi4_to_flowi(flp4), 0, &arg);
+	if (err < 0)
+		return err;
+	*mrt = res.mrt;
+	return 0;
+}
+
+static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	struct ipmr_result *res = arg->result;
+	struct mr_table *mrt;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	arg->table = fib_rule_get_table(rule, arg);
+
+	mrt = ipmr_get_table(rule->fr_net, arg->table);
+	if (!mrt)
+		return -EAGAIN;
+	res->mrt = mrt;
+	return 0;
+}
+
+static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+	return 1;
+}
+
+static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+			       struct fib_rule_hdr *frh, struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			     struct nlattr **tb)
+{
+	return 1;
+}
+
+static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			  struct fib_rule_hdr *frh)
+{
+	frh->dst_len = 0;
+	frh->src_len = 0;
+	frh->tos     = 0;
+	return 0;
+}
+
+static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
+	.family		= RTNL_FAMILY_IPMR,
+	.rule_size	= sizeof(struct ipmr_rule),
+	.addr_size	= sizeof(u32),
+	.action		= ipmr_rule_action,
+	.match		= ipmr_rule_match,
+	.configure	= ipmr_rule_configure,
+	.compare	= ipmr_rule_compare,
+	.fill		= ipmr_rule_fill,
+	.nlgroup	= RTNLGRP_IPV4_RULE,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+	struct fib_rules_ops *ops;
+	struct mr_table *mrt;
+	int err;
+
+	ops = fib_rules_register(&ipmr_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	INIT_LIST_HEAD(&net->ipv4.mr_tables);
+
+	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+	if (IS_ERR(mrt)) {
+		err = PTR_ERR(mrt);
+		goto err1;
+	}
+
+	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
+	if (err < 0)
+		goto err2;
+
+	net->ipv4.mr_rules_ops = ops;
+	return 0;
+
+err2:
+	rtnl_lock();
+	ipmr_free_table(mrt);
+	rtnl_unlock();
+err1:
+	fib_rules_unregister(ops);
+	return err;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+	struct mr_table *mrt, *next;
+
+	ASSERT_RTNL();
+	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
+		list_del(&mrt->list);
+		ipmr_free_table(mrt);
+	}
+	fib_rules_unregister(net->ipv4.mr_rules_ops);
+}
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+			   struct netlink_ext_ack *extack)
+{
+	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack);
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
+#else
+#define ipmr_for_each_table(mrt, net) \
+	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
+
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+					   struct mr_table *mrt)
+{
+	if (!mrt)
+		return net->ipv4.mrt;
+	return NULL;
+}
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+	return net->ipv4.mrt;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+			   struct mr_table **mrt)
+{
+	*mrt = net->ipv4.mrt;
+	return 0;
+}
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+	struct mr_table *mrt;
+
+	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+	if (IS_ERR(mrt))
+		return PTR_ERR(mrt);
+	net->ipv4.mrt = mrt;
+	return 0;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+	ASSERT_RTNL();
+	ipmr_free_table(net->ipv4.mrt);
+	net->ipv4.mrt = NULL;
+}
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+			   struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return 0;
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
+#endif
+
+static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
+				const void *ptr)
+{
+	const struct mfc_cache_cmp_arg *cmparg = arg->key;
+	const struct mfc_cache *c = ptr;
+
+	return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
+	       cmparg->mfc_origin != c->mfc_origin;
+}
+
+static const struct rhashtable_params ipmr_rht_params = {
+	.head_offset = offsetof(struct mr_mfc, mnode),
+	.key_offset = offsetof(struct mfc_cache, cmparg),
+	.key_len = sizeof(struct mfc_cache_cmp_arg),
+	.nelem_hint = 3,
+	.obj_cmpfn = ipmr_hash_cmp,
+	.automatic_shrinking = true,
+};
+
+static void ipmr_new_table_set(struct mr_table *mrt,
+			       struct net *net)
+{
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
+#endif
+}
+
+static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
+	.mfc_mcastgrp = htonl(INADDR_ANY),
+	.mfc_origin = htonl(INADDR_ANY),
+};
+
+static struct mr_table_ops ipmr_mr_table_ops = {
+	.rht_params = &ipmr_rht_params,
+	.cmparg_any = &ipmr_mr_table_ops_cmparg_any,
+};
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+
+	/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
+	if (id != RT_TABLE_DEFAULT && id >= 1000000000)
+		return ERR_PTR(-EINVAL);
+
+	mrt = ipmr_get_table(net, id);
+	if (mrt)
+		return mrt;
+
+	return mr_table_alloc(net, id, &ipmr_mr_table_ops,
+			      ipmr_expire_process, ipmr_new_table_set);
+}
+
+static void ipmr_free_table(struct mr_table *mrt)
+{
+	timer_shutdown_sync(&mrt->ipmr_expire_timer);
+	mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
+				 MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC);
+	rhltable_destroy(&mrt->mfc_hash);
+	kfree(mrt);
+}
+
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+
+/* Initialize ipmr pimreg/tunnel in_device */
+static bool ipmr_init_vif_indev(const struct net_device *dev)
+{
+	struct in_device *in_dev;
+
+	ASSERT_RTNL();
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (!in_dev)
+		return false;
+	ipv4_devconf_setall(in_dev);
+	neigh_parms_data_state_setall(in_dev->arp_parms);
+	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+
+	return true;
+}
+
+static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
+{
+	struct net_device *tunnel_dev, *new_dev;
+	struct ip_tunnel_parm p = { };
+	int err;
+
+	tunnel_dev = __dev_get_by_name(net, "tunl0");
+	if (!tunnel_dev)
+		goto out;
+
+	p.iph.daddr = v->vifc_rmt_addr.s_addr;
+	p.iph.saddr = v->vifc_lcl_addr.s_addr;
+	p.iph.version = 4;
+	p.iph.ihl = 5;
+	p.iph.protocol = IPPROTO_IPIP;
+	sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+
+	if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl)
+		goto out;
+	err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
+			SIOCADDTUNNEL);
+	if (err)
+		goto out;
+
+	new_dev = __dev_get_by_name(net, p.name);
+	if (!new_dev)
+		goto out;
+
+	new_dev->flags |= IFF_MULTICAST;
+	if (!ipmr_init_vif_indev(new_dev))
+		goto out_unregister;
+	if (dev_open(new_dev, NULL))
+		goto out_unregister;
+	dev_hold(new_dev);
+	err = dev_set_allmulti(new_dev, 1);
+	if (err) {
+		dev_close(new_dev);
+		tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
+				SIOCDELTUNNEL);
+		dev_put(new_dev);
+		new_dev = ERR_PTR(err);
+	}
+	return new_dev;
+
+out_unregister:
+	unregister_netdevice(new_dev);
+out:
+	return ERR_PTR(-ENOBUFS);
+}
+
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct mr_table *mrt;
+	struct flowi4 fl4 = {
+		.flowi4_oif	= dev->ifindex,
+		.flowi4_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,
+		.flowi4_mark	= skb->mark,
+	};
+	int err;
+
+	err = ipmr_fib_lookup(net, &fl4, &mrt);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	DEV_STATS_ADD(dev, tx_bytes, skb->len);
+	DEV_STATS_INC(dev, tx_packets);
+	rcu_read_lock();
+
+	/* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+	ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+			  IGMPMSG_WHOLEPKT);
+
+	rcu_read_unlock();
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int reg_vif_get_iflink(const struct net_device *dev)
+{
+	return 0;
+}
+
+static const struct net_device_ops reg_vif_netdev_ops = {
+	.ndo_start_xmit	= reg_vif_xmit,
+	.ndo_get_iflink = reg_vif_get_iflink,
+};
+
+static void reg_vif_setup(struct net_device *dev)
+{
+	dev->type		= ARPHRD_PIMREG;
+	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
+	dev->flags		= IFF_NOARP;
+	dev->netdev_ops		= &reg_vif_netdev_ops;
+	dev->needs_free_netdev	= true;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+}
+
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+
+	if (mrt->id == RT_TABLE_DEFAULT)
+		sprintf(name, "pimreg");
+	else
+		sprintf(name, "pimreg%u", mrt->id);
+
+	dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
+
+	if (!dev)
+		return NULL;
+
+	dev_net_set(dev, net);
+
+	if (register_netdevice(dev)) {
+		free_netdev(dev);
+		return NULL;
+	}
+
+	if (!ipmr_init_vif_indev(dev))
+		goto failure;
+	if (dev_open(dev, NULL))
+		goto failure;
+
+	dev_hold(dev);
+
+	return dev;
+
+failure:
+	unregister_netdevice(dev);
+	return NULL;
+}
+
+/* called with rcu_read_lock() */
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+		     unsigned int pimlen)
+{
+	struct net_device *reg_dev = NULL;
+	struct iphdr *encap;
+	int vif_num;
+
+	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
+	/* Check that:
+	 * a. packet is really sent to a multicast group
+	 * b. packet is not a NULL-REGISTER
+	 * c. packet is not truncated
+	 */
+	if (!ipv4_is_multicast(encap->daddr) ||
+	    encap->tot_len == 0 ||
+	    ntohs(encap->tot_len) + pimlen > skb->len)
+		return 1;
+
+	/* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */
+	vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+	if (vif_num >= 0)
+		reg_dev = vif_dev_read(&mrt->vif_table[vif_num]);
+	if (!reg_dev)
+		return 1;
+
+	skb->mac_header = skb->network_header;
+	skb_pull(skb, (u8 *)encap - skb->data);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IP);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
+
+	netif_rx(skb);
+
+	return NET_RX_SUCCESS;
+}
+#else
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+	return NULL;
+}
+#endif
+
+static int call_ipmr_vif_entry_notifiers(struct net *net,
+					 enum fib_event_type event_type,
+					 struct vif_device *vif,
+					 struct net_device *vif_dev,
+					 vifi_t vif_index, u32 tb_id)
+{
+	return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
+				     vif, vif_dev, vif_index, tb_id,
+				     &net->ipv4.ipmr_seq);
+}
+
+static int call_ipmr_mfc_entry_notifiers(struct net *net,
+					 enum fib_event_type event_type,
+					 struct mfc_cache *mfc, u32 tb_id)
+{
+	return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type,
+				     &mfc->_c, tb_id, &net->ipv4.ipmr_seq);
+}
+
+/**
+ *	vif_delete - Delete a VIF entry
+ *	@mrt: Table to delete from
+ *	@vifi: VIF identifier to delete
+ *	@notify: Set to 1, if the caller is a notifier_call
+ *	@head: if unregistering the VIF, place it on this queue
+ */
+static int vif_delete(struct mr_table *mrt, int vifi, int notify,
+		      struct list_head *head)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct vif_device *v;
+	struct net_device *dev;
+	struct in_device *in_dev;
+
+	if (vifi < 0 || vifi >= mrt->maxvif)
+		return -EADDRNOTAVAIL;
+
+	v = &mrt->vif_table[vifi];
+
+	dev = rtnl_dereference(v->dev);
+	if (!dev)
+		return -EADDRNOTAVAIL;
+
+	spin_lock(&mrt_lock);
+	call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev,
+				      vifi, mrt->id);
+	RCU_INIT_POINTER(v->dev, NULL);
+
+	if (vifi == mrt->mroute_reg_vif_num) {
+		/* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+		WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+	}
+	if (vifi + 1 == mrt->maxvif) {
+		int tmp;
+
+		for (tmp = vifi - 1; tmp >= 0; tmp--) {
+			if (VIF_EXISTS(mrt, tmp))
+				break;
+		}
+		WRITE_ONCE(mrt->maxvif, tmp + 1);
+	}
+
+	spin_unlock(&mrt_lock);
+
+	dev_set_allmulti(dev, -1);
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (in_dev) {
+		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
+		inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
+					    NETCONFA_MC_FORWARDING,
+					    dev->ifindex, &in_dev->cnf);
+		ip_rt_multicast_event(in_dev);
+	}
+
+	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
+		unregister_netdevice_queue(dev, head);
+
+	netdev_put(dev, &v->dev_tracker);
+	return 0;
+}
+
+static void ipmr_cache_free_rcu(struct rcu_head *head)
+{
+	struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);
+
+	kmem_cache_free(mrt_cachep, (struct mfc_cache *)c);
+}
+
+static void ipmr_cache_free(struct mfc_cache *c)
+{
+	call_rcu(&c->_c.rcu, ipmr_cache_free_rcu);
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+ * and reporting error to netlink readers.
+ */
+static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct sk_buff *skb;
+	struct nlmsgerr *e;
+
+	atomic_dec(&mrt->cache_resolve_queue_len);
+
+	while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) {
+		if (ip_hdr(skb)->version == 0) {
+			struct nlmsghdr *nlh = skb_pull(skb,
+							sizeof(struct iphdr));
+			nlh->nlmsg_type = NLMSG_ERROR;
+			nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+			skb_trim(skb, nlh->nlmsg_len);
+			e = nlmsg_data(nlh);
+			e->error = -ETIMEDOUT;
+			memset(&e->msg, 0, sizeof(e->msg));
+
+			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+		} else {
+			kfree_skb(skb);
+		}
+	}
+
+	ipmr_cache_free(c);
+}
+
+/* Timer process for the unresolved queue. */
+static void ipmr_expire_process(struct timer_list *t)
+{
+	struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
+	struct mr_mfc *c, *next;
+	unsigned long expires;
+	unsigned long now;
+
+	if (!spin_trylock(&mfc_unres_lock)) {
+		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
+		return;
+	}
+
+	if (list_empty(&mrt->mfc_unres_queue))
+		goto out;
+
+	now = jiffies;
+	expires = 10*HZ;
+
+	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+		if (time_after(c->mfc_un.unres.expires, now)) {
+			unsigned long interval = c->mfc_un.unres.expires - now;
+			if (interval < expires)
+				expires = interval;
+			continue;
+		}
+
+		list_del(&c->list);
+		mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE);
+		ipmr_destroy_unres(mrt, (struct mfc_cache *)c);
+	}
+
+	if (!list_empty(&mrt->mfc_unres_queue))
+		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
+
+out:
+	spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under locked mrt_lock. */
+static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
+				   unsigned char *ttls)
+{
+	int vifi;
+
+	cache->mfc_un.res.minvif = MAXVIFS;
+	cache->mfc_un.res.maxvif = 0;
+	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
+
+	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+		if (VIF_EXISTS(mrt, vifi) &&
+		    ttls[vifi] && ttls[vifi] < 255) {
+			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+			if (cache->mfc_un.res.minvif > vifi)
+				cache->mfc_un.res.minvif = vifi;
+			if (cache->mfc_un.res.maxvif <= vifi)
+				cache->mfc_un.res.maxvif = vifi + 1;
+		}
+	}
+	cache->mfc_un.res.lastuse = jiffies;
+}
+
+static int vif_add(struct net *net, struct mr_table *mrt,
+		   struct vifctl *vifc, int mrtsock)
+{
+	struct netdev_phys_item_id ppid = { };
+	int vifi = vifc->vifc_vifi;
+	struct vif_device *v = &mrt->vif_table[vifi];
+	struct net_device *dev;
+	struct in_device *in_dev;
+	int err;
+
+	/* Is vif busy ? */
+	if (VIF_EXISTS(mrt, vifi))
+		return -EADDRINUSE;
+
+	switch (vifc->vifc_flags) {
+	case VIFF_REGISTER:
+		if (!ipmr_pimsm_enabled())
+			return -EINVAL;
+		/* Special Purpose VIF in PIM
+		 * All the packets will be sent to the daemon
+		 */
+		if (mrt->mroute_reg_vif_num >= 0)
+			return -EADDRINUSE;
+		dev = ipmr_reg_vif(net, mrt);
+		if (!dev)
+			return -ENOBUFS;
+		err = dev_set_allmulti(dev, 1);
+		if (err) {
+			unregister_netdevice(dev);
+			dev_put(dev);
+			return err;
+		}
+		break;
+	case VIFF_TUNNEL:
+		dev = ipmr_new_tunnel(net, vifc);
+		if (IS_ERR(dev))
+			return PTR_ERR(dev);
+		break;
+	case VIFF_USE_IFINDEX:
+	case 0:
+		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
+			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
+			if (dev && !__in_dev_get_rtnl(dev)) {
+				dev_put(dev);
+				return -EADDRNOTAVAIL;
+			}
+		} else {
+			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+		}
+		if (!dev)
+			return -EADDRNOTAVAIL;
+		err = dev_set_allmulti(dev, 1);
+		if (err) {
+			dev_put(dev);
+			return err;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (!in_dev) {
+		dev_put(dev);
+		return -EADDRNOTAVAIL;
+	}
+	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
+	inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING,
+				    dev->ifindex, &in_dev->cnf);
+	ip_rt_multicast_event(in_dev);
+
+	/* Fill in the VIF structures */
+	vif_device_init(v, dev, vifc->vifc_rate_limit,
+			vifc->vifc_threshold,
+			vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
+			(VIFF_TUNNEL | VIFF_REGISTER));
+
+	err = dev_get_port_parent_id(dev, &ppid, true);
+	if (err == 0) {
+		memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len);
+		v->dev_parent_id.id_len = ppid.id_len;
+	} else {
+		v->dev_parent_id.id_len = 0;
+	}
+
+	v->local = vifc->vifc_lcl_addr.s_addr;
+	v->remote = vifc->vifc_rmt_addr.s_addr;
+
+	/* And finish update writing critical data */
+	spin_lock(&mrt_lock);
+	rcu_assign_pointer(v->dev, dev);
+	netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
+	if (v->flags & VIFF_REGISTER) {
+		/* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+		WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
+	}
+	if (vifi+1 > mrt->maxvif)
+		WRITE_ONCE(mrt->maxvif, vifi + 1);
+	spin_unlock(&mrt_lock);
+	call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev,
+				      vifi, mrt->id);
+	return 0;
+}
+
+/* called with rcu_read_lock() */
+static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
+					 __be32 origin,
+					 __be32 mcastgrp)
+{
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = origin
+	};
+
+	return mr_mfc_find(mrt, &arg);
+}
+
+/* Look for a (*,G) entry */
+static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
+					     __be32 mcastgrp, int vifi)
+{
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = htonl(INADDR_ANY)
+	};
+
+	if (mcastgrp == htonl(INADDR_ANY))
+		return mr_mfc_find_any_parent(mrt, vifi);
+	return mr_mfc_find_any(mrt, vifi, &arg);
+}
+
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
+						__be32 origin, __be32 mcastgrp,
+						int parent)
+{
+	struct mfc_cache_cmp_arg arg = {
+			.mfc_mcastgrp = mcastgrp,
+			.mfc_origin = origin,
+	};
+
+	return mr_mfc_find_parent(mrt, &arg, parent);
+}
+
+/* Allocate a multicast cache entry */
+static struct mfc_cache *ipmr_cache_alloc(void)
+{
+	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
+
+	if (c) {
+		c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
+		c->_c.mfc_un.res.minvif = MAXVIFS;
+		c->_c.free = ipmr_cache_free_rcu;
+		refcount_set(&c->_c.mfc_un.res.refcount, 1);
+	}
+	return c;
+}
+
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+{
+	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+
+	if (c) {
+		skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
+		c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
+	}
+	return c;
+}
+
+/* A cache entry has gone into a resolved state from queued */
+static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
+			       struct mfc_cache *uc, struct mfc_cache *c)
+{
+	struct sk_buff *skb;
+	struct nlmsgerr *e;
+
+	/* Play the pending entries through our router */
+	while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
+		if (ip_hdr(skb)->version == 0) {
+			struct nlmsghdr *nlh = skb_pull(skb,
+							sizeof(struct iphdr));
+
+			if (mr_fill_mroute(mrt, skb, &c->_c,
+					   nlmsg_data(nlh)) > 0) {
+				nlh->nlmsg_len = skb_tail_pointer(skb) -
+						 (u8 *)nlh;
+			} else {
+				nlh->nlmsg_type = NLMSG_ERROR;
+				nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+				skb_trim(skb, nlh->nlmsg_len);
+				e = nlmsg_data(nlh);
+				e->error = -EMSGSIZE;
+				memset(&e->msg, 0, sizeof(e->msg));
+			}
+
+			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+		} else {
+			rcu_read_lock();
+			ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
+			rcu_read_unlock();
+		}
+	}
+}
+
+/* Bounce a cache query up to mrouted and netlink.
+ *
+ * Called under rcu_read_lock().
+ */
+static int ipmr_cache_report(const struct mr_table *mrt,
+			     struct sk_buff *pkt, vifi_t vifi, int assert)
+{
+	const int ihl = ip_hdrlen(pkt);
+	struct sock *mroute_sk;
+	struct igmphdr *igmp;
+	struct igmpmsg *msg;
+	struct sk_buff *skb;
+	int ret;
+
+	mroute_sk = rcu_dereference(mrt->mroute_sk);
+	if (!mroute_sk)
+		return -EINVAL;
+
+	if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
+		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
+	else
+		skb = alloc_skb(128, GFP_ATOMIC);
+
+	if (!skb)
+		return -ENOBUFS;
+
+	if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
+		/* Ugly, but we have no choice with this interface.
+		 * Duplicate old header, fix ihl, length etc.
+		 * And all this only to mangle msg->im_msgtype and
+		 * to set msg->im_mbz to "mbz" :-)
+		 */
+		skb_push(skb, sizeof(struct iphdr));
+		skb_reset_network_header(skb);
+		skb_reset_transport_header(skb);
+		msg = (struct igmpmsg *)skb_network_header(skb);
+		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
+		msg->im_msgtype = assert;
+		msg->im_mbz = 0;
+		if (assert == IGMPMSG_WRVIFWHOLE) {
+			msg->im_vif = vifi;
+			msg->im_vif_hi = vifi >> 8;
+		} else {
+			/* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+			int vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+
+			msg->im_vif = vif_num;
+			msg->im_vif_hi = vif_num >> 8;
+		}
+		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
+		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
+					     sizeof(struct iphdr));
+	} else {
+		/* Copy the IP header */
+		skb_set_network_header(skb, skb->len);
+		skb_put(skb, ihl);
+		skb_copy_to_linear_data(skb, pkt->data, ihl);
+		/* Flag to the kernel this is a route add */
+		ip_hdr(skb)->protocol = 0;
+		msg = (struct igmpmsg *)skb_network_header(skb);
+		msg->im_vif = vifi;
+		msg->im_vif_hi = vifi >> 8;
+		ipv4_pktinfo_prepare(mroute_sk, pkt);
+		memcpy(skb->cb, pkt->cb, sizeof(skb->cb));
+		/* Add our header */
+		igmp = skb_put(skb, sizeof(struct igmphdr));
+		igmp->type = assert;
+		msg->im_msgtype = assert;
+		igmp->code = 0;
+		ip_hdr(skb)->tot_len = htons(skb->len);	/* Fix the length */
+		skb->transport_header = skb->network_header;
+	}
+
+	igmpmsg_netlink_event(mrt, skb);
+
+	/* Deliver to mrouted */
+	ret = sock_queue_rcv_skb(mroute_sk, skb);
+
+	if (ret < 0) {
+		net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
+		kfree_skb(skb);
+	}
+
+	return ret;
+}
+
+/* Queue a packet for resolution. It gets locked cache entry! */
+/* Called under rcu_read_lock() */
+static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
+				 struct sk_buff *skb, struct net_device *dev)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct mfc_cache *c;
+	bool found = false;
+	int err;
+
+	spin_lock_bh(&mfc_unres_lock);
+	list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
+		if (c->mfc_mcastgrp == iph->daddr &&
+		    c->mfc_origin == iph->saddr) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		/* Create a new entry if allowable */
+		c = ipmr_cache_alloc_unres();
+		if (!c) {
+			spin_unlock_bh(&mfc_unres_lock);
+
+			kfree_skb(skb);
+			return -ENOBUFS;
+		}
+
+		/* Fill in the new cache entry */
+		c->_c.mfc_parent = -1;
+		c->mfc_origin	= iph->saddr;
+		c->mfc_mcastgrp	= iph->daddr;
+
+		/* Reflect first query at mrouted. */
+		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
+
+		if (err < 0) {
+			/* If the report failed throw the cache entry
+			   out - Brad Parker
+			 */
+			spin_unlock_bh(&mfc_unres_lock);
+
+			ipmr_cache_free(c);
+			kfree_skb(skb);
+			return err;
+		}
+
+		atomic_inc(&mrt->cache_resolve_queue_len);
+		list_add(&c->_c.list, &mrt->mfc_unres_queue);
+		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+
+		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
+			mod_timer(&mrt->ipmr_expire_timer,
+				  c->_c.mfc_un.unres.expires);
+	}
+
+	/* See if we can append the packet */
+	if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
+		kfree_skb(skb);
+		err = -ENOBUFS;
+	} else {
+		if (dev) {
+			skb->dev = dev;
+			skb->skb_iif = dev->ifindex;
+		}
+		skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
+		err = 0;
+	}
+
+	spin_unlock_bh(&mfc_unres_lock);
+	return err;
+}
+
+/* MFC cache manipulation by user space mroute daemon */
+
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct mfc_cache *c;
+
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+				   mfc->mfcc_mcastgrp.s_addr, parent);
+	rcu_read_unlock();
+	if (!c)
+		return -ENOENT;
+	rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params);
+	list_del_rcu(&c->_c.list);
+	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
+	mroute_netlink_event(mrt, c, RTM_DELROUTE);
+	mr_cache_put(&c->_c);
+
+	return 0;
+}
+
+static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
+			struct mfcctl *mfc, int mrtsock, int parent)
+{
+	struct mfc_cache *uc, *c;
+	struct mr_mfc *_uc;
+	bool found;
+	int ret;
+
+	if (mfc->mfcc_parent >= MAXVIFS)
+		return -ENFILE;
+
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+				   mfc->mfcc_mcastgrp.s_addr, parent);
+	rcu_read_unlock();
+	if (c) {
+		spin_lock(&mrt_lock);
+		c->_c.mfc_parent = mfc->mfcc_parent;
+		ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
+		if (!mrtsock)
+			c->_c.mfc_flags |= MFC_STATIC;
+		spin_unlock(&mrt_lock);
+		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
+					      mrt->id);
+		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+		return 0;
+	}
+
+	if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
+	    !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
+		return -EINVAL;
+
+	c = ipmr_cache_alloc();
+	if (!c)
+		return -ENOMEM;
+
+	c->mfc_origin = mfc->mfcc_origin.s_addr;
+	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
+	c->_c.mfc_parent = mfc->mfcc_parent;
+	ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
+	if (!mrtsock)
+		c->_c.mfc_flags |= MFC_STATIC;
+
+	ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
+				  ipmr_rht_params);
+	if (ret) {
+		pr_err("ipmr: rhtable insert error %d\n", ret);
+		ipmr_cache_free(c);
+		return ret;
+	}
+	list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);
+	/* Check to see if we resolved a queued list. If so we
+	 * need to send on the frames and tidy up.
+	 */
+	found = false;
+	spin_lock_bh(&mfc_unres_lock);
+	list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
+		uc = (struct mfc_cache *)_uc;
+		if (uc->mfc_origin == c->mfc_origin &&
+		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
+			list_del(&_uc->list);
+			atomic_dec(&mrt->cache_resolve_queue_len);
+			found = true;
+			break;
+		}
+	}
+	if (list_empty(&mrt->mfc_unres_queue))
+		del_timer(&mrt->ipmr_expire_timer);
+	spin_unlock_bh(&mfc_unres_lock);
+
+	if (found) {
+		ipmr_cache_resolve(net, mrt, uc, c);
+		ipmr_cache_free(uc);
+	}
+	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
+	mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+	return 0;
+}
+
+/* Close the multicast socket, and clear the vif tables etc */
+static void mroute_clean_tables(struct mr_table *mrt, int flags)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct mr_mfc *c, *tmp;
+	struct mfc_cache *cache;
+	LIST_HEAD(list);
+	int i;
+
+	/* Shut down all active vif entries */
+	if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) {
+		for (i = 0; i < mrt->maxvif; i++) {
+			if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
+			     !(flags & MRT_FLUSH_VIFS_STATIC)) ||
+			    (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS)))
+				continue;
+			vif_delete(mrt, i, 0, &list);
+		}
+		unregister_netdevice_many(&list);
+	}
+
+	/* Wipe the cache */
+	if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) {
+		list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+			if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) ||
+			    (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC)))
+				continue;
+			rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+			list_del_rcu(&c->list);
+			cache = (struct mfc_cache *)c;
+			call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
+						      mrt->id);
+			mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+			mr_cache_put(c);
+		}
+	}
+
+	if (flags & MRT_FLUSH_MFC) {
+		if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+			spin_lock_bh(&mfc_unres_lock);
+			list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
+				list_del(&c->list);
+				cache = (struct mfc_cache *)c;
+				mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+				ipmr_destroy_unres(mrt, cache);
+			}
+			spin_unlock_bh(&mfc_unres_lock);
+		}
+	}
+}
+
+/* called from ip_ra_control(), before an RCU grace period,
+ * we don't need to call synchronize_rcu() here
+ */
+static void mrtsock_destruct(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	struct mr_table *mrt;
+
+	rtnl_lock();
+	ipmr_for_each_table(mrt, net) {
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
+			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_MC_FORWARDING,
+						    NETCONFA_IFINDEX_ALL,
+						    net->ipv4.devconf_all);
+			RCU_INIT_POINTER(mrt->mroute_sk, NULL);
+			mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC);
+		}
+	}
+	rtnl_unlock();
+}
+
+/* Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
+			 unsigned int optlen)
+{
+	struct net *net = sock_net(sk);
+	int val, ret = 0, parent = 0;
+	struct mr_table *mrt;
+	struct vifctl vif;
+	struct mfcctl mfc;
+	bool do_wrvifwhole;
+	u32 uval;
+
+	/* There's one exception to the lock - MRT_DONE which needs to unlock */
+	rtnl_lock();
+	if (sk->sk_type != SOCK_RAW ||
+	    inet_sk(sk)->inet_num != IPPROTO_IGMP) {
+		ret = -EOPNOTSUPP;
+		goto out_unlock;
+	}
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (!mrt) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+	if (optname != MRT_INIT) {
+		if (sk != rcu_access_pointer(mrt->mroute_sk) &&
+		    !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+			ret = -EACCES;
+			goto out_unlock;
+		}
+	}
+
+	switch (optname) {
+	case MRT_INIT:
+		if (optlen != sizeof(int)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (rtnl_dereference(mrt->mroute_sk)) {
+			ret = -EADDRINUSE;
+			break;
+		}
+
+		ret = ip_ra_control(sk, 1, mrtsock_destruct);
+		if (ret == 0) {
+			rcu_assign_pointer(mrt->mroute_sk, sk);
+			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_MC_FORWARDING,
+						    NETCONFA_IFINDEX_ALL,
+						    net->ipv4.devconf_all);
+		}
+		break;
+	case MRT_DONE:
+		if (sk != rcu_access_pointer(mrt->mroute_sk)) {
+			ret = -EACCES;
+		} else {
+			/* We need to unlock here because mrtsock_destruct takes
+			 * care of rtnl itself and we can't change that due to
+			 * the IP_ROUTER_ALERT setsockopt which runs without it.
+			 */
+			rtnl_unlock();
+			ret = ip_ra_control(sk, 0, NULL);
+			goto out;
+		}
+		break;
+	case MRT_ADD_VIF:
+	case MRT_DEL_VIF:
+		if (optlen != sizeof(vif)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_sockptr(&vif, optval, sizeof(vif))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (vif.vifc_vifi >= MAXVIFS) {
+			ret = -ENFILE;
+			break;
+		}
+		if (optname == MRT_ADD_VIF) {
+			ret = vif_add(net, mrt, &vif,
+				      sk == rtnl_dereference(mrt->mroute_sk));
+		} else {
+			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
+		}
+		break;
+	/* Manipulate the forwarding caches. These live
+	 * in a sort of kernel/user symbiosis.
+	 */
+	case MRT_ADD_MFC:
+	case MRT_DEL_MFC:
+		parent = -1;
+		fallthrough;
+	case MRT_ADD_MFC_PROXY:
+	case MRT_DEL_MFC_PROXY:
+		if (optlen != sizeof(mfc)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (parent == 0)
+			parent = mfc.mfcc_parent;
+		if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
+			ret = ipmr_mfc_delete(mrt, &mfc, parent);
+		else
+			ret = ipmr_mfc_add(net, mrt, &mfc,
+					   sk == rtnl_dereference(mrt->mroute_sk),
+					   parent);
+		break;
+	case MRT_FLUSH:
+		if (optlen != sizeof(val)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_sockptr(&val, optval, sizeof(val))) {
+			ret = -EFAULT;
+			break;
+		}
+		mroute_clean_tables(mrt, val);
+		break;
+	/* Control PIM assert. */
+	case MRT_ASSERT:
+		if (optlen != sizeof(val)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_sockptr(&val, optval, sizeof(val))) {
+			ret = -EFAULT;
+			break;
+		}
+		mrt->mroute_do_assert = val;
+		break;
+	case MRT_PIM:
+		if (!ipmr_pimsm_enabled()) {
+			ret = -ENOPROTOOPT;
+			break;
+		}
+		if (optlen != sizeof(val)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_sockptr(&val, optval, sizeof(val))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
+		val = !!val;
+		if (val != mrt->mroute_do_pim) {
+			mrt->mroute_do_pim = val;
+			mrt->mroute_do_assert = val;
+			mrt->mroute_do_wrvifwhole = do_wrvifwhole;
+		}
+		break;
+	case MRT_TABLE:
+		if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
+			ret = -ENOPROTOOPT;
+			break;
+		}
+		if (optlen != sizeof(uval)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_sockptr(&uval, optval, sizeof(uval))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
+			ret = -EBUSY;
+		} else {
+			mrt = ipmr_new_table(net, uval);
+			if (IS_ERR(mrt))
+				ret = PTR_ERR(mrt);
+			else
+				raw_sk(sk)->ipmr_table = uval;
+		}
+		break;
+	/* Spurious command, or MRT_VERSION which you cannot set. */
+	default:
+		ret = -ENOPROTOOPT;
+	}
+out_unlock:
+	rtnl_unlock();
+out:
+	return ret;
+}
+
+/* Execute if this ioctl is a special mroute ioctl */
+int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+	switch (cmd) {
+	/* These userspace buffers will be consumed by ipmr_ioctl() */
+	case SIOCGETVIFCNT: {
+		struct sioc_vif_req buffer;
+
+		return sock_ioctl_inout(sk, cmd, arg, &buffer,
+				      sizeof(buffer));
+		}
+	case SIOCGETSGCNT: {
+		struct sioc_sg_req buffer;
+
+		return sock_ioctl_inout(sk, cmd, arg, &buffer,
+				      sizeof(buffer));
+		}
+	}
+	/* return code > 0 means that the ioctl was not executed */
+	return 1;
+}
+
+/* Getsock opt support for the multicast routing system. */
+int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
+			 sockptr_t optlen)
+{
+	int olr;
+	int val;
+	struct net *net = sock_net(sk);
+	struct mr_table *mrt;
+
+	if (sk->sk_type != SOCK_RAW ||
+	    inet_sk(sk)->inet_num != IPPROTO_IGMP)
+		return -EOPNOTSUPP;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (!mrt)
+		return -ENOENT;
+
+	switch (optname) {
+	case MRT_VERSION:
+		val = 0x0305;
+		break;
+	case MRT_PIM:
+		if (!ipmr_pimsm_enabled())
+			return -ENOPROTOOPT;
+		val = mrt->mroute_do_pim;
+		break;
+	case MRT_ASSERT:
+		val = mrt->mroute_do_assert;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (copy_from_sockptr(&olr, optlen, sizeof(int)))
+		return -EFAULT;
+	olr = min_t(unsigned int, olr, sizeof(int));
+	if (olr < 0)
+		return -EINVAL;
+	if (copy_to_sockptr(optlen, &olr, sizeof(int)))
+		return -EFAULT;
+	if (copy_to_sockptr(optval, &val, olr))
+		return -EFAULT;
+	return 0;
+}
+
+/* The IP multicast ioctl support routines. */
+int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
+{
+	struct vif_device *vif;
+	struct mfc_cache *c;
+	struct net *net = sock_net(sk);
+	struct sioc_vif_req *vr;
+	struct sioc_sg_req *sr;
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (!mrt)
+		return -ENOENT;
+
+	switch (cmd) {
+	case SIOCGETVIFCNT:
+		vr = (struct sioc_vif_req *)arg;
+		if (vr->vifi >= mrt->maxvif)
+			return -EINVAL;
+		vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif);
+		rcu_read_lock();
+		vif = &mrt->vif_table[vr->vifi];
+		if (VIF_EXISTS(mrt, vr->vifi)) {
+			vr->icount = READ_ONCE(vif->pkt_in);
+			vr->ocount = READ_ONCE(vif->pkt_out);
+			vr->ibytes = READ_ONCE(vif->bytes_in);
+			vr->obytes = READ_ONCE(vif->bytes_out);
+			rcu_read_unlock();
+
+			return 0;
+		}
+		rcu_read_unlock();
+		return -EADDRNOTAVAIL;
+	case SIOCGETSGCNT:
+		sr = (struct sioc_sg_req *)arg;
+
+		rcu_read_lock();
+		c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr);
+		if (c) {
+			sr->pktcnt = c->_c.mfc_un.res.pkt;
+			sr->bytecnt = c->_c.mfc_un.res.bytes;
+			sr->wrong_if = c->_c.mfc_un.res.wrong_if;
+			rcu_read_unlock();
+			return 0;
+		}
+		rcu_read_unlock();
+		return -EADDRNOTAVAIL;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+	struct in_addr src;
+	struct in_addr grp;
+	compat_ulong_t pktcnt;
+	compat_ulong_t bytecnt;
+	compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_vif_req {
+	vifi_t	vifi;		/* Which iface */
+	compat_ulong_t icount;
+	compat_ulong_t ocount;
+	compat_ulong_t ibytes;
+	compat_ulong_t obytes;
+};
+
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+	struct compat_sioc_sg_req sr;
+	struct compat_sioc_vif_req vr;
+	struct vif_device *vif;
+	struct mfc_cache *c;
+	struct net *net = sock_net(sk);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (!mrt)
+		return -ENOENT;
+
+	switch (cmd) {
+	case SIOCGETVIFCNT:
+		if (copy_from_user(&vr, arg, sizeof(vr)))
+			return -EFAULT;
+		if (vr.vifi >= mrt->maxvif)
+			return -EINVAL;
+		vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
+		rcu_read_lock();
+		vif = &mrt->vif_table[vr.vifi];
+		if (VIF_EXISTS(mrt, vr.vifi)) {
+			vr.icount = READ_ONCE(vif->pkt_in);
+			vr.ocount = READ_ONCE(vif->pkt_out);
+			vr.ibytes = READ_ONCE(vif->bytes_in);
+			vr.obytes = READ_ONCE(vif->bytes_out);
+			rcu_read_unlock();
+
+			if (copy_to_user(arg, &vr, sizeof(vr)))
+				return -EFAULT;
+			return 0;
+		}
+		rcu_read_unlock();
+		return -EADDRNOTAVAIL;
+	case SIOCGETSGCNT:
+		if (copy_from_user(&sr, arg, sizeof(sr)))
+			return -EFAULT;
+
+		rcu_read_lock();
+		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+		if (c) {
+			sr.pktcnt = c->_c.mfc_un.res.pkt;
+			sr.bytecnt = c->_c.mfc_un.res.bytes;
+			sr.wrong_if = c->_c.mfc_un.res.wrong_if;
+			rcu_read_unlock();
+
+			if (copy_to_user(arg, &sr, sizeof(sr)))
+				return -EFAULT;
+			return 0;
+		}
+		rcu_read_unlock();
+		return -EADDRNOTAVAIL;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct net *net = dev_net(dev);
+	struct mr_table *mrt;
+	struct vif_device *v;
+	int ct;
+
+	if (event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+
+	ipmr_for_each_table(mrt, net) {
+		v = &mrt->vif_table[0];
+		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+			if (rcu_access_pointer(v->dev) == dev)
+				vif_delete(mrt, ct, 1, NULL);
+		}
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_mr_notifier = {
+	.notifier_call = ipmr_device_event,
+};
+
+/* Encapsulate a packet by attaching a valid IPIP header to it.
+ * This avoids tunnel drivers and other mess and gives us the speed so
+ * important for multicast video.
+ */
+static void ip_encap(struct net *net, struct sk_buff *skb,
+		     __be32 saddr, __be32 daddr)
+{
+	struct iphdr *iph;
+	const struct iphdr *old_iph = ip_hdr(skb);
+
+	skb_push(skb, sizeof(struct iphdr));
+	skb->transport_header = skb->network_header;
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+
+	iph->version	=	4;
+	iph->tos	=	old_iph->tos;
+	iph->ttl	=	old_iph->ttl;
+	iph->frag_off	=	0;
+	iph->daddr	=	daddr;
+	iph->saddr	=	saddr;
+	iph->protocol	=	IPPROTO_IPIP;
+	iph->ihl	=	5;
+	iph->tot_len	=	htons(skb->len);
+	ip_select_ident(net, skb, NULL);
+	ip_send_check(iph);
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	nf_reset_ct(skb);
+}
+
+static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
+				      struct sk_buff *skb)
+{
+	struct ip_options *opt = &(IPCB(skb)->opt);
+
+	IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+	if (unlikely(opt->optlen))
+		ip_forward_options(skb);
+
+	return dst_output(net, sk, skb);
+}
+
+#ifdef CONFIG_NET_SWITCHDEV
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+				   int in_vifi, int out_vifi)
+{
+	struct vif_device *out_vif = &mrt->vif_table[out_vifi];
+	struct vif_device *in_vif = &mrt->vif_table[in_vifi];
+
+	if (!skb->offload_l3_fwd_mark)
+		return false;
+	if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
+		return false;
+	return netdev_phys_item_id_same(&out_vif->dev_parent_id,
+					&in_vif->dev_parent_id);
+}
+#else
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+				   int in_vifi, int out_vifi)
+{
+	return false;
+}
+#endif
+
+/* Processing handlers for ipmr_forward, under rcu_read_lock() */
+
+static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
+			    int in_vifi, struct sk_buff *skb, int vifi)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct vif_device *vif = &mrt->vif_table[vifi];
+	struct net_device *vif_dev;
+	struct net_device *dev;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	int    encap = 0;
+
+	vif_dev = vif_dev_read(vif);
+	if (!vif_dev)
+		goto out_free;
+
+	if (vif->flags & VIFF_REGISTER) {
+		WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+		WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+		DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
+		DEV_STATS_INC(vif_dev, tx_packets);
+		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
+		goto out_free;
+	}
+
+	if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
+		goto out_free;
+
+	if (vif->flags & VIFF_TUNNEL) {
+		rt = ip_route_output_ports(net, &fl4, NULL,
+					   vif->remote, vif->local,
+					   0, 0,
+					   IPPROTO_IPIP,
+					   RT_TOS(iph->tos), vif->link);
+		if (IS_ERR(rt))
+			goto out_free;
+		encap = sizeof(struct iphdr);
+	} else {
+		rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
+					   0, 0,
+					   IPPROTO_IPIP,
+					   RT_TOS(iph->tos), vif->link);
+		if (IS_ERR(rt))
+			goto out_free;
+	}
+
+	dev = rt->dst.dev;
+
+	if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
+		/* Do not fragment multicasts. Alas, IPv4 does not
+		 * allow to send ICMP, so that packets will disappear
+		 * to blackhole.
+		 */
+		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
+		ip_rt_put(rt);
+		goto out_free;
+	}
+
+	encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
+
+	if (skb_cow(skb, encap)) {
+		ip_rt_put(rt);
+		goto out_free;
+	}
+
+	WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+	WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+	ip_decrease_ttl(ip_hdr(skb));
+
+	/* FIXME: forward and output firewalls used to be called here.
+	 * What do we do with netfilter? -- RR
+	 */
+	if (vif->flags & VIFF_TUNNEL) {
+		ip_encap(net, skb, vif->local, vif->remote);
+		/* FIXME: extra output firewall step used to be here. --RR */
+		DEV_STATS_INC(vif_dev, tx_packets);
+		DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
+	}
+
+	IPCB(skb)->flags |= IPSKB_FORWARDED;
+
+	/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+	 * not only before forwarding, but after forwarding on all output
+	 * interfaces. It is clear, if mrouter runs a multicasting
+	 * program, it should receive packets not depending to what interface
+	 * program is joined.
+	 * If we will not make it, the program will have to join on all
+	 * interfaces. On the other hand, multihoming host (or router, but
+	 * not mrouter) cannot join to more than one interface - it will
+	 * result in receiving multiple packets.
+	 */
+	NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
+		net, NULL, skb, skb->dev, dev,
+		ipmr_forward_finish);
+	return;
+
+out_free:
+	kfree_skb(skb);
+}
+
+/* Called with mrt_lock or rcu_read_lock() */
+static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev)
+{
+	int ct;
+	/* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */
+	for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+		if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
+			break;
+	}
+	return ct;
+}
+
+/* "local" means that we should preserve one skb (for local delivery) */
+/* Called uner rcu_read_lock() */
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+			  struct net_device *dev, struct sk_buff *skb,
+			  struct mfc_cache *c, int local)
+{
+	int true_vifi = ipmr_find_vif(mrt, dev);
+	int psend = -1;
+	int vif, ct;
+
+	vif = c->_c.mfc_parent;
+	c->_c.mfc_un.res.pkt++;
+	c->_c.mfc_un.res.bytes += skb->len;
+	c->_c.mfc_un.res.lastuse = jiffies;
+
+	if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+		struct mfc_cache *cache_proxy;
+
+		/* For an (*,G) entry, we only check that the incoming
+		 * interface is part of the static tree.
+		 */
+		cache_proxy = mr_mfc_find_any_parent(mrt, vif);
+		if (cache_proxy &&
+		    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
+			goto forward;
+	}
+
+	/* Wrong interface: drop packet and (maybe) send PIM assert. */
+	if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
+		if (rt_is_output_route(skb_rtable(skb))) {
+			/* It is our own packet, looped back.
+			 * Very complicated situation...
+			 *
+			 * The best workaround until routing daemons will be
+			 * fixed is not to redistribute packet, if it was
+			 * send through wrong interface. It means, that
+			 * multicast applications WILL NOT work for
+			 * (S,G), which have default multicast route pointing
+			 * to wrong oif. In any case, it is not a good
+			 * idea to use multicasting applications on router.
+			 */
+			goto dont_forward;
+		}
+
+		c->_c.mfc_un.res.wrong_if++;
+
+		if (true_vifi >= 0 && mrt->mroute_do_assert &&
+		    /* pimsm uses asserts, when switching from RPT to SPT,
+		     * so that we cannot check that packet arrived on an oif.
+		     * It is bad, but otherwise we would need to move pretty
+		     * large chunk of pimd to kernel. Ough... --ANK
+		     */
+		    (mrt->mroute_do_pim ||
+		     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
+		    time_after(jiffies,
+			       c->_c.mfc_un.res.last_assert +
+			       MFC_ASSERT_THRESH)) {
+			c->_c.mfc_un.res.last_assert = jiffies;
+			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+			if (mrt->mroute_do_wrvifwhole)
+				ipmr_cache_report(mrt, skb, true_vifi,
+						  IGMPMSG_WRVIFWHOLE);
+		}
+		goto dont_forward;
+	}
+
+forward:
+	WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+		   mrt->vif_table[vif].pkt_in + 1);
+	WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+		   mrt->vif_table[vif].bytes_in + skb->len);
+
+	/* Forward the frame */
+	if (c->mfc_origin == htonl(INADDR_ANY) &&
+	    c->mfc_mcastgrp == htonl(INADDR_ANY)) {
+		if (true_vifi >= 0 &&
+		    true_vifi != c->_c.mfc_parent &&
+		    ip_hdr(skb)->ttl >
+				c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+			/* It's an (*,*) entry and the packet is not coming from
+			 * the upstream: forward the packet to the upstream
+			 * only.
+			 */
+			psend = c->_c.mfc_parent;
+			goto last_forward;
+		}
+		goto dont_forward;
+	}
+	for (ct = c->_c.mfc_un.res.maxvif - 1;
+	     ct >= c->_c.mfc_un.res.minvif; ct--) {
+		/* For (*,G) entry, don't forward to the incoming interface */
+		if ((c->mfc_origin != htonl(INADDR_ANY) ||
+		     ct != true_vifi) &&
+		    ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
+			if (psend != -1) {
+				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+				if (skb2)
+					ipmr_queue_xmit(net, mrt, true_vifi,
+							skb2, psend);
+			}
+			psend = ct;
+		}
+	}
+last_forward:
+	if (psend != -1) {
+		if (local) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+			if (skb2)
+				ipmr_queue_xmit(net, mrt, true_vifi, skb2,
+						psend);
+		} else {
+			ipmr_queue_xmit(net, mrt, true_vifi, skb, psend);
+			return;
+		}
+	}
+
+dont_forward:
+	if (!local)
+		kfree_skb(skb);
+}
+
+static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct iphdr *iph = ip_hdr(skb);
+	struct flowi4 fl4 = {
+		.daddr = iph->daddr,
+		.saddr = iph->saddr,
+		.flowi4_tos = RT_TOS(iph->tos),
+		.flowi4_oif = (rt_is_output_route(rt) ?
+			       skb->dev->ifindex : 0),
+		.flowi4_iif = (rt_is_output_route(rt) ?
+			       LOOPBACK_IFINDEX :
+			       skb->dev->ifindex),
+		.flowi4_mark = skb->mark,
+	};
+	struct mr_table *mrt;
+	int err;
+
+	err = ipmr_fib_lookup(net, &fl4, &mrt);
+	if (err)
+		return ERR_PTR(err);
+	return mrt;
+}
+
+/* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
+ */
+int ip_mr_input(struct sk_buff *skb)
+{
+	struct mfc_cache *cache;
+	struct net *net = dev_net(skb->dev);
+	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+	struct mr_table *mrt;
+	struct net_device *dev;
+
+	/* skb->dev passed in is the loX master dev for vrfs.
+	 * As there are no vifs associated with loopback devices,
+	 * get the proper interface that does have a vif associated with it.
+	 */
+	dev = skb->dev;
+	if (netif_is_l3_master(skb->dev)) {
+		dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
+		if (!dev) {
+			kfree_skb(skb);
+			return -ENODEV;
+		}
+	}
+
+	/* Packet is looped back after forward, it should not be
+	 * forwarded second time, but still can be delivered locally.
+	 */
+	if (IPCB(skb)->flags & IPSKB_FORWARDED)
+		goto dont_forward;
+
+	mrt = ipmr_rt_fib_lookup(net, skb);
+	if (IS_ERR(mrt)) {
+		kfree_skb(skb);
+		return PTR_ERR(mrt);
+	}
+	if (!local) {
+		if (IPCB(skb)->opt.router_alert) {
+			if (ip_call_ra_chain(skb))
+				return 0;
+		} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+			/* IGMPv1 (and broken IGMPv2 implementations sort of
+			 * Cisco IOS <= 11.2(8)) do not put router alert
+			 * option to IGMP packets destined to routable
+			 * groups. It is very bad, because it means
+			 * that we can forward NO IGMP messages.
+			 */
+			struct sock *mroute_sk;
+
+			mroute_sk = rcu_dereference(mrt->mroute_sk);
+			if (mroute_sk) {
+				nf_reset_ct(skb);
+				raw_rcv(mroute_sk, skb);
+				return 0;
+			}
+		}
+	}
+
+	/* already under rcu_read_lock() */
+	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+	if (!cache) {
+		int vif = ipmr_find_vif(mrt, dev);
+
+		if (vif >= 0)
+			cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+						    vif);
+	}
+
+	/* No usable cache entry */
+	if (!cache) {
+		int vif;
+
+		if (local) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+			ip_local_deliver(skb);
+			if (!skb2)
+				return -ENOBUFS;
+			skb = skb2;
+		}
+
+		vif = ipmr_find_vif(mrt, dev);
+		if (vif >= 0)
+			return ipmr_cache_unresolved(mrt, vif, skb, dev);
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	ip_mr_forward(net, mrt, dev, skb, cache, local);
+
+	if (local)
+		return ip_local_deliver(skb);
+
+	return 0;
+
+dont_forward:
+	if (local)
+		return ip_local_deliver(skb);
+	kfree_skb(skb);
+	return 0;
+}
+
+#ifdef CONFIG_IP_PIMSM_V1
+/* Handle IGMP messages of PIMv1 */
+int pim_rcv_v1(struct sk_buff *skb)
+{
+	struct igmphdr *pim;
+	struct net *net = dev_net(skb->dev);
+	struct mr_table *mrt;
+
+	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+		goto drop;
+
+	pim = igmp_hdr(skb);
+
+	mrt = ipmr_rt_fib_lookup(net, skb);
+	if (IS_ERR(mrt))
+		goto drop;
+	if (!mrt->mroute_do_pim ||
+	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
+		goto drop;
+
+	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+		kfree_skb(skb);
+	}
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static int pim_rcv(struct sk_buff *skb)
+{
+	struct pimreghdr *pim;
+	struct net *net = dev_net(skb->dev);
+	struct mr_table *mrt;
+
+	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+		goto drop;
+
+	pim = (struct pimreghdr *)skb_transport_header(skb);
+	if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
+	    (pim->flags & PIM_NULL_REGISTER) ||
+	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+		goto drop;
+
+	mrt = ipmr_rt_fib_lookup(net, skb);
+	if (IS_ERR(mrt))
+		goto drop;
+	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+		kfree_skb(skb);
+	}
+	return 0;
+}
+#endif
+
+int ipmr_get_route(struct net *net, struct sk_buff *skb,
+		   __be32 saddr, __be32 daddr,
+		   struct rtmsg *rtm, u32 portid)
+{
+	struct mfc_cache *cache;
+	struct mr_table *mrt;
+	int err;
+
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (!mrt)
+		return -ENOENT;
+
+	rcu_read_lock();
+	cache = ipmr_cache_find(mrt, saddr, daddr);
+	if (!cache && skb->dev) {
+		int vif = ipmr_find_vif(mrt, skb->dev);
+
+		if (vif >= 0)
+			cache = ipmr_cache_find_any(mrt, daddr, vif);
+	}
+	if (!cache) {
+		struct sk_buff *skb2;
+		struct iphdr *iph;
+		struct net_device *dev;
+		int vif = -1;
+
+		dev = skb->dev;
+		if (dev)
+			vif = ipmr_find_vif(mrt, dev);
+		if (vif < 0) {
+			rcu_read_unlock();
+			return -ENODEV;
+		}
+
+		skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
+		if (!skb2) {
+			rcu_read_unlock();
+			return -ENOMEM;
+		}
+
+		NETLINK_CB(skb2).portid = portid;
+		skb_push(skb2, sizeof(struct iphdr));
+		skb_reset_network_header(skb2);
+		iph = ip_hdr(skb2);
+		iph->ihl = sizeof(struct iphdr) >> 2;
+		iph->saddr = saddr;
+		iph->daddr = daddr;
+		iph->version = 0;
+		err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
+		rcu_read_unlock();
+		return err;
+	}
+
+	err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
+	rcu_read_unlock();
+	return err;
+}
+
+static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+			    u32 portid, u32 seq, struct mfc_cache *c, int cmd,
+			    int flags)
+{
+	struct nlmsghdr *nlh;
+	struct rtmsg *rtm;
+	int err;
+
+	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family   = RTNL_FAMILY_IPMR;
+	rtm->rtm_dst_len  = 32;
+	rtm->rtm_src_len  = 32;
+	rtm->rtm_tos      = 0;
+	rtm->rtm_table    = mrt->id;
+	if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+		goto nla_put_failure;
+	rtm->rtm_type     = RTN_MULTICAST;
+	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
+	if (c->_c.mfc_flags & MFC_STATIC)
+		rtm->rtm_protocol = RTPROT_STATIC;
+	else
+		rtm->rtm_protocol = RTPROT_MROUTED;
+	rtm->rtm_flags    = 0;
+
+	if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
+	    nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
+		goto nla_put_failure;
+	err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
+	/* do not break the dump if cache is unresolved */
+	if (err < 0 && err != -ENOENT)
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+			     u32 portid, u32 seq, struct mr_mfc *c, int cmd,
+			     int flags)
+{
+	return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c,
+				cmd, flags);
+}
+
+static size_t mroute_msgsize(bool unresolved, int maxvif)
+{
+	size_t len =
+		NLMSG_ALIGN(sizeof(struct rtmsg))
+		+ nla_total_size(4)	/* RTA_TABLE */
+		+ nla_total_size(4)	/* RTA_SRC */
+		+ nla_total_size(4)	/* RTA_DST */
+		;
+
+	if (!unresolved)
+		len = len
+		      + nla_total_size(4)	/* RTA_IIF */
+		      + nla_total_size(0)	/* RTA_MULTIPATH */
+		      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+						/* RTA_MFC_STATS */
+		      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
+		;
+
+	return len;
+}
+
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+				 int cmd)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS,
+				       mrt->maxvif),
+			GFP_ATOMIC);
+	if (!skb)
+		goto errout;
+
+	err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
+	if (err < 0)
+		goto errout;
+
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
+	return;
+
+errout:
+	kfree_skb(skb);
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
+}
+
+static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
+{
+	size_t len =
+		NLMSG_ALIGN(sizeof(struct rtgenmsg))
+		+ nla_total_size(1)	/* IPMRA_CREPORT_MSGTYPE */
+		+ nla_total_size(4)	/* IPMRA_CREPORT_VIF_ID */
+		+ nla_total_size(4)	/* IPMRA_CREPORT_SRC_ADDR */
+		+ nla_total_size(4)	/* IPMRA_CREPORT_DST_ADDR */
+		+ nla_total_size(4)	/* IPMRA_CREPORT_TABLE */
+					/* IPMRA_CREPORT_PKT */
+		+ nla_total_size(payloadlen)
+		;
+
+	return len;
+}
+
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct nlmsghdr *nlh;
+	struct rtgenmsg *rtgenm;
+	struct igmpmsg *msg;
+	struct sk_buff *skb;
+	struct nlattr *nla;
+	int payloadlen;
+
+	payloadlen = pkt->len - sizeof(struct igmpmsg);
+	msg = (struct igmpmsg *)skb_network_header(pkt);
+
+	skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC);
+	if (!skb)
+		goto errout;
+
+	nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
+			sizeof(struct rtgenmsg), 0);
+	if (!nlh)
+		goto errout;
+	rtgenm = nlmsg_data(nlh);
+	rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
+	if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
+	    nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) ||
+	    nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
+			    msg->im_src.s_addr) ||
+	    nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
+			    msg->im_dst.s_addr) ||
+	    nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
+	if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
+				  nla_data(nla), payloadlen))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
+	return;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+errout:
+	kfree_skb(skb);
+	rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
+}
+
+static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
+				       const struct nlmsghdr *nlh,
+				       struct nlattr **tb,
+				       struct netlink_ext_ack *extack)
+{
+	struct rtmsg *rtm;
+	int i, err;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+		NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request");
+		return -EINVAL;
+	}
+
+	if (!netlink_strict_get_check(skb))
+		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+					      rtm_ipv4_policy, extack);
+
+	rtm = nlmsg_data(nlh);
+	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+	    rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
+	    rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
+		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request");
+		return -EINVAL;
+	}
+
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+					    rtm_ipv4_policy, extack);
+	if (err)
+		return err;
+
+	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+		return -EINVAL;
+	}
+
+	for (i = 0; i <= RTA_MAX; i++) {
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case RTA_SRC:
+		case RTA_DST:
+		case RTA_TABLE:
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nlattr *tb[RTA_MAX + 1];
+	struct sk_buff *skb = NULL;
+	struct mfc_cache *cache;
+	struct mr_table *mrt;
+	__be32 src, grp;
+	u32 tableid;
+	int err;
+
+	err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
+	if (err < 0)
+		goto errout;
+
+	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
+	grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
+	tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0;
+
+	mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
+	if (!mrt) {
+		err = -ENOENT;
+		goto errout_free;
+	}
+
+	/* entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	cache = ipmr_cache_find(mrt, src, grp);
+	rcu_read_unlock();
+	if (!cache) {
+		err = -ENOENT;
+		goto errout_free;
+	}
+
+	skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL);
+	if (!skb) {
+		err = -ENOBUFS;
+		goto errout_free;
+	}
+
+	err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
+			       nlh->nlmsg_seq, cache,
+			       RTM_NEWROUTE, 0);
+	if (err < 0)
+		goto errout_free;
+
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout:
+	return err;
+
+errout_free:
+	kfree_skb(skb);
+	goto errout;
+}
+
+static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct fib_dump_filter filter = {};
+	int err;
+
+	if (cb->strict_check) {
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+					    &filter, cb);
+		if (err < 0)
+			return err;
+	}
+
+	if (filter.table_id) {
+		struct mr_table *mrt;
+
+		mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id);
+		if (!mrt) {
+			if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR)
+				return skb->len;
+
+			NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
+			return -ENOENT;
+		}
+		err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
+				    &mfc_unres_lock, &filter);
+		return skb->len ? : err;
+	}
+
+	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
+				_ipmr_fill_mroute, &mfc_unres_lock, &filter);
+}
+
+static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
+	[RTA_SRC]	= { .type = NLA_U32 },
+	[RTA_DST]	= { .type = NLA_U32 },
+	[RTA_IIF]	= { .type = NLA_U32 },
+	[RTA_TABLE]	= { .type = NLA_U32 },
+	[RTA_MULTIPATH]	= { .len = sizeof(struct rtnexthop) },
+};
+
+static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
+{
+	switch (rtm_protocol) {
+	case RTPROT_STATIC:
+	case RTPROT_MROUTED:
+		return true;
+	}
+	return false;
+}
+
+static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
+{
+	struct rtnexthop *rtnh = nla_data(nla);
+	int remaining = nla_len(nla), vifi = 0;
+
+	while (rtnh_ok(rtnh, remaining)) {
+		mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
+		if (++vifi == MAXVIFS)
+			break;
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	return remaining > 0 ? -EINVAL : vifi;
+}
+
+/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
+static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
+			    struct mfcctl *mfcc, int *mrtsock,
+			    struct mr_table **mrtret,
+			    struct netlink_ext_ack *extack)
+{
+	struct net_device *dev = NULL;
+	u32 tblid = RT_TABLE_DEFAULT;
+	struct mr_table *mrt;
+	struct nlattr *attr;
+	struct rtmsg *rtm;
+	int ret, rem;
+
+	ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
+					rtm_ipmr_policy, extack);
+	if (ret < 0)
+		goto out;
+	rtm = nlmsg_data(nlh);
+
+	ret = -EINVAL;
+	if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
+	    rtm->rtm_type != RTN_MULTICAST ||
+	    rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
+	    !ipmr_rtm_validate_proto(rtm->rtm_protocol))
+		goto out;
+
+	memset(mfcc, 0, sizeof(*mfcc));
+	mfcc->mfcc_parent = -1;
+	ret = 0;
+	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
+		switch (nla_type(attr)) {
+		case RTA_SRC:
+			mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
+			break;
+		case RTA_DST:
+			mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
+			break;
+		case RTA_IIF:
+			dev = __dev_get_by_index(net, nla_get_u32(attr));
+			if (!dev) {
+				ret = -ENODEV;
+				goto out;
+			}
+			break;
+		case RTA_MULTIPATH:
+			if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+		case RTA_PREFSRC:
+			ret = 1;
+			break;
+		case RTA_TABLE:
+			tblid = nla_get_u32(attr);
+			break;
+		}
+	}
+	mrt = ipmr_get_table(net, tblid);
+	if (!mrt) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*mrtret = mrt;
+	*mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
+	if (dev)
+		mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);
+
+out:
+	return ret;
+}
+
+/* takes care of both newroute and delroute */
+static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	int ret, mrtsock, parent;
+	struct mr_table *tbl;
+	struct mfcctl mfcc;
+
+	mrtsock = 0;
+	tbl = NULL;
+	ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack);
+	if (ret < 0)
+		return ret;
+
+	parent = ret ? mfcc.mfcc_parent : -1;
+	if (nlh->nlmsg_type == RTM_NEWROUTE)
+		return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
+	else
+		return ipmr_mfc_delete(tbl, &mfcc, parent);
+}
+
+static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
+{
+	u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len);
+
+	if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) ||
+	    nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) ||
+	    nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM,
+			mrt->mroute_reg_vif_num) ||
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
+		       mrt->mroute_do_assert) ||
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
+	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
+		       mrt->mroute_do_wrvifwhole))
+		return false;
+
+	return true;
+}
+
+static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
+{
+	struct net_device *vif_dev;
+	struct nlattr *vif_nest;
+	struct vif_device *vif;
+
+	vif = &mrt->vif_table[vifid];
+	vif_dev = rtnl_dereference(vif->dev);
+	/* if the VIF doesn't exist just continue */
+	if (!vif_dev)
+		return true;
+
+	vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF);
+	if (!vif_nest)
+		return false;
+
+	if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) ||
+	    nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
+	    nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
+	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
+			      IPMRA_VIFA_PAD) ||
+	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out,
+			      IPMRA_VIFA_PAD) ||
+	    nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in,
+			      IPMRA_VIFA_PAD) ||
+	    nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out,
+			      IPMRA_VIFA_PAD) ||
+	    nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) ||
+	    nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) {
+		nla_nest_cancel(skb, vif_nest);
+		return false;
+	}
+	nla_nest_end(skb, vif_nest);
+
+	return true;
+}
+
+static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct ifinfomsg *ifm;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+		NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
+		return -EINVAL;
+	}
+
+	if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
+		NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump");
+		return -EINVAL;
+	}
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+	    ifm->ifi_change || ifm->ifi_index) {
+		NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlmsghdr *nlh = NULL;
+	unsigned int t = 0, s_t;
+	unsigned int e = 0, s_e;
+	struct mr_table *mrt;
+
+	if (cb->strict_check) {
+		int err = ipmr_valid_dumplink(cb->nlh, cb->extack);
+
+		if (err < 0)
+			return err;
+	}
+
+	s_t = cb->args[0];
+	s_e = cb->args[1];
+
+	ipmr_for_each_table(mrt, net) {
+		struct nlattr *vifs, *af;
+		struct ifinfomsg *hdr;
+		u32 i;
+
+		if (t < s_t)
+			goto skip_table;
+		nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq, RTM_NEWLINK,
+				sizeof(*hdr), NLM_F_MULTI);
+		if (!nlh)
+			break;
+
+		hdr = nlmsg_data(nlh);
+		memset(hdr, 0, sizeof(*hdr));
+		hdr->ifi_family = RTNL_FAMILY_IPMR;
+
+		af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
+		if (!af) {
+			nlmsg_cancel(skb, nlh);
+			goto out;
+		}
+
+		if (!ipmr_fill_table(mrt, skb)) {
+			nlmsg_cancel(skb, nlh);
+			goto out;
+		}
+
+		vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS);
+		if (!vifs) {
+			nla_nest_end(skb, af);
+			nlmsg_end(skb, nlh);
+			goto out;
+		}
+		for (i = 0; i < mrt->maxvif; i++) {
+			if (e < s_e)
+				goto skip_entry;
+			if (!ipmr_fill_vif(mrt, i, skb)) {
+				nla_nest_end(skb, vifs);
+				nla_nest_end(skb, af);
+				nlmsg_end(skb, nlh);
+				goto out;
+			}
+skip_entry:
+			e++;
+		}
+		s_e = 0;
+		e = 0;
+		nla_nest_end(skb, vifs);
+		nla_nest_end(skb, af);
+		nlmsg_end(skb, nlh);
+skip_table:
+		t++;
+	}
+
+out:
+	cb->args[1] = e;
+	cb->args[0] = t;
+
+	return skb->len;
+}
+
+#ifdef CONFIG_PROC_FS
+/* The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
+ */
+
+static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	struct mr_vif_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (!mrt)
+		return ERR_PTR(-ENOENT);
+
+	iter->mrt = mrt;
+
+	rcu_read_lock();
+	return mr_vif_seq_start(seq, pos);
+}
+
+static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
+{
+	struct mr_vif_iter *iter = seq->private;
+	struct mr_table *mrt = iter->mrt;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq,
+			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
+	} else {
+		const struct vif_device *vif = v;
+		const struct net_device *vif_dev;
+		const char *name;
+
+		vif_dev = vif_dev_read(vif);
+		name = vif_dev ? vif_dev->name : "none";
+		seq_printf(seq,
+			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
+			   vif - mrt->vif_table,
+			   name, vif->bytes_in, vif->pkt_in,
+			   vif->bytes_out, vif->pkt_out,
+			   vif->flags, vif->local, vif->remote);
+	}
+	return 0;
+}
+
+static const struct seq_operations ipmr_vif_seq_ops = {
+	.start = ipmr_vif_seq_start,
+	.next  = mr_vif_seq_next,
+	.stop  = ipmr_vif_seq_stop,
+	.show  = ipmr_vif_seq_show,
+};
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (!mrt)
+		return ERR_PTR(-ENOENT);
+
+	return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+	int n;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq,
+		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
+	} else {
+		const struct mfc_cache *mfc = v;
+		const struct mr_mfc_iter *it = seq->private;
+		const struct mr_table *mrt = it->mrt;
+
+		seq_printf(seq, "%08X %08X %-3hd",
+			   (__force u32) mfc->mfc_mcastgrp,
+			   (__force u32) mfc->mfc_origin,
+			   mfc->_c.mfc_parent);
+
+		if (it->cache != &mrt->mfc_unres_queue) {
+			seq_printf(seq, " %8lu %8lu %8lu",
+				   mfc->_c.mfc_un.res.pkt,
+				   mfc->_c.mfc_un.res.bytes,
+				   mfc->_c.mfc_un.res.wrong_if);
+			for (n = mfc->_c.mfc_un.res.minvif;
+			     n < mfc->_c.mfc_un.res.maxvif; n++) {
+				if (VIF_EXISTS(mrt, n) &&
+				    mfc->_c.mfc_un.res.ttls[n] < 255)
+					seq_printf(seq,
+					   " %2d:%-3d",
+					   n, mfc->_c.mfc_un.res.ttls[n]);
+			}
+		} else {
+			/* unresolved mfc_caches don't contain
+			 * pkt, bytes and wrong_if values
+			 */
+			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
+		}
+		seq_putc(seq, '\n');
+	}
+	return 0;
+}
+
+static const struct seq_operations ipmr_mfc_seq_ops = {
+	.start = ipmr_mfc_seq_start,
+	.next  = mr_mfc_seq_next,
+	.stop  = mr_mfc_seq_stop,
+	.show  = ipmr_mfc_seq_show,
+};
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static const struct net_protocol pim_protocol = {
+	.handler	=	pim_rcv,
+};
+#endif
+
+static unsigned int ipmr_seq_read(struct net *net)
+{
+	ASSERT_RTNL();
+
+	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
+}
+
+static int ipmr_dump(struct net *net, struct notifier_block *nb,
+		     struct netlink_ext_ack *extack)
+{
+	return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
+		       ipmr_mr_table_iter, extack);
+}
+
+static const struct fib_notifier_ops ipmr_notifier_ops_template = {
+	.family		= RTNL_FAMILY_IPMR,
+	.fib_seq_read	= ipmr_seq_read,
+	.fib_dump	= ipmr_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init ipmr_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	net->ipv4.ipmr_seq = 0;
+
+	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv4.ipmr_notifier_ops = ops;
+
+	return 0;
+}
+
+static void __net_exit ipmr_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
+	net->ipv4.ipmr_notifier_ops = NULL;
+}
+
+/* Setup for IP multicast routing */
+static int __net_init ipmr_net_init(struct net *net)
+{
+	int err;
+
+	err = ipmr_notifier_init(net);
+	if (err)
+		goto ipmr_notifier_fail;
+
+	err = ipmr_rules_init(net);
+	if (err < 0)
+		goto ipmr_rules_fail;
+
+#ifdef CONFIG_PROC_FS
+	err = -ENOMEM;
+	if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops,
+			sizeof(struct mr_vif_iter)))
+		goto proc_vif_fail;
+	if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
+			sizeof(struct mr_mfc_iter)))
+		goto proc_cache_fail;
+#endif
+	return 0;
+
+#ifdef CONFIG_PROC_FS
+proc_cache_fail:
+	remove_proc_entry("ip_mr_vif", net->proc_net);
+proc_vif_fail:
+	rtnl_lock();
+	ipmr_rules_exit(net);
+	rtnl_unlock();
+#endif
+ipmr_rules_fail:
+	ipmr_notifier_exit(net);
+ipmr_notifier_fail:
+	return err;
+}
+
+static void __net_exit ipmr_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("ip_mr_cache", net->proc_net);
+	remove_proc_entry("ip_mr_vif", net->proc_net);
+#endif
+	ipmr_notifier_exit(net);
+}
+
+static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
+{
+	struct net *net;
+
+	rtnl_lock();
+	list_for_each_entry(net, net_list, exit_list)
+		ipmr_rules_exit(net);
+	rtnl_unlock();
+}
+
+static struct pernet_operations ipmr_net_ops = {
+	.init = ipmr_net_init,
+	.exit = ipmr_net_exit,
+	.exit_batch = ipmr_net_exit_batch,
+};
+
+int __init ip_mr_init(void)
+{
+	int err;
+
+	mrt_cachep = kmem_cache_create("ip_mrt_cache",
+				       sizeof(struct mfc_cache),
+				       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+				       NULL);
+
+	err = register_pernet_subsys(&ipmr_net_ops);
+	if (err)
+		goto reg_pernet_fail;
+
+	err = register_netdevice_notifier(&ip_mr_notifier);
+	if (err)
+		goto reg_notif_fail;
+#ifdef CONFIG_IP_PIMSM_V2
+	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
+		pr_err("%s: can't add PIM protocol\n", __func__);
+		err = -EAGAIN;
+		goto add_proto_fail;
+	}
+#endif
+	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
+		      ipmr_rtm_getroute, ipmr_rtm_dumproute, 0);
+	rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
+		      ipmr_rtm_route, NULL, 0);
+	rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
+		      ipmr_rtm_route, NULL, 0);
+
+	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
+		      NULL, ipmr_rtm_dumplink, 0);
+	return 0;
+
+#ifdef CONFIG_IP_PIMSM_V2
+add_proto_fail:
+	unregister_netdevice_notifier(&ip_mr_notifier);
+#endif
+reg_notif_fail:
+	unregister_pernet_subsys(&ipmr_net_ops);
+reg_pernet_fail:
+	kmem_cache_destroy(mrt_cachep);
+	return err;
+}
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
new file mode 100644
index 0000000000..271dc03fc6
--- /dev/null
+++ b/net/ipv4/ipmr_base.c
@@ -0,0 +1,448 @@
+/* Linux multicast routing support
+ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation
+ */
+
+#include <linux/rhashtable.h>
+#include <linux/mroute_base.h>
+
+/* Sets everything common except 'dev', since that is done under locking */
+void vif_device_init(struct vif_device *v,
+		     struct net_device *dev,
+		     unsigned long rate_limit,
+		     unsigned char threshold,
+		     unsigned short flags,
+		     unsigned short get_iflink_mask)
+{
+	RCU_INIT_POINTER(v->dev, NULL);
+	v->bytes_in = 0;
+	v->bytes_out = 0;
+	v->pkt_in = 0;
+	v->pkt_out = 0;
+	v->rate_limit = rate_limit;
+	v->flags = flags;
+	v->threshold = threshold;
+	if (v->flags & get_iflink_mask)
+		v->link = dev_get_iflink(dev);
+	else
+		v->link = dev->ifindex;
+}
+EXPORT_SYMBOL(vif_device_init);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+	       struct mr_table_ops *ops,
+	       void (*expire_func)(struct timer_list *t),
+	       void (*table_set)(struct mr_table *mrt,
+				 struct net *net))
+{
+	struct mr_table *mrt;
+	int err;
+
+	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+	if (!mrt)
+		return ERR_PTR(-ENOMEM);
+	mrt->id = id;
+	write_pnet(&mrt->net, net);
+
+	mrt->ops = *ops;
+	err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
+	if (err) {
+		kfree(mrt);
+		return ERR_PTR(err);
+	}
+	INIT_LIST_HEAD(&mrt->mfc_cache_list);
+	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
+
+	timer_setup(&mrt->ipmr_expire_timer, expire_func, 0);
+
+	mrt->mroute_reg_vif_num = -1;
+	table_set(mrt, net);
+	return mrt;
+}
+EXPORT_SYMBOL(mr_table_alloc);
+
+void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent)
+{
+	struct rhlist_head *tmp, *list;
+	struct mr_mfc *c;
+
+	list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (parent == -1 || parent == c->mfc_parent)
+			return c;
+
+	return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_find_parent);
+
+void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi)
+{
+	struct rhlist_head *tmp, *list;
+	struct mr_mfc *c;
+
+	list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any,
+			       *mrt->ops.rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (c->mfc_un.res.ttls[vifi] < 255)
+			return c;
+
+	return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_find_any_parent);
+
+void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg)
+{
+	struct rhlist_head *tmp, *list;
+	struct mr_mfc *c, *proxy;
+
+	list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+		if (c->mfc_un.res.ttls[vifi] < 255)
+			return c;
+
+		/* It's ok if the vifi is part of the static tree */
+		proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent);
+		if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+			return c;
+	}
+
+	return mr_mfc_find_any_parent(mrt, vifi);
+}
+EXPORT_SYMBOL(mr_mfc_find_any);
+
+#ifdef CONFIG_PROC_FS
+void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos)
+{
+	struct mr_table *mrt = iter->mrt;
+
+	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+		if (!VIF_EXISTS(mrt, iter->ct))
+			continue;
+		if (pos-- == 0)
+			return &mrt->vif_table[iter->ct];
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(mr_vif_seq_idx);
+
+void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct mr_vif_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = iter->mrt;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return mr_vif_seq_idx(net, iter, 0);
+
+	while (++iter->ct < mrt->maxvif) {
+		if (!VIF_EXISTS(mrt, iter->ct))
+			continue;
+		return &mrt->vif_table[iter->ct];
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(mr_vif_seq_next);
+
+void *mr_mfc_seq_idx(struct net *net,
+		     struct mr_mfc_iter *it, loff_t pos)
+{
+	struct mr_table *mrt = it->mrt;
+	struct mr_mfc *mfc;
+
+	rcu_read_lock();
+	it->cache = &mrt->mfc_cache_list;
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+		if (pos-- == 0)
+			return mfc;
+	rcu_read_unlock();
+
+	spin_lock_bh(it->lock);
+	it->cache = &mrt->mfc_unres_queue;
+	list_for_each_entry(mfc, it->cache, list)
+		if (pos-- == 0)
+			return mfc;
+	spin_unlock_bh(it->lock);
+
+	it->cache = NULL;
+	return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_seq_idx);
+
+void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+		      loff_t *pos)
+{
+	struct mr_mfc_iter *it = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = it->mrt;
+	struct mr_mfc *c = v;
+
+	++*pos;
+
+	if (v == SEQ_START_TOKEN)
+		return mr_mfc_seq_idx(net, seq->private, 0);
+
+	if (c->list.next != it->cache)
+		return list_entry(c->list.next, struct mr_mfc, list);
+
+	if (it->cache == &mrt->mfc_unres_queue)
+		goto end_of_list;
+
+	/* exhausted cache_array, show unresolved */
+	rcu_read_unlock();
+	it->cache = &mrt->mfc_unres_queue;
+
+	spin_lock_bh(it->lock);
+	if (!list_empty(it->cache))
+		return list_first_entry(it->cache, struct mr_mfc, list);
+
+end_of_list:
+	spin_unlock_bh(it->lock);
+	it->cache = NULL;
+
+	return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_seq_next);
+#endif
+
+int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+		   struct mr_mfc *c, struct rtmsg *rtm)
+{
+	struct net_device *vif_dev;
+	struct rta_mfc_stats mfcs;
+	struct nlattr *mp_attr;
+	struct rtnexthop *nhp;
+	unsigned long lastuse;
+	int ct;
+
+	/* If cache is unresolved, don't try to parse IIF and OIF */
+	if (c->mfc_parent >= MAXVIFS) {
+		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
+		return -ENOENT;
+	}
+
+	rcu_read_lock();
+	vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev);
+	if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) {
+		rcu_read_unlock();
+		return -EMSGSIZE;
+	}
+	rcu_read_unlock();
+
+	if (c->mfc_flags & MFC_OFFLOAD)
+		rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
+	mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH);
+	if (!mp_attr)
+		return -EMSGSIZE;
+
+	rcu_read_lock();
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		struct vif_device *vif = &mrt->vif_table[ct];
+
+		vif_dev = rcu_dereference(vif->dev);
+		if (vif_dev && c->mfc_un.res.ttls[ct] < 255) {
+
+			nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
+			if (!nhp) {
+				rcu_read_unlock();
+				nla_nest_cancel(skb, mp_attr);
+				return -EMSGSIZE;
+			}
+
+			nhp->rtnh_flags = 0;
+			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+			nhp->rtnh_ifindex = vif_dev->ifindex;
+			nhp->rtnh_len = sizeof(*nhp);
+		}
+	}
+	rcu_read_unlock();
+
+	nla_nest_end(skb, mp_attr);
+
+	lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+	lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
+	mfcs.mfcs_packets = c->mfc_un.res.pkt;
+	mfcs.mfcs_bytes = c->mfc_un.res.bytes;
+	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
+	    nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
+			      RTA_PAD))
+		return -EMSGSIZE;
+
+	rtm->rtm_type = RTN_MULTICAST;
+	return 1;
+}
+EXPORT_SYMBOL(mr_fill_mroute);
+
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+			    const struct mr_mfc *c,
+			    const struct net_device *dev)
+{
+	int ct;
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		const struct net_device *vif_dev;
+		const struct vif_device *vif;
+
+		vif = &mrt->vif_table[ct];
+		vif_dev = rcu_access_pointer(vif->dev);
+		if (vif_dev && c->mfc_un.res.ttls[ct] < 255 &&
+		    vif_dev == dev)
+			return true;
+	}
+	return false;
+}
+
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+		  struct netlink_callback *cb,
+		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+			      u32 portid, u32 seq, struct mr_mfc *c,
+			      int cmd, int flags),
+		  spinlock_t *lock, struct fib_dump_filter *filter)
+{
+	unsigned int e = 0, s_e = cb->args[1];
+	unsigned int flags = NLM_F_MULTI;
+	struct mr_mfc *mfc;
+	int err;
+
+	if (filter->filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+		if (e < s_e)
+			goto next_entry;
+		if (filter->dev &&
+		    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+			goto next_entry;
+
+		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+		if (err < 0)
+			goto out;
+next_entry:
+		e++;
+	}
+
+	spin_lock_bh(lock);
+	list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+		if (e < s_e)
+			goto next_entry2;
+		if (filter->dev &&
+		    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+			goto next_entry2;
+
+		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+		if (err < 0) {
+			spin_unlock_bh(lock);
+			goto out;
+		}
+next_entry2:
+		e++;
+	}
+	spin_unlock_bh(lock);
+	err = 0;
+out:
+	cb->args[1] = e;
+	return err;
+}
+EXPORT_SYMBOL(mr_table_dump);
+
+int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+		     struct mr_table *(*iter)(struct net *net,
+					      struct mr_table *mrt),
+		     int (*fill)(struct mr_table *mrt,
+				 struct sk_buff *skb,
+				 u32 portid, u32 seq, struct mr_mfc *c,
+				 int cmd, int flags),
+		     spinlock_t *lock, struct fib_dump_filter *filter)
+{
+	unsigned int t = 0, s_t = cb->args[0];
+	struct net *net = sock_net(skb->sk);
+	struct mr_table *mrt;
+	int err;
+
+	/* multicast does not track protocol or have route type other
+	 * than RTN_MULTICAST
+	 */
+	if (filter->filter_set) {
+		if (filter->protocol || filter->flags ||
+		    (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+			return skb->len;
+	}
+
+	rcu_read_lock();
+	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
+		if (t < s_t)
+			goto next_table;
+
+		err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
+		if (err < 0)
+			break;
+		cb->args[1] = 0;
+next_table:
+		t++;
+	}
+	rcu_read_unlock();
+
+	cb->args[0] = t;
+
+	return skb->len;
+}
+EXPORT_SYMBOL(mr_rtm_dumproute);
+
+int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
+	    int (*rules_dump)(struct net *net,
+			      struct notifier_block *nb,
+			      struct netlink_ext_ack *extack),
+	    struct mr_table *(*mr_iter)(struct net *net,
+					struct mr_table *mrt),
+	    struct netlink_ext_ack *extack)
+{
+	struct mr_table *mrt;
+	int err;
+
+	err = rules_dump(net, nb, extack);
+	if (err)
+		return err;
+
+	for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) {
+		struct vif_device *v = &mrt->vif_table[0];
+		struct net_device *vif_dev;
+		struct mr_mfc *mfc;
+		int vifi;
+
+		/* Notifiy on table VIF entries */
+		rcu_read_lock();
+		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+			vif_dev = rcu_dereference(v->dev);
+			if (!vif_dev)
+				continue;
+
+			err = mr_call_vif_notifier(nb, family,
+						   FIB_EVENT_VIF_ADD, v,
+						   vif_dev, vifi,
+						   mrt->id, extack);
+			if (err)
+				break;
+		}
+		rcu_read_unlock();
+
+		if (err)
+			return err;
+
+		/* Notify on table MFC entries */
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+			err = mr_call_mfc_notifier(nb, family,
+						   FIB_EVENT_ENTRY_ADD,
+						   mfc, mrt->id, extack);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mr_dump);
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
new file mode 100644
index 0000000000..0e3ee15328
--- /dev/null
+++ b/net/ipv4/metrics.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/netlink.h>
+#include <linux/nospec.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/tcp.h>
+
+static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
+			      int fc_mx_len, u32 *metrics,
+			      struct netlink_ext_ack *extack)
+{
+	bool ecn_ca = false;
+	struct nlattr *nla;
+	int remaining;
+
+	nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
+		int type = nla_type(nla);
+		u32 val;
+
+		if (!type)
+			continue;
+		if (type > RTAX_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid metric type");
+			return -EINVAL;
+		}
+
+		type = array_index_nospec(type, RTAX_MAX + 1);
+		if (type == RTAX_CC_ALGO) {
+			char tmp[TCP_CA_NAME_MAX];
+
+			nla_strscpy(tmp, nla, sizeof(tmp));
+			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
+			if (val == TCP_CA_UNSPEC) {
+				NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
+				return -EINVAL;
+			}
+		} else {
+			if (nla_len(nla) != sizeof(u32)) {
+				NL_SET_ERR_MSG_ATTR(extack, nla,
+						    "Invalid attribute in metrics");
+				return -EINVAL;
+			}
+			val = nla_get_u32(nla);
+		}
+		if (type == RTAX_ADVMSS && val > 65535 - 40)
+			val = 65535 - 40;
+		if (type == RTAX_MTU && val > 65535 - 15)
+			val = 65535 - 15;
+		if (type == RTAX_HOPLIMIT && val > 255)
+			val = 255;
+		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+			NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
+			return -EINVAL;
+		}
+		metrics[type - 1] = val;
+	}
+
+	if (ecn_ca)
+		metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+	return 0;
+}
+
+struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
+					int fc_mx_len,
+					struct netlink_ext_ack *extack)
+{
+	struct dst_metrics *fib_metrics;
+	int err;
+
+	if (!fc_mx)
+		return (struct dst_metrics *)&dst_default_metrics;
+
+	fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL);
+	if (unlikely(!fib_metrics))
+		return ERR_PTR(-ENOMEM);
+
+	err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics,
+				 extack);
+	if (!err) {
+		refcount_set(&fib_metrics->refcnt, 1);
+	} else {
+		kfree(fib_metrics);
+		fib_metrics = ERR_PTR(err);
+	}
+
+	return fib_metrics;
+}
+EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 0000000000..bd13516548
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,95 @@
+/*
+ * IPv4 specific functions of netfilter core
+ *
+ * Rusty Russell (C) 2000 -- This code is GPL.
+ * Patrick McHardy (C) 2006-2012
+ */
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_queue.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt;
+	struct flowi4 fl4 = {};
+	__be32 saddr = iph->saddr;
+	__u8 flags;
+	struct net_device *dev = skb_dst(skb)->dev;
+	struct flow_keys flkeys;
+	unsigned int hh_len;
+
+	sk = sk_to_full_sk(sk);
+	flags = sk ? inet_sk_flowi_flags(sk) : 0;
+
+	if (addr_type == RTN_UNSPEC)
+		addr_type = inet_addr_type_dev_table(net, dev, saddr);
+	if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
+		flags |= FLOWI_FLAG_ANYSRC;
+	else
+		saddr = 0;
+
+	/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+	 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
+	 */
+	fl4.daddr = iph->daddr;
+	fl4.saddr = saddr;
+	fl4.flowi4_tos = RT_TOS(iph->tos);
+	fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
+	fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev);
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_flags = flags;
+	fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys);
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	/* Drop old route. */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	if (skb_dst(skb)->error)
+		return skb_dst(skb)->error;
+
+#ifdef CONFIG_XFRM
+	if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
+		struct dst_entry *dst = skb_dst(skb);
+		skb_dst_set(skb, NULL);
+		dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
+		if (IS_ERR(dst))
+			return PTR_ERR(dst);
+		skb_dst_set(skb, dst);
+	}
+#endif
+
+	/* Change in oif may mean change in hh_len. */
+	hh_len = skb_dst(skb)->dev->hard_header_len;
+	if (skb_headroom(skb) < hh_len &&
+	    pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+				0, GFP_ATOMIC))
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
+		bool strict __always_unused)
+{
+	struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+	*dst = &rt->dst;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ip_route);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
new file mode 100644
index 0000000000..f71a7e9a7d
--- /dev/null
+++ b/net/ipv4/netfilter/Kconfig
@@ -0,0 +1,344 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# IP netfilter configuration
+#
+
+menu "IP: Netfilter Configuration"
+	depends on INET && NETFILTER
+
+config NF_DEFRAG_IPV4
+	tristate
+	default n
+
+config NF_SOCKET_IPV4
+	tristate "IPv4 socket lookup support"
+	help
+	  This option enables the IPv4 socket lookup infrastructure. This is
+	  is required by the {ip,nf}tables socket match.
+
+config NF_TPROXY_IPV4
+	tristate "IPv4 tproxy support"
+
+if NF_TABLES
+
+config NF_TABLES_IPV4
+	bool "IPv4 nf_tables support"
+	help
+	  This option enables the IPv4 support for nf_tables.
+
+if NF_TABLES_IPV4
+
+config NFT_REJECT_IPV4
+	select NF_REJECT_IPV4
+	default NFT_REJECT
+	tristate
+
+config NFT_DUP_IPV4
+	tristate "IPv4 nf_tables packet duplication support"
+	depends on !NF_CONNTRACK || NF_CONNTRACK
+	select NF_DUP_IPV4
+	help
+	  This module enables IPv4 packet duplication support for nf_tables.
+
+config NFT_FIB_IPV4
+	select NFT_FIB
+	tristate "nf_tables fib / ip route lookup support"
+	help
+	  This module enables IPv4 FIB lookups, e.g. for reverse path filtering.
+	  It also allows query of the FIB for the route type, e.g. local, unicast,
+	  multicast or blackhole.
+
+endif # NF_TABLES_IPV4
+
+config NF_TABLES_ARP
+	bool "ARP nf_tables support"
+	select NETFILTER_FAMILY_ARP
+	help
+	  This option enables the ARP support for nf_tables.
+
+endif # NF_TABLES
+
+config NF_DUP_IPV4
+	tristate "Netfilter IPv4 packet duplication to alternate destination"
+	depends on !NF_CONNTRACK || NF_CONNTRACK
+	help
+	  This option enables the nf_dup_ipv4 core, which duplicates an IPv4
+	  packet to be rerouted to another destination.
+
+config NF_LOG_ARP
+	tristate "ARP packet logging"
+	default m if NETFILTER_ADVANCED=n
+	select NF_LOG_SYSLOG
+	help
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
+
+config NF_LOG_IPV4
+	tristate "IPv4 packet logging"
+	default m if NETFILTER_ADVANCED=n
+	select NF_LOG_SYSLOG
+	help
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
+
+config NF_REJECT_IPV4
+	tristate "IPv4 packet rejection"
+	default m if NETFILTER_ADVANCED=n
+
+if NF_NAT
+config NF_NAT_SNMP_BASIC
+	tristate "Basic SNMP-ALG support"
+	depends on NF_CONNTRACK_SNMP
+	depends on NETFILTER_ADVANCED
+	default NF_NAT && NF_CONNTRACK_SNMP
+	select ASN1
+	help
+
+	  This module implements an Application Layer Gateway (ALG) for
+	  SNMP payloads.  In conjunction with NAT, it allows a network
+	  management system to access multiple private networks with
+	  conflicting addresses.  It works by modifying IP addresses
+	  inside SNMP payloads to match IP-layer NAT mapping.
+
+	  This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_NAT_PPTP
+	tristate
+	depends on NF_CONNTRACK
+	default NF_CONNTRACK_PPTP
+
+config NF_NAT_H323
+	tristate
+	depends on NF_CONNTRACK
+	default NF_CONNTRACK_H323
+
+endif # NF_NAT
+
+config IP_NF_IPTABLES
+	tristate "IP tables support (required for filtering/masq/NAT)"
+	default m if NETFILTER_ADVANCED=n
+	select NETFILTER_XTABLES
+	help
+	  iptables is a general, extensible packet identification framework.
+	  The packet filtering and full NAT (masquerading, port forwarding,
+	  etc) subsystems now use this: say `Y' or `M' here if you want to use
+	  either of those.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+if IP_NF_IPTABLES
+
+# The matches.
+config IP_NF_MATCH_AH
+	tristate '"ah" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This match extension allows you to match a range of SPIs
+	  inside AH header of IPSec packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_ECN
+	tristate '"ecn" match support'
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_MATCH_ECN
+	help
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_MATCH_ECN.
+
+config IP_NF_MATCH_RPFILTER
+	tristate '"rpfilter" reverse path filter match support'
+	depends on NETFILTER_ADVANCED
+	depends on IP_NF_MANGLE || IP_NF_RAW
+	help
+	  This option allows you to match packets whose replies would
+	  go out via the interface the packet came in.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+	  The module will be called ipt_rpfilter.
+
+config IP_NF_MATCH_TTL
+	tristate '"ttl" match support'
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_MATCH_HL
+	help
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_MATCH_HL.
+
+# `filter', generic and specific targets
+config IP_NF_FILTER
+	tristate "Packet filtering"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  Packet filtering defines a table `filter', which has a series of
+	  rules for simple packet filtering at local input, forwarding and
+	  local output.  See the man page for iptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_REJECT
+	tristate "REJECT target support"
+	depends on IP_NF_FILTER
+	select NF_REJECT_IPV4
+	default m if NETFILTER_ADVANCED=n
+	help
+	  The REJECT target allows a filtering rule to specify that an ICMP
+	  error should be issued in response to an incoming packet, rather
+	  than silently being dropped.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_SYNPROXY
+	tristate "SYNPROXY target support"
+	depends on NF_CONNTRACK && NETFILTER_ADVANCED
+	select NETFILTER_SYNPROXY
+	select SYN_COOKIES
+	help
+	  The SYNPROXY target allows you to intercept TCP connections and
+	  establish them using syncookies before they are passed on to the
+	  server. This allows to avoid conntrack and server resource usage
+	  during SYN-flood attacks.
+
+	  To compile it as a module, choose M here. If unsure, say N.
+
+# NAT + specific targets: nf_conntrack
+config IP_NF_NAT
+	tristate "iptables NAT support"
+	depends on NF_CONNTRACK
+	default m if NETFILTER_ADVANCED=n
+	select NF_NAT
+	select NETFILTER_XT_NAT
+	help
+	  This enables the `nat' table in iptables. This allows masquerading,
+	  port forwarding and other forms of full Network Address Port
+	  Translation.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+if IP_NF_NAT
+
+config IP_NF_TARGET_MASQUERADE
+	tristate "MASQUERADE target support"
+	select NETFILTER_XT_TARGET_MASQUERADE
+	help
+	  This is a backwards-compat option for the user's convenience
+	  (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE.
+
+config IP_NF_TARGET_NETMAP
+	tristate "NETMAP target support"
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_TARGET_NETMAP
+	help
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_TARGET_NETMAP.
+
+config IP_NF_TARGET_REDIRECT
+	tristate "REDIRECT target support"
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_TARGET_REDIRECT
+	help
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_TARGET_REDIRECT.
+
+endif # IP_NF_NAT
+
+# mangle + specific targets
+config IP_NF_MANGLE
+	tristate "Packet mangling"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This option adds a `mangle' table to iptables: see the man page for
+	  iptables(8).  This table is used for various packet alterations
+	  which can effect how the packet is routed.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_ECN
+	tristate "ECN target support"
+	depends on IP_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `ECN' target, which can be used in the iptables mangle
+	  table.
+
+	  You can use this target to remove the ECN bits from the IPv4 header of
+	  an IP packet.  This is particularly useful, if you need to work around
+	  existing ECN blackholes on the internet, but don't want to disable
+	  ECN support in general.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_TTL
+	tristate '"TTL" target support'
+	depends on NETFILTER_ADVANCED && IP_NF_MANGLE
+	select NETFILTER_XT_TARGET_HL
+	help
+	This is a backwards-compatible option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_TARGET_HL.
+
+# raw + specific targets
+config IP_NF_RAW
+	tristate  'raw table support (required for NOTRACK/TRACE)'
+	help
+	  This option adds a `raw' table to iptables. This table is the very
+	  first in the netfilter framework and hooks in at the PREROUTING
+	  and OUTPUT chains.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.rst>.  If unsure, say `N'.
+
+# security table for MAC policy
+config IP_NF_SECURITY
+	tristate "Security table"
+	depends on SECURITY
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `security' table to iptables, for use
+	  with Mandatory Access Control (MAC) policy.
+
+	  If unsure, say N.
+
+endif # IP_NF_IPTABLES
+
+# ARP tables
+config IP_NF_ARPTABLES
+	tristate "ARP tables support"
+	select NETFILTER_XTABLES
+	select NETFILTER_FAMILY_ARP
+	depends on NETFILTER_ADVANCED
+	help
+	  arptables is a general, extensible packet identification framework.
+	  The ARP packet filtering and mangling (manipulation)subsystems
+	  use this: say Y or M here if you want to use either of those.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+if IP_NF_ARPTABLES
+
+config IP_NF_ARPFILTER
+	tristate "ARP packet filtering"
+	help
+	  ARP packet filtering defines a table `filter', which has a series of
+	  rules for simple ARP packet filtering at local input and
+	  local output.  On a bridge, you can also specify filtering rules
+	  for forwarded ARP packets. See the man page for arptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_ARP_MANGLE
+	tristate "ARP payload mangling"
+	help
+	  Allows altering the ARP packet payload: source and destination
+	  hardware and network addresses.
+
+endif # IP_NF_ARPTABLES
+
+endmenu
+
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 0000000000..5a26f9de1a
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the netfilter modules on top of IPv4.
+#
+
+# defrag
+obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+
+obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o
+
+# reject
+obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
+
+# NAT helpers (nf_conntrack)
+obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
+obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
+
+nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
+$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
+obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
+
+obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
+obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
+
+# generic IP tables
+obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+
+# the three instances of ip_tables
+obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
+obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
+
+# matches
+obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
+obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
+
+# targets
+obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
+
+# generic ARP tables
+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
+obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
+
+# just filtering instance of ARP tables for now
+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+
+obj-$(CONFIG_NF_DUP_IPV4) += nf_dup_ipv4.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 0000000000..2407066b0f
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1667 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Packet matching code for ARP packets.
+ *
+ * Based heavily, if not almost entirely, upon ip_tables.c framework.
+ *
+ * Some ARP specific bits are:
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net>
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <net/compat.h>
+#include <net/sock.h>
+#include <linux/uaccess.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables core");
+
+void *arpt_alloc_initial_table(const struct xt_table *info)
+{
+	return xt_alloc_initial_table(arpt, ARPT);
+}
+EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
+
+static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
+				      const char *hdr_addr, int len)
+{
+	int i, ret;
+
+	if (len > ARPT_DEV_ADDR_LEN_MAX)
+		len = ARPT_DEV_ADDR_LEN_MAX;
+
+	ret = 0;
+	for (i = 0; i < len; i++)
+		ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
+
+	return ret != 0;
+}
+
+/*
+ * Unfortunately, _b and _mask are not aligned to an int (or long int)
+ * Some arches dont care, unrolling the loop is a win on them.
+ * For other arches, we only have a 16bit alignement.
+ */
+static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask)
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	unsigned long ret = ifname_compare_aligned(_a, _b, _mask);
+#else
+	unsigned long ret = 0;
+	const u16 *a = (const u16 *)_a;
+	const u16 *b = (const u16 *)_b;
+	const u16 *mask = (const u16 *)_mask;
+	int i;
+
+	for (i = 0; i < IFNAMSIZ/sizeof(u16); i++)
+		ret |= (a[i] ^ b[i]) & mask[i];
+#endif
+	return ret;
+}
+
+/* Returns whether packet matches rule or not. */
+static inline int arp_packet_match(const struct arphdr *arphdr,
+				   struct net_device *dev,
+				   const char *indev,
+				   const char *outdev,
+				   const struct arpt_arp *arpinfo)
+{
+	const char *arpptr = (char *)(arphdr + 1);
+	const char *src_devaddr, *tgt_devaddr;
+	__be32 src_ipaddr, tgt_ipaddr;
+	long ret;
+
+	if (NF_INVF(arpinfo, ARPT_INV_ARPOP,
+		    (arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop))
+		return 0;
+
+	if (NF_INVF(arpinfo, ARPT_INV_ARPHRD,
+		    (arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd))
+		return 0;
+
+	if (NF_INVF(arpinfo, ARPT_INV_ARPPRO,
+		    (arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro))
+		return 0;
+
+	if (NF_INVF(arpinfo, ARPT_INV_ARPHLN,
+		    (arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln))
+		return 0;
+
+	src_devaddr = arpptr;
+	arpptr += dev->addr_len;
+	memcpy(&src_ipaddr, arpptr, sizeof(u32));
+	arpptr += sizeof(u32);
+	tgt_devaddr = arpptr;
+	arpptr += dev->addr_len;
+	memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
+
+	if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR,
+		    arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr,
+					dev->addr_len)) ||
+	    NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR,
+		    arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr,
+					dev->addr_len)))
+		return 0;
+
+	if (NF_INVF(arpinfo, ARPT_INV_SRCIP,
+		    (src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr) ||
+	    NF_INVF(arpinfo, ARPT_INV_TGTIP,
+		    (tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr))
+		return 0;
+
+	/* Look for ifname matches.  */
+	ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
+
+	if (NF_INVF(arpinfo, ARPT_INV_VIA_IN, ret != 0))
+		return 0;
+
+	ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
+
+	if (NF_INVF(arpinfo, ARPT_INV_VIA_OUT, ret != 0))
+		return 0;
+
+	return 1;
+}
+
+static inline int arp_checkentry(const struct arpt_arp *arp)
+{
+	if (arp->flags & ~ARPT_F_MASK)
+		return 0;
+	if (arp->invflags & ~ARPT_INV_MASK)
+		return 0;
+
+	return 1;
+}
+
+static unsigned int
+arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	net_err_ratelimited("arp_tables: error: '%s'\n",
+			    (const char *)par->targinfo);
+
+	return NF_DROP;
+}
+
+static inline const struct xt_entry_target *
+arpt_get_target_c(const struct arpt_entry *e)
+{
+	return arpt_get_target((struct arpt_entry *)e);
+}
+
+static inline struct arpt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+	return (struct arpt_entry *)(base + offset);
+}
+
+static inline
+struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
+{
+	return (void *)entry + entry->next_offset;
+}
+
+unsigned int arpt_do_table(void *priv,
+			   struct sk_buff *skb,
+			   const struct nf_hook_state *state)
+{
+	const struct xt_table *table = priv;
+	unsigned int hook = state->hook;
+	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+	unsigned int verdict = NF_DROP;
+	const struct arphdr *arp;
+	struct arpt_entry *e, **jumpstack;
+	const char *indev, *outdev;
+	const void *table_base;
+	unsigned int cpu, stackidx = 0;
+	const struct xt_table_info *private;
+	struct xt_action_param acpar;
+	unsigned int addend;
+
+	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
+		return NF_DROP;
+
+	indev = state->in ? state->in->name : nulldevname;
+	outdev = state->out ? state->out->name : nulldevname;
+
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
+	private = READ_ONCE(table->private); /* Address dependency. */
+	cpu     = smp_processor_id();
+	table_base = private->entries;
+	jumpstack  = (struct arpt_entry **)private->jumpstack[cpu];
+
+	/* No TEE support for arptables, so no need to switch to alternate
+	 * stack.  All targets that reenter must return absolute verdicts.
+	 */
+	e = get_entry(table_base, private->hook_entry[hook]);
+
+	acpar.state   = state;
+	acpar.hotdrop = false;
+
+	arp = arp_hdr(skb);
+	do {
+		const struct xt_entry_target *t;
+		struct xt_counters *counter;
+
+		if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
+			e = arpt_next_entry(e);
+			continue;
+		}
+
+		counter = xt_get_this_cpu_counter(&e->counters);
+		ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1);
+
+		t = arpt_get_target_c(e);
+
+		/* Standard target? */
+		if (!t->u.kernel.target->target) {
+			int v;
+
+			v = ((struct xt_standard_target *)t)->verdict;
+			if (v < 0) {
+				/* Pop from stack? */
+				if (v != XT_RETURN) {
+					verdict = (unsigned int)(-v) - 1;
+					break;
+				}
+				if (stackidx == 0) {
+					e = get_entry(table_base,
+						      private->underflow[hook]);
+				} else {
+					e = jumpstack[--stackidx];
+					e = arpt_next_entry(e);
+				}
+				continue;
+			}
+			if (table_base + v
+			    != arpt_next_entry(e)) {
+				if (unlikely(stackidx >= private->stacksize)) {
+					verdict = NF_DROP;
+					break;
+				}
+				jumpstack[stackidx++] = e;
+			}
+
+			e = get_entry(table_base, v);
+			continue;
+		}
+
+		acpar.target   = t->u.kernel.target;
+		acpar.targinfo = t->data;
+		verdict = t->u.kernel.target->target(skb, &acpar);
+
+		if (verdict == XT_CONTINUE) {
+			/* Target might have changed stuff. */
+			arp = arp_hdr(skb);
+			e = arpt_next_entry(e);
+		} else {
+			/* Verdict */
+			break;
+		}
+	} while (!acpar.hotdrop);
+	xt_write_recseq_end(addend);
+	local_bh_enable();
+
+	if (acpar.hotdrop)
+		return NF_DROP;
+	else
+		return verdict;
+}
+
+/* All zeroes == unconditional rule. */
+static inline bool unconditional(const struct arpt_entry *e)
+{
+	static const struct arpt_arp uncond;
+
+	return e->target_offset == sizeof(struct arpt_entry) &&
+	       memcmp(&e->arp, &uncond, sizeof(uncond)) == 0;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ * there are loops.  Puts hook bitmask in comefrom.
+ */
+static int mark_source_chains(const struct xt_table_info *newinfo,
+			      unsigned int valid_hooks, void *entry0,
+			      unsigned int *offsets)
+{
+	unsigned int hook;
+
+	/* No recursion; use packet counter to save back ptrs (reset
+	 * to 0 as we leave), and comefrom to save source hook bitmask.
+	 */
+	for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
+		unsigned int pos = newinfo->hook_entry[hook];
+		struct arpt_entry *e = entry0 + pos;
+
+		if (!(valid_hooks & (1 << hook)))
+			continue;
+
+		/* Set initial back pointer. */
+		e->counters.pcnt = pos;
+
+		for (;;) {
+			const struct xt_standard_target *t
+				= (void *)arpt_get_target_c(e);
+			int visited = e->comefrom & (1 << hook);
+
+			if (e->comefrom & (1 << NF_ARP_NUMHOOKS))
+				return 0;
+
+			e->comefrom
+				|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
+
+			/* Unconditional return/END. */
+			if ((unconditional(e) &&
+			     (strcmp(t->target.u.user.name,
+				     XT_STANDARD_TARGET) == 0) &&
+			     t->verdict < 0) || visited) {
+				unsigned int oldpos, size;
+
+				/* Return: backtrack through the last
+				 * big jump.
+				 */
+				do {
+					e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
+					oldpos = pos;
+					pos = e->counters.pcnt;
+					e->counters.pcnt = 0;
+
+					/* We're at the start. */
+					if (pos == oldpos)
+						goto next;
+
+					e = entry0 + pos;
+				} while (oldpos == pos + e->next_offset);
+
+				/* Move along one */
+				size = e->next_offset;
+				e = entry0 + pos + size;
+				if (pos + size >= newinfo->size)
+					return 0;
+				e->counters.pcnt = pos;
+				pos += size;
+			} else {
+				int newpos = t->verdict;
+
+				if (strcmp(t->target.u.user.name,
+					   XT_STANDARD_TARGET) == 0 &&
+				    newpos >= 0) {
+					/* This a jump; chase it. */
+					if (!xt_find_jump_offset(offsets, newpos,
+								 newinfo->number))
+						return 0;
+				} else {
+					/* ... this is a fallthru */
+					newpos = pos + e->next_offset;
+					if (newpos >= newinfo->size)
+						return 0;
+				}
+				e = entry0 + newpos;
+				e->counters.pcnt = pos;
+				pos = newpos;
+			}
+		}
+next:		;
+	}
+	return 1;
+}
+
+static int check_target(struct arpt_entry *e, struct net *net, const char *name)
+{
+	struct xt_entry_target *t = arpt_get_target(e);
+	struct xt_tgchk_param par = {
+		.net       = net,
+		.table     = name,
+		.entryinfo = e,
+		.target    = t->u.kernel.target,
+		.targinfo  = t->data,
+		.hook_mask = e->comefrom,
+		.family    = NFPROTO_ARP,
+	};
+
+	return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
+}
+
+static int
+find_check_entry(struct arpt_entry *e, struct net *net, const char *name,
+		 unsigned int size,
+		 struct xt_percpu_counter_alloc_state *alloc_state)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	int ret;
+
+	if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
+		return -ENOMEM;
+
+	t = arpt_get_target(e);
+	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		ret = PTR_ERR(target);
+		goto out;
+	}
+	t->u.kernel.target = target;
+
+	ret = check_target(e, net, name);
+	if (ret)
+		goto err;
+	return 0;
+err:
+	module_put(t->u.kernel.target->me);
+out:
+	xt_percpu_counter_free(&e->counters);
+
+	return ret;
+}
+
+static bool check_underflow(const struct arpt_entry *e)
+{
+	const struct xt_entry_target *t;
+	unsigned int verdict;
+
+	if (!unconditional(e))
+		return false;
+	t = arpt_get_target_c(e);
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+		return false;
+	verdict = ((struct xt_standard_target *)t)->verdict;
+	verdict = -verdict - 1;
+	return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static inline int check_entry_size_and_hooks(struct arpt_entry *e,
+					     struct xt_table_info *newinfo,
+					     const unsigned char *base,
+					     const unsigned char *limit,
+					     const unsigned int *hook_entries,
+					     const unsigned int *underflows,
+					     unsigned int valid_hooks)
+{
+	unsigned int h;
+	int err;
+
+	if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct arpt_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit)
+		return -EINVAL;
+
+	if (e->next_offset
+	    < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target))
+		return -EINVAL;
+
+	if (!arp_checkentry(&e->arp))
+		return -EINVAL;
+
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
+	if (err)
+		return err;
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+		if (!(valid_hooks & (1 << h)))
+			continue;
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h]) {
+			if (!check_underflow(e))
+				return -EINVAL;
+
+			newinfo->underflow[h] = underflows[h];
+		}
+	}
+
+	/* Clear counters and comefrom */
+	e->counters = ((struct xt_counters) { 0, 0 });
+	e->comefrom = 0;
+	return 0;
+}
+
+static void cleanup_entry(struct arpt_entry *e, struct net *net)
+{
+	struct xt_tgdtor_param par;
+	struct xt_entry_target *t;
+
+	t = arpt_get_target(e);
+	par.net      = net;
+	par.target   = t->u.kernel.target;
+	par.targinfo = t->data;
+	par.family   = NFPROTO_ARP;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
+	xt_percpu_counter_free(&e->counters);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ * newinfo).
+ */
+static int translate_table(struct net *net,
+			   struct xt_table_info *newinfo,
+			   void *entry0,
+			   const struct arpt_replace *repl)
+{
+	struct xt_percpu_counter_alloc_state alloc_state = { 0 };
+	struct arpt_entry *iter;
+	unsigned int *offsets;
+	unsigned int i;
+	int ret = 0;
+
+	newinfo->size = repl->size;
+	newinfo->number = repl->num_entries;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = 0xFFFFFFFF;
+		newinfo->underflow[i] = 0xFFFFFFFF;
+	}
+
+	offsets = xt_alloc_entry_offsets(newinfo->number);
+	if (!offsets)
+		return -ENOMEM;
+	i = 0;
+
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+						 entry0 + repl->size,
+						 repl->hook_entry,
+						 repl->underflow,
+						 repl->valid_hooks);
+		if (ret != 0)
+			goto out_free;
+		if (i < repl->num_entries)
+			offsets[i] = (void *)iter - entry0;
+		++i;
+		if (strcmp(arpt_get_target(iter)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
+	}
+
+	ret = -EINVAL;
+	if (i != repl->num_entries)
+		goto out_free;
+
+	ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
+	if (ret)
+		goto out_free;
+
+	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
+		ret = -ELOOP;
+		goto out_free;
+	}
+	kvfree(offsets);
+
+	/* Finally, each sanity check must pass */
+	i = 0;
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = find_check_entry(iter, net, repl->name, repl->size,
+				       &alloc_state);
+		if (ret != 0)
+			break;
+		++i;
+	}
+
+	if (ret != 0) {
+		xt_entry_foreach(iter, entry0, newinfo->size) {
+			if (i-- == 0)
+				break;
+			cleanup_entry(iter, net);
+		}
+		return ret;
+	}
+
+	return ret;
+ out_free:
+	kvfree(offsets);
+	return ret;
+}
+
+static void get_counters(const struct xt_table_info *t,
+			 struct xt_counters counters[])
+{
+	struct arpt_entry *iter;
+	unsigned int cpu;
+	unsigned int i;
+
+	for_each_possible_cpu(cpu) {
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+		i = 0;
+		xt_entry_foreach(iter, t->entries, t->size) {
+			struct xt_counters *tmp;
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+			do {
+				start = read_seqcount_begin(s);
+				bcnt = tmp->bcnt;
+				pcnt = tmp->pcnt;
+			} while (read_seqcount_retry(s, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
+			++i;
+			cond_resched();
+		}
+	}
+}
+
+static void get_old_counters(const struct xt_table_info *t,
+			     struct xt_counters counters[])
+{
+	struct arpt_entry *iter;
+	unsigned int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		i = 0;
+		xt_entry_foreach(iter, t->entries, t->size) {
+			struct xt_counters *tmp;
+
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+			ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+			++i;
+		}
+		cond_resched();
+	}
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+	unsigned int countersize;
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+
+	/* We need atomic snapshot of counters: rest doesn't change
+	 * (other than comefrom, which userspace doesn't care
+	 * about).
+	 */
+	countersize = sizeof(struct xt_counters) * private->number;
+	counters = vzalloc(countersize);
+
+	if (counters == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	get_counters(private, counters);
+
+	return counters;
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+				const struct xt_table *table,
+				void __user *userptr)
+{
+	unsigned int off, num;
+	const struct arpt_entry *e;
+	struct xt_counters *counters;
+	struct xt_table_info *private = table->private;
+	int ret = 0;
+	void *loc_cpu_entry;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	loc_cpu_entry = private->entries;
+
+	/* FIXME: use iterator macros --RR */
+	/* ... then go back and fix counters and names */
+	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+		const struct xt_entry_target *t;
+
+		e = loc_cpu_entry + off;
+		if (copy_to_user(userptr + off, e, sizeof(*e))) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+		if (copy_to_user(userptr + off
+				 + offsetof(struct arpt_entry, counters),
+				 &counters[num],
+				 sizeof(counters[num])) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+
+		t = arpt_get_target_c(e);
+		if (xt_target_to_user(t, userptr + off + e->target_offset)) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+	}
+
+ free_counters:
+	vfree(counters);
+	return ret;
+}
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+	int v = *(compat_int_t *)src;
+
+	if (v > 0)
+		v += xt_compat_calc_jump(NFPROTO_ARP, v);
+	memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+	compat_int_t cv = *(int *)src;
+
+	if (cv > 0)
+		cv -= xt_compat_calc_jump(NFPROTO_ARP, cv);
+	return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct arpt_entry *e,
+			     const struct xt_table_info *info,
+			     const void *base, struct xt_table_info *newinfo)
+{
+	const struct xt_entry_target *t;
+	unsigned int entry_offset;
+	int off, i, ret;
+
+	off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+	entry_offset = (void *)e - base;
+
+	t = arpt_get_target_c(e);
+	off += xt_compat_target_offset(t->u.kernel.target);
+	newinfo->size -= off;
+	ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		if (info->hook_entry[i] &&
+		    (e < (struct arpt_entry *)(base + info->hook_entry[i])))
+			newinfo->hook_entry[i] -= off;
+		if (info->underflow[i] &&
+		    (e < (struct arpt_entry *)(base + info->underflow[i])))
+			newinfo->underflow[i] -= off;
+	}
+	return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+			     struct xt_table_info *newinfo)
+{
+	struct arpt_entry *iter;
+	const void *loc_cpu_entry;
+	int ret;
+
+	if (!newinfo || !info)
+		return -EINVAL;
+
+	/* we dont care about newinfo->entries */
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	newinfo->initial_entries = 0;
+	loc_cpu_entry = info->entries;
+	ret = xt_compat_init_offsets(NFPROTO_ARP, info->number);
+	if (ret)
+		return ret;
+	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+		if (ret != 0)
+			return ret;
+	}
+	return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user, const int *len)
+{
+	char name[XT_TABLE_MAXNAMELEN];
+	struct xt_table *t;
+	int ret;
+
+	if (*len != sizeof(struct arpt_getinfo))
+		return -EINVAL;
+
+	if (copy_from_user(name, user, sizeof(name)) != 0)
+		return -EFAULT;
+
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+	if (in_compat_syscall())
+		xt_compat_lock(NFPROTO_ARP);
+#endif
+	t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
+	if (!IS_ERR(t)) {
+		struct arpt_getinfo info;
+		const struct xt_table_info *private = t->private;
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+		struct xt_table_info tmp;
+
+		if (in_compat_syscall()) {
+			ret = compat_table_info(private, &tmp);
+			xt_compat_flush_offsets(NFPROTO_ARP);
+			private = &tmp;
+		}
+#endif
+		memset(&info, 0, sizeof(info));
+		info.valid_hooks = t->valid_hooks;
+		memcpy(info.hook_entry, private->hook_entry,
+		       sizeof(info.hook_entry));
+		memcpy(info.underflow, private->underflow,
+		       sizeof(info.underflow));
+		info.num_entries = private->number;
+		info.size = private->size;
+		strcpy(info.name, name);
+
+		if (copy_to_user(user, &info, *len) != 0)
+			ret = -EFAULT;
+		else
+			ret = 0;
+		xt_table_unlock(t);
+		module_put(t->me);
+	} else
+		ret = PTR_ERR(t);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+	if (in_compat_syscall())
+		xt_compat_unlock(NFPROTO_ARP);
+#endif
+	return ret;
+}
+
+static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
+		       const int *len)
+{
+	int ret;
+	struct arpt_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get))
+		return -EINVAL;
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+	if (*len != sizeof(struct arpt_get_entries) + get.size)
+		return -EINVAL;
+
+	get.name[sizeof(get.name) - 1] = '\0';
+
+	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+	if (!IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+
+		if (get.size == private->size)
+			ret = copy_entries_to_user(private->size,
+						   t, uptr->entrytable);
+		else
+			ret = -EAGAIN;
+
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = PTR_ERR(t);
+
+	return ret;
+}
+
+static int __do_replace(struct net *net, const char *name,
+			unsigned int valid_hooks,
+			struct xt_table_info *newinfo,
+			unsigned int num_counters,
+			void __user *counters_ptr)
+{
+	int ret;
+	struct xt_table *t;
+	struct xt_table_info *oldinfo;
+	struct xt_counters *counters;
+	void *loc_cpu_old_entry;
+	struct arpt_entry *iter;
+
+	ret = 0;
+	counters = xt_counters_alloc(num_counters);
+	if (!counters) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
+		goto free_newinfo_counters_untrans;
+	}
+
+	/* You lied! */
+	if (valid_hooks != t->valid_hooks) {
+		ret = -EINVAL;
+		goto put_module;
+	}
+
+	oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+	if (!oldinfo)
+		goto put_module;
+
+	/* Update module usage count based on number of rules */
+	if ((oldinfo->number > oldinfo->initial_entries) ||
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+	if ((oldinfo->number > oldinfo->initial_entries) &&
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+
+	xt_table_unlock(t);
+
+	get_old_counters(oldinfo, counters);
+
+	/* Decrease module usage counts and free resource */
+	loc_cpu_old_entry = oldinfo->entries;
+	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+		cleanup_entry(iter, net);
+
+	xt_free_table_info(oldinfo);
+	if (copy_to_user(counters_ptr, counters,
+			 sizeof(struct xt_counters) * num_counters) != 0) {
+		/* Silent error, can't fail, new table is already in place */
+		net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
+	}
+	vfree(counters);
+	return ret;
+
+ put_module:
+	module_put(t->me);
+	xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+	vfree(counters);
+ out:
+	return ret;
+}
+
+static int do_replace(struct net *net, sockptr_t arg, unsigned int len)
+{
+	int ret;
+	struct arpt_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct arpt_entry *iter;
+
+	if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	loc_cpu_entry = newinfo->entries;
+	if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+			tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+	if (ret != 0)
+		goto free_newinfo;
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, tmp.counters);
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter, net);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
+{
+	unsigned int i;
+	struct xt_counters_info tmp;
+	struct xt_counters *paddc;
+	struct xt_table *t;
+	const struct xt_table_info *private;
+	int ret = 0;
+	struct arpt_entry *iter;
+	unsigned int addend;
+
+	paddc = xt_copy_counters(arg, len, &tmp);
+	if (IS_ERR(paddc))
+		return PTR_ERR(paddc);
+
+	t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name);
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
+		goto free;
+	}
+
+	local_bh_disable();
+	private = t->private;
+	if (private->number != tmp.num_counters) {
+		ret = -EINVAL;
+		goto unlock_up_free;
+	}
+
+	i = 0;
+
+	addend = xt_write_recseq_begin();
+	xt_entry_foreach(iter,  private->entries, private->size) {
+		struct xt_counters *tmp;
+
+		tmp = xt_get_this_cpu_counter(&iter->counters);
+		ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
+		++i;
+	}
+	xt_write_recseq_end(addend);
+ unlock_up_free:
+	local_bh_enable();
+	xt_table_unlock(t);
+	module_put(t->me);
+ free:
+	vfree(paddc);
+
+	return ret;
+}
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+struct compat_arpt_replace {
+	char				name[XT_TABLE_MAXNAMELEN];
+	u32				valid_hooks;
+	u32				num_entries;
+	u32				size;
+	u32				hook_entry[NF_ARP_NUMHOOKS];
+	u32				underflow[NF_ARP_NUMHOOKS];
+	u32				num_counters;
+	compat_uptr_t			counters;
+	struct compat_arpt_entry	entries[];
+};
+
+static inline void compat_release_entry(struct compat_arpt_entry *e)
+{
+	struct xt_entry_target *t;
+
+	t = compat_arpt_get_target(e);
+	module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
+				  struct xt_table_info *newinfo,
+				  unsigned int *size,
+				  const unsigned char *base,
+				  const unsigned char *limit)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	unsigned int entry_offset;
+	int ret, off;
+
+	if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit)
+		return -EINVAL;
+
+	if (e->next_offset < sizeof(struct compat_arpt_entry) +
+			     sizeof(struct compat_xt_entry_target))
+		return -EINVAL;
+
+	if (!arp_checkentry(&e->arp))
+		return -EINVAL;
+
+	ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset,
+					    e->next_offset);
+	if (ret)
+		return ret;
+
+	off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+	entry_offset = (void *)e - (void *)base;
+
+	t = compat_arpt_get_target(e);
+	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		ret = PTR_ERR(target);
+		goto out;
+	}
+	t->u.kernel.target = target;
+
+	off += xt_compat_target_offset(target);
+	*size += off;
+	ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+	if (ret)
+		goto release_target;
+
+	return 0;
+
+release_target:
+	module_put(t->u.kernel.target->me);
+out:
+	return ret;
+}
+
+static void
+compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
+			    unsigned int *size,
+			    struct xt_table_info *newinfo, unsigned char *base)
+{
+	struct xt_entry_target *t;
+	struct arpt_entry *de;
+	unsigned int origsize;
+	int h;
+
+	origsize = *size;
+	de = *dstptr;
+	memcpy(de, e, sizeof(struct arpt_entry));
+	memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+	*dstptr += sizeof(struct arpt_entry);
+	*size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+	de->target_offset = e->target_offset - (origsize - *size);
+	t = compat_arpt_get_target(e);
+	xt_compat_target_from_user(t, dstptr, size);
+
+	de->next_offset = e->next_offset - (origsize - *size);
+	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+		if ((unsigned char *)de - base < newinfo->hook_entry[h])
+			newinfo->hook_entry[h] -= origsize - *size;
+		if ((unsigned char *)de - base < newinfo->underflow[h])
+			newinfo->underflow[h] -= origsize - *size;
+	}
+}
+
+static int translate_compat_table(struct net *net,
+				  struct xt_table_info **pinfo,
+				  void **pentry0,
+				  const struct compat_arpt_replace *compatr)
+{
+	unsigned int i, j;
+	struct xt_table_info *newinfo, *info;
+	void *pos, *entry0, *entry1;
+	struct compat_arpt_entry *iter0;
+	struct arpt_replace repl;
+	unsigned int size;
+	int ret;
+
+	info = *pinfo;
+	entry0 = *pentry0;
+	size = compatr->size;
+	info->number = compatr->num_entries;
+
+	j = 0;
+	xt_compat_lock(NFPROTO_ARP);
+	ret = xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries);
+	if (ret)
+		goto out_unlock;
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter0, entry0, compatr->size) {
+		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+							entry0,
+							entry0 + compatr->size);
+		if (ret != 0)
+			goto out_unlock;
+		++j;
+	}
+
+	ret = -EINVAL;
+	if (j != compatr->num_entries)
+		goto out_unlock;
+
+	ret = -ENOMEM;
+	newinfo = xt_alloc_table_info(size);
+	if (!newinfo)
+		goto out_unlock;
+
+	memset(newinfo->entries, 0, size);
+
+	newinfo->number = compatr->num_entries;
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = compatr->hook_entry[i];
+		newinfo->underflow[i] = compatr->underflow[i];
+	}
+	entry1 = newinfo->entries;
+	pos = entry1;
+	size = compatr->size;
+	xt_entry_foreach(iter0, entry0, compatr->size)
+		compat_copy_entry_from_user(iter0, &pos, &size,
+					    newinfo, entry1);
+
+	/* all module references in entry0 are now gone */
+
+	xt_compat_flush_offsets(NFPROTO_ARP);
+	xt_compat_unlock(NFPROTO_ARP);
+
+	memcpy(&repl, compatr, sizeof(*compatr));
+
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		repl.hook_entry[i] = newinfo->hook_entry[i];
+		repl.underflow[i] = newinfo->underflow[i];
+	}
+
+	repl.num_counters = 0;
+	repl.counters = NULL;
+	repl.size = newinfo->size;
+	ret = translate_table(net, newinfo, entry1, &repl);
+	if (ret)
+		goto free_newinfo;
+
+	*pinfo = newinfo;
+	*pentry0 = entry1;
+	xt_free_table_info(info);
+	return 0;
+
+free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(NFPROTO_ARP);
+	xt_compat_unlock(NFPROTO_ARP);
+	xt_entry_foreach(iter0, entry0, compatr->size) {
+		if (j-- == 0)
+			break;
+		compat_release_entry(iter0);
+	}
+	return ret;
+}
+
+static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
+{
+	int ret;
+	struct compat_arpt_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct arpt_entry *iter;
+
+	if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	loc_cpu_entry = newinfo->entries;
+	if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+			tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
+	if (ret != 0)
+		goto free_newinfo;
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, compat_ptr(tmp.counters));
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter, net);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
+				     compat_uint_t *size,
+				     struct xt_counters *counters,
+				     unsigned int i)
+{
+	struct xt_entry_target *t;
+	struct compat_arpt_entry __user *ce;
+	u_int16_t target_offset, next_offset;
+	compat_uint_t origsize;
+	int ret;
+
+	origsize = *size;
+	ce = *dstptr;
+	if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
+	    copy_to_user(&ce->counters, &counters[i],
+	    sizeof(counters[i])) != 0)
+		return -EFAULT;
+
+	*dstptr += sizeof(struct compat_arpt_entry);
+	*size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+	target_offset = e->target_offset - (origsize - *size);
+
+	t = arpt_get_target(e);
+	ret = xt_compat_target_to_user(t, dstptr, size);
+	if (ret)
+		return ret;
+	next_offset = e->next_offset - (origsize - *size);
+	if (put_user(target_offset, &ce->target_offset) != 0 ||
+	    put_user(next_offset, &ce->next_offset) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int compat_copy_entries_to_user(unsigned int total_size,
+				       struct xt_table *table,
+				       void __user *userptr)
+{
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+	void __user *pos;
+	unsigned int size;
+	int ret = 0;
+	unsigned int i = 0;
+	struct arpt_entry *iter;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	pos = userptr;
+	size = total_size;
+	xt_entry_foreach(iter, private->entries, total_size) {
+		ret = compat_copy_entry_to_user(iter, &pos,
+						&size, counters, i++);
+		if (ret != 0)
+			break;
+	}
+	vfree(counters);
+	return ret;
+}
+
+struct compat_arpt_get_entries {
+	char name[XT_TABLE_MAXNAMELEN];
+	compat_uint_t size;
+	struct compat_arpt_entry entrytable[];
+};
+
+static int compat_get_entries(struct net *net,
+			      struct compat_arpt_get_entries __user *uptr,
+			      int *len)
+{
+	int ret;
+	struct compat_arpt_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get))
+		return -EINVAL;
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+	if (*len != sizeof(struct compat_arpt_get_entries) + get.size)
+		return -EINVAL;
+
+	get.name[sizeof(get.name) - 1] = '\0';
+
+	xt_compat_lock(NFPROTO_ARP);
+	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+	if (!IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+		struct xt_table_info info;
+
+		ret = compat_table_info(private, &info);
+		if (!ret && get.size == info.size) {
+			ret = compat_copy_entries_to_user(private->size,
+							  t, uptr->entrytable);
+		} else if (!ret)
+			ret = -EAGAIN;
+
+		xt_compat_flush_offsets(NFPROTO_ARP);
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = PTR_ERR(t);
+
+	xt_compat_unlock(NFPROTO_ARP);
+	return ret;
+}
+#endif
+
+static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg,
+		unsigned int len)
+{
+	int ret;
+
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ARPT_SO_SET_REPLACE:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+		if (in_compat_syscall())
+			ret = compat_do_replace(sock_net(sk), arg, len);
+		else
+#endif
+			ret = do_replace(sock_net(sk), arg, len);
+		break;
+
+	case ARPT_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(sock_net(sk), arg, len);
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ARPT_SO_GET_INFO:
+		ret = get_info(sock_net(sk), user, len);
+		break;
+
+	case ARPT_SO_GET_ENTRIES:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+		if (in_compat_syscall())
+			ret = compat_get_entries(sock_net(sk), user, len);
+		else
+#endif
+			ret = get_entries(sock_net(sk), user, len);
+		break;
+
+	case ARPT_SO_GET_REVISION_TARGET: {
+		struct xt_get_revision rev;
+
+		if (*len != sizeof(rev)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		rev.name[sizeof(rev.name)-1] = 0;
+
+		try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
+							 rev.revision, 1, &ret),
+					"arpt_%s", rev.name);
+		break;
+	}
+
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void __arpt_unregister_table(struct net *net, struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct arpt_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries;
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter, net);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
+int arpt_register_table(struct net *net,
+			const struct xt_table *table,
+			const struct arpt_replace *repl,
+			const struct nf_hook_ops *template_ops)
+{
+	struct nf_hook_ops *ops;
+	unsigned int num_ops;
+	int ret, i;
+	struct xt_table_info *newinfo;
+	struct xt_table_info bootstrap = {0};
+	void *loc_cpu_entry;
+	struct xt_table *new_table;
+
+	newinfo = xt_alloc_table_info(repl->size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	loc_cpu_entry = newinfo->entries;
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
+	if (ret != 0) {
+		xt_free_table_info(newinfo);
+		return ret;
+	}
+
+	new_table = xt_register_table(net, table, &bootstrap, newinfo);
+	if (IS_ERR(new_table)) {
+		struct arpt_entry *iter;
+
+		xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+			cleanup_entry(iter, net);
+		xt_free_table_info(newinfo);
+		return PTR_ERR(new_table);
+	}
+
+	num_ops = hweight32(table->valid_hooks);
+	if (num_ops == 0) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+	if (!ops) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < num_ops; i++)
+		ops[i].priv = new_table;
+
+	new_table->ops = ops;
+
+	ret = nf_register_net_hooks(net, ops, num_ops);
+	if (ret != 0)
+		goto out_free;
+
+	return ret;
+
+out_free:
+	__arpt_unregister_table(net, new_table);
+	return ret;
+}
+
+void arpt_unregister_table_pre_exit(struct net *net, const char *name)
+{
+	struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
+
+	if (table)
+		nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
+}
+EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
+
+void arpt_unregister_table(struct net *net, const char *name)
+{
+	struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
+
+	if (table)
+		__arpt_unregister_table(net, table);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct xt_target arpt_builtin_tg[] __read_mostly = {
+	{
+		.name             = XT_STANDARD_TARGET,
+		.targetsize       = sizeof(int),
+		.family           = NFPROTO_ARP,
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+		.compatsize       = sizeof(compat_int_t),
+		.compat_from_user = compat_standard_from_user,
+		.compat_to_user   = compat_standard_to_user,
+#endif
+	},
+	{
+		.name             = XT_ERROR_TARGET,
+		.target           = arpt_error,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
+		.family           = NFPROTO_ARP,
+	},
+};
+
+static struct nf_sockopt_ops arpt_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= ARPT_BASE_CTL,
+	.set_optmax	= ARPT_SO_SET_MAX+1,
+	.set		= do_arpt_set_ctl,
+	.get_optmin	= ARPT_BASE_CTL,
+	.get_optmax	= ARPT_SO_GET_MAX+1,
+	.get		= do_arpt_get_ctl,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init arp_tables_net_init(struct net *net)
+{
+	return xt_proto_init(net, NFPROTO_ARP);
+}
+
+static void __net_exit arp_tables_net_exit(struct net *net)
+{
+	xt_proto_fini(net, NFPROTO_ARP);
+}
+
+static struct pernet_operations arp_tables_net_ops = {
+	.init = arp_tables_net_init,
+	.exit = arp_tables_net_exit,
+};
+
+static int __init arp_tables_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&arp_tables_net_ops);
+	if (ret < 0)
+		goto err1;
+
+	/* No one else will be downing sem now, so we won't sleep */
+	ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+	if (ret < 0)
+		goto err2;
+
+	/* Register setsockopt */
+	ret = nf_register_sockopt(&arpt_sockopts);
+	if (ret < 0)
+		goto err4;
+
+	return 0;
+
+err4:
+	xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+err2:
+	unregister_pernet_subsys(&arp_tables_net_ops);
+err1:
+	return ret;
+}
+
+static void __exit arp_tables_fini(void)
+{
+	nf_unregister_sockopt(&arpt_sockopts);
+	xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+	unregister_pernet_subsys(&arp_tables_net_ops);
+}
+
+EXPORT_SYMBOL(arpt_register_table);
+EXPORT_SYMBOL(arpt_unregister_table);
+EXPORT_SYMBOL(arpt_do_table);
+
+module_init(arp_tables_init);
+module_exit(arp_tables_fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
new file mode 100644
index 0000000000..a4e07e5e9c
--- /dev/null
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* module that allows mangling of the arp payload */
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_arp/arpt_mangle.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("arptables arp payload mangle target");
+
+static unsigned int
+target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct arpt_mangle *mangle = par->targinfo;
+	const struct arphdr *arp;
+	unsigned char *arpptr;
+	int pln, hln;
+
+	if (skb_ensure_writable(skb, skb->len))
+		return NF_DROP;
+
+	arp = arp_hdr(skb);
+	arpptr = skb_network_header(skb) + sizeof(*arp);
+	pln = arp->ar_pln;
+	hln = arp->ar_hln;
+	/* We assume that pln and hln were checked in the match */
+	if (mangle->flags & ARPT_MANGLE_SDEV) {
+		if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+		   (arpptr + hln > skb_tail_pointer(skb)))
+			return NF_DROP;
+		memcpy(arpptr, mangle->src_devaddr, hln);
+	}
+	arpptr += hln;
+	if (mangle->flags & ARPT_MANGLE_SIP) {
+		if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+		   (arpptr + pln > skb_tail_pointer(skb)))
+			return NF_DROP;
+		memcpy(arpptr, &mangle->u_s.src_ip, pln);
+	}
+	arpptr += pln;
+	if (mangle->flags & ARPT_MANGLE_TDEV) {
+		if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+		   (arpptr + hln > skb_tail_pointer(skb)))
+			return NF_DROP;
+		memcpy(arpptr, mangle->tgt_devaddr, hln);
+	}
+	arpptr += hln;
+	if (mangle->flags & ARPT_MANGLE_TIP) {
+		if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+		   (arpptr + pln > skb_tail_pointer(skb)))
+			return NF_DROP;
+		memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
+	}
+	return mangle->target;
+}
+
+static int checkentry(const struct xt_tgchk_param *par)
+{
+	const struct arpt_mangle *mangle = par->targinfo;
+
+	if (mangle->flags & ~ARPT_MANGLE_MASK ||
+	    !(mangle->flags & ARPT_MANGLE_MASK))
+		return -EINVAL;
+
+	if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
+	   mangle->target != XT_CONTINUE)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_target arpt_mangle_reg __read_mostly = {
+	.name		= "mangle",
+	.family		= NFPROTO_ARP,
+	.target		= target,
+	.targetsize	= sizeof(struct arpt_mangle),
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init arpt_mangle_init(void)
+{
+	return xt_register_target(&arpt_mangle_reg);
+}
+
+static void __exit arpt_mangle_fini(void)
+{
+	xt_unregister_target(&arpt_mangle_reg);
+}
+
+module_init(arpt_mangle_init);
+module_exit(arpt_mangle_fini);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 0000000000..78cd5ee244
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Filtering ARP tables module.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
+			   (1 << NF_ARP_FORWARD))
+
+static const struct xt_table packet_filter = {
+	.name		= "filter",
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_ARP,
+	.priority	= NF_IP_PRI_FILTER,
+};
+
+static struct nf_hook_ops *arpfilter_ops __read_mostly;
+
+static int arptable_filter_table_init(struct net *net)
+{
+	struct arpt_replace *repl;
+	int err;
+
+	repl = arpt_alloc_initial_table(&packet_filter);
+	if (repl == NULL)
+		return -ENOMEM;
+	err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops);
+	kfree(repl);
+	return err;
+}
+
+static void __net_exit arptable_filter_net_pre_exit(struct net *net)
+{
+	arpt_unregister_table_pre_exit(net, "filter");
+}
+
+static void __net_exit arptable_filter_net_exit(struct net *net)
+{
+	arpt_unregister_table(net, "filter");
+}
+
+static struct pernet_operations arptable_filter_net_ops = {
+	.exit = arptable_filter_net_exit,
+	.pre_exit = arptable_filter_net_pre_exit,
+};
+
+static int __init arptable_filter_init(void)
+{
+	int ret = xt_register_template(&packet_filter,
+				       arptable_filter_table_init);
+
+	if (ret < 0)
+		return ret;
+
+	arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table);
+	if (IS_ERR(arpfilter_ops)) {
+		xt_unregister_template(&packet_filter);
+		return PTR_ERR(arpfilter_ops);
+	}
+
+	ret = register_pernet_subsys(&arptable_filter_net_ops);
+	if (ret < 0) {
+		xt_unregister_template(&packet_filter);
+		kfree(arpfilter_ops);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit arptable_filter_fini(void)
+{
+	unregister_pernet_subsys(&arptable_filter_net_ops);
+	xt_unregister_template(&packet_filter);
+	kfree(arpfilter_ops);
+}
+
+module_init(arptable_filter_init);
+module_exit(arptable_filter_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 0000000000..7da1df4997
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,1886 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cache.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/compat.h>
+#include <linux/uaccess.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv4 packet filter");
+
+void *ipt_alloc_initial_table(const struct xt_table *info)
+{
+	return xt_alloc_initial_table(ipt, IPT);
+}
+EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
+
+/* Returns whether matches rule or not. */
+/* Performance critical - called for every packet */
+static inline bool
+ip_packet_match(const struct iphdr *ip,
+		const char *indev,
+		const char *outdev,
+		const struct ipt_ip *ipinfo,
+		int isfrag)
+{
+	unsigned long ret;
+
+	if (NF_INVF(ipinfo, IPT_INV_SRCIP,
+		    (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) ||
+	    NF_INVF(ipinfo, IPT_INV_DSTIP,
+		    (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr))
+		return false;
+
+	ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
+
+	if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0))
+		return false;
+
+	ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
+
+	if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0))
+		return false;
+
+	/* Check specific protocol */
+	if (ipinfo->proto &&
+	    NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto))
+		return false;
+
+	/* If we have a fragment rule but the packet is not a fragment
+	 * then we return zero */
+	if (NF_INVF(ipinfo, IPT_INV_FRAG,
+		    (ipinfo->flags & IPT_F_FRAG) && !isfrag))
+		return false;
+
+	return true;
+}
+
+static bool
+ip_checkentry(const struct ipt_ip *ip)
+{
+	if (ip->flags & ~IPT_F_MASK)
+		return false;
+	if (ip->invflags & ~IPT_INV_MASK)
+		return false;
+	return true;
+}
+
+static unsigned int
+ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
+
+	return NF_DROP;
+}
+
+/* Performance critical */
+static inline struct ipt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+	return (struct ipt_entry *)(base + offset);
+}
+
+/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
+static inline bool unconditional(const struct ipt_entry *e)
+{
+	static const struct ipt_ip uncond;
+
+	return e->target_offset == sizeof(struct ipt_entry) &&
+	       memcmp(&e->ip, &uncond, sizeof(uncond)) == 0;
+}
+
+/* for const-correctness */
+static inline const struct xt_entry_target *
+ipt_get_target_c(const struct ipt_entry *e)
+{
+	return ipt_get_target((struct ipt_entry *)e);
+}
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+static const char *const hooknames[] = {
+	[NF_INET_PRE_ROUTING]		= "PREROUTING",
+	[NF_INET_LOCAL_IN]		= "INPUT",
+	[NF_INET_FORWARD]		= "FORWARD",
+	[NF_INET_LOCAL_OUT]		= "OUTPUT",
+	[NF_INET_POST_ROUTING]		= "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+	NF_IP_TRACE_COMMENT_RULE,
+	NF_IP_TRACE_COMMENT_RETURN,
+	NF_IP_TRACE_COMMENT_POLICY,
+};
+
+static const char *const comments[] = {
+	[NF_IP_TRACE_COMMENT_RULE]	= "rule",
+	[NF_IP_TRACE_COMMENT_RETURN]	= "return",
+	[NF_IP_TRACE_COMMENT_POLICY]	= "policy",
+};
+
+static const struct nf_loginfo trace_loginfo = {
+	.type = NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level = 4,
+			.logflags = NF_LOG_DEFAULT_MASK,
+		},
+	},
+};
+
+/* Mildly perf critical (only if packet tracing is on) */
+static inline int
+get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
+		      const char *hookname, const char **chainname,
+		      const char **comment, unsigned int *rulenum)
+{
+	const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
+
+	if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
+		/* Head of user chain: ERROR target with chainname */
+		*chainname = t->target.data;
+		(*rulenum) = 0;
+	} else if (s == e) {
+		(*rulenum)++;
+
+		if (unconditional(s) &&
+		    strcmp(t->target.u.kernel.target->name,
+			   XT_STANDARD_TARGET) == 0 &&
+		   t->verdict < 0) {
+			/* Tail of chains: STANDARD target (return/policy) */
+			*comment = *chainname == hookname
+				? comments[NF_IP_TRACE_COMMENT_POLICY]
+				: comments[NF_IP_TRACE_COMMENT_RETURN];
+		}
+		return 1;
+	} else
+		(*rulenum)++;
+
+	return 0;
+}
+
+static void trace_packet(struct net *net,
+			 const struct sk_buff *skb,
+			 unsigned int hook,
+			 const struct net_device *in,
+			 const struct net_device *out,
+			 const char *tablename,
+			 const struct xt_table_info *private,
+			 const struct ipt_entry *e)
+{
+	const struct ipt_entry *root;
+	const char *hookname, *chainname, *comment;
+	const struct ipt_entry *iter;
+	unsigned int rulenum = 0;
+
+	root = get_entry(private->entries, private->hook_entry[hook]);
+
+	hookname = chainname = hooknames[hook];
+	comment = comments[NF_IP_TRACE_COMMENT_RULE];
+
+	xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+		if (get_chainname_rulenum(iter, e, hookname,
+		    &chainname, &comment, &rulenum) != 0)
+			break;
+
+	nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
+		     "TRACE: %s:%s:%s:%u ",
+		     tablename, chainname, comment, rulenum);
+}
+#endif
+
+static inline
+struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
+{
+	return (void *)entry + entry->next_offset;
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ipt_do_table(void *priv,
+	     struct sk_buff *skb,
+	     const struct nf_hook_state *state)
+{
+	const struct xt_table *table = priv;
+	unsigned int hook = state->hook;
+	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+	const struct iphdr *ip;
+	/* Initializing verdict to NF_DROP keeps gcc happy. */
+	unsigned int verdict = NF_DROP;
+	const char *indev, *outdev;
+	const void *table_base;
+	struct ipt_entry *e, **jumpstack;
+	unsigned int stackidx, cpu;
+	const struct xt_table_info *private;
+	struct xt_action_param acpar;
+	unsigned int addend;
+
+	/* Initialization */
+	stackidx = 0;
+	ip = ip_hdr(skb);
+	indev = state->in ? state->in->name : nulldevname;
+	outdev = state->out ? state->out->name : nulldevname;
+	/* We handle fragments by dealing with the first fragment as
+	 * if it was a normal packet.  All other fragments are treated
+	 * normally, except that they will NEVER match rules that ask
+	 * things we don't know, ie. tcp syn flag or ports).  If the
+	 * rule is also a fragment-specific rule, non-fragments won't
+	 * match it. */
+	acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+	acpar.thoff   = ip_hdrlen(skb);
+	acpar.hotdrop = false;
+	acpar.state   = state;
+
+	WARN_ON(!(table->valid_hooks & (1 << hook)));
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
+	private = READ_ONCE(table->private); /* Address dependency. */
+	cpu        = smp_processor_id();
+	table_base = private->entries;
+	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
+
+	/* Switch to alternate jumpstack if we're being invoked via TEE.
+	 * TEE issues XT_CONTINUE verdict on original skb so we must not
+	 * clobber the jumpstack.
+	 *
+	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
+	 * but it is no problem since absolute verdict is issued by these.
+	 */
+	if (static_key_false(&xt_tee_enabled))
+		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+
+	e = get_entry(table_base, private->hook_entry[hook]);
+
+	do {
+		const struct xt_entry_target *t;
+		const struct xt_entry_match *ematch;
+		struct xt_counters *counter;
+
+		WARN_ON(!e);
+		if (!ip_packet_match(ip, indev, outdev,
+		    &e->ip, acpar.fragoff)) {
+ no_match:
+			e = ipt_next_entry(e);
+			continue;
+		}
+
+		xt_ematch_foreach(ematch, e) {
+			acpar.match     = ematch->u.kernel.match;
+			acpar.matchinfo = ematch->data;
+			if (!acpar.match->match(skb, &acpar))
+				goto no_match;
+		}
+
+		counter = xt_get_this_cpu_counter(&e->counters);
+		ADD_COUNTER(*counter, skb->len, 1);
+
+		t = ipt_get_target_c(e);
+		WARN_ON(!t->u.kernel.target);
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+		/* The packet is traced: log it */
+		if (unlikely(skb->nf_trace))
+			trace_packet(state->net, skb, hook, state->in,
+				     state->out, table->name, private, e);
+#endif
+		/* Standard target? */
+		if (!t->u.kernel.target->target) {
+			int v;
+
+			v = ((struct xt_standard_target *)t)->verdict;
+			if (v < 0) {
+				/* Pop from stack? */
+				if (v != XT_RETURN) {
+					verdict = (unsigned int)(-v) - 1;
+					break;
+				}
+				if (stackidx == 0) {
+					e = get_entry(table_base,
+					    private->underflow[hook]);
+				} else {
+					e = jumpstack[--stackidx];
+					e = ipt_next_entry(e);
+				}
+				continue;
+			}
+			if (table_base + v != ipt_next_entry(e) &&
+			    !(e->ip.flags & IPT_F_GOTO)) {
+				if (unlikely(stackidx >= private->stacksize)) {
+					verdict = NF_DROP;
+					break;
+				}
+				jumpstack[stackidx++] = e;
+			}
+
+			e = get_entry(table_base, v);
+			continue;
+		}
+
+		acpar.target   = t->u.kernel.target;
+		acpar.targinfo = t->data;
+
+		verdict = t->u.kernel.target->target(skb, &acpar);
+		if (verdict == XT_CONTINUE) {
+			/* Target might have changed stuff. */
+			ip = ip_hdr(skb);
+			e = ipt_next_entry(e);
+		} else {
+			/* Verdict */
+			break;
+		}
+	} while (!acpar.hotdrop);
+
+	xt_write_recseq_end(addend);
+	local_bh_enable();
+
+	if (acpar.hotdrop)
+		return NF_DROP;
+	else return verdict;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+   there are loops.  Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(const struct xt_table_info *newinfo,
+		   unsigned int valid_hooks, void *entry0,
+		   unsigned int *offsets)
+{
+	unsigned int hook;
+
+	/* No recursion; use packet counter to save back ptrs (reset
+	   to 0 as we leave), and comefrom to save source hook bitmask */
+	for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
+		unsigned int pos = newinfo->hook_entry[hook];
+		struct ipt_entry *e = entry0 + pos;
+
+		if (!(valid_hooks & (1 << hook)))
+			continue;
+
+		/* Set initial back pointer. */
+		e->counters.pcnt = pos;
+
+		for (;;) {
+			const struct xt_standard_target *t
+				= (void *)ipt_get_target_c(e);
+			int visited = e->comefrom & (1 << hook);
+
+			if (e->comefrom & (1 << NF_INET_NUMHOOKS))
+				return 0;
+
+			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
+
+			/* Unconditional return/END. */
+			if ((unconditional(e) &&
+			     (strcmp(t->target.u.user.name,
+				     XT_STANDARD_TARGET) == 0) &&
+			     t->verdict < 0) || visited) {
+				unsigned int oldpos, size;
+
+				/* Return: backtrack through the last
+				   big jump. */
+				do {
+					e->comefrom ^= (1<<NF_INET_NUMHOOKS);
+					oldpos = pos;
+					pos = e->counters.pcnt;
+					e->counters.pcnt = 0;
+
+					/* We're at the start. */
+					if (pos == oldpos)
+						goto next;
+
+					e = entry0 + pos;
+				} while (oldpos == pos + e->next_offset);
+
+				/* Move along one */
+				size = e->next_offset;
+				e = entry0 + pos + size;
+				if (pos + size >= newinfo->size)
+					return 0;
+				e->counters.pcnt = pos;
+				pos += size;
+			} else {
+				int newpos = t->verdict;
+
+				if (strcmp(t->target.u.user.name,
+					   XT_STANDARD_TARGET) == 0 &&
+				    newpos >= 0) {
+					/* This a jump; chase it. */
+					if (!xt_find_jump_offset(offsets, newpos,
+								 newinfo->number))
+						return 0;
+				} else {
+					/* ... this is a fallthru */
+					newpos = pos + e->next_offset;
+					if (newpos >= newinfo->size)
+						return 0;
+				}
+				e = entry0 + newpos;
+				e->counters.pcnt = pos;
+				pos = newpos;
+			}
+		}
+next:		;
+	}
+	return 1;
+}
+
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
+{
+	struct xt_mtdtor_param par;
+
+	par.net       = net;
+	par.match     = m->u.kernel.match;
+	par.matchinfo = m->data;
+	par.family    = NFPROTO_IPV4;
+	if (par.match->destroy != NULL)
+		par.match->destroy(&par);
+	module_put(par.match->me);
+}
+
+static int
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+	const struct ipt_ip *ip = par->entryinfo;
+
+	par->match     = m->u.kernel.match;
+	par->matchinfo = m->data;
+
+	return xt_check_match(par, m->u.match_size - sizeof(*m),
+			      ip->proto, ip->invflags & IPT_INV_PROTO);
+}
+
+static int
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+	struct xt_match *match;
+	int ret;
+
+	match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+				      m->u.user.revision);
+	if (IS_ERR(match))
+		return PTR_ERR(match);
+	m->u.kernel.match = match;
+
+	ret = check_match(m, par);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	module_put(m->u.kernel.match->me);
+	return ret;
+}
+
+static int check_target(struct ipt_entry *e, struct net *net, const char *name)
+{
+	struct xt_entry_target *t = ipt_get_target(e);
+	struct xt_tgchk_param par = {
+		.net       = net,
+		.table     = name,
+		.entryinfo = e,
+		.target    = t->u.kernel.target,
+		.targinfo  = t->data,
+		.hook_mask = e->comefrom,
+		.family    = NFPROTO_IPV4,
+	};
+
+	return xt_check_target(&par, t->u.target_size - sizeof(*t),
+			       e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
+}
+
+static int
+find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
+		 unsigned int size,
+		 struct xt_percpu_counter_alloc_state *alloc_state)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	int ret;
+	unsigned int j;
+	struct xt_mtchk_param mtpar;
+	struct xt_entry_match *ematch;
+
+	if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
+		return -ENOMEM;
+
+	j = 0;
+	memset(&mtpar, 0, sizeof(mtpar));
+	mtpar.net	= net;
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ip;
+	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV4;
+	xt_ematch_foreach(ematch, e) {
+		ret = find_check_match(ematch, &mtpar);
+		if (ret != 0)
+			goto cleanup_matches;
+		++j;
+	}
+
+	t = ipt_get_target(e);
+	target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		ret = PTR_ERR(target);
+		goto cleanup_matches;
+	}
+	t->u.kernel.target = target;
+
+	ret = check_target(e, net, name);
+	if (ret)
+		goto err;
+
+	return 0;
+ err:
+	module_put(t->u.kernel.target->me);
+ cleanup_matches:
+	xt_ematch_foreach(ematch, e) {
+		if (j-- == 0)
+			break;
+		cleanup_match(ematch, net);
+	}
+
+	xt_percpu_counter_free(&e->counters);
+
+	return ret;
+}
+
+static bool check_underflow(const struct ipt_entry *e)
+{
+	const struct xt_entry_target *t;
+	unsigned int verdict;
+
+	if (!unconditional(e))
+		return false;
+	t = ipt_get_target_c(e);
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+		return false;
+	verdict = ((struct xt_standard_target *)t)->verdict;
+	verdict = -verdict - 1;
+	return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static int
+check_entry_size_and_hooks(struct ipt_entry *e,
+			   struct xt_table_info *newinfo,
+			   const unsigned char *base,
+			   const unsigned char *limit,
+			   const unsigned int *hook_entries,
+			   const unsigned int *underflows,
+			   unsigned int valid_hooks)
+{
+	unsigned int h;
+	int err;
+
+	if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit)
+		return -EINVAL;
+
+	if (e->next_offset
+	    < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target))
+		return -EINVAL;
+
+	if (!ip_checkentry(&e->ip))
+		return -EINVAL;
+
+	err = xt_check_entry_offsets(e, e->elems, e->target_offset,
+				     e->next_offset);
+	if (err)
+		return err;
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+		if (!(valid_hooks & (1 << h)))
+			continue;
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h]) {
+			if (!check_underflow(e))
+				return -EINVAL;
+
+			newinfo->underflow[h] = underflows[h];
+		}
+	}
+
+	/* Clear counters and comefrom */
+	e->counters = ((struct xt_counters) { 0, 0 });
+	e->comefrom = 0;
+	return 0;
+}
+
+static void
+cleanup_entry(struct ipt_entry *e, struct net *net)
+{
+	struct xt_tgdtor_param par;
+	struct xt_entry_target *t;
+	struct xt_entry_match *ematch;
+
+	/* Cleanup all matches */
+	xt_ematch_foreach(ematch, e)
+		cleanup_match(ematch, net);
+	t = ipt_get_target(e);
+
+	par.net      = net;
+	par.target   = t->u.kernel.target;
+	par.targinfo = t->data;
+	par.family   = NFPROTO_IPV4;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
+	xt_percpu_counter_free(&e->counters);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+   newinfo) */
+static int
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+		const struct ipt_replace *repl)
+{
+	struct xt_percpu_counter_alloc_state alloc_state = { 0 };
+	struct ipt_entry *iter;
+	unsigned int *offsets;
+	unsigned int i;
+	int ret = 0;
+
+	newinfo->size = repl->size;
+	newinfo->number = repl->num_entries;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = 0xFFFFFFFF;
+		newinfo->underflow[i] = 0xFFFFFFFF;
+	}
+
+	offsets = xt_alloc_entry_offsets(newinfo->number);
+	if (!offsets)
+		return -ENOMEM;
+	i = 0;
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+						 entry0 + repl->size,
+						 repl->hook_entry,
+						 repl->underflow,
+						 repl->valid_hooks);
+		if (ret != 0)
+			goto out_free;
+		if (i < repl->num_entries)
+			offsets[i] = (void *)iter - entry0;
+		++i;
+		if (strcmp(ipt_get_target(iter)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
+	}
+
+	ret = -EINVAL;
+	if (i != repl->num_entries)
+		goto out_free;
+
+	ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
+	if (ret)
+		goto out_free;
+
+	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
+		ret = -ELOOP;
+		goto out_free;
+	}
+	kvfree(offsets);
+
+	/* Finally, each sanity check must pass */
+	i = 0;
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = find_check_entry(iter, net, repl->name, repl->size,
+				       &alloc_state);
+		if (ret != 0)
+			break;
+		++i;
+	}
+
+	if (ret != 0) {
+		xt_entry_foreach(iter, entry0, newinfo->size) {
+			if (i-- == 0)
+				break;
+			cleanup_entry(iter, net);
+		}
+		return ret;
+	}
+
+	return ret;
+ out_free:
+	kvfree(offsets);
+	return ret;
+}
+
+static void
+get_counters(const struct xt_table_info *t,
+	     struct xt_counters counters[])
+{
+	struct ipt_entry *iter;
+	unsigned int cpu;
+	unsigned int i;
+
+	for_each_possible_cpu(cpu) {
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+		i = 0;
+		xt_entry_foreach(iter, t->entries, t->size) {
+			struct xt_counters *tmp;
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+			do {
+				start = read_seqcount_begin(s);
+				bcnt = tmp->bcnt;
+				pcnt = tmp->pcnt;
+			} while (read_seqcount_retry(s, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
+			++i; /* macro does multi eval of i */
+			cond_resched();
+		}
+	}
+}
+
+static void get_old_counters(const struct xt_table_info *t,
+			     struct xt_counters counters[])
+{
+	struct ipt_entry *iter;
+	unsigned int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		i = 0;
+		xt_entry_foreach(iter, t->entries, t->size) {
+			const struct xt_counters *tmp;
+
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
+			ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
+			++i; /* macro does multi eval of i */
+		}
+
+		cond_resched();
+	}
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+	unsigned int countersize;
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+
+	/* We need atomic snapshot of counters: rest doesn't change
+	   (other than comefrom, which userspace doesn't care
+	   about). */
+	countersize = sizeof(struct xt_counters) * private->number;
+	counters = vzalloc(countersize);
+
+	if (counters == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	get_counters(private, counters);
+
+	return counters;
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+		     const struct xt_table *table,
+		     void __user *userptr)
+{
+	unsigned int off, num;
+	const struct ipt_entry *e;
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+	int ret = 0;
+	const void *loc_cpu_entry;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	loc_cpu_entry = private->entries;
+
+	/* FIXME: use iterator macros --RR */
+	/* ... then go back and fix counters and names */
+	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+		unsigned int i;
+		const struct xt_entry_match *m;
+		const struct xt_entry_target *t;
+
+		e = loc_cpu_entry + off;
+		if (copy_to_user(userptr + off, e, sizeof(*e))) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+		if (copy_to_user(userptr + off
+				 + offsetof(struct ipt_entry, counters),
+				 &counters[num],
+				 sizeof(counters[num])) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+
+		for (i = sizeof(struct ipt_entry);
+		     i < e->target_offset;
+		     i += m->u.match_size) {
+			m = (void *)e + i;
+
+			if (xt_match_to_user(m, userptr + off + i)) {
+				ret = -EFAULT;
+				goto free_counters;
+			}
+		}
+
+		t = ipt_get_target_c(e);
+		if (xt_target_to_user(t, userptr + off + e->target_offset)) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+	}
+
+ free_counters:
+	vfree(counters);
+	return ret;
+}
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+	int v = *(compat_int_t *)src;
+
+	if (v > 0)
+		v += xt_compat_calc_jump(AF_INET, v);
+	memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+	compat_int_t cv = *(int *)src;
+
+	if (cv > 0)
+		cv -= xt_compat_calc_jump(AF_INET, cv);
+	return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct ipt_entry *e,
+			     const struct xt_table_info *info,
+			     const void *base, struct xt_table_info *newinfo)
+{
+	const struct xt_entry_match *ematch;
+	const struct xt_entry_target *t;
+	unsigned int entry_offset;
+	int off, i, ret;
+
+	off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+	entry_offset = (void *)e - base;
+	xt_ematch_foreach(ematch, e)
+		off += xt_compat_match_offset(ematch->u.kernel.match);
+	t = ipt_get_target_c(e);
+	off += xt_compat_target_offset(t->u.kernel.target);
+	newinfo->size -= off;
+	ret = xt_compat_add_offset(AF_INET, entry_offset, off);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		if (info->hook_entry[i] &&
+		    (e < (struct ipt_entry *)(base + info->hook_entry[i])))
+			newinfo->hook_entry[i] -= off;
+		if (info->underflow[i] &&
+		    (e < (struct ipt_entry *)(base + info->underflow[i])))
+			newinfo->underflow[i] -= off;
+	}
+	return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+			     struct xt_table_info *newinfo)
+{
+	struct ipt_entry *iter;
+	const void *loc_cpu_entry;
+	int ret;
+
+	if (!newinfo || !info)
+		return -EINVAL;
+
+	/* we dont care about newinfo->entries */
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	newinfo->initial_entries = 0;
+	loc_cpu_entry = info->entries;
+	ret = xt_compat_init_offsets(AF_INET, info->number);
+	if (ret)
+		return ret;
+	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+		if (ret != 0)
+			return ret;
+	}
+	return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user, const int *len)
+{
+	char name[XT_TABLE_MAXNAMELEN];
+	struct xt_table *t;
+	int ret;
+
+	if (*len != sizeof(struct ipt_getinfo))
+		return -EINVAL;
+
+	if (copy_from_user(name, user, sizeof(name)) != 0)
+		return -EFAULT;
+
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+	if (in_compat_syscall())
+		xt_compat_lock(AF_INET);
+#endif
+	t = xt_request_find_table_lock(net, AF_INET, name);
+	if (!IS_ERR(t)) {
+		struct ipt_getinfo info;
+		const struct xt_table_info *private = t->private;
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+		struct xt_table_info tmp;
+
+		if (in_compat_syscall()) {
+			ret = compat_table_info(private, &tmp);
+			xt_compat_flush_offsets(AF_INET);
+			private = &tmp;
+		}
+#endif
+		memset(&info, 0, sizeof(info));
+		info.valid_hooks = t->valid_hooks;
+		memcpy(info.hook_entry, private->hook_entry,
+		       sizeof(info.hook_entry));
+		memcpy(info.underflow, private->underflow,
+		       sizeof(info.underflow));
+		info.num_entries = private->number;
+		info.size = private->size;
+		strcpy(info.name, name);
+
+		if (copy_to_user(user, &info, *len) != 0)
+			ret = -EFAULT;
+		else
+			ret = 0;
+
+		xt_table_unlock(t);
+		module_put(t->me);
+	} else
+		ret = PTR_ERR(t);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+	if (in_compat_syscall())
+		xt_compat_unlock(AF_INET);
+#endif
+	return ret;
+}
+
+static int
+get_entries(struct net *net, struct ipt_get_entries __user *uptr,
+	    const int *len)
+{
+	int ret;
+	struct ipt_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get))
+		return -EINVAL;
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+	if (*len != sizeof(struct ipt_get_entries) + get.size)
+		return -EINVAL;
+	get.name[sizeof(get.name) - 1] = '\0';
+
+	t = xt_find_table_lock(net, AF_INET, get.name);
+	if (!IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+		if (get.size == private->size)
+			ret = copy_entries_to_user(private->size,
+						   t, uptr->entrytable);
+		else
+			ret = -EAGAIN;
+
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = PTR_ERR(t);
+
+	return ret;
+}
+
+static int
+__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+	     struct xt_table_info *newinfo, unsigned int num_counters,
+	     void __user *counters_ptr)
+{
+	int ret;
+	struct xt_table *t;
+	struct xt_table_info *oldinfo;
+	struct xt_counters *counters;
+	struct ipt_entry *iter;
+
+	counters = xt_counters_alloc(num_counters);
+	if (!counters) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	t = xt_request_find_table_lock(net, AF_INET, name);
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
+		goto free_newinfo_counters_untrans;
+	}
+
+	/* You lied! */
+	if (valid_hooks != t->valid_hooks) {
+		ret = -EINVAL;
+		goto put_module;
+	}
+
+	oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+	if (!oldinfo)
+		goto put_module;
+
+	/* Update module usage count based on number of rules */
+	if ((oldinfo->number > oldinfo->initial_entries) ||
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+	if ((oldinfo->number > oldinfo->initial_entries) &&
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+
+	xt_table_unlock(t);
+
+	get_old_counters(oldinfo, counters);
+
+	/* Decrease module usage counts and free resource */
+	xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
+		cleanup_entry(iter, net);
+
+	xt_free_table_info(oldinfo);
+	if (copy_to_user(counters_ptr, counters,
+			 sizeof(struct xt_counters) * num_counters) != 0) {
+		/* Silent error, can't fail, new table is already in place */
+		net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
+	}
+	vfree(counters);
+	return 0;
+
+ put_module:
+	module_put(t->me);
+	xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+	vfree(counters);
+ out:
+	return ret;
+}
+
+static int
+do_replace(struct net *net, sockptr_t arg, unsigned int len)
+{
+	int ret;
+	struct ipt_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct ipt_entry *iter;
+
+	if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	loc_cpu_entry = newinfo->entries;
+	if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+			tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+	if (ret != 0)
+		goto free_newinfo;
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, tmp.counters);
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter, net);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int
+do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
+{
+	unsigned int i;
+	struct xt_counters_info tmp;
+	struct xt_counters *paddc;
+	struct xt_table *t;
+	const struct xt_table_info *private;
+	int ret = 0;
+	struct ipt_entry *iter;
+	unsigned int addend;
+
+	paddc = xt_copy_counters(arg, len, &tmp);
+	if (IS_ERR(paddc))
+		return PTR_ERR(paddc);
+
+	t = xt_find_table_lock(net, AF_INET, tmp.name);
+	if (IS_ERR(t)) {
+		ret = PTR_ERR(t);
+		goto free;
+	}
+
+	local_bh_disable();
+	private = t->private;
+	if (private->number != tmp.num_counters) {
+		ret = -EINVAL;
+		goto unlock_up_free;
+	}
+
+	i = 0;
+	addend = xt_write_recseq_begin();
+	xt_entry_foreach(iter, private->entries, private->size) {
+		struct xt_counters *tmp;
+
+		tmp = xt_get_this_cpu_counter(&iter->counters);
+		ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
+		++i;
+	}
+	xt_write_recseq_end(addend);
+ unlock_up_free:
+	local_bh_enable();
+	xt_table_unlock(t);
+	module_put(t->me);
+ free:
+	vfree(paddc);
+
+	return ret;
+}
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+struct compat_ipt_replace {
+	char			name[XT_TABLE_MAXNAMELEN];
+	u32			valid_hooks;
+	u32			num_entries;
+	u32			size;
+	u32			hook_entry[NF_INET_NUMHOOKS];
+	u32			underflow[NF_INET_NUMHOOKS];
+	u32			num_counters;
+	compat_uptr_t		counters;	/* struct xt_counters * */
+	struct compat_ipt_entry	entries[];
+};
+
+static int
+compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
+			  unsigned int *size, struct xt_counters *counters,
+			  unsigned int i)
+{
+	struct xt_entry_target *t;
+	struct compat_ipt_entry __user *ce;
+	u_int16_t target_offset, next_offset;
+	compat_uint_t origsize;
+	const struct xt_entry_match *ematch;
+	int ret = 0;
+
+	origsize = *size;
+	ce = *dstptr;
+	if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
+	    copy_to_user(&ce->counters, &counters[i],
+	    sizeof(counters[i])) != 0)
+		return -EFAULT;
+
+	*dstptr += sizeof(struct compat_ipt_entry);
+	*size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+	xt_ematch_foreach(ematch, e) {
+		ret = xt_compat_match_to_user(ematch, dstptr, size);
+		if (ret != 0)
+			return ret;
+	}
+	target_offset = e->target_offset - (origsize - *size);
+	t = ipt_get_target(e);
+	ret = xt_compat_target_to_user(t, dstptr, size);
+	if (ret)
+		return ret;
+	next_offset = e->next_offset - (origsize - *size);
+	if (put_user(target_offset, &ce->target_offset) != 0 ||
+	    put_user(next_offset, &ce->next_offset) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int
+compat_find_calc_match(struct xt_entry_match *m,
+		       const struct ipt_ip *ip,
+		       int *size)
+{
+	struct xt_match *match;
+
+	match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+				      m->u.user.revision);
+	if (IS_ERR(match))
+		return PTR_ERR(match);
+
+	m->u.kernel.match = match;
+	*size += xt_compat_match_offset(match);
+	return 0;
+}
+
+static void compat_release_entry(struct compat_ipt_entry *e)
+{
+	struct xt_entry_target *t;
+	struct xt_entry_match *ematch;
+
+	/* Cleanup all matches */
+	xt_ematch_foreach(ematch, e)
+		module_put(ematch->u.kernel.match->me);
+	t = compat_ipt_get_target(e);
+	module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
+				  struct xt_table_info *newinfo,
+				  unsigned int *size,
+				  const unsigned char *base,
+				  const unsigned char *limit)
+{
+	struct xt_entry_match *ematch;
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	unsigned int entry_offset;
+	unsigned int j;
+	int ret, off;
+
+	if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
+	    (unsigned char *)e + e->next_offset > limit)
+		return -EINVAL;
+
+	if (e->next_offset < sizeof(struct compat_ipt_entry) +
+			     sizeof(struct compat_xt_entry_target))
+		return -EINVAL;
+
+	if (!ip_checkentry(&e->ip))
+		return -EINVAL;
+
+	ret = xt_compat_check_entry_offsets(e, e->elems,
+					    e->target_offset, e->next_offset);
+	if (ret)
+		return ret;
+
+	off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+	entry_offset = (void *)e - (void *)base;
+	j = 0;
+	xt_ematch_foreach(ematch, e) {
+		ret = compat_find_calc_match(ematch, &e->ip, &off);
+		if (ret != 0)
+			goto release_matches;
+		++j;
+	}
+
+	t = compat_ipt_get_target(e);
+	target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		ret = PTR_ERR(target);
+		goto release_matches;
+	}
+	t->u.kernel.target = target;
+
+	off += xt_compat_target_offset(target);
+	*size += off;
+	ret = xt_compat_add_offset(AF_INET, entry_offset, off);
+	if (ret)
+		goto out;
+
+	return 0;
+
+out:
+	module_put(t->u.kernel.target->me);
+release_matches:
+	xt_ematch_foreach(ematch, e) {
+		if (j-- == 0)
+			break;
+		module_put(ematch->u.kernel.match->me);
+	}
+	return ret;
+}
+
+static void
+compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
+			    unsigned int *size,
+			    struct xt_table_info *newinfo, unsigned char *base)
+{
+	struct xt_entry_target *t;
+	struct ipt_entry *de;
+	unsigned int origsize;
+	int h;
+	struct xt_entry_match *ematch;
+
+	origsize = *size;
+	de = *dstptr;
+	memcpy(de, e, sizeof(struct ipt_entry));
+	memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+	*dstptr += sizeof(struct ipt_entry);
+	*size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+	xt_ematch_foreach(ematch, e)
+		xt_compat_match_from_user(ematch, dstptr, size);
+
+	de->target_offset = e->target_offset - (origsize - *size);
+	t = compat_ipt_get_target(e);
+	xt_compat_target_from_user(t, dstptr, size);
+
+	de->next_offset = e->next_offset - (origsize - *size);
+
+	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+		if ((unsigned char *)de - base < newinfo->hook_entry[h])
+			newinfo->hook_entry[h] -= origsize - *size;
+		if ((unsigned char *)de - base < newinfo->underflow[h])
+			newinfo->underflow[h] -= origsize - *size;
+	}
+}
+
+static int
+translate_compat_table(struct net *net,
+		       struct xt_table_info **pinfo,
+		       void **pentry0,
+		       const struct compat_ipt_replace *compatr)
+{
+	unsigned int i, j;
+	struct xt_table_info *newinfo, *info;
+	void *pos, *entry0, *entry1;
+	struct compat_ipt_entry *iter0;
+	struct ipt_replace repl;
+	unsigned int size;
+	int ret;
+
+	info = *pinfo;
+	entry0 = *pentry0;
+	size = compatr->size;
+	info->number = compatr->num_entries;
+
+	j = 0;
+	xt_compat_lock(AF_INET);
+	ret = xt_compat_init_offsets(AF_INET, compatr->num_entries);
+	if (ret)
+		goto out_unlock;
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter0, entry0, compatr->size) {
+		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+							entry0,
+							entry0 + compatr->size);
+		if (ret != 0)
+			goto out_unlock;
+		++j;
+	}
+
+	ret = -EINVAL;
+	if (j != compatr->num_entries)
+		goto out_unlock;
+
+	ret = -ENOMEM;
+	newinfo = xt_alloc_table_info(size);
+	if (!newinfo)
+		goto out_unlock;
+
+	memset(newinfo->entries, 0, size);
+
+	newinfo->number = compatr->num_entries;
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = compatr->hook_entry[i];
+		newinfo->underflow[i] = compatr->underflow[i];
+	}
+	entry1 = newinfo->entries;
+	pos = entry1;
+	size = compatr->size;
+	xt_entry_foreach(iter0, entry0, compatr->size)
+		compat_copy_entry_from_user(iter0, &pos, &size,
+					    newinfo, entry1);
+
+	/* all module references in entry0 are now gone.
+	 * entry1/newinfo contains a 64bit ruleset that looks exactly as
+	 * generated by 64bit userspace.
+	 *
+	 * Call standard translate_table() to validate all hook_entrys,
+	 * underflows, check for loops, etc.
+	 */
+	xt_compat_flush_offsets(AF_INET);
+	xt_compat_unlock(AF_INET);
+
+	memcpy(&repl, compatr, sizeof(*compatr));
+
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		repl.hook_entry[i] = newinfo->hook_entry[i];
+		repl.underflow[i] = newinfo->underflow[i];
+	}
+
+	repl.num_counters = 0;
+	repl.counters = NULL;
+	repl.size = newinfo->size;
+	ret = translate_table(net, newinfo, entry1, &repl);
+	if (ret)
+		goto free_newinfo;
+
+	*pinfo = newinfo;
+	*pentry0 = entry1;
+	xt_free_table_info(info);
+	return 0;
+
+free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(AF_INET);
+	xt_compat_unlock(AF_INET);
+	xt_entry_foreach(iter0, entry0, compatr->size) {
+		if (j-- == 0)
+			break;
+		compat_release_entry(iter0);
+	}
+	return ret;
+}
+
+static int
+compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
+{
+	int ret;
+	struct compat_ipt_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct ipt_entry *iter;
+
+	if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	loc_cpu_entry = newinfo->entries;
+	if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+			tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
+	if (ret != 0)
+		goto free_newinfo;
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, compat_ptr(tmp.counters));
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter, net);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+struct compat_ipt_get_entries {
+	char name[XT_TABLE_MAXNAMELEN];
+	compat_uint_t size;
+	struct compat_ipt_entry entrytable[];
+};
+
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+			    void __user *userptr)
+{
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+	void __user *pos;
+	unsigned int size;
+	int ret = 0;
+	unsigned int i = 0;
+	struct ipt_entry *iter;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	pos = userptr;
+	size = total_size;
+	xt_entry_foreach(iter, private->entries, total_size) {
+		ret = compat_copy_entry_to_user(iter, &pos,
+						&size, counters, i++);
+		if (ret != 0)
+			break;
+	}
+
+	vfree(counters);
+	return ret;
+}
+
+static int
+compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
+		   int *len)
+{
+	int ret;
+	struct compat_ipt_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get))
+		return -EINVAL;
+
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+
+	if (*len != sizeof(struct compat_ipt_get_entries) + get.size)
+		return -EINVAL;
+
+	get.name[sizeof(get.name) - 1] = '\0';
+
+	xt_compat_lock(AF_INET);
+	t = xt_find_table_lock(net, AF_INET, get.name);
+	if (!IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+		struct xt_table_info info;
+		ret = compat_table_info(private, &info);
+		if (!ret && get.size == info.size)
+			ret = compat_copy_entries_to_user(private->size,
+							  t, uptr->entrytable);
+		else if (!ret)
+			ret = -EAGAIN;
+
+		xt_compat_flush_offsets(AF_INET);
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = PTR_ERR(t);
+
+	xt_compat_unlock(AF_INET);
+	return ret;
+}
+#endif
+
+static int
+do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
+{
+	int ret;
+
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IPT_SO_SET_REPLACE:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+		if (in_compat_syscall())
+			ret = compat_do_replace(sock_net(sk), arg, len);
+		else
+#endif
+			ret = do_replace(sock_net(sk), arg, len);
+		break;
+
+	case IPT_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(sock_net(sk), arg, len);
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int
+do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IPT_SO_GET_INFO:
+		ret = get_info(sock_net(sk), user, len);
+		break;
+
+	case IPT_SO_GET_ENTRIES:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+		if (in_compat_syscall())
+			ret = compat_get_entries(sock_net(sk), user, len);
+		else
+#endif
+			ret = get_entries(sock_net(sk), user, len);
+		break;
+
+	case IPT_SO_GET_REVISION_MATCH:
+	case IPT_SO_GET_REVISION_TARGET: {
+		struct xt_get_revision rev;
+		int target;
+
+		if (*len != sizeof(rev)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		rev.name[sizeof(rev.name)-1] = 0;
+
+		if (cmd == IPT_SO_GET_REVISION_TARGET)
+			target = 1;
+		else
+			target = 0;
+
+		try_then_request_module(xt_find_revision(AF_INET, rev.name,
+							 rev.revision,
+							 target, &ret),
+					"ipt_%s", rev.name);
+		break;
+	}
+
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void __ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct ipt_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries;
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter, net);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
+int ipt_register_table(struct net *net, const struct xt_table *table,
+		       const struct ipt_replace *repl,
+		       const struct nf_hook_ops *template_ops)
+{
+	struct nf_hook_ops *ops;
+	unsigned int num_ops;
+	int ret, i;
+	struct xt_table_info *newinfo;
+	struct xt_table_info bootstrap = {0};
+	void *loc_cpu_entry;
+	struct xt_table *new_table;
+
+	newinfo = xt_alloc_table_info(repl->size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	loc_cpu_entry = newinfo->entries;
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
+	if (ret != 0) {
+		xt_free_table_info(newinfo);
+		return ret;
+	}
+
+	new_table = xt_register_table(net, table, &bootstrap, newinfo);
+	if (IS_ERR(new_table)) {
+		struct ipt_entry *iter;
+
+		xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+			cleanup_entry(iter, net);
+		xt_free_table_info(newinfo);
+		return PTR_ERR(new_table);
+	}
+
+	/* No template? No need to do anything. This is used by 'nat' table, it registers
+	 * with the nat core instead of the netfilter core.
+	 */
+	if (!template_ops)
+		return 0;
+
+	num_ops = hweight32(table->valid_hooks);
+	if (num_ops == 0) {
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+	if (!ops) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < num_ops; i++)
+		ops[i].priv = new_table;
+
+	new_table->ops = ops;
+
+	ret = nf_register_net_hooks(net, ops, num_ops);
+	if (ret != 0)
+		goto out_free;
+
+	return ret;
+
+out_free:
+	__ipt_unregister_table(net, new_table);
+	return ret;
+}
+
+void ipt_unregister_table_pre_exit(struct net *net, const char *name)
+{
+	struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
+
+	if (table)
+		nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
+}
+
+void ipt_unregister_table_exit(struct net *net, const char *name)
+{
+	struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
+
+	if (table)
+		__ipt_unregister_table(net, table);
+}
+
+static struct xt_target ipt_builtin_tg[] __read_mostly = {
+	{
+		.name             = XT_STANDARD_TARGET,
+		.targetsize       = sizeof(int),
+		.family           = NFPROTO_IPV4,
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+		.compatsize       = sizeof(compat_int_t),
+		.compat_from_user = compat_standard_from_user,
+		.compat_to_user   = compat_standard_to_user,
+#endif
+	},
+	{
+		.name             = XT_ERROR_TARGET,
+		.target           = ipt_error,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
+		.family           = NFPROTO_IPV4,
+	},
+};
+
+static struct nf_sockopt_ops ipt_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= IPT_BASE_CTL,
+	.set_optmax	= IPT_SO_SET_MAX+1,
+	.set		= do_ipt_set_ctl,
+	.get_optmin	= IPT_BASE_CTL,
+	.get_optmax	= IPT_SO_GET_MAX+1,
+	.get		= do_ipt_get_ctl,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init ip_tables_net_init(struct net *net)
+{
+	return xt_proto_init(net, NFPROTO_IPV4);
+}
+
+static void __net_exit ip_tables_net_exit(struct net *net)
+{
+	xt_proto_fini(net, NFPROTO_IPV4);
+}
+
+static struct pernet_operations ip_tables_net_ops = {
+	.init = ip_tables_net_init,
+	.exit = ip_tables_net_exit,
+};
+
+static int __init ip_tables_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&ip_tables_net_ops);
+	if (ret < 0)
+		goto err1;
+
+	/* No one else will be downing sem now, so we won't sleep */
+	ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+	if (ret < 0)
+		goto err2;
+
+	/* Register setsockopt */
+	ret = nf_register_sockopt(&ipt_sockopts);
+	if (ret < 0)
+		goto err4;
+
+	return 0;
+
+err4:
+	xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+err2:
+	unregister_pernet_subsys(&ip_tables_net_ops);
+err1:
+	return ret;
+}
+
+static void __exit ip_tables_fini(void)
+{
+	nf_unregister_sockopt(&ipt_sockopts);
+
+	xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+	unregister_pernet_subsys(&ip_tables_net_ops);
+}
+
+EXPORT_SYMBOL(ipt_register_table);
+EXPORT_SYMBOL(ipt_unregister_table_pre_exit);
+EXPORT_SYMBOL(ipt_unregister_table_exit);
+EXPORT_SYMBOL(ipt_do_table);
+module_init(ip_tables_init);
+module_exit(ip_tables_fini);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
new file mode 100644
index 0000000000..5930d3b025
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/tcp.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ECN.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag modification");
+
+/* set ECT codepoint from IP header.
+ * 	return false if there was an error. */
+static inline bool
+set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
+		__u8 oldtos;
+		if (skb_ensure_writable(skb, sizeof(struct iphdr)))
+			return false;
+		iph = ip_hdr(skb);
+		oldtos = iph->tos;
+		iph->tos &= ~IPT_ECN_IP_MASK;
+		iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
+		csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+	}
+	return true;
+}
+
+/* Return false if there was an error. */
+static inline bool
+set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
+{
+	struct tcphdr _tcph, *tcph;
+	__be16 oldval;
+
+	/* Not enough header? */
+	tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
+	if (!tcph)
+		return false;
+
+	if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
+	     tcph->ece == einfo->proto.tcp.ece) &&
+	    (!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
+	     tcph->cwr == einfo->proto.tcp.cwr))
+		return true;
+
+	if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+		return false;
+	tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb);
+
+	oldval = ((__be16 *)tcph)[6];
+	if (einfo->operation & IPT_ECN_OP_SET_ECE)
+		tcph->ece = einfo->proto.tcp.ece;
+	if (einfo->operation & IPT_ECN_OP_SET_CWR)
+		tcph->cwr = einfo->proto.tcp.cwr;
+
+	inet_proto_csum_replace2(&tcph->check, skb,
+				 oldval, ((__be16 *)tcph)[6], false);
+	return true;
+}
+
+static unsigned int
+ecn_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipt_ECN_info *einfo = par->targinfo;
+
+	if (einfo->operation & IPT_ECN_OP_SET_IP)
+		if (!set_ect_ip(skb, einfo))
+			return NF_DROP;
+
+	if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
+	    ip_hdr(skb)->protocol == IPPROTO_TCP)
+		if (!set_ect_tcp(skb, einfo))
+			return NF_DROP;
+
+	return XT_CONTINUE;
+}
+
+static int ecn_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ipt_ECN_info *einfo = par->targinfo;
+	const struct ipt_entry *e = par->entryinfo;
+
+	if (einfo->operation & IPT_ECN_OP_MASK)
+		return -EINVAL;
+
+	if (einfo->ip_ect & ~IPT_ECN_IP_MASK)
+		return -EINVAL;
+
+	if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
+	    (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
+		pr_info_ratelimited("cannot use operation on non-tcp rule\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target ecn_tg_reg __read_mostly = {
+	.name		= "ECN",
+	.family		= NFPROTO_IPV4,
+	.target		= ecn_tg,
+	.targetsize	= sizeof(struct ipt_ECN_info),
+	.table		= "mangle",
+	.checkentry	= ecn_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init ecn_tg_init(void)
+{
+	return xt_register_target(&ecn_tg_reg);
+}
+
+static void __exit ecn_tg_exit(void)
+{
+	xt_unregister_target(&ecn_tg_reg);
+}
+
+module_init(ecn_tg_init);
+module_exit(ecn_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 0000000000..4b88407347
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This is a module which is used for rejecting packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/icmp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_REJECT.h>
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+#include <linux/netfilter_bridge.h>
+#endif
+
+#include <net/netfilter/ipv4/nf_reject.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
+
+static unsigned int
+reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipt_reject_info *reject = par->targinfo;
+	int hook = xt_hooknum(par);
+
+	switch (reject->with) {
+	case IPT_ICMP_NET_UNREACHABLE:
+		nf_send_unreach(skb, ICMP_NET_UNREACH, hook);
+		break;
+	case IPT_ICMP_HOST_UNREACHABLE:
+		nf_send_unreach(skb, ICMP_HOST_UNREACH, hook);
+		break;
+	case IPT_ICMP_PROT_UNREACHABLE:
+		nf_send_unreach(skb, ICMP_PROT_UNREACH, hook);
+		break;
+	case IPT_ICMP_PORT_UNREACHABLE:
+		nf_send_unreach(skb, ICMP_PORT_UNREACH, hook);
+		break;
+	case IPT_ICMP_NET_PROHIBITED:
+		nf_send_unreach(skb, ICMP_NET_ANO, hook);
+		break;
+	case IPT_ICMP_HOST_PROHIBITED:
+		nf_send_unreach(skb, ICMP_HOST_ANO, hook);
+		break;
+	case IPT_ICMP_ADMIN_PROHIBITED:
+		nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
+		break;
+	case IPT_TCP_RESET:
+		nf_send_reset(xt_net(par), par->state->sk, skb, hook);
+		break;
+	case IPT_ICMP_ECHOREPLY:
+		/* Doesn't happen. */
+		break;
+	}
+
+	return NF_DROP;
+}
+
+static int reject_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ipt_reject_info *rejinfo = par->targinfo;
+	const struct ipt_entry *e = par->entryinfo;
+
+	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+		pr_info_ratelimited("ECHOREPLY no longer supported.\n");
+		return -EINVAL;
+	} else if (rejinfo->with == IPT_TCP_RESET) {
+		/* Must specify that it's a TCP packet */
+		if (e->ip.proto != IPPROTO_TCP ||
+		    (e->ip.invflags & XT_INV_PROTO)) {
+			pr_info_ratelimited("TCP_RESET invalid for non-tcp\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static struct xt_target reject_tg_reg __read_mostly = {
+	.name		= "REJECT",
+	.family		= NFPROTO_IPV4,
+	.target		= reject_tg,
+	.targetsize	= sizeof(struct ipt_reject_info),
+	.table		= "filter",
+	.hooks		= (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_LOCAL_OUT),
+	.checkentry	= reject_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init reject_tg_init(void)
+{
+	return xt_register_target(&reject_tg_reg);
+}
+
+static void __exit reject_tg_exit(void)
+{
+	xt_unregister_target(&reject_tg_reg);
+}
+
+module_init(reject_tg_init);
+module_exit(reject_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 0000000000..f2984c7eef
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+
+#include <net/netfilter/nf_synproxy.h>
+
+static unsigned int
+synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_synproxy_info *info = par->targinfo;
+	struct net *net = xt_net(par);
+	struct synproxy_net *snet = synproxy_pernet(net);
+	struct synproxy_options opts = {};
+	struct tcphdr *th, _th;
+
+	if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
+		return NF_DROP;
+
+	th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+	if (th == NULL)
+		return NF_DROP;
+
+	if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+		return NF_DROP;
+
+	if (th->syn && !(th->ack || th->fin || th->rst)) {
+		/* Initial SYN from client */
+		this_cpu_inc(snet->stats->syn_received);
+
+		if (th->ece && th->cwr)
+			opts.options |= XT_SYNPROXY_OPT_ECN;
+
+		opts.options &= info->options;
+		opts.mss_encode = opts.mss_option;
+		opts.mss_option = info->mss;
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy_init_timestamp_cookie(info, &opts);
+		else
+			opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+					  XT_SYNPROXY_OPT_SACK_PERM |
+					  XT_SYNPROXY_OPT_ECN);
+
+		synproxy_send_client_synack(net, skb, th, &opts);
+		consume_skb(skb);
+		return NF_STOLEN;
+	} else if (th->ack && !(th->fin || th->rst || th->syn)) {
+		/* ACK from client */
+		if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) {
+			consume_skb(skb);
+			return NF_STOLEN;
+		} else {
+			return NF_DROP;
+		}
+	}
+
+	return XT_CONTINUE;
+}
+
+static int synproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+	struct synproxy_net *snet = synproxy_pernet(par->net);
+	const struct ipt_entry *e = par->entryinfo;
+	int err;
+
+	if (e->ip.proto != IPPROTO_TCP ||
+	    e->ip.invflags & XT_INV_PROTO)
+		return -EINVAL;
+
+	err = nf_ct_netns_get(par->net, par->family);
+	if (err)
+		return err;
+
+	err = nf_synproxy_ipv4_init(snet, par->net);
+	if (err) {
+		nf_ct_netns_put(par->net, par->family);
+		return err;
+	}
+
+	return err;
+}
+
+static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+	struct synproxy_net *snet = synproxy_pernet(par->net);
+
+	nf_synproxy_ipv4_fini(snet, par->net);
+	nf_ct_netns_put(par->net, par->family);
+}
+
+static struct xt_target synproxy_tg4_reg __read_mostly = {
+	.name		= "SYNPROXY",
+	.family		= NFPROTO_IPV4,
+	.hooks		= (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
+	.target		= synproxy_tg4,
+	.targetsize	= sizeof(struct xt_synproxy_info),
+	.checkentry	= synproxy_tg4_check,
+	.destroy	= synproxy_tg4_destroy,
+	.me		= THIS_MODULE,
+};
+
+static int __init synproxy_tg4_init(void)
+{
+	return xt_register_target(&synproxy_tg4_reg);
+}
+
+static void __exit synproxy_tg4_exit(void)
+{
+	xt_unregister_target(&synproxy_tg4_reg);
+}
+
+module_init(synproxy_tg4_init);
+module_exit(synproxy_tg4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Intercept TCP connections and establish them using syncookies");
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
new file mode 100644
index 0000000000..161ba412cb
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Kernel module to match AH parameters. */
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv4/ipt_ah.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+	bool r;
+	pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+		 invert ? '!' : ' ', min, spi, max);
+	r = (spi >= min && spi <= max) ^ invert;
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+	return r;
+}
+
+static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ip_auth_hdr _ahdr;
+	const struct ip_auth_hdr *ah;
+	const struct ipt_ah *ahinfo = par->matchinfo;
+
+	/* Must not be a fragment. */
+	if (par->fragoff != 0)
+		return false;
+
+	ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr);
+	if (ah == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		pr_debug("Dropping evil AH tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+
+	return spi_match(ahinfo->spis[0], ahinfo->spis[1],
+			 ntohl(ah->spi),
+			 !!(ahinfo->invflags & IPT_AH_INV_SPI));
+}
+
+static int ah_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ipt_ah *ahinfo = par->matchinfo;
+
+	/* Must specify no unknown invflags */
+	if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
+		pr_debug("unknown flags %X\n", ahinfo->invflags);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_match ah_mt_reg __read_mostly = {
+	.name		= "ah",
+	.family		= NFPROTO_IPV4,
+	.match		= ah_mt,
+	.matchsize	= sizeof(struct ipt_ah),
+	.proto		= IPPROTO_AH,
+	.checkentry	= ah_mt_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init ah_mt_init(void)
+{
+	return xt_register_match(&ah_mt_reg);
+}
+
+static void __exit ah_mt_exit(void)
+{
+	xt_unregister_match(&ah_mt_reg);
+}
+
+module_init(ah_mt_init);
+module_exit(ah_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
new file mode 100644
index 0000000000..ded5bef02f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
+ *
+ * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+#include <linux/netfilter/xt_rpfilter.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 rpfilter_get_saddr(__be32 addr)
+{
+	if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+	    ipv4_is_zeronet(addr))
+		return 0;
+	return addr;
+}
+
+static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
+				const struct net_device *dev, u8 flags)
+{
+	struct fib_result res;
+
+	if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
+		return false;
+
+	if (res.type != RTN_UNICAST) {
+		if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
+			return false;
+	}
+	return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE;
+}
+
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
+{
+	return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
+}
+
+static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_rpfilter_info *info;
+	const struct iphdr *iph;
+	struct flowi4 flow;
+	bool invert;
+
+	info = par->matchinfo;
+	invert = info->flags & XT_RPFILTER_INVERT;
+
+	if (rpfilter_is_loopback(skb, xt_in(par)))
+		return true ^ invert;
+
+	iph = ip_hdr(skb);
+	if (ipv4_is_zeronet(iph->saddr)) {
+		if (ipv4_is_lbcast(iph->daddr) ||
+		    ipv4_is_local_multicast(iph->daddr))
+			return true ^ invert;
+	}
+
+	memset(&flow, 0, sizeof(flow));
+	flow.flowi4_iif = LOOPBACK_IFINDEX;
+	flow.daddr = iph->saddr;
+	flow.saddr = rpfilter_get_saddr(iph->daddr);
+	flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+	flow.flowi4_tos = iph->tos & IPTOS_RT_MASK;
+	flow.flowi4_scope = RT_SCOPE_UNIVERSE;
+	flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par));
+	flow.flowi4_uid = sock_net_uid(xt_net(par), NULL);
+
+	return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
+}
+
+static int rpfilter_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_rpfilter_info *info = par->matchinfo;
+	unsigned int options = ~XT_RPFILTER_OPTION_MASK;
+	if (info->flags & options) {
+		pr_info_ratelimited("unknown options\n");
+		return -EINVAL;
+	}
+
+	if (strcmp(par->table, "mangle") != 0 &&
+	    strcmp(par->table, "raw") != 0) {
+		pr_info_ratelimited("only valid in \'raw\' or \'mangle\' table, not \'%s\'\n",
+				    par->table);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match rpfilter_mt_reg __read_mostly = {
+	.name		= "rpfilter",
+	.family		= NFPROTO_IPV4,
+	.checkentry	= rpfilter_check,
+	.match		= rpfilter_mt,
+	.matchsize	= sizeof(struct xt_rpfilter_info),
+	.hooks		= (1 << NF_INET_PRE_ROUTING),
+	.me		= THIS_MODULE
+};
+
+static int __init rpfilter_mt_init(void)
+{
+	return xt_register_match(&rpfilter_mt_reg);
+}
+
+static void __exit rpfilter_mt_exit(void)
+{
+	xt_unregister_match(&rpfilter_mt_reg);
+}
+
+module_init(rpfilter_mt_init);
+module_exit(rpfilter_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 0000000000..b9062f4552
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+			    (1 << NF_INET_FORWARD) | \
+			    (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_filter = {
+	.name		= "filter",
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+	.priority	= NF_IP_PRI_FILTER,
+};
+
+static struct nf_hook_ops *filter_ops __read_mostly;
+
+/* Default to forward because I got too much mail already. */
+static bool forward __read_mostly = true;
+module_param(forward, bool, 0000);
+
+static int iptable_filter_table_init(struct net *net)
+{
+	struct ipt_replace *repl;
+	int err;
+
+	repl = ipt_alloc_initial_table(&packet_filter);
+	if (repl == NULL)
+		return -ENOMEM;
+	/* Entry 1 is the FORWARD hook */
+	((struct ipt_standard *)repl->entries)[1].target.verdict =
+		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+
+	err = ipt_register_table(net, &packet_filter, repl, filter_ops);
+	kfree(repl);
+	return err;
+}
+
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+	if (!forward)
+		return iptable_filter_table_init(net);
+
+	return 0;
+}
+
+static void __net_exit iptable_filter_net_pre_exit(struct net *net)
+{
+	ipt_unregister_table_pre_exit(net, "filter");
+}
+
+static void __net_exit iptable_filter_net_exit(struct net *net)
+{
+	ipt_unregister_table_exit(net, "filter");
+}
+
+static struct pernet_operations iptable_filter_net_ops = {
+	.init = iptable_filter_net_init,
+	.pre_exit = iptable_filter_net_pre_exit,
+	.exit = iptable_filter_net_exit,
+};
+
+static int __init iptable_filter_init(void)
+{
+	int ret = xt_register_template(&packet_filter,
+				       iptable_filter_table_init);
+
+	if (ret < 0)
+		return ret;
+
+	filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table);
+	if (IS_ERR(filter_ops)) {
+		xt_unregister_template(&packet_filter);
+		return PTR_ERR(filter_ops);
+	}
+
+	ret = register_pernet_subsys(&iptable_filter_net_ops);
+	if (ret < 0) {
+		xt_unregister_template(&packet_filter);
+		kfree(filter_ops);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit iptable_filter_fini(void)
+{
+	unregister_pernet_subsys(&iptable_filter_net_ops);
+	xt_unregister_template(&packet_filter);
+	kfree(filter_ops);
+}
+
+module_init(iptable_filter_init);
+module_exit(iptable_filter_fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 0000000000..3abb430af9
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mangle table");
+
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+			    (1 << NF_INET_LOCAL_IN) | \
+			    (1 << NF_INET_FORWARD) | \
+			    (1 << NF_INET_LOCAL_OUT) | \
+			    (1 << NF_INET_POST_ROUTING))
+
+static const struct xt_table packet_mangler = {
+	.name		= "mangle",
+	.valid_hooks	= MANGLE_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+	.priority	= NF_IP_PRI_MANGLE,
+};
+
+static unsigned int
+ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
+{
+	unsigned int ret;
+	const struct iphdr *iph;
+	u_int8_t tos;
+	__be32 saddr, daddr;
+	u_int32_t mark;
+	int err;
+
+	/* Save things which could affect route */
+	mark = skb->mark;
+	iph = ip_hdr(skb);
+	saddr = iph->saddr;
+	daddr = iph->daddr;
+	tos = iph->tos;
+
+	ret = ipt_do_table(priv, skb, state);
+	/* Reroute for ANY change. */
+	if (ret != NF_DROP && ret != NF_STOLEN) {
+		iph = ip_hdr(skb);
+
+		if (iph->saddr != saddr ||
+		    iph->daddr != daddr ||
+		    skb->mark != mark ||
+		    iph->tos != tos) {
+			err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
+			if (err < 0)
+				ret = NF_DROP_ERR(err);
+		}
+	}
+
+	return ret;
+}
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+iptable_mangle_hook(void *priv,
+		     struct sk_buff *skb,
+		     const struct nf_hook_state *state)
+{
+	if (state->hook == NF_INET_LOCAL_OUT)
+		return ipt_mangle_out(priv, skb, state);
+	return ipt_do_table(priv, skb, state);
+}
+
+static struct nf_hook_ops *mangle_ops __read_mostly;
+static int iptable_mangle_table_init(struct net *net)
+{
+	struct ipt_replace *repl;
+	int ret;
+
+	repl = ipt_alloc_initial_table(&packet_mangler);
+	if (repl == NULL)
+		return -ENOMEM;
+	ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops);
+	kfree(repl);
+	return ret;
+}
+
+static void __net_exit iptable_mangle_net_pre_exit(struct net *net)
+{
+	ipt_unregister_table_pre_exit(net, "mangle");
+}
+
+static void __net_exit iptable_mangle_net_exit(struct net *net)
+{
+	ipt_unregister_table_exit(net, "mangle");
+}
+
+static struct pernet_operations iptable_mangle_net_ops = {
+	.pre_exit = iptable_mangle_net_pre_exit,
+	.exit = iptable_mangle_net_exit,
+};
+
+static int __init iptable_mangle_init(void)
+{
+	int ret = xt_register_template(&packet_mangler,
+				       iptable_mangle_table_init);
+	if (ret < 0)
+		return ret;
+
+	mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
+	if (IS_ERR(mangle_ops)) {
+		xt_unregister_template(&packet_mangler);
+		ret = PTR_ERR(mangle_ops);
+		return ret;
+	}
+
+	ret = register_pernet_subsys(&iptable_mangle_net_ops);
+	if (ret < 0) {
+		xt_unregister_template(&packet_mangler);
+		kfree(mangle_ops);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit iptable_mangle_fini(void)
+{
+	unregister_pernet_subsys(&iptable_mangle_net_ops);
+	xt_unregister_template(&packet_mangler);
+	kfree(mangle_ops);
+}
+
+module_init(iptable_mangle_init);
+module_exit(iptable_mangle_fini);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
new file mode 100644
index 0000000000..56f6ecc434
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+
+struct iptable_nat_pernet {
+	struct nf_hook_ops *nf_nat_ops;
+};
+
+static unsigned int iptable_nat_net_id __read_mostly;
+
+static const struct xt_table nf_nat_ipv4_table = {
+	.name		= "nat",
+	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+};
+
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+	{
+		.hook		= ipt_do_table,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	{
+		.hook		= ipt_do_table,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+	{
+		.hook		= ipt_do_table,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	{
+		.hook		= ipt_do_table,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+};
+
+static int ipt_nat_register_lookups(struct net *net)
+{
+	struct iptable_nat_pernet *xt_nat_net;
+	struct nf_hook_ops *ops;
+	struct xt_table *table;
+	int i, ret;
+
+	xt_nat_net = net_generic(net, iptable_nat_net_id);
+	table = xt_find_table(net, NFPROTO_IPV4, "nat");
+	if (WARN_ON_ONCE(!table))
+		return -ENOENT;
+
+	ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
+		ops[i].priv = table;
+		ret = nf_nat_ipv4_register_fn(net, &ops[i]);
+		if (ret) {
+			while (i)
+				nf_nat_ipv4_unregister_fn(net, &ops[--i]);
+
+			kfree(ops);
+			return ret;
+		}
+	}
+
+	xt_nat_net->nf_nat_ops = ops;
+	return 0;
+}
+
+static void ipt_nat_unregister_lookups(struct net *net)
+{
+	struct iptable_nat_pernet *xt_nat_net = net_generic(net, iptable_nat_net_id);
+	struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops;
+	int i;
+
+	if (!ops)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
+		nf_nat_ipv4_unregister_fn(net, &ops[i]);
+
+	kfree(ops);
+}
+
+static int iptable_nat_table_init(struct net *net)
+{
+	struct ipt_replace *repl;
+	int ret;
+
+	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
+	if (repl == NULL)
+		return -ENOMEM;
+
+	ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, NULL);
+	if (ret < 0) {
+		kfree(repl);
+		return ret;
+	}
+
+	ret = ipt_nat_register_lookups(net);
+	if (ret < 0)
+		ipt_unregister_table_exit(net, "nat");
+
+	kfree(repl);
+	return ret;
+}
+
+static void __net_exit iptable_nat_net_pre_exit(struct net *net)
+{
+	ipt_nat_unregister_lookups(net);
+}
+
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+	ipt_unregister_table_exit(net, "nat");
+}
+
+static struct pernet_operations iptable_nat_net_ops = {
+	.pre_exit = iptable_nat_net_pre_exit,
+	.exit	= iptable_nat_net_exit,
+	.id	= &iptable_nat_net_id,
+	.size	= sizeof(struct iptable_nat_pernet),
+};
+
+static int __init iptable_nat_init(void)
+{
+	int ret = xt_register_template(&nf_nat_ipv4_table,
+				       iptable_nat_table_init);
+
+	if (ret < 0)
+		return ret;
+
+	ret = register_pernet_subsys(&iptable_nat_net_ops);
+	if (ret < 0) {
+		xt_unregister_template(&nf_nat_ipv4_table);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit iptable_nat_exit(void)
+{
+	unregister_pernet_subsys(&iptable_nat_net_ops);
+	xt_unregister_template(&nf_nat_ipv4_table);
+}
+
+module_init(iptable_nat_init);
+module_exit(iptable_nat_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
new file mode 100644
index 0000000000..ca5e5b2158
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+
+static bool raw_before_defrag __read_mostly;
+MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
+module_param(raw_before_defrag, bool, 0000);
+
+static const struct xt_table packet_raw = {
+	.name = "raw",
+	.valid_hooks =  RAW_VALID_HOOKS,
+	.me = THIS_MODULE,
+	.af = NFPROTO_IPV4,
+	.priority = NF_IP_PRI_RAW,
+};
+
+static const struct xt_table packet_raw_before_defrag = {
+	.name = "raw",
+	.valid_hooks =  RAW_VALID_HOOKS,
+	.me = THIS_MODULE,
+	.af = NFPROTO_IPV4,
+	.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG,
+};
+
+static struct nf_hook_ops *rawtable_ops __read_mostly;
+
+static int iptable_raw_table_init(struct net *net)
+{
+	struct ipt_replace *repl;
+	const struct xt_table *table = &packet_raw;
+	int ret;
+
+	if (raw_before_defrag)
+		table = &packet_raw_before_defrag;
+
+	repl = ipt_alloc_initial_table(table);
+	if (repl == NULL)
+		return -ENOMEM;
+	ret = ipt_register_table(net, table, repl, rawtable_ops);
+	kfree(repl);
+	return ret;
+}
+
+static void __net_exit iptable_raw_net_pre_exit(struct net *net)
+{
+	ipt_unregister_table_pre_exit(net, "raw");
+}
+
+static void __net_exit iptable_raw_net_exit(struct net *net)
+{
+	ipt_unregister_table_exit(net, "raw");
+}
+
+static struct pernet_operations iptable_raw_net_ops = {
+	.pre_exit = iptable_raw_net_pre_exit,
+	.exit = iptable_raw_net_exit,
+};
+
+static int __init iptable_raw_init(void)
+{
+	int ret;
+	const struct xt_table *table = &packet_raw;
+
+	if (raw_before_defrag) {
+		table = &packet_raw_before_defrag;
+
+		pr_info("Enabling raw table before defrag\n");
+	}
+
+	ret = xt_register_template(table,
+				   iptable_raw_table_init);
+	if (ret < 0)
+		return ret;
+
+	rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table);
+	if (IS_ERR(rawtable_ops)) {
+		xt_unregister_template(table);
+		return PTR_ERR(rawtable_ops);
+	}
+
+	ret = register_pernet_subsys(&iptable_raw_net_ops);
+	if (ret < 0) {
+		xt_unregister_template(table);
+		kfree(rawtable_ops);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit iptable_raw_fini(void)
+{
+	unregister_pernet_subsys(&iptable_raw_net_ops);
+	kfree(rawtable_ops);
+	xt_unregister_template(&packet_raw);
+}
+
+module_init(iptable_raw_init);
+module_exit(iptable_raw_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
new file mode 100644
index 0000000000..d885443cb2
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * "security" table
+ *
+ * This is for use by Mandatory Access Control (MAC) security models,
+ * which need to be able to manage security policy in separate context
+ * to DAC.
+ *
+ * Based on iptable_mangle.c
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
+ * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
+MODULE_DESCRIPTION("iptables security table, for MAC rules");
+
+#define SECURITY_VALID_HOOKS	(1 << NF_INET_LOCAL_IN) | \
+				(1 << NF_INET_FORWARD) | \
+				(1 << NF_INET_LOCAL_OUT)
+
+static const struct xt_table security_table = {
+	.name		= "security",
+	.valid_hooks	= SECURITY_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+	.priority	= NF_IP_PRI_SECURITY,
+};
+
+static struct nf_hook_ops *sectbl_ops __read_mostly;
+
+static int iptable_security_table_init(struct net *net)
+{
+	struct ipt_replace *repl;
+	int ret;
+
+	repl = ipt_alloc_initial_table(&security_table);
+	if (repl == NULL)
+		return -ENOMEM;
+	ret = ipt_register_table(net, &security_table, repl, sectbl_ops);
+	kfree(repl);
+	return ret;
+}
+
+static void __net_exit iptable_security_net_pre_exit(struct net *net)
+{
+	ipt_unregister_table_pre_exit(net, "security");
+}
+
+static void __net_exit iptable_security_net_exit(struct net *net)
+{
+	ipt_unregister_table_exit(net, "security");
+}
+
+static struct pernet_operations iptable_security_net_ops = {
+	.pre_exit = iptable_security_net_pre_exit,
+	.exit = iptable_security_net_exit,
+};
+
+static int __init iptable_security_init(void)
+{
+	int ret = xt_register_template(&security_table,
+				       iptable_security_table_init);
+
+	if (ret < 0)
+		return ret;
+
+	sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table);
+	if (IS_ERR(sectbl_ops)) {
+		xt_unregister_template(&security_table);
+		return PTR_ERR(sectbl_ops);
+	}
+
+	ret = register_pernet_subsys(&iptable_security_net_ops);
+	if (ret < 0) {
+		xt_unregister_template(&security_table);
+		kfree(sectbl_ops);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit iptable_security_fini(void)
+{
+	unregister_pernet_subsys(&iptable_security_net_ops);
+	kfree(sectbl_ops);
+	xt_unregister_template(&security_table);
+}
+
+module_init(iptable_security_init);
+module_exit(iptable_security_fini);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
new file mode 100644
index 0000000000..265b39bc43
--- /dev/null
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <net/netns/generic.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static DEFINE_MUTEX(defrag4_mutex);
+
+static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
+				   u_int32_t user)
+{
+	int err;
+
+	local_bh_disable();
+	err = ip_defrag(net, skb, user);
+	local_bh_enable();
+
+	if (!err)
+		skb->ignore_df = 1;
+
+	return err;
+}
+
+static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
+					      struct sk_buff *skb)
+{
+	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	if (skb_nfct(skb)) {
+		enum ip_conntrack_info ctinfo;
+		const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+	}
+#endif
+	if (nf_bridge_in_prerouting(skb))
+		return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;
+
+	if (hooknum == NF_INET_PRE_ROUTING)
+		return IP_DEFRAG_CONNTRACK_IN + zone_id;
+	else
+		return IP_DEFRAG_CONNTRACK_OUT + zone_id;
+}
+
+static unsigned int ipv4_conntrack_defrag(void *priv,
+					  struct sk_buff *skb,
+					  const struct nf_hook_state *state)
+{
+	struct sock *sk = skb->sk;
+
+	if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
+	    inet_test_bit(NODEFRAG, sk))
+		return NF_ACCEPT;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#if !IS_ENABLED(CONFIG_NF_NAT)
+	/* Previously seen (loopback)?  Ignore.  Do this before
+	   fragment check. */
+	if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
+		return NF_ACCEPT;
+#endif
+	if (skb->_nfct == IP_CT_UNTRACKED)
+		return NF_ACCEPT;
+#endif
+	/* Gather fragments. */
+	if (ip_is_fragment(ip_hdr(skb))) {
+		enum ip_defrag_users user =
+			nf_ct_defrag_user(state->hook, skb);
+
+		if (nf_ct_ipv4_gather_frags(state->net, skb, user))
+			return NF_STOLEN;
+	}
+	return NF_ACCEPT;
+}
+
+static const struct nf_hook_ops ipv4_defrag_ops[] = {
+	{
+		.hook		= ipv4_conntrack_defrag,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
+	},
+	{
+		.hook           = ipv4_conntrack_defrag,
+		.pf             = NFPROTO_IPV4,
+		.hooknum        = NF_INET_LOCAL_OUT,
+		.priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
+	},
+};
+
+static void __net_exit defrag4_net_exit(struct net *net)
+{
+	if (net->nf.defrag_ipv4_users) {
+		nf_unregister_net_hooks(net, ipv4_defrag_ops,
+					ARRAY_SIZE(ipv4_defrag_ops));
+		net->nf.defrag_ipv4_users = 0;
+	}
+}
+
+static const struct nf_defrag_hook defrag_hook = {
+	.owner = THIS_MODULE,
+	.enable = nf_defrag_ipv4_enable,
+	.disable = nf_defrag_ipv4_disable,
+};
+
+static struct pernet_operations defrag4_net_ops = {
+	.exit = defrag4_net_exit,
+};
+
+static int __init nf_defrag_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&defrag4_net_ops);
+	if (err)
+		return err;
+
+	rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook);
+	return err;
+}
+
+static void __exit nf_defrag_fini(void)
+{
+	rcu_assign_pointer(nf_defrag_v4_hook, NULL);
+	unregister_pernet_subsys(&defrag4_net_ops);
+}
+
+int nf_defrag_ipv4_enable(struct net *net)
+{
+	int err = 0;
+
+	mutex_lock(&defrag4_mutex);
+	if (net->nf.defrag_ipv4_users == UINT_MAX) {
+		err = -EOVERFLOW;
+		goto out_unlock;
+	}
+
+	if (net->nf.defrag_ipv4_users) {
+		net->nf.defrag_ipv4_users++;
+		goto out_unlock;
+	}
+
+	err = nf_register_net_hooks(net, ipv4_defrag_ops,
+				    ARRAY_SIZE(ipv4_defrag_ops));
+	if (err == 0)
+		net->nf.defrag_ipv4_users = 1;
+
+ out_unlock:
+	mutex_unlock(&defrag4_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
+
+void nf_defrag_ipv4_disable(struct net *net)
+{
+	mutex_lock(&defrag4_mutex);
+	if (net->nf.defrag_ipv4_users) {
+		net->nf.defrag_ipv4_users--;
+		if (net->nf.defrag_ipv4_users == 0)
+			nf_unregister_net_hooks(net, ipv4_defrag_ops,
+						ARRAY_SIZE(ipv4_defrag_ops));
+	}
+
+	mutex_unlock(&defrag4_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
new file mode 100644
index 0000000000..6cc5743c55
--- /dev/null
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
+ * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Extracted from xt_TEE.c
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
+			      const struct in_addr *gw, int oif)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	memset(&fl4, 0, sizeof(fl4));
+	if (oif != -1)
+		fl4.flowi4_oif = oif;
+
+	fl4.daddr = gw->s_addr;
+	fl4.flowi4_tos = RT_TOS(iph->tos);
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		return false;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+	skb->dev      = rt->dst.dev;
+	skb->protocol = htons(ETH_P_IP);
+
+	return true;
+}
+
+void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
+		 const struct in_addr *gw, int oif)
+{
+	struct iphdr *iph;
+
+	if (this_cpu_read(nf_skb_duplicated))
+		return;
+	/*
+	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+	 * the original skb, which should continue on its way as if nothing has
+	 * happened. The copy should be independently delivered to the gateway.
+	 */
+	skb = pskb_copy(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	/* Avoid counting cloned packets towards the original connection. */
+	nf_reset_ct(skb);
+	nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+#endif
+	/*
+	 * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
+	 * loops between two hosts.
+	 *
+	 * Set %IP_DF so that the original source is notified of a potentially
+	 * decreased MTU on the clone route. IPv6 does this too.
+	 *
+	 * IP header checksum will be recalculated at ip_local_out.
+	 */
+	iph = ip_hdr(skb);
+	iph->frag_off |= htons(IP_DF);
+	if (hooknum == NF_INET_PRE_ROUTING ||
+	    hooknum == NF_INET_LOCAL_IN)
+		--iph->ttl;
+
+	if (nf_dup_ipv4_route(net, skb, gw, oif)) {
+		__this_cpu_write(nf_skb_duplicated, true);
+		ip_local_out(net, skb->sk, skb);
+		__this_cpu_write(nf_skb_duplicated, false);
+	} else {
+		kfree_skb(skb);
+	}
+}
+EXPORT_SYMBOL_GPL(nf_dup_ipv4);
+
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("nf_dup_ipv4: Duplicate IPv4 packet");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
new file mode 100644
index 0000000000..faee20af48
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -0,0 +1,567 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * H.323 extension for NAT alteration.
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the 'brute force' H.323 NAT module by
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_h323.h>
+
+/****************************************************************************/
+static int set_addr(struct sk_buff *skb, unsigned int protoff,
+		    unsigned char **data, int dataoff,
+		    unsigned int addroff, __be32 ip, __be16 port)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct {
+		__be32 ip;
+		__be16 port;
+	} __attribute__ ((__packed__)) buf;
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+
+	buf.ip = ip;
+	buf.port = port;
+	addroff += dataoff;
+
+	if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
+		if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+					      protoff, addroff, sizeof(buf),
+					      (char *) &buf, sizeof(buf))) {
+			net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
+			return -1;
+		}
+
+		/* Relocate data pointer */
+		th = skb_header_pointer(skb, ip_hdrlen(skb),
+					sizeof(_tcph), &_tcph);
+		if (th == NULL)
+			return -1;
+		*data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
+	} else {
+		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+					      protoff, addroff, sizeof(buf),
+					      (char *) &buf, sizeof(buf))) {
+			net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
+			return -1;
+		}
+		/* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy
+		 * or pull everything in a linear buffer, so we can safely
+		 * use the skb pointers now */
+		*data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
+			 unsigned char **data, int dataoff,
+			 TransportAddress *taddr,
+			 union nf_inet_addr *addr, __be16 port)
+{
+	return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
+			addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
+			 unsigned char **data, int dataoff,
+			 H245_TransportAddress *taddr,
+			 union nf_inet_addr *addr, __be16 port)
+{
+	return set_addr(skb, protoff, data, dataoff,
+			taddr->unicastAddress.iPAddress.network,
+			addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned int protoff, unsigned char **data,
+			TransportAddress *taddr, int count)
+{
+	const struct nf_ct_h323_master *info = nfct_help_data(ct);
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	__be16 port;
+	union nf_inet_addr addr;
+
+	for (i = 0; i < count; i++) {
+		if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
+			if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+			    port == info->sig_port[dir]) {
+				/* GW->GK */
+
+				/* Fix for Gnomemeeting */
+				if (i > 0 &&
+				    get_h225_addr(ct, *data, &taddr[0],
+						  &addr, &port) &&
+				    (ntohl(addr.ip) & 0xff000000) == 0x7f000000)
+					i = 0;
+
+				pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+					 &addr.ip, port,
+					 &ct->tuplehash[!dir].tuple.dst.u3.ip,
+					 info->sig_port[!dir]);
+				return set_h225_addr(skb, protoff, data, 0,
+						     &taddr[i],
+						     &ct->tuplehash[!dir].
+						     tuple.dst.u3,
+						     info->sig_port[!dir]);
+			} else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
+				   port == info->sig_port[dir]) {
+				/* GK->GW */
+				pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+					 &addr.ip, port,
+					 &ct->tuplehash[!dir].tuple.src.u3.ip,
+					 info->sig_port[!dir]);
+				return set_h225_addr(skb, protoff, data, 0,
+						     &taddr[i],
+						     &ct->tuplehash[!dir].
+						     tuple.src.u3,
+						     info->sig_port[!dir]);
+			}
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned int protoff, unsigned char **data,
+			TransportAddress *taddr, int count)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	__be16 port;
+	union nf_inet_addr addr;
+
+	for (i = 0; i < count; i++) {
+		if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
+		    addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+		    port == ct->tuplehash[dir].tuple.src.u.udp.port) {
+			pr_debug("nf_nat_ras: set rasAddress %pI4:%hu->%pI4:%hu\n",
+				 &addr.ip, ntohs(port),
+				 &ct->tuplehash[!dir].tuple.dst.u3.ip,
+				 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
+			return set_h225_addr(skb, protoff, data, 0, &taddr[i],
+					     &ct->tuplehash[!dir].tuple.dst.u3,
+					     ct->tuplehash[!dir].tuple.
+								dst.u.udp.port);
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned int protoff, unsigned char **data, int dataoff,
+			H245_TransportAddress *taddr,
+			__be16 port, __be16 rtp_port,
+			struct nf_conntrack_expect *rtp_exp,
+			struct nf_conntrack_expect *rtcp_exp)
+{
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	u_int16_t nated_port;
+
+	/* Set expectations for NAT */
+	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+	rtp_exp->expectfn = nf_nat_follow_master;
+	rtp_exp->dir = !dir;
+	rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+	rtcp_exp->expectfn = nf_nat_follow_master;
+	rtcp_exp->dir = !dir;
+
+	/* Lookup existing expects */
+	for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
+		if (info->rtp_port[i][dir] == rtp_port) {
+			/* Expected */
+
+			/* Use allocated ports first. This will refresh
+			 * the expects */
+			rtp_exp->tuple.dst.u.udp.port = info->rtp_port[i][dir];
+			rtcp_exp->tuple.dst.u.udp.port =
+			    htons(ntohs(info->rtp_port[i][dir]) + 1);
+			break;
+		} else if (info->rtp_port[i][dir] == 0) {
+			/* Not expected */
+			break;
+		}
+	}
+
+	/* Run out of expectations */
+	if (i >= H323_RTP_CHANNEL_MAX) {
+		net_notice_ratelimited("nf_nat_h323: out of expectations\n");
+		return 0;
+	}
+
+	/* Try to get a pair of ports. */
+	for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+	     nated_port != 0; nated_port += 2) {
+		int ret;
+
+		rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
+		ret = nf_ct_expect_related(rtp_exp, 0);
+		if (ret == 0) {
+			rtcp_exp->tuple.dst.u.udp.port =
+			    htons(nated_port + 1);
+			ret = nf_ct_expect_related(rtcp_exp, 0);
+			if (ret == 0)
+				break;
+			else if (ret == -EBUSY) {
+				nf_ct_unexpect_related(rtp_exp);
+				continue;
+			} else if (ret < 0) {
+				nf_ct_unexpect_related(rtp_exp);
+				nated_port = 0;
+				break;
+			}
+		} else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons((port & htons(1)) ? nated_port + 1 :
+						    nated_port))) {
+		nf_ct_unexpect_related(rtp_exp);
+		nf_ct_unexpect_related(rtcp_exp);
+		return -1;
+	}
+
+	/* Save ports */
+	info->rtp_port[i][dir] = rtp_port;
+	info->rtp_port[i][!dir] = htons(nated_port);
+
+	/* Success */
+	pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n",
+		 &rtp_exp->tuple.src.u3.ip,
+		 ntohs(rtp_exp->tuple.src.u.udp.port),
+		 &rtp_exp->tuple.dst.u3.ip,
+		 ntohs(rtp_exp->tuple.dst.u.udp.port));
+	pr_debug("nf_nat_h323: expect RTCP %pI4:%hu->%pI4:%hu\n",
+		 &rtcp_exp->tuple.src.u3.ip,
+		 ntohs(rtcp_exp->tuple.src.u.udp.port),
+		 &rtcp_exp->tuple.dst.u3.ip,
+		 ntohs(rtcp_exp->tuple.dst.u.udp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned int protoff, unsigned char **data, int dataoff,
+		    H245_TransportAddress *taddr, __be16 port,
+		    struct nf_conntrack_expect *exp)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = ntohs(port);
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = nf_nat_follow_master;
+	exp->dir = !dir;
+
+	nated_port = nf_nat_exp_find_port(exp, nated_port);
+	if (nated_port == 0) {	/* No port available */
+		net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons(nated_port)) < 0) {
+		nf_ct_unexpect_related(exp);
+		return -1;
+	}
+
+	pr_debug("nf_nat_h323: expect T.120 %pI4:%hu->%pI4:%hu\n",
+		 &exp->tuple.src.u3.ip,
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 &exp->tuple.dst.u3.ip,
+		 ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned int protoff, unsigned char **data, int dataoff,
+		    TransportAddress *taddr, __be16 port,
+		    struct nf_conntrack_expect *exp)
+{
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = ntohs(port);
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = nf_nat_follow_master;
+	exp->dir = !dir;
+
+	/* Check existing expects */
+	if (info->sig_port[dir] == port)
+		nated_port = ntohs(info->sig_port[!dir]);
+
+	nated_port = nf_nat_exp_find_port(exp, nated_port);
+	if (nated_port == 0) {	/* No port available */
+		net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h225_addr(skb, protoff, data, dataoff, taddr,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons(nated_port))) {
+		nf_ct_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Save ports */
+	info->sig_port[dir] = port;
+	info->sig_port[!dir] = htons(nated_port);
+
+	pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n",
+		 &exp->tuple.src.u3.ip,
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 &exp->tuple.dst.u3.ip,
+		 ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************
+ * This conntrack expect function replaces nf_conntrack_q931_expect()
+ * which was set by nf_conntrack_h323.c.
+ ****************************************************************************/
+static void ip_nat_q931_expect(struct nf_conn *new,
+			       struct nf_conntrack_expect *this)
+{
+	struct nf_nat_range2 range;
+
+	if (this->tuple.src.u3.ip != 0) {	/* Only accept calls from GK */
+		nf_nat_follow_master(new, this);
+		return;
+	}
+
+	/* This must be a fresh one. */
+	BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = NF_NAT_RANGE_MAP_IPS;
+	range.min_addr = range.max_addr =
+	    new->tuplehash[!this->dir].tuple.src.u3;
+	nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+	range.min_proto = range.max_proto = this->saved_proto;
+	range.min_addr = range.max_addr =
+	    new->master->tuplehash[!this->dir].tuple.src.u3;
+	nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned int protoff, unsigned char **data,
+		    TransportAddress *taddr, int idx,
+		    __be16 port, struct nf_conntrack_expect *exp)
+{
+	struct nf_ct_h323_master *info = nfct_help_data(ct);
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = ntohs(port);
+	union nf_inet_addr addr;
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_q931_expect;
+	exp->dir = !dir;
+
+	/* Check existing expects */
+	if (info->sig_port[dir] == port)
+		nated_port = ntohs(info->sig_port[!dir]);
+
+	nated_port = nf_nat_exp_find_port(exp, nated_port);
+	if (nated_port == 0) {	/* No port available */
+		net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons(nated_port))) {
+		nf_ct_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Save ports */
+	info->sig_port[dir] = port;
+	info->sig_port[!dir] = htons(nated_port);
+
+	/* Fix for Gnomemeeting */
+	if (idx > 0 &&
+	    get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
+	    (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
+		if (set_h225_addr(skb, protoff, data, 0, &taddr[0],
+				  &ct->tuplehash[!dir].tuple.dst.u3,
+				  info->sig_port[!dir])) {
+			nf_ct_unexpect_related(exp);
+			return -1;
+		}
+	}
+
+	/* Success */
+	pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n",
+		 &exp->tuple.src.u3.ip,
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 &exp->tuple.dst.u3.ip,
+		 ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static void ip_nat_callforwarding_expect(struct nf_conn *new,
+					 struct nf_conntrack_expect *this)
+{
+	struct nf_nat_range2 range;
+
+	/* This must be a fresh one. */
+	BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = NF_NAT_RANGE_MAP_IPS;
+	range.min_addr = range.max_addr =
+	    new->tuplehash[!this->dir].tuple.src.u3;
+	nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+	range.min_proto = range.max_proto = this->saved_proto;
+	range.min_addr = range.max_addr = this->saved_addr;
+	nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
+			      enum ip_conntrack_info ctinfo,
+			      unsigned int protoff,
+			      unsigned char **data, int dataoff,
+			      TransportAddress *taddr, __be16 port,
+			      struct nf_conntrack_expect *exp)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port;
+
+	/* Set expectations for NAT */
+	exp->saved_addr = exp->tuple.dst.u3;
+	exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_callforwarding_expect;
+	exp->dir = !dir;
+
+	nated_port = nf_nat_exp_find_port(exp, ntohs(port));
+	if (nated_port == 0) {	/* No port available */
+		net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h225_addr(skb, protoff, data, dataoff, taddr,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons(nated_port))) {
+		nf_ct_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Success */
+	pr_debug("nf_nat_q931: expect Call Forwarding %pI4:%hu->%pI4:%hu\n",
+		 &exp->tuple.src.u3.ip,
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 &exp->tuple.dst.u3.ip,
+		 ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+static struct nf_ct_helper_expectfn q931_nat = {
+	.name		= "Q.931",
+	.expectfn	= ip_nat_q931_expect,
+};
+
+static struct nf_ct_helper_expectfn callforwarding_nat = {
+	.name		= "callforwarding",
+	.expectfn	= ip_nat_callforwarding_expect,
+};
+
+static const struct nfct_h323_nat_hooks nathooks = {
+	.set_h245_addr = set_h245_addr,
+	.set_h225_addr = set_h225_addr,
+	.set_sig_addr = set_sig_addr,
+	.set_ras_addr = set_ras_addr,
+	.nat_rtp_rtcp = nat_rtp_rtcp,
+	.nat_t120 = nat_t120,
+	.nat_h245 = nat_h245,
+	.nat_callforwarding = nat_callforwarding,
+	.nat_q931 = nat_q931,
+};
+
+/****************************************************************************/
+static int __init nf_nat_h323_init(void)
+{
+	RCU_INIT_POINTER(nfct_h323_nat_hook, &nathooks);
+	nf_ct_helper_expectfn_register(&q931_nat);
+	nf_ct_helper_expectfn_register(&callforwarding_nat);
+	return 0;
+}
+
+/****************************************************************************/
+static void __exit nf_nat_h323_fini(void)
+{
+	RCU_INIT_POINTER(nfct_h323_nat_hook, NULL);
+	nf_ct_helper_expectfn_unregister(&q931_nat);
+	nf_ct_helper_expectfn_unregister(&callforwarding_nat);
+	synchronize_rcu();
+}
+
+/****************************************************************************/
+module_init(nf_nat_h323_init);
+module_exit(nf_nat_h323_fini);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_NAT_HELPER("h323");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
new file mode 100644
index 0000000000..fab357cc85
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * nf_nat_pptp.c
+ *
+ * NAT support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft.  PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702.  Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * TODO: - NAT to a unique tuple, not to TCP source port
+ * 	   (needs netfilter tuple reservation)
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define NF_NAT_PPTP_VERSION "3.0"
+
+#define REQ_CID(req, off)		(*(__be16 *)((char *)(req) + (off)))
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
+MODULE_ALIAS_NF_NAT_HELPER("pptp");
+
+static void pptp_nat_expected(struct nf_conn *ct,
+			      struct nf_conntrack_expect *exp)
+{
+	struct net *net = nf_ct_net(ct);
+	const struct nf_conn *master = ct->master;
+	struct nf_conntrack_expect *other_exp;
+	struct nf_conntrack_tuple t = {};
+	const struct nf_ct_pptp_master *ct_pptp_info;
+	const struct nf_nat_pptp *nat_pptp_info;
+	struct nf_nat_range2 range;
+	struct nf_conn_nat *nat;
+
+	nat = nf_ct_nat_ext_add(ct);
+	if (WARN_ON_ONCE(!nat))
+		return;
+
+	nat_pptp_info = &nat->help.nat_pptp_info;
+	ct_pptp_info = nfct_help_data(master);
+
+	/* And here goes the grand finale of corrosion... */
+	if (exp->dir == IP_CT_DIR_ORIGINAL) {
+		pr_debug("we are PNS->PAC\n");
+		/* therefore, build tuple for PAC->PNS */
+		t.src.l3num = AF_INET;
+		t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+		t.src.u.gre.key = ct_pptp_info->pac_call_id;
+		t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+		t.dst.u.gre.key = ct_pptp_info->pns_call_id;
+		t.dst.protonum = IPPROTO_GRE;
+	} else {
+		pr_debug("we are PAC->PNS\n");
+		/* build tuple for PNS->PAC */
+		t.src.l3num = AF_INET;
+		t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+		t.src.u.gre.key = nat_pptp_info->pns_call_id;
+		t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+		t.dst.u.gre.key = nat_pptp_info->pac_call_id;
+		t.dst.protonum = IPPROTO_GRE;
+	}
+
+	pr_debug("trying to unexpect other dir: ");
+	nf_ct_dump_tuple_ip(&t);
+	other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
+	if (other_exp) {
+		nf_ct_unexpect_related(other_exp);
+		nf_ct_expect_put(other_exp);
+		pr_debug("success\n");
+	} else {
+		pr_debug("not found!\n");
+	}
+
+	/* This must be a fresh one. */
+	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = NF_NAT_RANGE_MAP_IPS;
+	range.min_addr = range.max_addr
+		= ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+	if (exp->dir == IP_CT_DIR_ORIGINAL) {
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+		range.min_proto = range.max_proto = exp->saved_proto;
+	}
+	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = NF_NAT_RANGE_MAP_IPS;
+	range.min_addr = range.max_addr
+		= ct->master->tuplehash[!exp->dir].tuple.src.u3;
+	if (exp->dir == IP_CT_DIR_REPLY) {
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+		range.min_proto = range.max_proto = exp->saved_proto;
+	}
+	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+/* outbound packets == from PNS to PAC */
+static int
+pptp_outbound_pkt(struct sk_buff *skb,
+		  struct nf_conn *ct,
+		  enum ip_conntrack_info ctinfo,
+		  unsigned int protoff,
+		  struct PptpControlHeader *ctlh,
+		  union pptp_ctrl_union *pptpReq)
+
+{
+	struct nf_ct_pptp_master *ct_pptp_info;
+	struct nf_conn_nat *nat = nfct_nat(ct);
+	struct nf_nat_pptp *nat_pptp_info;
+	u_int16_t msg;
+	__be16 new_callid;
+	unsigned int cid_off;
+
+	if (WARN_ON_ONCE(!nat))
+		return NF_DROP;
+
+	nat_pptp_info = &nat->help.nat_pptp_info;
+	ct_pptp_info = nfct_help_data(ct);
+
+	new_callid = ct_pptp_info->pns_call_id;
+
+	switch (msg = ntohs(ctlh->messageType)) {
+	case PPTP_OUT_CALL_REQUEST:
+		cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
+		/* FIXME: ideally we would want to reserve a call ID
+		 * here.  current netfilter NAT core is not able to do
+		 * this :( For now we use TCP source port. This breaks
+		 * multiple calls within one control session */
+
+		/* save original call ID in nat_info */
+		nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
+
+		/* don't use tcph->source since we are at a DSTmanip
+		 * hook (e.g. PREROUTING) and pkt is not mangled yet */
+		new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+		/* save new call ID in ct info */
+		ct_pptp_info->pns_call_id = new_callid;
+		break;
+	case PPTP_IN_CALL_REPLY:
+		cid_off = offsetof(union pptp_ctrl_union, icack.callID);
+		break;
+	case PPTP_CALL_CLEAR_REQUEST:
+		cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
+		break;
+	default:
+		pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
+			 pptp_msg_name(msg));
+		fallthrough;
+	case PPTP_SET_LINK_INFO:
+		/* only need to NAT in case PAC is behind NAT box */
+	case PPTP_START_SESSION_REQUEST:
+	case PPTP_START_SESSION_REPLY:
+	case PPTP_STOP_SESSION_REQUEST:
+	case PPTP_STOP_SESSION_REPLY:
+	case PPTP_ECHO_REQUEST:
+	case PPTP_ECHO_REPLY:
+		/* no need to alter packet */
+		return NF_ACCEPT;
+	}
+
+	/* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
+	 * down to here */
+	pr_debug("altering call id from 0x%04x to 0x%04x\n",
+		 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
+
+	/* mangle packet */
+	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+				      cid_off + sizeof(struct pptp_pkt_hdr) +
+				      sizeof(struct PptpControlHeader),
+				      sizeof(new_callid), (char *)&new_callid,
+				      sizeof(new_callid)))
+		return NF_DROP;
+	return NF_ACCEPT;
+}
+
+static void
+pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
+	     struct nf_conntrack_expect *expect_reply)
+{
+	const struct nf_conn *ct = expect_orig->master;
+	struct nf_conn_nat *nat = nfct_nat(ct);
+	struct nf_ct_pptp_master *ct_pptp_info;
+	struct nf_nat_pptp *nat_pptp_info;
+
+	if (WARN_ON_ONCE(!nat))
+		return;
+
+	nat_pptp_info = &nat->help.nat_pptp_info;
+	ct_pptp_info = nfct_help_data(ct);
+
+	/* save original PAC call ID in nat_info */
+	nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
+
+	/* alter expectation for PNS->PAC direction */
+	expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
+	expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
+	expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
+	expect_orig->dir = IP_CT_DIR_ORIGINAL;
+
+	/* alter expectation for PAC->PNS direction */
+	expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
+	expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
+	expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
+	expect_reply->dir = IP_CT_DIR_REPLY;
+}
+
+/* inbound packets == from PAC to PNS */
+static int
+pptp_inbound_pkt(struct sk_buff *skb,
+		 struct nf_conn *ct,
+		 enum ip_conntrack_info ctinfo,
+		 unsigned int protoff,
+		 struct PptpControlHeader *ctlh,
+		 union pptp_ctrl_union *pptpReq)
+{
+	const struct nf_nat_pptp *nat_pptp_info;
+	struct nf_conn_nat *nat = nfct_nat(ct);
+	u_int16_t msg;
+	__be16 new_pcid;
+	unsigned int pcid_off;
+
+	if (WARN_ON_ONCE(!nat))
+		return NF_DROP;
+
+	nat_pptp_info = &nat->help.nat_pptp_info;
+	new_pcid = nat_pptp_info->pns_call_id;
+
+	switch (msg = ntohs(ctlh->messageType)) {
+	case PPTP_OUT_CALL_REPLY:
+		pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID);
+		break;
+	case PPTP_IN_CALL_CONNECT:
+		pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID);
+		break;
+	case PPTP_IN_CALL_REQUEST:
+		/* only need to nat in case PAC is behind NAT box */
+		return NF_ACCEPT;
+	case PPTP_WAN_ERROR_NOTIFY:
+		pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID);
+		break;
+	case PPTP_CALL_DISCONNECT_NOTIFY:
+		pcid_off = offsetof(union pptp_ctrl_union, disc.callID);
+		break;
+	case PPTP_SET_LINK_INFO:
+		pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
+		break;
+	default:
+		pr_debug("unknown inbound packet %s\n", pptp_msg_name(msg));
+		fallthrough;
+	case PPTP_START_SESSION_REQUEST:
+	case PPTP_START_SESSION_REPLY:
+	case PPTP_STOP_SESSION_REQUEST:
+	case PPTP_STOP_SESSION_REPLY:
+	case PPTP_ECHO_REQUEST:
+	case PPTP_ECHO_REPLY:
+		/* no need to alter packet */
+		return NF_ACCEPT;
+	}
+
+	/* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
+	 * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
+
+	/* mangle packet */
+	pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
+		 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
+
+	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+				      pcid_off + sizeof(struct pptp_pkt_hdr) +
+				      sizeof(struct PptpControlHeader),
+				      sizeof(new_pcid), (char *)&new_pcid,
+				      sizeof(new_pcid)))
+		return NF_DROP;
+	return NF_ACCEPT;
+}
+
+static const struct nf_nat_pptp_hook pptp_hooks = {
+	.outbound = pptp_outbound_pkt,
+	.inbound = pptp_inbound_pkt,
+	.exp_gre = pptp_exp_gre,
+	.expectfn = pptp_nat_expected,
+};
+
+static int __init nf_nat_helper_pptp_init(void)
+{
+	WARN_ON(nf_nat_pptp_hook != NULL);
+	RCU_INIT_POINTER(nf_nat_pptp_hook, &pptp_hooks);
+
+	return 0;
+}
+
+static void __exit nf_nat_helper_pptp_fini(void)
+{
+	RCU_INIT_POINTER(nf_nat_pptp_hook, NULL);
+	synchronize_rcu();
+}
+
+module_init(nf_nat_helper_pptp_init);
+module_exit(nf_nat_helper_pptp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
new file mode 100644
index 0000000000..24b73268f3
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
@@ -0,0 +1,177 @@
+Message ::=
+	SEQUENCE {
+		version
+			INTEGER ({snmp_version}),
+
+		community
+			OCTET STRING,
+
+		pdu
+			PDUs
+	}
+
+
+ObjectName ::=
+	OBJECT IDENTIFIER
+
+ObjectSyntax ::=
+	CHOICE {
+		simple
+			SimpleSyntax,
+
+		application-wide
+			ApplicationSyntax
+	}
+
+SimpleSyntax ::=
+	CHOICE {
+		integer-value
+			INTEGER,
+
+		string-value
+			OCTET STRING,
+
+		objectID-value
+			OBJECT IDENTIFIER
+	}
+
+ApplicationSyntax ::=
+	CHOICE {
+		ipAddress-value
+			IpAddress,
+
+		counter-value
+			Counter32,
+
+		timeticks-value
+			TimeTicks,
+
+		arbitrary-value
+			Opaque,
+
+		big-counter-value
+			Counter64,
+
+		unsigned-integer-value
+			Unsigned32
+	}
+
+IpAddress ::=
+	[APPLICATION 0]
+		IMPLICIT OCTET STRING OPTIONAL ({snmp_helper})
+
+Counter32 ::=
+	[APPLICATION 1]
+		IMPLICIT INTEGER OPTIONAL
+
+Unsigned32 ::=
+	[APPLICATION 2]
+		IMPLICIT INTEGER OPTIONAL
+
+Gauge32 ::= Unsigned32 OPTIONAL
+
+TimeTicks ::=
+	[APPLICATION 3]
+		IMPLICIT INTEGER OPTIONAL
+
+Opaque ::=
+	[APPLICATION 4]
+		IMPLICIT OCTET STRING OPTIONAL
+
+Counter64 ::=
+	[APPLICATION 6]
+		IMPLICIT INTEGER OPTIONAL
+
+PDUs ::=
+	CHOICE {
+		get-request
+			GetRequest-PDU,
+
+		get-next-request
+			GetNextRequest-PDU,
+
+		get-bulk-request
+			GetBulkRequest-PDU,
+
+		response
+			Response-PDU,
+
+		set-request
+			SetRequest-PDU,
+
+		inform-request
+			InformRequest-PDU,
+
+		snmpV2-trap
+			SNMPv2-Trap-PDU,
+
+		report
+			Report-PDU
+	}
+
+GetRequest-PDU ::=
+	[0] IMPLICIT PDU OPTIONAL
+
+GetNextRequest-PDU ::=
+	[1] IMPLICIT PDU OPTIONAL
+
+Response-PDU ::=
+	[2] IMPLICIT PDU OPTIONAL
+
+SetRequest-PDU ::=
+	[3] IMPLICIT PDU OPTIONAL
+
+-- [4] is obsolete
+
+GetBulkRequest-PDU ::=
+	[5] IMPLICIT PDU OPTIONAL
+
+InformRequest-PDU ::=
+	[6] IMPLICIT PDU OPTIONAL
+
+SNMPv2-Trap-PDU ::=
+	[7] IMPLICIT PDU OPTIONAL
+
+Report-PDU ::=
+	[8] IMPLICIT PDU OPTIONAL
+
+PDU ::=
+	SEQUENCE {
+		request-id
+			INTEGER,
+
+		error-status
+			INTEGER,
+
+		error-index
+			INTEGER,
+
+		variable-bindings
+			VarBindList
+	}
+
+
+VarBind ::=
+	SEQUENCE {
+		name
+			ObjectName,
+
+	CHOICE {
+		value
+			ObjectSyntax,
+
+		unSpecified
+			NULL,
+
+		noSuchObject
+			[0] IMPLICIT NULL,
+
+		noSuchInstance
+			[1] IMPLICIT NULL,
+
+		endOfMibView
+			[2] IMPLICIT NULL
+	}
+}
+
+VarBindList ::= SEQUENCE OF VarBind
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
new file mode 100644
index 0000000000..717b726504
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * nf_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network
+ * discovery and monitoring applications where target networks use
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified.  The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+#include "nf_nat_snmp_basic.asn1.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+MODULE_ALIAS("ip_nat_snmp_basic");
+MODULE_ALIAS_NFCT_HELPER("snmp_trap");
+
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+
+static DEFINE_SPINLOCK(snmp_lock);
+
+struct snmp_ctx {
+	unsigned char *begin;
+	__sum16 *check;
+	__be32 from;
+	__be32 to;
+};
+
+static void fast_csum(struct snmp_ctx *ctx, unsigned char offset)
+{
+	unsigned char s[12] = {0,};
+	int size;
+
+	if (offset & 1) {
+		memcpy(&s[1], &ctx->from, 4);
+		memcpy(&s[7], &ctx->to, 4);
+		s[0] = ~0;
+		s[1] = ~s[1];
+		s[2] = ~s[2];
+		s[3] = ~s[3];
+		s[4] = ~s[4];
+		s[5] = ~0;
+		size = 12;
+	} else {
+		memcpy(&s[0], &ctx->from, 4);
+		memcpy(&s[4], &ctx->to, 4);
+		s[0] = ~s[0];
+		s[1] = ~s[1];
+		s[2] = ~s[2];
+		s[3] = ~s[3];
+		size = 8;
+	}
+	*ctx->check = csum_fold(csum_partial(s, size,
+					     ~csum_unfold(*ctx->check)));
+}
+
+int snmp_version(void *context, size_t hdrlen, unsigned char tag,
+		 const void *data, size_t datalen)
+{
+	if (datalen != 1)
+		return -EINVAL;
+	if (*(unsigned char *)data > 1)
+		return -ENOTSUPP;
+	return 1;
+}
+
+int snmp_helper(void *context, size_t hdrlen, unsigned char tag,
+		const void *data, size_t datalen)
+{
+	struct snmp_ctx *ctx = (struct snmp_ctx *)context;
+	__be32 *pdata;
+
+	if (datalen != 4)
+		return -EINVAL;
+	pdata = (__be32 *)data;
+	if (*pdata == ctx->from) {
+		pr_debug("%s: %pI4 to %pI4\n", __func__,
+			 (void *)&ctx->from, (void *)&ctx->to);
+
+		if (*ctx->check)
+			fast_csum(ctx, (unsigned char *)data - ctx->begin);
+		*pdata = ctx->to;
+	}
+
+	return 1;
+}
+
+static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+	struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+	u16 datalen = ntohs(udph->len) - sizeof(struct udphdr);
+	char *data = (unsigned char *)udph + sizeof(struct udphdr);
+	struct snmp_ctx ctx;
+	int ret;
+
+	if (dir == IP_CT_DIR_ORIGINAL) {
+		ctx.from = ct->tuplehash[dir].tuple.src.u3.ip;
+		ctx.to = ct->tuplehash[!dir].tuple.dst.u3.ip;
+	} else {
+		ctx.from = ct->tuplehash[!dir].tuple.src.u3.ip;
+		ctx.to = ct->tuplehash[dir].tuple.dst.u3.ip;
+	}
+
+	if (ctx.from == ctx.to)
+		return NF_ACCEPT;
+
+	ctx.begin = (unsigned char *)udph + sizeof(struct udphdr);
+	ctx.check = &udph->check;
+	ret = asn1_ber_decoder(&nf_nat_snmp_basic_decoder, &ctx, data, datalen);
+	if (ret < 0) {
+		nf_ct_helper_log(skb, ct, "parser failed\n");
+		return NF_DROP;
+	}
+
+	return NF_ACCEPT;
+}
+
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted
+ */
+static int help(struct sk_buff *skb, unsigned int protoff,
+		struct nf_conn *ct,
+		enum ip_conntrack_info ctinfo)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	unsigned int ret;
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+
+	/* SNMP replies and originating SNMP traps get mangled */
+	if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+		return NF_ACCEPT;
+	if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	/* No NAT? */
+	if (!(ct->status & IPS_NAT_MASK))
+		return NF_ACCEPT;
+
+	/* Make sure the packet length is ok.  So far, we were only guaranteed
+	 * to have a valid length IP header plus 8 bytes, which means we have
+	 * enough room for a UDP header.  Just verify the UDP length field so we
+	 * can mess around with the payload.
+	 */
+	if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
+		nf_ct_helper_log(skb, ct, "dropping malformed packet\n");
+		return NF_DROP;
+	}
+
+	if (skb_ensure_writable(skb, skb->len)) {
+		nf_ct_helper_log(skb, ct, "cannot mangle packet");
+		return NF_DROP;
+	}
+
+	spin_lock_bh(&snmp_lock);
+	ret = snmp_translate(ct, dir, skb);
+	spin_unlock_bh(&snmp_lock);
+	return ret;
+}
+
+static const struct nf_conntrack_expect_policy snmp_exp_policy = {
+	.max_expected	= 0,
+	.timeout	= 180,
+};
+
+static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
+	.me			= THIS_MODULE,
+	.help			= help,
+	.expect_policy		= &snmp_exp_policy,
+	.name			= "snmp_trap",
+	.tuple.src.l3num	= AF_INET,
+	.tuple.src.u.udp.port	= cpu_to_be16(SNMP_TRAP_PORT),
+	.tuple.dst.protonum	= IPPROTO_UDP,
+};
+
+static int __init nf_nat_snmp_basic_init(void)
+{
+	BUG_ON(nf_nat_snmp_hook != NULL);
+	RCU_INIT_POINTER(nf_nat_snmp_hook, help);
+
+	return nf_conntrack_helper_register(&snmp_trap_helper);
+}
+
+static void __exit nf_nat_snmp_basic_fini(void)
+{
+	RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+	synchronize_rcu();
+	nf_conntrack_helper_unregister(&snmp_trap_helper);
+}
+
+module_init(nf_nat_snmp_basic_init);
+module_exit(nf_nat_snmp_basic_fini);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
new file mode 100644
index 0000000000..fc761915c5
--- /dev/null
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -0,0 +1,341 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+
+static int nf_reject_iphdr_validate(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	u32 len;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		return 0;
+
+	iph = ip_hdr(skb);
+	if (iph->ihl < 5 || iph->version != 4)
+		return 0;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len)
+		return 0;
+	else if (len < (iph->ihl*4))
+		return 0;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		return 0;
+
+	return 1;
+}
+
+struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net,
+					   struct sk_buff *oldskb,
+					   const struct net_device *dev,
+					   int hook)
+{
+	const struct tcphdr *oth;
+	struct sk_buff *nskb;
+	struct iphdr *niph;
+	struct tcphdr _oth;
+
+	if (!nf_reject_iphdr_validate(oldskb))
+		return NULL;
+
+	oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+	if (!oth)
+		return NULL;
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+			 LL_MAX_HEADER, GFP_ATOMIC);
+	if (!nskb)
+		return NULL;
+
+	nskb->dev = (struct net_device *)dev;
+
+	skb_reserve(nskb, LL_MAX_HEADER);
+	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+				   READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
+	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+	niph->tot_len = htons(nskb->len);
+	ip_send_check(niph);
+
+	return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset);
+
+struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
+					 struct sk_buff *oldskb,
+					 const struct net_device *dev,
+					 int hook, u8 code)
+{
+	struct sk_buff *nskb;
+	struct iphdr *niph;
+	struct icmphdr *icmph;
+	unsigned int len;
+	int dataoff;
+	__wsum csum;
+	u8 proto;
+
+	if (!nf_reject_iphdr_validate(oldskb))
+		return NULL;
+
+	/* IP header checks: fragment. */
+	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+		return NULL;
+
+	/* RFC says return as much as we can without exceeding 576 bytes. */
+	len = min_t(unsigned int, 536, oldskb->len);
+
+	if (!pskb_may_pull(oldskb, len))
+		return NULL;
+
+	if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
+		return NULL;
+
+	dataoff = ip_hdrlen(oldskb);
+	proto = ip_hdr(oldskb)->protocol;
+
+	if (!skb_csum_unnecessary(oldskb) &&
+	    nf_reject_verify_csum(oldskb, dataoff, proto) &&
+	    nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
+		return NULL;
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+			 LL_MAX_HEADER + len, GFP_ATOMIC);
+	if (!nskb)
+		return NULL;
+
+	nskb->dev = (struct net_device *)dev;
+
+	skb_reserve(nskb, LL_MAX_HEADER);
+	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
+				   READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
+
+	skb_reset_transport_header(nskb);
+	icmph = skb_put_zero(nskb, sizeof(struct icmphdr));
+	icmph->type     = ICMP_DEST_UNREACH;
+	icmph->code	= code;
+
+	skb_put_data(nskb, skb_network_header(oldskb), len);
+
+	csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0);
+	icmph->checksum = csum_fold(csum);
+
+	niph->tot_len	= htons(nskb->len);
+	ip_send_check(niph);
+
+	return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach);
+
+const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+					     struct tcphdr *_oth, int hook)
+{
+	const struct tcphdr *oth;
+
+	/* IP header checks: fragment. */
+	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+		return NULL;
+
+	if (ip_hdr(oldskb)->protocol != IPPROTO_TCP)
+		return NULL;
+
+	oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
+				 sizeof(struct tcphdr), _oth);
+	if (oth == NULL)
+		return NULL;
+
+	/* No RST for RST. */
+	if (oth->rst)
+		return NULL;
+
+	/* Check checksum */
+	if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
+		return NULL;
+
+	return oth;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
+
+struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+				  const struct sk_buff *oldskb,
+				  __u8 protocol, int ttl)
+{
+	struct iphdr *niph, *oiph = ip_hdr(oldskb);
+
+	skb_reset_network_header(nskb);
+	niph = skb_put(nskb, sizeof(struct iphdr));
+	niph->version	= 4;
+	niph->ihl	= sizeof(struct iphdr) / 4;
+	niph->tos	= 0;
+	niph->id	= 0;
+	niph->frag_off	= htons(IP_DF);
+	niph->protocol	= protocol;
+	niph->check	= 0;
+	niph->saddr	= oiph->daddr;
+	niph->daddr	= oiph->saddr;
+	niph->ttl	= ttl;
+
+	nskb->protocol = htons(ETH_P_IP);
+
+	return niph;
+}
+EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
+
+void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+			  const struct tcphdr *oth)
+{
+	struct iphdr *niph = ip_hdr(nskb);
+	struct tcphdr *tcph;
+
+	skb_reset_transport_header(nskb);
+	tcph = skb_put_zero(nskb, sizeof(struct tcphdr));
+	tcph->source	= oth->dest;
+	tcph->dest	= oth->source;
+	tcph->doff	= sizeof(struct tcphdr) / 4;
+
+	if (oth->ack) {
+		tcph->seq = oth->ack_seq;
+	} else {
+		tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+				      oldskb->len - ip_hdrlen(oldskb) -
+				      (oth->doff << 2));
+		tcph->ack = 1;
+	}
+
+	tcph->rst	= 1;
+	tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+				    niph->daddr, 0);
+	nskb->ip_summed = CHECKSUM_PARTIAL;
+	nskb->csum_start = (unsigned char *)tcph - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
+
+static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
+{
+	struct dst_entry *dst = NULL;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(struct flowi));
+	fl.u.ip4.daddr = ip_hdr(skb_in)->saddr;
+	nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false);
+	if (!dst)
+		return -1;
+
+	skb_dst_set(skb_in, dst);
+	return 0;
+}
+
+/* Send RST reply */
+void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+		   int hook)
+{
+	struct sk_buff *nskb;
+	struct iphdr *niph;
+	const struct tcphdr *oth;
+	struct tcphdr _oth;
+
+	oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+	if (!oth)
+		return;
+
+	if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) &&
+	    nf_reject_fill_skb_dst(oldskb) < 0)
+		return;
+
+	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		return;
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+			 LL_MAX_HEADER, GFP_ATOMIC);
+	if (!nskb)
+		return;
+
+	/* ip_route_me_harder expects skb->dst to be set */
+	skb_dst_set_noref(nskb, skb_dst(oldskb));
+
+	nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
+
+	skb_reserve(nskb, LL_MAX_HEADER);
+	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+				   ip4_dst_hoplimit(skb_dst(nskb)));
+	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+	if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC))
+		goto free_nskb;
+
+	niph = ip_hdr(nskb);
+
+	/* "Never happens" */
+	if (nskb->len > dst_mtu(skb_dst(nskb)))
+		goto free_nskb;
+
+	nf_ct_attach(nskb, oldskb);
+	nf_ct_set_closing(skb_nfct(oldskb));
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	/* If we use ip_local_out for bridged traffic, the MAC source on
+	 * the RST will be ours, instead of the destination's.  This confuses
+	 * some routers/firewalls, and they drop the packet.  So we need to
+	 * build the eth header using the original destination's MAC as the
+	 * source, and send the RST packet directly.
+	 */
+	if (nf_bridge_info_exists(oldskb)) {
+		struct ethhdr *oeth = eth_hdr(oldskb);
+		struct net_device *br_indev;
+
+		br_indev = nf_bridge_get_physindev(oldskb, net);
+		if (!br_indev)
+			goto free_nskb;
+
+		nskb->dev = br_indev;
+		niph->tot_len = htons(nskb->len);
+		ip_send_check(niph);
+		if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+				    oeth->h_source, oeth->h_dest, nskb->len) < 0)
+			goto free_nskb;
+		dev_queue_xmit(nskb);
+	} else
+#endif
+		ip_local_out(net, nskb->sk, nskb);
+
+	return;
+
+ free_nskb:
+	kfree_skb(nskb);
+}
+EXPORT_SYMBOL_GPL(nf_send_reset);
+
+void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
+{
+	struct iphdr *iph = ip_hdr(skb_in);
+	int dataoff = ip_hdrlen(skb_in);
+	u8 proto = iph->protocol;
+
+	if (iph->frag_off & htons(IP_OFFSET))
+		return;
+
+	if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) &&
+	    nf_reject_fill_skb_dst(skb_in) < 0)
+		return;
+
+	if (skb_csum_unnecessary(skb_in) ||
+	    !nf_reject_verify_csum(skb_in, dataoff, proto)) {
+		icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+		return;
+	}
+
+	if (nf_ip_checksum(skb_in, hook, dataoff, proto) == 0)
+		icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}
+EXPORT_SYMBOL_GPL(nf_send_unreach);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
new file mode 100644
index 0000000000..a1350fc258
--- /dev/null
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netfilter/nf_socket.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
+		     __be32 *raddr, __be32 *laddr,
+		     __be16 *rport, __be16 *lport)
+{
+	unsigned int outside_hdrlen = ip_hdrlen(skb);
+	struct iphdr *inside_iph, _inside_iph;
+	struct icmphdr *icmph, _icmph;
+	__be16 *ports, _ports[2];
+
+	icmph = skb_header_pointer(skb, outside_hdrlen,
+				   sizeof(_icmph), &_icmph);
+	if (icmph == NULL)
+		return 1;
+
+	if (!icmp_is_err(icmph->type))
+		return 1;
+
+	inside_iph = skb_header_pointer(skb, outside_hdrlen +
+					sizeof(struct icmphdr),
+					sizeof(_inside_iph), &_inside_iph);
+	if (inside_iph == NULL)
+		return 1;
+
+	if (inside_iph->protocol != IPPROTO_TCP &&
+	    inside_iph->protocol != IPPROTO_UDP)
+		return 1;
+
+	ports = skb_header_pointer(skb, outside_hdrlen +
+				   sizeof(struct icmphdr) +
+				   (inside_iph->ihl << 2),
+				   sizeof(_ports), &_ports);
+	if (ports == NULL)
+		return 1;
+
+	/* the inside IP packet is the one quoted from our side, thus
+	 * its saddr is the local address */
+	*protocol = inside_iph->protocol;
+	*laddr = inside_iph->saddr;
+	*lport = ports[0];
+	*raddr = inside_iph->daddr;
+	*rport = ports[1];
+
+	return 0;
+}
+
+static struct sock *
+nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
+		      const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in)
+{
+	switch (protocol) {
+	case IPPROTO_TCP:
+		return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo,
+				   skb, doff, saddr, sport, daddr, dport,
+				   in->ifindex);
+	case IPPROTO_UDP:
+		return udp4_lib_lookup(net, saddr, sport, daddr, dport,
+				       in->ifindex);
+	}
+	return NULL;
+}
+
+struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
+				  const struct net_device *indev)
+{
+	__be32 daddr, saddr;
+	__be16 dport, sport;
+	const struct iphdr *iph = ip_hdr(skb);
+	struct sk_buff *data_skb = NULL;
+	u8 protocol;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn const *ct;
+#endif
+	int doff = 0;
+
+	if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
+		struct tcphdr _hdr;
+		struct udphdr *hp;
+
+		hp = skb_header_pointer(skb, ip_hdrlen(skb),
+					iph->protocol == IPPROTO_UDP ?
+					sizeof(*hp) : sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
+		protocol = iph->protocol;
+		saddr = iph->saddr;
+		sport = hp->source;
+		daddr = iph->daddr;
+		dport = hp->dest;
+		data_skb = (struct sk_buff *)skb;
+		doff = iph->protocol == IPPROTO_TCP ?
+			ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
+			ip_hdrlen(skb) + sizeof(*hp);
+
+	} else if (iph->protocol == IPPROTO_ICMP) {
+		if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
+					 &sport, &dport))
+			return NULL;
+	} else {
+		return NULL;
+	}
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	/* Do the lookup with the original socket address in
+	 * case this is a reply packet of an established
+	 * SNAT-ted connection.
+	 */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct &&
+	    ((iph->protocol != IPPROTO_ICMP &&
+	      ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+	     (iph->protocol == IPPROTO_ICMP &&
+	      ctinfo == IP_CT_RELATED_REPLY)) &&
+	    (ct->status & IPS_SRC_NAT_DONE)) {
+
+		daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+		dport = (iph->protocol == IPPROTO_TCP) ?
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+	}
+#endif
+
+	return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
+				     daddr, sport, dport, indev);
+}
+EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure");
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
new file mode 100644
index 0000000000..69e3317996
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ */
+
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <linux/inetdevice.h>
+
+struct sock *
+nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+			 __be32 laddr, __be16 lport, struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
+					    iph->saddr, laddr ? laddr : iph->daddr,
+					    hp->source, lport ? lport : hp->dest,
+					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+		if (sk2) {
+			nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
+
+__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+	const struct in_ifaddr *ifa;
+	struct in_device *indev;
+	__be32 laddr;
+
+	if (user_laddr)
+		return user_laddr;
+
+	laddr = 0;
+	indev = __in_dev_get_rcu(skb->dev);
+
+	in_dev_for_each_ifa_rcu(ifa, indev) {
+		if (ifa->ifa_flags & IFA_F_SECONDARY)
+			continue;
+
+		laddr = ifa->ifa_local;
+		break;
+	}
+
+	return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
+
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
+		      const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type)
+{
+	struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
+	struct sock *sk;
+
+	switch (protocol) {
+	case IPPROTO_TCP: {
+		struct tcphdr _hdr, *hp;
+
+		hp = skb_header_pointer(skb, ip_hdrlen(skb),
+					sizeof(struct tcphdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
+		switch (lookup_type) {
+		case NF_TPROXY_LOOKUP_LISTENER:
+			sk = inet_lookup_listener(net, hinfo, skb,
+						  ip_hdrlen(skb) + __tcp_hdrlen(hp),
+						  saddr, sport, daddr, dport,
+						  in->ifindex, 0);
+
+			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+				sk = NULL;
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			break;
+		case NF_TPROXY_LOOKUP_ESTABLISHED:
+			sk = inet_lookup_established(net, hinfo, saddr, sport,
+						     daddr, dport, in->ifindex);
+			break;
+		default:
+			BUG();
+		}
+		break;
+		}
+	case IPPROTO_UDP:
+		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		if (sk) {
+			int connected = (sk->sk_state == TCP_ESTABLISHED);
+			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
+			      (!connected || wildcard)) ||
+			    (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+				sock_put(sk);
+				sk = NULL;
+			}
+		}
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
new file mode 100644
index 0000000000..a522c3a3be
--- /dev/null
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_dup_ipv4.h>
+
+struct nft_dup_ipv4 {
+	u8	sreg_addr;
+	u8	sreg_dev;
+};
+
+static void nft_dup_ipv4_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+	struct in_addr gw = {
+		.s_addr = (__force __be32)regs->data[priv->sreg_addr],
+	};
+	int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
+
+	nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif);
+}
+
+static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_DUP_SREG_ADDR] == NULL)
+		return -EINVAL;
+
+	err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+				      sizeof(struct in_addr));
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_DUP_SREG_DEV])
+		err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV],
+					      &priv->sreg_dev, sizeof(int));
+
+	return err;
+}
+
+static int nft_dup_ipv4_dump(struct sk_buff *skb,
+			     const struct nft_expr *expr, bool reset)
+{
+	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr))
+		goto nla_put_failure;
+	if (priv->sreg_dev &&
+	    nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_dup_ipv4_type;
+static const struct nft_expr_ops nft_dup_ipv4_ops = {
+	.type		= &nft_dup_ipv4_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)),
+	.eval		= nft_dup_ipv4_eval,
+	.init		= nft_dup_ipv4_init,
+	.dump		= nft_dup_ipv4_dump,
+	.reduce		= NFT_REDUCE_READONLY,
+};
+
+static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = {
+	[NFTA_DUP_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_DUP_SREG_DEV]	= { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_dup_ipv4_type __read_mostly = {
+	.family		= NFPROTO_IPV4,
+	.name		= "dup",
+	.ops		= &nft_dup_ipv4_ops,
+	.policy		= nft_dup_ipv4_policy,
+	.maxattr	= NFTA_DUP_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_dup_ipv4_module_init(void)
+{
+	return nft_register_expr(&nft_dup_ipv4_type);
+}
+
+static void __exit nft_dup_ipv4_module_exit(void)
+{
+	nft_unregister_expr(&nft_dup_ipv4_type);
+}
+
+module_init(nft_dup_ipv4_module_init);
+module_exit(nft_dup_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup");
+MODULE_DESCRIPTION("IPv4 nftables packet duplication support");
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
new file mode 100644
index 0000000000..9eee535c64
--- /dev/null
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_fib.h>
+
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 get_saddr(__be32 addr)
+{
+	if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+	    ipv4_is_zeronet(addr))
+		return 0;
+	return addr;
+}
+
+#define DSCP_BITS     0xfc
+
+void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+			const struct nft_pktinfo *pkt)
+{
+	const struct nft_fib *priv = nft_expr_priv(expr);
+	int noff = skb_network_offset(pkt->skb);
+	u32 *dst = &regs->data[priv->dreg];
+	const struct net_device *dev = NULL;
+	struct iphdr *iph, _iph;
+	__be32 addr;
+
+	if (priv->flags & NFTA_FIB_F_IIF)
+		dev = nft_in(pkt);
+	else if (priv->flags & NFTA_FIB_F_OIF)
+		dev = nft_out(pkt);
+
+	iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+	if (!iph) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	if (priv->flags & NFTA_FIB_F_DADDR)
+		addr = iph->daddr;
+	else
+		addr = iph->saddr;
+
+	*dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
+
+void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
+		   const struct nft_pktinfo *pkt)
+{
+	const struct nft_fib *priv = nft_expr_priv(expr);
+	int noff = skb_network_offset(pkt->skb);
+	u32 *dest = &regs->data[priv->dreg];
+	struct iphdr *iph, _iph;
+	struct fib_result res;
+	struct flowi4 fl4 = {
+		.flowi4_scope = RT_SCOPE_UNIVERSE,
+		.flowi4_iif = LOOPBACK_IFINDEX,
+		.flowi4_uid = sock_net_uid(nft_net(pkt), NULL),
+	};
+	const struct net_device *oif;
+	const struct net_device *found;
+
+	/*
+	 * Do not set flowi4_oif, it restricts results (for example, asking
+	 * for oif 3 will get RTN_UNICAST result even if the daddr exits
+	 * on another interface.
+	 *
+	 * Search results for the desired outinterface instead.
+	 */
+	if (priv->flags & NFTA_FIB_F_OIF)
+		oif = nft_out(pkt);
+	else if (priv->flags & NFTA_FIB_F_IIF)
+		oif = nft_in(pkt);
+	else
+		oif = NULL;
+
+	if (priv->flags & NFTA_FIB_F_IIF)
+		fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(oif);
+
+	if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
+	    nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
+		nft_fib_store_result(dest, priv, nft_in(pkt));
+		return;
+	}
+
+	iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+	if (!iph) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	if (ipv4_is_zeronet(iph->saddr)) {
+		if (ipv4_is_lbcast(iph->daddr) ||
+		    ipv4_is_local_multicast(iph->daddr)) {
+			nft_fib_store_result(dest, priv, pkt->skb->dev);
+			return;
+		}
+	}
+
+	if (priv->flags & NFTA_FIB_F_MARK)
+		fl4.flowi4_mark = pkt->skb->mark;
+
+	fl4.flowi4_tos = iph->tos & DSCP_BITS;
+
+	if (priv->flags & NFTA_FIB_F_DADDR) {
+		fl4.daddr = iph->daddr;
+		fl4.saddr = get_saddr(iph->saddr);
+	} else {
+		if (nft_hook(pkt) == NF_INET_FORWARD &&
+		    priv->flags & NFTA_FIB_F_IIF)
+			fl4.flowi4_iif = nft_out(pkt)->ifindex;
+
+		fl4.daddr = iph->saddr;
+		fl4.saddr = get_saddr(iph->daddr);
+	}
+
+	*dest = 0;
+
+	if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
+		return;
+
+	switch (res.type) {
+	case RTN_UNICAST:
+		break;
+	case RTN_LOCAL: /* Should not see RTN_LOCAL here */
+		return;
+	default:
+		break;
+	}
+
+	if (!oif) {
+		found = FIB_RES_DEV(res);
+	} else {
+		if (!fib_info_nh_uses_dev(res.fi, oif))
+			return;
+		found = oif;
+	}
+
+	nft_fib_store_result(dest, priv, found);
+}
+EXPORT_SYMBOL_GPL(nft_fib4_eval);
+
+static struct nft_expr_type nft_fib4_type;
+
+static const struct nft_expr_ops nft_fib4_type_ops = {
+	.type		= &nft_fib4_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+	.eval		= nft_fib4_eval_type,
+	.init		= nft_fib_init,
+	.dump		= nft_fib_dump,
+	.validate	= nft_fib_validate,
+	.reduce		= nft_fib_reduce,
+};
+
+static const struct nft_expr_ops nft_fib4_ops = {
+	.type		= &nft_fib4_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fib)),
+	.eval		= nft_fib4_eval,
+	.init		= nft_fib_init,
+	.dump		= nft_fib_dump,
+	.validate	= nft_fib_validate,
+	.reduce		= nft_fib_reduce,
+};
+
+static const struct nft_expr_ops *
+nft_fib4_select_ops(const struct nft_ctx *ctx,
+		    const struct nlattr * const tb[])
+{
+	enum nft_fib_result result;
+
+	if (!tb[NFTA_FIB_RESULT])
+		return ERR_PTR(-EINVAL);
+
+	result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+
+	switch (result) {
+	case NFT_FIB_RESULT_OIF:
+		return &nft_fib4_ops;
+	case NFT_FIB_RESULT_OIFNAME:
+		return &nft_fib4_ops;
+	case NFT_FIB_RESULT_ADDRTYPE:
+		return &nft_fib4_type_ops;
+	default:
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+}
+
+static struct nft_expr_type nft_fib4_type __read_mostly = {
+	.name		= "fib",
+	.select_ops	= nft_fib4_select_ops,
+	.policy		= nft_fib_policy,
+	.maxattr	= NFTA_FIB_MAX,
+	.family		= NFPROTO_IPV4,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_fib4_module_init(void)
+{
+	return nft_register_expr(&nft_fib4_type);
+}
+
+static void __exit nft_fib4_module_exit(void)
+{
+	nft_unregister_expr(&nft_fib4_type);
+}
+
+module_init(nft_fib4_module_init);
+module_exit(nft_fib4_module_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_ALIAS_NFT_AF_EXPR(2, "fib");
+MODULE_DESCRIPTION("nftables fib / ip route lookup support");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
new file mode 100644
index 0000000000..6cb213bb72
--- /dev/null
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/nft_reject.h>
+
+static void nft_reject_ipv4_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nft_reject *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NFT_REJECT_ICMP_UNREACH:
+		nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt));
+		break;
+	case NFT_REJECT_TCP_RST:
+		nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb,
+			      nft_hook(pkt));
+		break;
+	default:
+		break;
+	}
+
+	regs->verdict.code = NF_DROP;
+}
+
+static struct nft_expr_type nft_reject_ipv4_type;
+static const struct nft_expr_ops nft_reject_ipv4_ops = {
+	.type		= &nft_reject_ipv4_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+	.eval		= nft_reject_ipv4_eval,
+	.init		= nft_reject_init,
+	.dump		= nft_reject_dump,
+	.validate	= nft_reject_validate,
+	.reduce		= NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
+	.family		= NFPROTO_IPV4,
+	.name		= "reject",
+	.ops		= &nft_reject_ipv4_ops,
+	.policy		= nft_reject_policy,
+	.maxattr	= NFTA_REJECT_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_reject_ipv4_module_init(void)
+{
+	return nft_register_expr(&nft_reject_ipv4_type);
+}
+
+static void __exit nft_reject_ipv4_module_exit(void)
+{
+	nft_unregister_expr(&nft_reject_ipv4_type);
+}
+
+module_init(nft_reject_ipv4_module_init);
+module_exit(nft_reject_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject");
+MODULE_DESCRIPTION("IPv4 packet rejection for nftables");
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 0000000000..b920e1bdcf
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <linux/in6.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family,
+				struct netlink_ext_ack *extack)
+{
+	*ip_proto = nla_get_u8(attr);
+
+	switch (*ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		return 0;
+	case IPPROTO_ICMP:
+		if (family != AF_INET)
+			break;
+		return 0;
+#if IS_ENABLED(CONFIG_IPV6)
+	case IPPROTO_ICMPV6:
+		if (family != AF_INET6)
+			break;
+		return 0;
+#endif
+	}
+	NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
new file mode 100644
index 0000000000..bbff68b5b5
--- /dev/null
+++ b/net/ipv4/nexthop.c
@@ -0,0 +1,3792 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Generic nexthop implementation
+ *
+ * Copyright (c) 2017-19 Cumulus Networks
+ * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
+ */
+
+#include <linux/nexthop.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/arp.h>
+#include <net/ipv6_stubs.h>
+#include <net/lwtunnel.h>
+#include <net/ndisc.h>
+#include <net/nexthop.h>
+#include <net/route.h>
+#include <net/sock.h>
+
+#define NH_RES_DEFAULT_IDLE_TIMER	(120 * HZ)
+#define NH_RES_DEFAULT_UNBALANCED_TIMER	0	/* No forced rebalancing. */
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+			   struct nl_info *nlinfo);
+
+#define NH_DEV_HASHBITS  8
+#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
+
+static const struct nla_policy rtm_nh_policy_new[] = {
+	[NHA_ID]		= { .type = NLA_U32 },
+	[NHA_GROUP]		= { .type = NLA_BINARY },
+	[NHA_GROUP_TYPE]	= { .type = NLA_U16 },
+	[NHA_BLACKHOLE]		= { .type = NLA_FLAG },
+	[NHA_OIF]		= { .type = NLA_U32 },
+	[NHA_GATEWAY]		= { .type = NLA_BINARY },
+	[NHA_ENCAP_TYPE]	= { .type = NLA_U16 },
+	[NHA_ENCAP]		= { .type = NLA_NESTED },
+	[NHA_FDB]		= { .type = NLA_FLAG },
+	[NHA_RES_GROUP]		= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_policy_get[] = {
+	[NHA_ID]		= { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_dump[] = {
+	[NHA_OIF]		= { .type = NLA_U32 },
+	[NHA_GROUPS]		= { .type = NLA_FLAG },
+	[NHA_MASTER]		= { .type = NLA_U32 },
+	[NHA_FDB]		= { .type = NLA_FLAG },
+};
+
+static const struct nla_policy rtm_nh_res_policy_new[] = {
+	[NHA_RES_GROUP_BUCKETS]			= { .type = NLA_U16 },
+	[NHA_RES_GROUP_IDLE_TIMER]		= { .type = NLA_U32 },
+	[NHA_RES_GROUP_UNBALANCED_TIMER]	= { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
+	[NHA_ID]		= { .type = NLA_U32 },
+	[NHA_OIF]		= { .type = NLA_U32 },
+	[NHA_MASTER]		= { .type = NLA_U32 },
+	[NHA_RES_BUCKET]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
+	[NHA_RES_BUCKET_NH_ID]	= { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_get_bucket[] = {
+	[NHA_ID]		= { .type = NLA_U32 },
+	[NHA_RES_BUCKET]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
+	[NHA_RES_BUCKET_INDEX]	= { .type = NLA_U16 },
+};
+
+static bool nexthop_notifiers_is_empty(struct net *net)
+{
+	return !net->nexthop.notifier_chain.head;
+}
+
+static void
+__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
+			       const struct nh_info *nhi)
+{
+	nh_info->dev = nhi->fib_nhc.nhc_dev;
+	nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
+	if (nh_info->gw_family == AF_INET)
+		nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
+	else if (nh_info->gw_family == AF_INET6)
+		nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
+
+	nh_info->is_reject = nhi->reject_nh;
+	nh_info->is_fdb = nhi->fdb_nh;
+	nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
+}
+
+static int nh_notifier_single_info_init(struct nh_notifier_info *info,
+					const struct nexthop *nh)
+{
+	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+	info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
+	info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
+	if (!info->nh)
+		return -ENOMEM;
+
+	__nh_notifier_single_info_init(info->nh, nhi);
+
+	return 0;
+}
+
+static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
+{
+	kfree(info->nh);
+}
+
+static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
+				       struct nh_group *nhg)
+{
+	u16 num_nh = nhg->num_nh;
+	int i;
+
+	info->type = NH_NOTIFIER_INFO_TYPE_GRP;
+	info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
+			       GFP_KERNEL);
+	if (!info->nh_grp)
+		return -ENOMEM;
+
+	info->nh_grp->num_nh = num_nh;
+	info->nh_grp->is_fdb = nhg->fdb_nh;
+
+	for (i = 0; i < num_nh; i++) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+		struct nh_info *nhi;
+
+		nhi = rtnl_dereference(nhge->nh->nh_info);
+		info->nh_grp->nh_entries[i].id = nhge->nh->id;
+		info->nh_grp->nh_entries[i].weight = nhge->weight;
+		__nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
+					       nhi);
+	}
+
+	return 0;
+}
+
+static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
+					   struct nh_group *nhg)
+{
+	struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
+	u16 num_nh_buckets = res_table->num_nh_buckets;
+	unsigned long size;
+	u16 i;
+
+	info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
+	size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
+	info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
+				       __GFP_NOWARN);
+	if (!info->nh_res_table)
+		return -ENOMEM;
+
+	info->nh_res_table->num_nh_buckets = num_nh_buckets;
+
+	for (i = 0; i < num_nh_buckets; i++) {
+		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+		struct nh_grp_entry *nhge;
+		struct nh_info *nhi;
+
+		nhge = rtnl_dereference(bucket->nh_entry);
+		nhi = rtnl_dereference(nhge->nh->nh_info);
+		__nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
+					       nhi);
+	}
+
+	return 0;
+}
+
+static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
+				     const struct nexthop *nh)
+{
+	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+	if (nhg->hash_threshold)
+		return nh_notifier_mpath_info_init(info, nhg);
+	else if (nhg->resilient)
+		return nh_notifier_res_table_info_init(info, nhg);
+	return -EINVAL;
+}
+
+static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
+				      const struct nexthop *nh)
+{
+	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+	if (nhg->hash_threshold)
+		kfree(info->nh_grp);
+	else if (nhg->resilient)
+		vfree(info->nh_res_table);
+}
+
+static int nh_notifier_info_init(struct nh_notifier_info *info,
+				 const struct nexthop *nh)
+{
+	info->id = nh->id;
+
+	if (nh->is_group)
+		return nh_notifier_grp_info_init(info, nh);
+	else
+		return nh_notifier_single_info_init(info, nh);
+}
+
+static void nh_notifier_info_fini(struct nh_notifier_info *info,
+				  const struct nexthop *nh)
+{
+	if (nh->is_group)
+		nh_notifier_grp_info_fini(info, nh);
+	else
+		nh_notifier_single_info_fini(info);
+}
+
+static int call_nexthop_notifiers(struct net *net,
+				  enum nexthop_event_type event_type,
+				  struct nexthop *nh,
+				  struct netlink_ext_ack *extack)
+{
+	struct nh_notifier_info info = {
+		.net = net,
+		.extack = extack,
+	};
+	int err;
+
+	ASSERT_RTNL();
+
+	if (nexthop_notifiers_is_empty(net))
+		return 0;
+
+	err = nh_notifier_info_init(&info, nh);
+	if (err) {
+		NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
+		return err;
+	}
+
+	err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+					   event_type, &info);
+	nh_notifier_info_fini(&info, nh);
+
+	return notifier_to_errno(err);
+}
+
+static int
+nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
+				      bool force, unsigned int *p_idle_timer_ms)
+{
+	struct nh_res_table *res_table;
+	struct nh_group *nhg;
+	struct nexthop *nh;
+	int err = 0;
+
+	/* When 'force' is false, nexthop bucket replacement is performed
+	 * because the bucket was deemed to be idle. In this case, capable
+	 * listeners can choose to perform an atomic replacement: The bucket is
+	 * only replaced if it is inactive. However, if the idle timer interval
+	 * is smaller than the interval in which a listener is querying
+	 * buckets' activity from the device, then atomic replacement should
+	 * not be tried. Pass the idle timer value to listeners, so that they
+	 * could determine which type of replacement to perform.
+	 */
+	if (force) {
+		*p_idle_timer_ms = 0;
+		return 0;
+	}
+
+	rcu_read_lock();
+
+	nh = nexthop_find_by_id(info->net, info->id);
+	if (!nh) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	nhg = rcu_dereference(nh->nh_grp);
+	res_table = rcu_dereference(nhg->res_table);
+	*p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);
+
+out:
+	rcu_read_unlock();
+
+	return err;
+}
+
+static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
+					    u16 bucket_index, bool force,
+					    struct nh_info *oldi,
+					    struct nh_info *newi)
+{
+	unsigned int idle_timer_ms;
+	int err;
+
+	err = nh_notifier_res_bucket_idle_timer_get(info, force,
+						    &idle_timer_ms);
+	if (err)
+		return err;
+
+	info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
+	info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
+				      GFP_KERNEL);
+	if (!info->nh_res_bucket)
+		return -ENOMEM;
+
+	info->nh_res_bucket->bucket_index = bucket_index;
+	info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
+	info->nh_res_bucket->force = force;
+	__nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
+	__nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
+	return 0;
+}
+
+static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
+{
+	kfree(info->nh_res_bucket);
+}
+
+static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
+					       u16 bucket_index, bool force,
+					       struct nh_info *oldi,
+					       struct nh_info *newi,
+					       struct netlink_ext_ack *extack)
+{
+	struct nh_notifier_info info = {
+		.net = net,
+		.extack = extack,
+		.id = nhg_id,
+	};
+	int err;
+
+	if (nexthop_notifiers_is_empty(net))
+		return 0;
+
+	err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
+					       oldi, newi);
+	if (err)
+		return err;
+
+	err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+					   NEXTHOP_EVENT_BUCKET_REPLACE, &info);
+	nh_notifier_res_bucket_info_fini(&info);
+
+	return notifier_to_errno(err);
+}
+
+/* There are three users of RES_TABLE, and NHs etc. referenced from there:
+ *
+ * 1) a collection of callbacks for NH maintenance. This operates under
+ *    RTNL,
+ * 2) the delayed work that gradually balances the resilient table,
+ * 3) and nexthop_select_path(), operating under RCU.
+ *
+ * Both the delayed work and the RTNL block are writers, and need to
+ * maintain mutual exclusion. Since there are only two and well-known
+ * writers for each table, the RTNL code can make sure it has exclusive
+ * access thus:
+ *
+ * - Have the DW operate without locking;
+ * - synchronously cancel the DW;
+ * - do the writing;
+ * - if the write was not actually a delete, call upkeep, which schedules
+ *   DW again if necessary.
+ *
+ * The functions that are always called from the RTNL context use
+ * rtnl_dereference(). The functions that can also be called from the DW do
+ * a raw dereference and rely on the above mutual exclusion scheme.
+ */
+#define nh_res_dereference(p) (rcu_dereference_raw(p))
+
+static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
+					     u16 bucket_index, bool force,
+					     struct nexthop *old_nh,
+					     struct nexthop *new_nh,
+					     struct netlink_ext_ack *extack)
+{
+	struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
+	struct nh_info *newi = nh_res_dereference(new_nh->nh_info);
+
+	return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
+						   force, oldi, newi, extack);
+}
+
+static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
+					    struct netlink_ext_ack *extack)
+{
+	struct nh_notifier_info info = {
+		.net = net,
+		.extack = extack,
+	};
+	struct nh_group *nhg;
+	int err;
+
+	ASSERT_RTNL();
+
+	if (nexthop_notifiers_is_empty(net))
+		return 0;
+
+	/* At this point, the nexthop buckets are still not populated. Only
+	 * emit a notification with the logical nexthops, so that a listener
+	 * could potentially veto it in case of unsupported configuration.
+	 */
+	nhg = rtnl_dereference(nh->nh_grp);
+	err = nh_notifier_mpath_info_init(&info, nhg);
+	if (err) {
+		NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
+		return err;
+	}
+
+	err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+					   NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
+					   &info);
+	kfree(info.nh_grp);
+
+	return notifier_to_errno(err);
+}
+
+static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
+				 enum nexthop_event_type event_type,
+				 struct nexthop *nh,
+				 struct netlink_ext_ack *extack)
+{
+	struct nh_notifier_info info = {
+		.net = net,
+		.extack = extack,
+	};
+	int err;
+
+	err = nh_notifier_info_init(&info, nh);
+	if (err)
+		return err;
+
+	err = nb->notifier_call(nb, event_type, &info);
+	nh_notifier_info_fini(&info, nh);
+
+	return notifier_to_errno(err);
+}
+
+static unsigned int nh_dev_hashfn(unsigned int val)
+{
+	unsigned int mask = NH_DEV_HASHSIZE - 1;
+
+	return (val ^
+		(val >> NH_DEV_HASHBITS) ^
+		(val >> (NH_DEV_HASHBITS * 2))) & mask;
+}
+
+static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
+{
+	struct net_device *dev = nhi->fib_nhc.nhc_dev;
+	struct hlist_head *head;
+	unsigned int hash;
+
+	WARN_ON(!dev);
+
+	hash = nh_dev_hashfn(dev->ifindex);
+	head = &net->nexthop.devhash[hash];
+	hlist_add_head(&nhi->dev_hash, head);
+}
+
+static void nexthop_free_group(struct nexthop *nh)
+{
+	struct nh_group *nhg;
+	int i;
+
+	nhg = rcu_dereference_raw(nh->nh_grp);
+	for (i = 0; i < nhg->num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+		WARN_ON(!list_empty(&nhge->nh_list));
+		nexthop_put(nhge->nh);
+	}
+
+	WARN_ON(nhg->spare == nhg);
+
+	if (nhg->resilient)
+		vfree(rcu_dereference_raw(nhg->res_table));
+
+	kfree(nhg->spare);
+	kfree(nhg);
+}
+
+static void nexthop_free_single(struct nexthop *nh)
+{
+	struct nh_info *nhi;
+
+	nhi = rcu_dereference_raw(nh->nh_info);
+	switch (nhi->family) {
+	case AF_INET:
+		fib_nh_release(nh->net, &nhi->fib_nh);
+		break;
+	case AF_INET6:
+		ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
+		break;
+	}
+	kfree(nhi);
+}
+
+void nexthop_free_rcu(struct rcu_head *head)
+{
+	struct nexthop *nh = container_of(head, struct nexthop, rcu);
+
+	if (nh->is_group)
+		nexthop_free_group(nh);
+	else
+		nexthop_free_single(nh);
+
+	kfree(nh);
+}
+EXPORT_SYMBOL_GPL(nexthop_free_rcu);
+
+static struct nexthop *nexthop_alloc(void)
+{
+	struct nexthop *nh;
+
+	nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
+	if (nh) {
+		INIT_LIST_HEAD(&nh->fi_list);
+		INIT_LIST_HEAD(&nh->f6i_list);
+		INIT_LIST_HEAD(&nh->grp_list);
+		INIT_LIST_HEAD(&nh->fdb_list);
+	}
+	return nh;
+}
+
+static struct nh_group *nexthop_grp_alloc(u16 num_nh)
+{
+	struct nh_group *nhg;
+
+	nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
+	if (nhg)
+		nhg->num_nh = num_nh;
+
+	return nhg;
+}
+
+static void nh_res_table_upkeep_dw(struct work_struct *work);
+
+static struct nh_res_table *
+nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
+{
+	const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
+	struct nh_res_table *res_table;
+	unsigned long size;
+
+	size = struct_size(res_table, nh_buckets, num_nh_buckets);
+	res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
+	if (!res_table)
+		return NULL;
+
+	res_table->net = net;
+	res_table->nhg_id = nhg_id;
+	INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
+	INIT_LIST_HEAD(&res_table->uw_nh_entries);
+	res_table->idle_timer = cfg->nh_grp_res_idle_timer;
+	res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
+	res_table->num_nh_buckets = num_nh_buckets;
+	return res_table;
+}
+
+static void nh_base_seq_inc(struct net *net)
+{
+	while (++net->nexthop.seq == 0)
+		;
+}
+
+/* no reference taken; rcu lock or rtnl must be held */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
+{
+	struct rb_node **pp, *parent = NULL, *next;
+
+	pp = &net->nexthop.rb_root.rb_node;
+	while (1) {
+		struct nexthop *nh;
+
+		next = rcu_dereference_raw(*pp);
+		if (!next)
+			break;
+		parent = next;
+
+		nh = rb_entry(parent, struct nexthop, rb_node);
+		if (id < nh->id)
+			pp = &next->rb_left;
+		else if (id > nh->id)
+			pp = &next->rb_right;
+		else
+			return nh;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_find_by_id);
+
+/* used for auto id allocation; called with rtnl held */
+static u32 nh_find_unused_id(struct net *net)
+{
+	u32 id_start = net->nexthop.last_id_allocated;
+
+	while (1) {
+		net->nexthop.last_id_allocated++;
+		if (net->nexthop.last_id_allocated == id_start)
+			break;
+
+		if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
+			return net->nexthop.last_id_allocated;
+	}
+	return 0;
+}
+
+static void nh_res_time_set_deadline(unsigned long next_time,
+				     unsigned long *deadline)
+{
+	if (time_before(next_time, *deadline))
+		*deadline = next_time;
+}
+
+static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
+{
+	if (list_empty(&res_table->uw_nh_entries))
+		return 0;
+	return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
+}
+
+static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
+{
+	struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NHA_RES_GROUP);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
+			res_table->num_nh_buckets) ||
+	    nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
+			jiffies_to_clock_t(res_table->idle_timer)) ||
+	    nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
+			jiffies_to_clock_t(res_table->unbalanced_timer)) ||
+	    nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
+			      nh_res_table_unbalanced_time(res_table),
+			      NHA_RES_GROUP_PAD))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
+{
+	struct nexthop_grp *p;
+	size_t len = nhg->num_nh * sizeof(*p);
+	struct nlattr *nla;
+	u16 group_type = 0;
+	int i;
+
+	if (nhg->hash_threshold)
+		group_type = NEXTHOP_GRP_TYPE_MPATH;
+	else if (nhg->resilient)
+		group_type = NEXTHOP_GRP_TYPE_RES;
+
+	if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, NHA_GROUP, len);
+	if (!nla)
+		goto nla_put_failure;
+
+	p = nla_data(nla);
+	for (i = 0; i < nhg->num_nh; ++i) {
+		p->id = nhg->nh_entries[i].nh->id;
+		p->weight = nhg->nh_entries[i].weight - 1;
+		p += 1;
+	}
+
+	if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
+			int event, u32 portid, u32 seq, unsigned int nlflags)
+{
+	struct fib6_nh *fib6_nh;
+	struct fib_nh *fib_nh;
+	struct nlmsghdr *nlh;
+	struct nh_info *nhi;
+	struct nhmsg *nhm;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	nhm = nlmsg_data(nlh);
+	nhm->nh_family = AF_UNSPEC;
+	nhm->nh_flags = nh->nh_flags;
+	nhm->nh_protocol = nh->protocol;
+	nhm->nh_scope = 0;
+	nhm->resvd = 0;
+
+	if (nla_put_u32(skb, NHA_ID, nh->id))
+		goto nla_put_failure;
+
+	if (nh->is_group) {
+		struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+		if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
+			goto nla_put_failure;
+		if (nla_put_nh_group(skb, nhg))
+			goto nla_put_failure;
+		goto out;
+	}
+
+	nhi = rtnl_dereference(nh->nh_info);
+	nhm->nh_family = nhi->family;
+	if (nhi->reject_nh) {
+		if (nla_put_flag(skb, NHA_BLACKHOLE))
+			goto nla_put_failure;
+		goto out;
+	} else if (nhi->fdb_nh) {
+		if (nla_put_flag(skb, NHA_FDB))
+			goto nla_put_failure;
+	} else {
+		const struct net_device *dev;
+
+		dev = nhi->fib_nhc.nhc_dev;
+		if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
+			goto nla_put_failure;
+	}
+
+	nhm->nh_scope = nhi->fib_nhc.nhc_scope;
+	switch (nhi->family) {
+	case AF_INET:
+		fib_nh = &nhi->fib_nh;
+		if (fib_nh->fib_nh_gw_family &&
+		    nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+			goto nla_put_failure;
+		break;
+
+	case AF_INET6:
+		fib6_nh = &nhi->fib6_nh;
+		if (fib6_nh->fib_nh_gw_family &&
+		    nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
+			goto nla_put_failure;
+		break;
+	}
+
+	if (nhi->fib_nhc.nhc_lwtstate &&
+	    lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
+				NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
+		goto nla_put_failure;
+
+out:
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
+{
+	return nla_total_size(0) +	/* NHA_RES_GROUP */
+		nla_total_size(2) +	/* NHA_RES_GROUP_BUCKETS */
+		nla_total_size(4) +	/* NHA_RES_GROUP_IDLE_TIMER */
+		nla_total_size(4) +	/* NHA_RES_GROUP_UNBALANCED_TIMER */
+		nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
+}
+
+static size_t nh_nlmsg_size_grp(struct nexthop *nh)
+{
+	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+	size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
+	size_t tot = nla_total_size(sz) +
+		nla_total_size(2); /* NHA_GROUP_TYPE */
+
+	if (nhg->resilient)
+		tot += nh_nlmsg_size_grp_res(nhg);
+
+	return tot;
+}
+
+static size_t nh_nlmsg_size_single(struct nexthop *nh)
+{
+	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+	size_t sz;
+
+	/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+	 * are mutually exclusive
+	 */
+	sz = nla_total_size(4);  /* NHA_OIF */
+
+	switch (nhi->family) {
+	case AF_INET:
+		if (nhi->fib_nh.fib_nh_gw_family)
+			sz += nla_total_size(4);  /* NHA_GATEWAY */
+		break;
+
+	case AF_INET6:
+		/* NHA_GATEWAY */
+		if (nhi->fib6_nh.fib_nh_gw_family)
+			sz += nla_total_size(sizeof(const struct in6_addr));
+		break;
+	}
+
+	if (nhi->fib_nhc.nhc_lwtstate) {
+		sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
+		sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
+	}
+
+	return sz;
+}
+
+static size_t nh_nlmsg_size(struct nexthop *nh)
+{
+	size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
+
+	sz += nla_total_size(4); /* NHA_ID */
+
+	if (nh->is_group)
+		sz += nh_nlmsg_size_grp(nh);
+	else
+		sz += nh_nlmsg_size_single(nh);
+
+	return sz;
+}
+
+static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
+{
+	unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
+	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
+	if (!skb)
+		goto errout;
+
+	err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in nh_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
+		    info->nlh, gfp_any());
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
+}
+
+static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
+{
+	return (unsigned long)atomic_long_read(&bucket->used_time);
+}
+
+static unsigned long
+nh_res_bucket_idle_point(const struct nh_res_table *res_table,
+			 const struct nh_res_bucket *bucket,
+			 unsigned long now)
+{
+	unsigned long time = nh_res_bucket_used_time(bucket);
+
+	/* Bucket was not used since it was migrated. The idle time is now. */
+	if (time == bucket->migrated_time)
+		return now;
+
+	return time + res_table->idle_timer;
+}
+
+static unsigned long
+nh_res_table_unb_point(const struct nh_res_table *res_table)
+{
+	return res_table->unbalanced_since + res_table->unbalanced_timer;
+}
+
+static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
+				   struct nh_res_bucket *bucket)
+{
+	unsigned long now = jiffies;
+
+	atomic_long_set(&bucket->used_time, (long)now);
+	bucket->migrated_time = now;
+}
+
+static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
+{
+	atomic_long_set(&bucket->used_time, (long)jiffies);
+}
+
+static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
+{
+	unsigned long used_time = nh_res_bucket_used_time(bucket);
+
+	return jiffies_delta_to_clock_t(jiffies - used_time);
+}
+
+static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
+			      struct nh_res_bucket *bucket, u16 bucket_index,
+			      int event, u32 portid, u32 seq,
+			      unsigned int nlflags,
+			      struct netlink_ext_ack *extack)
+{
+	struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
+	struct nlmsghdr *nlh;
+	struct nlattr *nest;
+	struct nhmsg *nhm;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	nhm = nlmsg_data(nlh);
+	nhm->nh_family = AF_UNSPEC;
+	nhm->nh_flags = bucket->nh_flags;
+	nhm->nh_protocol = nh->protocol;
+	nhm->nh_scope = 0;
+	nhm->resvd = 0;
+
+	if (nla_put_u32(skb, NHA_ID, nh->id))
+		goto nla_put_failure;
+
+	nest = nla_nest_start(skb, NHA_RES_BUCKET);
+	if (!nest)
+		goto nla_put_failure;
+
+	if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
+	    nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
+	    nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
+			      nh_res_bucket_idle_time(bucket),
+			      NHA_RES_BUCKET_PAD))
+		goto nla_put_failure_nest;
+
+	nla_nest_end(skb, nest);
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure_nest:
+	nla_nest_cancel(skb, nest);
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static void nexthop_bucket_notify(struct nh_res_table *res_table,
+				  u16 bucket_index)
+{
+	struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
+	struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
+	struct nexthop *nh = nhge->nh_parent;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+				 RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
+				 NULL);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
+}
+
+static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
+			   bool *is_fdb, struct netlink_ext_ack *extack)
+{
+	if (nh->is_group) {
+		struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+		/* Nesting groups within groups is not supported. */
+		if (nhg->hash_threshold) {
+			NL_SET_ERR_MSG(extack,
+				       "Hash-threshold group can not be a nexthop within a group");
+			return false;
+		}
+		if (nhg->resilient) {
+			NL_SET_ERR_MSG(extack,
+				       "Resilient group can not be a nexthop within a group");
+			return false;
+		}
+		*is_fdb = nhg->fdb_nh;
+	} else {
+		struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+		if (nhi->reject_nh && npaths > 1) {
+			NL_SET_ERR_MSG(extack,
+				       "Blackhole nexthop can not be used in a group with more than 1 path");
+			return false;
+		}
+		*is_fdb = nhi->fdb_nh;
+	}
+
+	return true;
+}
+
+static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
+				   struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+
+	nhi = rtnl_dereference(nh->nh_info);
+
+	if (!nhi->fdb_nh) {
+		NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
+		return -EINVAL;
+	}
+
+	if (*nh_family == AF_UNSPEC) {
+		*nh_family = nhi->family;
+	} else if (*nh_family != nhi->family) {
+		NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nh_check_attr_group(struct net *net,
+			       struct nlattr *tb[], size_t tb_size,
+			       u16 nh_grp_type, struct netlink_ext_ack *extack)
+{
+	unsigned int len = nla_len(tb[NHA_GROUP]);
+	u8 nh_family = AF_UNSPEC;
+	struct nexthop_grp *nhg;
+	unsigned int i, j;
+	u8 nhg_fdb = 0;
+
+	if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid length for nexthop group attribute");
+		return -EINVAL;
+	}
+
+	/* convert len to number of nexthop ids */
+	len /= sizeof(*nhg);
+
+	nhg = nla_data(tb[NHA_GROUP]);
+	for (i = 0; i < len; ++i) {
+		if (nhg[i].resvd1 || nhg[i].resvd2) {
+			NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
+			return -EINVAL;
+		}
+		if (nhg[i].weight > 254) {
+			NL_SET_ERR_MSG(extack, "Invalid value for weight");
+			return -EINVAL;
+		}
+		for (j = i + 1; j < len; ++j) {
+			if (nhg[i].id == nhg[j].id) {
+				NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
+				return -EINVAL;
+			}
+		}
+	}
+
+	if (tb[NHA_FDB])
+		nhg_fdb = 1;
+	nhg = nla_data(tb[NHA_GROUP]);
+	for (i = 0; i < len; ++i) {
+		struct nexthop *nh;
+		bool is_fdb_nh;
+
+		nh = nexthop_find_by_id(net, nhg[i].id);
+		if (!nh) {
+			NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+			return -EINVAL;
+		}
+		if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
+			return -EINVAL;
+
+		if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
+			return -EINVAL;
+
+		if (!nhg_fdb && is_fdb_nh) {
+			NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
+			return -EINVAL;
+		}
+	}
+	for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
+		if (!tb[i])
+			continue;
+		switch (i) {
+		case NHA_FDB:
+			continue;
+		case NHA_RES_GROUP:
+			if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+				continue;
+			break;
+		}
+		NL_SET_ERR_MSG(extack,
+			       "No other attributes can be set in nexthop groups");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool ipv6_good_nh(const struct fib6_nh *nh)
+{
+	int state = NUD_REACHABLE;
+	struct neighbour *n;
+
+	rcu_read_lock();
+
+	n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
+	if (n)
+		state = READ_ONCE(n->nud_state);
+
+	rcu_read_unlock();
+
+	return !!(state & NUD_VALID);
+}
+
+static bool ipv4_good_nh(const struct fib_nh *nh)
+{
+	int state = NUD_REACHABLE;
+	struct neighbour *n;
+
+	rcu_read_lock();
+
+	n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+				      (__force u32)nh->fib_nh_gw4);
+	if (n)
+		state = READ_ONCE(n->nud_state);
+
+	rcu_read_unlock();
+
+	return !!(state & NUD_VALID);
+}
+
+static bool nexthop_is_good_nh(const struct nexthop *nh)
+{
+	struct nh_info *nhi = rcu_dereference(nh->nh_info);
+
+	switch (nhi->family) {
+	case AF_INET:
+		return ipv4_good_nh(&nhi->fib_nh);
+	case AF_INET6:
+		return ipv6_good_nh(&nhi->fib6_nh);
+	}
+
+	return false;
+}
+
+static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash)
+{
+	int i;
+
+	for (i = 0; i < nhg->num_nh; i++) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+		if (hash > atomic_read(&nhge->hthr.upper_bound))
+			continue;
+
+		return nhge->nh;
+	}
+
+	WARN_ON_ONCE(1);
+	return NULL;
+}
+
+static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
+{
+	struct nexthop *rc = NULL;
+	int i;
+
+	if (nhg->fdb_nh)
+		return nexthop_select_path_fdb(nhg, hash);
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+		/* nexthops always check if it is good and does
+		 * not rely on a sysctl for this behavior
+		 */
+		if (!nexthop_is_good_nh(nhge->nh))
+			continue;
+
+		if (!rc)
+			rc = nhge->nh;
+
+		if (hash > atomic_read(&nhge->hthr.upper_bound))
+			continue;
+
+		return nhge->nh;
+	}
+
+	return rc ? : nhg->nh_entries[0].nh;
+}
+
+static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
+{
+	struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
+	u16 bucket_index = hash % res_table->num_nh_buckets;
+	struct nh_res_bucket *bucket;
+	struct nh_grp_entry *nhge;
+
+	/* nexthop_select_path() is expected to return a non-NULL value, so
+	 * skip protocol validation and just hand out whatever there is.
+	 */
+	bucket = &res_table->nh_buckets[bucket_index];
+	nh_res_bucket_set_busy(bucket);
+	nhge = rcu_dereference(bucket->nh_entry);
+	return nhge->nh;
+}
+
+struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
+{
+	struct nh_group *nhg;
+
+	if (!nh->is_group)
+		return nh;
+
+	nhg = rcu_dereference(nh->nh_grp);
+	if (nhg->hash_threshold)
+		return nexthop_select_path_hthr(nhg, hash);
+	else if (nhg->resilient)
+		return nexthop_select_path_res(nhg, hash);
+
+	/* Unreachable. */
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_select_path);
+
+int nexthop_for_each_fib6_nh(struct nexthop *nh,
+			     int (*cb)(struct fib6_nh *nh, void *arg),
+			     void *arg)
+{
+	struct nh_info *nhi;
+	int err;
+
+	if (nh->is_group) {
+		struct nh_group *nhg;
+		int i;
+
+		nhg = rcu_dereference_rtnl(nh->nh_grp);
+		for (i = 0; i < nhg->num_nh; i++) {
+			struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+			nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
+			err = cb(&nhi->fib6_nh, arg);
+			if (err)
+				return err;
+		}
+	} else {
+		nhi = rcu_dereference_rtnl(nh->nh_info);
+		err = cb(&nhi->fib6_nh, arg);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
+
+static int check_src_addr(const struct in6_addr *saddr,
+			  struct netlink_ext_ack *extack)
+{
+	if (!ipv6_addr_any(saddr)) {
+		NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+		       struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+	bool is_fdb_nh;
+
+	/* fib6_src is unique to a fib6_info and limits the ability to cache
+	 * routes in fib6_nh within a nexthop that is potentially shared
+	 * across multiple fib entries. If the config wants to use source
+	 * routing it can not use nexthop objects. mlxsw also does not allow
+	 * fib6_src on routes.
+	 */
+	if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
+		return -EINVAL;
+
+	if (nh->is_group) {
+		struct nh_group *nhg;
+
+		nhg = rtnl_dereference(nh->nh_grp);
+		if (nhg->has_v4)
+			goto no_v4_nh;
+		is_fdb_nh = nhg->fdb_nh;
+	} else {
+		nhi = rtnl_dereference(nh->nh_info);
+		if (nhi->family == AF_INET)
+			goto no_v4_nh;
+		is_fdb_nh = nhi->fdb_nh;
+	}
+
+	if (is_fdb_nh) {
+		NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+		return -EINVAL;
+	}
+
+	return 0;
+no_v4_nh:
+	NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(fib6_check_nexthop);
+
+/* if existing nexthop has ipv6 routes linked to it, need
+ * to verify this new spec works with ipv6
+ */
+static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
+			      struct netlink_ext_ack *extack)
+{
+	struct fib6_info *f6i;
+
+	if (list_empty(&old->f6i_list))
+		return 0;
+
+	list_for_each_entry(f6i, &old->f6i_list, nh_list) {
+		if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
+			return -EINVAL;
+	}
+
+	return fib6_check_nexthop(new, NULL, extack);
+}
+
+static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
+			       struct netlink_ext_ack *extack)
+{
+	if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
+		NL_SET_ERR_MSG(extack,
+			       "Route with host scope can not have a gateway");
+		return -EINVAL;
+	}
+
+	if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
+		NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Invoked by fib add code to verify nexthop by id is ok with
+ * config for prefix; parts of fib_check_nh not done when nexthop
+ * object is used.
+ */
+int fib_check_nexthop(struct nexthop *nh, u8 scope,
+		      struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+	int err = 0;
+
+	if (nh->is_group) {
+		struct nh_group *nhg;
+
+		nhg = rtnl_dereference(nh->nh_grp);
+		if (nhg->fdb_nh) {
+			NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (scope == RT_SCOPE_HOST) {
+			NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* all nexthops in a group have the same scope */
+		nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
+		err = nexthop_check_scope(nhi, scope, extack);
+	} else {
+		nhi = rtnl_dereference(nh->nh_info);
+		if (nhi->fdb_nh) {
+			NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+			err = -EINVAL;
+			goto out;
+		}
+		err = nexthop_check_scope(nhi, scope, extack);
+	}
+
+out:
+	return err;
+}
+
+static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
+			     struct netlink_ext_ack *extack)
+{
+	struct fib_info *fi;
+
+	list_for_each_entry(fi, &old->fi_list, nh_list) {
+		int err;
+
+		err = fib_check_nexthop(new, fi->fib_scope, extack);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
+{
+	return nhge->res.count_buckets == nhge->res.wants_buckets;
+}
+
+static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
+{
+	return nhge->res.count_buckets > nhge->res.wants_buckets;
+}
+
+static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
+{
+	return nhge->res.count_buckets < nhge->res.wants_buckets;
+}
+
+static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
+{
+	return list_empty(&res_table->uw_nh_entries);
+}
+
+static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
+{
+	struct nh_grp_entry *nhge;
+
+	if (bucket->occupied) {
+		nhge = nh_res_dereference(bucket->nh_entry);
+		nhge->res.count_buckets--;
+		bucket->occupied = false;
+	}
+}
+
+static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
+				 struct nh_grp_entry *nhge)
+{
+	nh_res_bucket_unset_nh(bucket);
+
+	bucket->occupied = true;
+	rcu_assign_pointer(bucket->nh_entry, nhge);
+	nhge->res.count_buckets++;
+}
+
+static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
+					 struct nh_res_bucket *bucket,
+					 unsigned long *deadline, bool *force)
+{
+	unsigned long now = jiffies;
+	struct nh_grp_entry *nhge;
+	unsigned long idle_point;
+
+	if (!bucket->occupied) {
+		/* The bucket is not occupied, its NHGE pointer is either
+		 * NULL or obsolete. We _have to_ migrate: set force.
+		 */
+		*force = true;
+		return true;
+	}
+
+	nhge = nh_res_dereference(bucket->nh_entry);
+
+	/* If the bucket is populated by an underweight or balanced
+	 * nexthop, do not migrate.
+	 */
+	if (!nh_res_nhge_is_ow(nhge))
+		return false;
+
+	/* At this point we know that the bucket is populated with an
+	 * overweight nexthop. It needs to be migrated to a new nexthop if
+	 * the idle timer of unbalanced timer expired.
+	 */
+
+	idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
+	if (time_after_eq(now, idle_point)) {
+		/* The bucket is idle. We _can_ migrate: unset force. */
+		*force = false;
+		return true;
+	}
+
+	/* Unbalanced timer of 0 means "never force". */
+	if (res_table->unbalanced_timer) {
+		unsigned long unb_point;
+
+		unb_point = nh_res_table_unb_point(res_table);
+		if (time_after(now, unb_point)) {
+			/* The bucket is not idle, but the unbalanced timer
+			 * expired. We _can_ migrate, but set force anyway,
+			 * so that drivers know to ignore activity reports
+			 * from the HW.
+			 */
+			*force = true;
+			return true;
+		}
+
+		nh_res_time_set_deadline(unb_point, deadline);
+	}
+
+	nh_res_time_set_deadline(idle_point, deadline);
+	return false;
+}
+
+static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
+				  u16 bucket_index, bool notify,
+				  bool notify_nl, bool force)
+{
+	struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
+	struct nh_grp_entry *new_nhge;
+	struct netlink_ext_ack extack;
+	int err;
+
+	new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
+					    struct nh_grp_entry,
+					    res.uw_nh_entry);
+	if (WARN_ON_ONCE(!new_nhge))
+		/* If this function is called, "bucket" is either not
+		 * occupied, or it belongs to a next hop that is
+		 * overweight. In either case, there ought to be a
+		 * corresponding underweight next hop.
+		 */
+		return false;
+
+	if (notify) {
+		struct nh_grp_entry *old_nhge;
+
+		old_nhge = nh_res_dereference(bucket->nh_entry);
+		err = call_nexthop_res_bucket_notifiers(res_table->net,
+							res_table->nhg_id,
+							bucket_index, force,
+							old_nhge->nh,
+							new_nhge->nh, &extack);
+		if (err) {
+			pr_err_ratelimited("%s\n", extack._msg);
+			if (!force)
+				return false;
+			/* It is not possible to veto a forced replacement, so
+			 * just clear the hardware flags from the nexthop
+			 * bucket to indicate to user space that this bucket is
+			 * not correctly populated in hardware.
+			 */
+			bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+		}
+	}
+
+	nh_res_bucket_set_nh(bucket, new_nhge);
+	nh_res_bucket_set_idle(res_table, bucket);
+
+	if (notify_nl)
+		nexthop_bucket_notify(res_table, bucket_index);
+
+	if (nh_res_nhge_is_balanced(new_nhge))
+		list_del(&new_nhge->res.uw_nh_entry);
+	return true;
+}
+
+#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
+
+static void nh_res_table_upkeep(struct nh_res_table *res_table,
+				bool notify, bool notify_nl)
+{
+	unsigned long now = jiffies;
+	unsigned long deadline;
+	u16 i;
+
+	/* Deadline is the next time that upkeep should be run. It is the
+	 * earliest time at which one of the buckets might be migrated.
+	 * Start at the most pessimistic estimate: either unbalanced_timer
+	 * from now, or if there is none, idle_timer from now. For each
+	 * encountered time point, call nh_res_time_set_deadline() to
+	 * refine the estimate.
+	 */
+	if (res_table->unbalanced_timer)
+		deadline = now + res_table->unbalanced_timer;
+	else
+		deadline = now + res_table->idle_timer;
+
+	for (i = 0; i < res_table->num_nh_buckets; i++) {
+		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+		bool force;
+
+		if (nh_res_bucket_should_migrate(res_table, bucket,
+						 &deadline, &force)) {
+			if (!nh_res_bucket_migrate(res_table, i, notify,
+						   notify_nl, force)) {
+				unsigned long idle_point;
+
+				/* A driver can override the migration
+				 * decision if the HW reports that the
+				 * bucket is actually not idle. Therefore
+				 * remark the bucket as busy again and
+				 * update the deadline.
+				 */
+				nh_res_bucket_set_busy(bucket);
+				idle_point = nh_res_bucket_idle_point(res_table,
+								      bucket,
+								      now);
+				nh_res_time_set_deadline(idle_point, &deadline);
+			}
+		}
+	}
+
+	/* If the group is still unbalanced, schedule the next upkeep to
+	 * either the deadline computed above, or the minimum deadline,
+	 * whichever comes later.
+	 */
+	if (!nh_res_table_is_balanced(res_table)) {
+		unsigned long now = jiffies;
+		unsigned long min_deadline;
+
+		min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
+		if (time_before(deadline, min_deadline))
+			deadline = min_deadline;
+
+		queue_delayed_work(system_power_efficient_wq,
+				   &res_table->upkeep_dw, deadline - now);
+	}
+}
+
+static void nh_res_table_upkeep_dw(struct work_struct *work)
+{
+	struct delayed_work *dw = to_delayed_work(work);
+	struct nh_res_table *res_table;
+
+	res_table = container_of(dw, struct nh_res_table, upkeep_dw);
+	nh_res_table_upkeep(res_table, true, true);
+}
+
+static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
+{
+	cancel_delayed_work_sync(&res_table->upkeep_dw);
+}
+
+static void nh_res_group_rebalance(struct nh_group *nhg,
+				   struct nh_res_table *res_table)
+{
+	int prev_upper_bound = 0;
+	int total = 0;
+	int w = 0;
+	int i;
+
+	INIT_LIST_HEAD(&res_table->uw_nh_entries);
+
+	for (i = 0; i < nhg->num_nh; ++i)
+		total += nhg->nh_entries[i].weight;
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+		int upper_bound;
+
+		w += nhge->weight;
+		upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
+						total);
+		nhge->res.wants_buckets = upper_bound - prev_upper_bound;
+		prev_upper_bound = upper_bound;
+
+		if (nh_res_nhge_is_uw(nhge)) {
+			if (list_empty(&res_table->uw_nh_entries))
+				res_table->unbalanced_since = jiffies;
+			list_add(&nhge->res.uw_nh_entry,
+				 &res_table->uw_nh_entries);
+		}
+	}
+}
+
+/* Migrate buckets in res_table so that they reference NHGE's from NHG with
+ * the right NH ID. Set those buckets that do not have a corresponding NHGE
+ * entry in NHG as not occupied.
+ */
+static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
+					 struct nh_group *nhg)
+{
+	u16 i;
+
+	for (i = 0; i < res_table->num_nh_buckets; i++) {
+		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+		u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
+		bool found = false;
+		int j;
+
+		for (j = 0; j < nhg->num_nh; j++) {
+			struct nh_grp_entry *nhge = &nhg->nh_entries[j];
+
+			if (nhge->nh->id == id) {
+				nh_res_bucket_set_nh(bucket, nhge);
+				found = true;
+				break;
+			}
+		}
+
+		if (!found)
+			nh_res_bucket_unset_nh(bucket);
+	}
+}
+
+static void replace_nexthop_grp_res(struct nh_group *oldg,
+				    struct nh_group *newg)
+{
+	/* For NH group replacement, the new NHG might only have a stub
+	 * hash table with 0 buckets, because the number of buckets was not
+	 * specified. For NH removal, oldg and newg both reference the same
+	 * res_table. So in any case, in the following, we want to work
+	 * with oldg->res_table.
+	 */
+	struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
+	unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
+	bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
+
+	nh_res_table_cancel_upkeep(old_res_table);
+	nh_res_table_migrate_buckets(old_res_table, newg);
+	nh_res_group_rebalance(newg, old_res_table);
+	if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
+		old_res_table->unbalanced_since = prev_unbalanced_since;
+	nh_res_table_upkeep(old_res_table, true, false);
+}
+
+static void nh_hthr_group_rebalance(struct nh_group *nhg)
+{
+	int total = 0;
+	int w = 0;
+	int i;
+
+	for (i = 0; i < nhg->num_nh; ++i)
+		total += nhg->nh_entries[i].weight;
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+		int upper_bound;
+
+		w += nhge->weight;
+		upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
+		atomic_set(&nhge->hthr.upper_bound, upper_bound);
+	}
+}
+
+static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
+				struct nl_info *nlinfo)
+{
+	struct nh_grp_entry *nhges, *new_nhges;
+	struct nexthop *nhp = nhge->nh_parent;
+	struct netlink_ext_ack extack;
+	struct nexthop *nh = nhge->nh;
+	struct nh_group *nhg, *newg;
+	int i, j, err;
+
+	WARN_ON(!nh);
+
+	nhg = rtnl_dereference(nhp->nh_grp);
+	newg = nhg->spare;
+
+	/* last entry, keep it visible and remove the parent */
+	if (nhg->num_nh == 1) {
+		remove_nexthop(net, nhp, nlinfo);
+		return;
+	}
+
+	newg->has_v4 = false;
+	newg->is_multipath = nhg->is_multipath;
+	newg->hash_threshold = nhg->hash_threshold;
+	newg->resilient = nhg->resilient;
+	newg->fdb_nh = nhg->fdb_nh;
+	newg->num_nh = nhg->num_nh;
+
+	/* copy old entries to new except the one getting removed */
+	nhges = nhg->nh_entries;
+	new_nhges = newg->nh_entries;
+	for (i = 0, j = 0; i < nhg->num_nh; ++i) {
+		struct nh_info *nhi;
+
+		/* current nexthop getting removed */
+		if (nhg->nh_entries[i].nh == nh) {
+			newg->num_nh--;
+			continue;
+		}
+
+		nhi = rtnl_dereference(nhges[i].nh->nh_info);
+		if (nhi->family == AF_INET)
+			newg->has_v4 = true;
+
+		list_del(&nhges[i].nh_list);
+		new_nhges[j].nh_parent = nhges[i].nh_parent;
+		new_nhges[j].nh = nhges[i].nh;
+		new_nhges[j].weight = nhges[i].weight;
+		list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
+		j++;
+	}
+
+	if (newg->hash_threshold)
+		nh_hthr_group_rebalance(newg);
+	else if (newg->resilient)
+		replace_nexthop_grp_res(nhg, newg);
+
+	rcu_assign_pointer(nhp->nh_grp, newg);
+
+	list_del(&nhge->nh_list);
+	nexthop_put(nhge->nh);
+
+	/* Removal of a NH from a resilient group is notified through
+	 * bucket notifications.
+	 */
+	if (newg->hash_threshold) {
+		err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
+					     &extack);
+		if (err)
+			pr_err("%s\n", extack._msg);
+	}
+
+	if (nlinfo)
+		nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+				       struct nl_info *nlinfo)
+{
+	struct nh_grp_entry *nhge, *tmp;
+
+	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
+		remove_nh_grp_entry(net, nhge, nlinfo);
+
+	/* make sure all see the newly published array before releasing rtnl */
+	synchronize_net();
+}
+
+static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
+{
+	struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
+	struct nh_res_table *res_table;
+	int i, num_nh = nhg->num_nh;
+
+	for (i = 0; i < num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+		if (WARN_ON(!nhge->nh))
+			continue;
+
+		list_del_init(&nhge->nh_list);
+	}
+
+	if (nhg->resilient) {
+		res_table = rtnl_dereference(nhg->res_table);
+		nh_res_table_cancel_upkeep(res_table);
+	}
+}
+
+/* not called for nexthop replace */
+static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+{
+	struct fib6_info *f6i, *tmp;
+	bool do_flush = false;
+	struct fib_info *fi;
+
+	list_for_each_entry(fi, &nh->fi_list, nh_list) {
+		fi->fib_flags |= RTNH_F_DEAD;
+		do_flush = true;
+	}
+	if (do_flush)
+		fib_flush(net);
+
+	/* ip6_del_rt removes the entry from this list hence the _safe */
+	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+		/* __ip6_del_rt does a release, so do a hold here */
+		fib6_info_hold(f6i);
+		ipv6_stub->ip6_del_rt(net, f6i,
+				      !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
+	}
+}
+
+static void __remove_nexthop(struct net *net, struct nexthop *nh,
+			     struct nl_info *nlinfo)
+{
+	__remove_nexthop_fib(net, nh);
+
+	if (nh->is_group) {
+		remove_nexthop_group(nh, nlinfo);
+	} else {
+		struct nh_info *nhi;
+
+		nhi = rtnl_dereference(nh->nh_info);
+		if (nhi->fib_nhc.nhc_dev)
+			hlist_del(&nhi->dev_hash);
+
+		remove_nexthop_from_groups(net, nh, nlinfo);
+	}
+}
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+			   struct nl_info *nlinfo)
+{
+	call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
+
+	/* remove from the tree */
+	rb_erase(&nh->rb_node, &net->nexthop.rb_root);
+
+	if (nlinfo)
+		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+
+	__remove_nexthop(net, nh, nlinfo);
+	nh_base_seq_inc(net);
+
+	nexthop_put(nh);
+}
+
+/* if any FIB entries reference this nexthop, any dst entries
+ * need to be regenerated
+ */
+static void nh_rt_cache_flush(struct net *net, struct nexthop *nh,
+			      struct nexthop *replaced_nh)
+{
+	struct fib6_info *f6i;
+	struct nh_group *nhg;
+	int i;
+
+	if (!list_empty(&nh->fi_list))
+		rt_cache_flush(net);
+
+	list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+		ipv6_stub->fib6_update_sernum(net, f6i);
+
+	/* if an IPv6 group was replaced, we have to release all old
+	 * dsts to make sure all refcounts are released
+	 */
+	if (!replaced_nh->is_group)
+		return;
+
+	nhg = rtnl_dereference(replaced_nh->nh_grp);
+	for (i = 0; i < nhg->num_nh; i++) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+		struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info);
+
+		if (nhi->family == AF_INET6)
+			ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh);
+	}
+}
+
+static int replace_nexthop_grp(struct net *net, struct nexthop *old,
+			       struct nexthop *new, const struct nh_config *cfg,
+			       struct netlink_ext_ack *extack)
+{
+	struct nh_res_table *tmp_table = NULL;
+	struct nh_res_table *new_res_table;
+	struct nh_res_table *old_res_table;
+	struct nh_group *oldg, *newg;
+	int i, err;
+
+	if (!new->is_group) {
+		NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+		return -EINVAL;
+	}
+
+	oldg = rtnl_dereference(old->nh_grp);
+	newg = rtnl_dereference(new->nh_grp);
+
+	if (newg->hash_threshold != oldg->hash_threshold) {
+		NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
+		return -EINVAL;
+	}
+
+	if (newg->hash_threshold) {
+		err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
+					     extack);
+		if (err)
+			return err;
+	} else if (newg->resilient) {
+		new_res_table = rtnl_dereference(newg->res_table);
+		old_res_table = rtnl_dereference(oldg->res_table);
+
+		/* Accept if num_nh_buckets was not given, but if it was
+		 * given, demand that the value be correct.
+		 */
+		if (cfg->nh_grp_res_has_num_buckets &&
+		    cfg->nh_grp_res_num_buckets !=
+		    old_res_table->num_nh_buckets) {
+			NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
+			return -EINVAL;
+		}
+
+		/* Emit a pre-replace notification so that listeners could veto
+		 * a potentially unsupported configuration. Otherwise,
+		 * individual bucket replacement notifications would need to be
+		 * vetoed, which is something that should only happen if the
+		 * bucket is currently active.
+		 */
+		err = call_nexthop_res_table_notifiers(net, new, extack);
+		if (err)
+			return err;
+
+		if (cfg->nh_grp_res_has_idle_timer)
+			old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
+		if (cfg->nh_grp_res_has_unbalanced_timer)
+			old_res_table->unbalanced_timer =
+				cfg->nh_grp_res_unbalanced_timer;
+
+		replace_nexthop_grp_res(oldg, newg);
+
+		tmp_table = new_res_table;
+		rcu_assign_pointer(newg->res_table, old_res_table);
+		rcu_assign_pointer(newg->spare->res_table, old_res_table);
+	}
+
+	/* update parents - used by nexthop code for cleanup */
+	for (i = 0; i < newg->num_nh; i++)
+		newg->nh_entries[i].nh_parent = old;
+
+	rcu_assign_pointer(old->nh_grp, newg);
+
+	/* Make sure concurrent readers are not using 'oldg' anymore. */
+	synchronize_net();
+
+	if (newg->resilient) {
+		rcu_assign_pointer(oldg->res_table, tmp_table);
+		rcu_assign_pointer(oldg->spare->res_table, tmp_table);
+	}
+
+	for (i = 0; i < oldg->num_nh; i++)
+		oldg->nh_entries[i].nh_parent = new;
+
+	rcu_assign_pointer(new->nh_grp, oldg);
+
+	return 0;
+}
+
+static void nh_group_v4_update(struct nh_group *nhg)
+{
+	struct nh_grp_entry *nhges;
+	bool has_v4 = false;
+	int i;
+
+	nhges = nhg->nh_entries;
+	for (i = 0; i < nhg->num_nh; i++) {
+		struct nh_info *nhi;
+
+		nhi = rtnl_dereference(nhges[i].nh->nh_info);
+		if (nhi->family == AF_INET)
+			has_v4 = true;
+	}
+	nhg->has_v4 = has_v4;
+}
+
+static int replace_nexthop_single_notify_res(struct net *net,
+					     struct nh_res_table *res_table,
+					     struct nexthop *old,
+					     struct nh_info *oldi,
+					     struct nh_info *newi,
+					     struct netlink_ext_ack *extack)
+{
+	u32 nhg_id = res_table->nhg_id;
+	int err;
+	u16 i;
+
+	for (i = 0; i < res_table->num_nh_buckets; i++) {
+		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+		struct nh_grp_entry *nhge;
+
+		nhge = rtnl_dereference(bucket->nh_entry);
+		if (nhge->nh == old) {
+			err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
+								  i, true,
+								  oldi, newi,
+								  extack);
+			if (err)
+				goto err_notify;
+		}
+	}
+
+	return 0;
+
+err_notify:
+	while (i-- > 0) {
+		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+		struct nh_grp_entry *nhge;
+
+		nhge = rtnl_dereference(bucket->nh_entry);
+		if (nhge->nh == old)
+			__call_nexthop_res_bucket_notifiers(net, nhg_id, i,
+							    true, newi, oldi,
+							    extack);
+	}
+	return err;
+}
+
+static int replace_nexthop_single_notify(struct net *net,
+					 struct nexthop *group_nh,
+					 struct nexthop *old,
+					 struct nh_info *oldi,
+					 struct nh_info *newi,
+					 struct netlink_ext_ack *extack)
+{
+	struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
+	struct nh_res_table *res_table;
+
+	if (nhg->hash_threshold) {
+		return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
+					      group_nh, extack);
+	} else if (nhg->resilient) {
+		res_table = rtnl_dereference(nhg->res_table);
+		return replace_nexthop_single_notify_res(net, res_table,
+							 old, oldi, newi,
+							 extack);
+	}
+
+	return -EINVAL;
+}
+
+static int replace_nexthop_single(struct net *net, struct nexthop *old,
+				  struct nexthop *new,
+				  struct netlink_ext_ack *extack)
+{
+	u8 old_protocol, old_nh_flags;
+	struct nh_info *oldi, *newi;
+	struct nh_grp_entry *nhge;
+	int err;
+
+	if (new->is_group) {
+		NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+		return -EINVAL;
+	}
+
+	err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
+	if (err)
+		return err;
+
+	/* Hardware flags were set on 'old' as 'new' is not in the red-black
+	 * tree. Therefore, inherit the flags from 'old' to 'new'.
+	 */
+	new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
+
+	oldi = rtnl_dereference(old->nh_info);
+	newi = rtnl_dereference(new->nh_info);
+
+	newi->nh_parent = old;
+	oldi->nh_parent = new;
+
+	old_protocol = old->protocol;
+	old_nh_flags = old->nh_flags;
+
+	old->protocol = new->protocol;
+	old->nh_flags = new->nh_flags;
+
+	rcu_assign_pointer(old->nh_info, newi);
+	rcu_assign_pointer(new->nh_info, oldi);
+
+	/* Send a replace notification for all the groups using the nexthop. */
+	list_for_each_entry(nhge, &old->grp_list, nh_list) {
+		struct nexthop *nhp = nhge->nh_parent;
+
+		err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
+						    extack);
+		if (err)
+			goto err_notify;
+	}
+
+	/* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
+	 * update IPv4 indication in all the groups using the nexthop.
+	 */
+	if (oldi->family == AF_INET && newi->family == AF_INET6) {
+		list_for_each_entry(nhge, &old->grp_list, nh_list) {
+			struct nexthop *nhp = nhge->nh_parent;
+			struct nh_group *nhg;
+
+			nhg = rtnl_dereference(nhp->nh_grp);
+			nh_group_v4_update(nhg);
+		}
+	}
+
+	return 0;
+
+err_notify:
+	rcu_assign_pointer(new->nh_info, newi);
+	rcu_assign_pointer(old->nh_info, oldi);
+	old->nh_flags = old_nh_flags;
+	old->protocol = old_protocol;
+	oldi->nh_parent = old;
+	newi->nh_parent = new;
+	list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
+		struct nexthop *nhp = nhge->nh_parent;
+
+		replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
+	}
+	call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
+	return err;
+}
+
+static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
+				     struct nl_info *info)
+{
+	struct fib6_info *f6i;
+
+	if (!list_empty(&nh->fi_list)) {
+		struct fib_info *fi;
+
+		/* expectation is a few fib_info per nexthop and then
+		 * a lot of routes per fib_info. So mark the fib_info
+		 * and then walk the fib tables once
+		 */
+		list_for_each_entry(fi, &nh->fi_list, nh_list)
+			fi->nh_updated = true;
+
+		fib_info_notify_update(net, info);
+
+		list_for_each_entry(fi, &nh->fi_list, nh_list)
+			fi->nh_updated = false;
+	}
+
+	list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+		ipv6_stub->fib6_rt_update(net, f6i, info);
+}
+
+/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
+ * linked to this nexthop and for all groups that the nexthop
+ * is a member of
+ */
+static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
+				   struct nl_info *info)
+{
+	struct nh_grp_entry *nhge;
+
+	__nexthop_replace_notify(net, nh, info);
+
+	list_for_each_entry(nhge, &nh->grp_list, nh_list)
+		__nexthop_replace_notify(net, nhge->nh_parent, info);
+}
+
+static int replace_nexthop(struct net *net, struct nexthop *old,
+			   struct nexthop *new, const struct nh_config *cfg,
+			   struct netlink_ext_ack *extack)
+{
+	bool new_is_reject = false;
+	struct nh_grp_entry *nhge;
+	int err;
+
+	/* check that existing FIB entries are ok with the
+	 * new nexthop definition
+	 */
+	err = fib_check_nh_list(old, new, extack);
+	if (err)
+		return err;
+
+	err = fib6_check_nh_list(old, new, extack);
+	if (err)
+		return err;
+
+	if (!new->is_group) {
+		struct nh_info *nhi = rtnl_dereference(new->nh_info);
+
+		new_is_reject = nhi->reject_nh;
+	}
+
+	list_for_each_entry(nhge, &old->grp_list, nh_list) {
+		/* if new nexthop is a blackhole, any groups using this
+		 * nexthop cannot have more than 1 path
+		 */
+		if (new_is_reject &&
+		    nexthop_num_path(nhge->nh_parent) > 1) {
+			NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
+			return -EINVAL;
+		}
+
+		err = fib_check_nh_list(nhge->nh_parent, new, extack);
+		if (err)
+			return err;
+
+		err = fib6_check_nh_list(nhge->nh_parent, new, extack);
+		if (err)
+			return err;
+	}
+
+	if (old->is_group)
+		err = replace_nexthop_grp(net, old, new, cfg, extack);
+	else
+		err = replace_nexthop_single(net, old, new, extack);
+
+	if (!err) {
+		nh_rt_cache_flush(net, old, new);
+
+		__remove_nexthop(net, new, NULL);
+		nexthop_put(new);
+	}
+
+	return err;
+}
+
+/* called with rtnl_lock held */
+static int insert_nexthop(struct net *net, struct nexthop *new_nh,
+			  struct nh_config *cfg, struct netlink_ext_ack *extack)
+{
+	struct rb_node **pp, *parent = NULL, *next;
+	struct rb_root *root = &net->nexthop.rb_root;
+	bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
+	bool create = !!(cfg->nlflags & NLM_F_CREATE);
+	u32 new_id = new_nh->id;
+	int replace_notify = 0;
+	int rc = -EEXIST;
+
+	pp = &root->rb_node;
+	while (1) {
+		struct nexthop *nh;
+
+		next = *pp;
+		if (!next)
+			break;
+
+		parent = next;
+
+		nh = rb_entry(parent, struct nexthop, rb_node);
+		if (new_id < nh->id) {
+			pp = &next->rb_left;
+		} else if (new_id > nh->id) {
+			pp = &next->rb_right;
+		} else if (replace) {
+			rc = replace_nexthop(net, nh, new_nh, cfg, extack);
+			if (!rc) {
+				new_nh = nh; /* send notification with old nh */
+				replace_notify = 1;
+			}
+			goto out;
+		} else {
+			/* id already exists and not a replace */
+			goto out;
+		}
+	}
+
+	if (replace && !create) {
+		NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
+		rc = -ENOENT;
+		goto out;
+	}
+
+	if (new_nh->is_group) {
+		struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
+		struct nh_res_table *res_table;
+
+		if (nhg->resilient) {
+			res_table = rtnl_dereference(nhg->res_table);
+
+			/* Not passing the number of buckets is OK when
+			 * replacing, but not when creating a new group.
+			 */
+			if (!cfg->nh_grp_res_has_num_buckets) {
+				NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
+				rc = -EINVAL;
+				goto out;
+			}
+
+			nh_res_group_rebalance(nhg, res_table);
+
+			/* Do not send bucket notifications, we do full
+			 * notification below.
+			 */
+			nh_res_table_upkeep(res_table, false, false);
+		}
+	}
+
+	rb_link_node_rcu(&new_nh->rb_node, parent, pp);
+	rb_insert_color(&new_nh->rb_node, root);
+
+	/* The initial insertion is a full notification for hash-threshold as
+	 * well as resilient groups.
+	 */
+	rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
+	if (rc)
+		rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
+
+out:
+	if (!rc) {
+		nh_base_seq_inc(net);
+		nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
+		if (replace_notify &&
+		    READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode))
+			nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
+	}
+
+	return rc;
+}
+
+/* rtnl */
+/* remove all nexthops tied to a device being deleted */
+static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
+{
+	unsigned int hash = nh_dev_hashfn(dev->ifindex);
+	struct net *net = dev_net(dev);
+	struct hlist_head *head = &net->nexthop.devhash[hash];
+	struct hlist_node *n;
+	struct nh_info *nhi;
+
+	hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+		if (nhi->fib_nhc.nhc_dev != dev)
+			continue;
+
+		if (nhi->reject_nh &&
+		    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
+			continue;
+
+		remove_nexthop(net, nhi->nh_parent, NULL);
+	}
+}
+
+/* rtnl; called when net namespace is deleted */
+static void flush_all_nexthops(struct net *net)
+{
+	struct rb_root *root = &net->nexthop.rb_root;
+	struct rb_node *node;
+	struct nexthop *nh;
+
+	while ((node = rb_first(root))) {
+		nh = rb_entry(node, struct nexthop, rb_node);
+		remove_nexthop(net, nh, NULL);
+		cond_resched();
+	}
+}
+
+static struct nexthop *nexthop_create_group(struct net *net,
+					    struct nh_config *cfg)
+{
+	struct nlattr *grps_attr = cfg->nh_grp;
+	struct nexthop_grp *entry = nla_data(grps_attr);
+	u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
+	struct nh_group *nhg;
+	struct nexthop *nh;
+	int err;
+	int i;
+
+	if (WARN_ON(!num_nh))
+		return ERR_PTR(-EINVAL);
+
+	nh = nexthop_alloc();
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nh->is_group = 1;
+
+	nhg = nexthop_grp_alloc(num_nh);
+	if (!nhg) {
+		kfree(nh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* spare group used for removals */
+	nhg->spare = nexthop_grp_alloc(num_nh);
+	if (!nhg->spare) {
+		kfree(nhg);
+		kfree(nh);
+		return ERR_PTR(-ENOMEM);
+	}
+	nhg->spare->spare = nhg;
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		struct nexthop *nhe;
+		struct nh_info *nhi;
+
+		nhe = nexthop_find_by_id(net, entry[i].id);
+		if (!nexthop_get(nhe)) {
+			err = -ENOENT;
+			goto out_no_nh;
+		}
+
+		nhi = rtnl_dereference(nhe->nh_info);
+		if (nhi->family == AF_INET)
+			nhg->has_v4 = true;
+
+		nhg->nh_entries[i].nh = nhe;
+		nhg->nh_entries[i].weight = entry[i].weight + 1;
+		list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
+		nhg->nh_entries[i].nh_parent = nh;
+	}
+
+	if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+		nhg->hash_threshold = 1;
+		nhg->is_multipath = true;
+	} else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
+		struct nh_res_table *res_table;
+
+		res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
+		if (!res_table) {
+			err = -ENOMEM;
+			goto out_no_nh;
+		}
+
+		rcu_assign_pointer(nhg->spare->res_table, res_table);
+		rcu_assign_pointer(nhg->res_table, res_table);
+		nhg->resilient = true;
+		nhg->is_multipath = true;
+	}
+
+	WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1);
+
+	if (nhg->hash_threshold)
+		nh_hthr_group_rebalance(nhg);
+
+	if (cfg->nh_fdb)
+		nhg->fdb_nh = 1;
+
+	rcu_assign_pointer(nh->nh_grp, nhg);
+
+	return nh;
+
+out_no_nh:
+	for (i--; i >= 0; --i) {
+		list_del(&nhg->nh_entries[i].nh_list);
+		nexthop_put(nhg->nh_entries[i].nh);
+	}
+
+	kfree(nhg->spare);
+	kfree(nhg);
+	kfree(nh);
+
+	return ERR_PTR(err);
+}
+
+static int nh_create_ipv4(struct net *net, struct nexthop *nh,
+			  struct nh_info *nhi, struct nh_config *cfg,
+			  struct netlink_ext_ack *extack)
+{
+	struct fib_nh *fib_nh = &nhi->fib_nh;
+	struct fib_config fib_cfg = {
+		.fc_oif   = cfg->nh_ifindex,
+		.fc_gw4   = cfg->gw.ipv4,
+		.fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
+		.fc_flags = cfg->nh_flags,
+		.fc_nlinfo = cfg->nlinfo,
+		.fc_encap = cfg->nh_encap,
+		.fc_encap_type = cfg->nh_encap_type,
+	};
+	u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
+	int err;
+
+	err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
+	if (err) {
+		fib_nh_release(net, fib_nh);
+		goto out;
+	}
+
+	if (nhi->fdb_nh)
+		goto out;
+
+	/* sets nh_dev if successful */
+	err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
+	if (!err) {
+		nh->nh_flags = fib_nh->fib_nh_flags;
+		fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
+					  !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1);
+	} else {
+		fib_nh_release(net, fib_nh);
+	}
+out:
+	return err;
+}
+
+static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
+			  struct nh_info *nhi, struct nh_config *cfg,
+			  struct netlink_ext_ack *extack)
+{
+	struct fib6_nh *fib6_nh = &nhi->fib6_nh;
+	struct fib6_config fib6_cfg = {
+		.fc_table = l3mdev_fib_table(cfg->dev),
+		.fc_ifindex = cfg->nh_ifindex,
+		.fc_gateway = cfg->gw.ipv6,
+		.fc_flags = cfg->nh_flags,
+		.fc_nlinfo = cfg->nlinfo,
+		.fc_encap = cfg->nh_encap,
+		.fc_encap_type = cfg->nh_encap_type,
+		.fc_is_fdb = cfg->nh_fdb,
+	};
+	int err;
+
+	if (!ipv6_addr_any(&cfg->gw.ipv6))
+		fib6_cfg.fc_flags |= RTF_GATEWAY;
+
+	/* sets nh_dev if successful */
+	err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
+				      extack);
+	if (err) {
+		/* IPv6 is not enabled, don't call fib6_nh_release */
+		if (err == -EAFNOSUPPORT)
+			goto out;
+		ipv6_stub->fib6_nh_release(fib6_nh);
+	} else {
+		nh->nh_flags = fib6_nh->fib_nh_flags;
+	}
+out:
+	return err;
+}
+
+static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
+				      struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+	struct nexthop *nh;
+	int err = 0;
+
+	nh = nexthop_alloc();
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
+	if (!nhi) {
+		kfree(nh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nh->nh_flags = cfg->nh_flags;
+	nh->net = net;
+
+	nhi->nh_parent = nh;
+	nhi->family = cfg->nh_family;
+	nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
+
+	if (cfg->nh_fdb)
+		nhi->fdb_nh = 1;
+
+	if (cfg->nh_blackhole) {
+		nhi->reject_nh = 1;
+		cfg->nh_ifindex = net->loopback_dev->ifindex;
+	}
+
+	switch (cfg->nh_family) {
+	case AF_INET:
+		err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+		break;
+	case AF_INET6:
+		err = nh_create_ipv6(net, nh, nhi, cfg, extack);
+		break;
+	}
+
+	if (err) {
+		kfree(nhi);
+		kfree(nh);
+		return ERR_PTR(err);
+	}
+
+	/* add the entry to the device based hash */
+	if (!nhi->fdb_nh)
+		nexthop_devhash_add(net, nhi);
+
+	rcu_assign_pointer(nh->nh_info, nhi);
+
+	return nh;
+}
+
+/* called with rtnl lock held */
+static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
+				   struct netlink_ext_ack *extack)
+{
+	struct nexthop *nh;
+	int err;
+
+	if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
+		NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!cfg->nh_id) {
+		cfg->nh_id = nh_find_unused_id(net);
+		if (!cfg->nh_id) {
+			NL_SET_ERR_MSG(extack, "No unused id");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	if (cfg->nh_grp)
+		nh = nexthop_create_group(net, cfg);
+	else
+		nh = nexthop_create(net, cfg, extack);
+
+	if (IS_ERR(nh))
+		return nh;
+
+	refcount_set(&nh->refcnt, 1);
+	nh->id = cfg->nh_id;
+	nh->protocol = cfg->nh_protocol;
+	nh->net = net;
+
+	err = insert_nexthop(net, nh, cfg, extack);
+	if (err) {
+		__remove_nexthop(net, nh, NULL);
+		nexthop_put(nh);
+		nh = ERR_PTR(err);
+	}
+
+	return nh;
+}
+
+static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
+			    unsigned long *timer_p, bool *has_p,
+			    struct netlink_ext_ack *extack)
+{
+	unsigned long timer;
+	u32 value;
+
+	if (!attr) {
+		*timer_p = fallback;
+		*has_p = false;
+		return 0;
+	}
+
+	value = nla_get_u32(attr);
+	timer = clock_t_to_jiffies(value);
+	if (timer == ~0UL) {
+		NL_SET_ERR_MSG(extack, "Timer value too large");
+		return -EINVAL;
+	}
+
+	*timer_p = timer;
+	*has_p = true;
+	return 0;
+}
+
+static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
+				    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
+	int err;
+
+	if (res) {
+		err = nla_parse_nested(tb,
+				       ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
+				       res, rtm_nh_res_policy_new, extack);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NHA_RES_GROUP_BUCKETS]) {
+		cfg->nh_grp_res_num_buckets =
+			nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
+		cfg->nh_grp_res_has_num_buckets = true;
+		if (!cfg->nh_grp_res_num_buckets) {
+			NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
+			return -EINVAL;
+		}
+	}
+
+	err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
+			       NH_RES_DEFAULT_IDLE_TIMER,
+			       &cfg->nh_grp_res_idle_timer,
+			       &cfg->nh_grp_res_has_idle_timer,
+			       extack);
+	if (err)
+		return err;
+
+	return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
+				NH_RES_DEFAULT_UNBALANCED_TIMER,
+				&cfg->nh_grp_res_unbalanced_timer,
+				&cfg->nh_grp_res_has_unbalanced_timer,
+				extack);
+}
+
+static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
+			    struct nlmsghdr *nlh, struct nh_config *cfg,
+			    struct netlink_ext_ack *extack)
+{
+	struct nhmsg *nhm = nlmsg_data(nlh);
+	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*nhm), tb,
+			  ARRAY_SIZE(rtm_nh_policy_new) - 1,
+			  rtm_nh_policy_new, extack);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (nhm->resvd || nhm->nh_scope) {
+		NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
+		goto out;
+	}
+	if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
+		NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
+		goto out;
+	}
+
+	switch (nhm->nh_family) {
+	case AF_INET:
+	case AF_INET6:
+		break;
+	case AF_UNSPEC:
+		if (tb[NHA_GROUP])
+			break;
+		fallthrough;
+	default:
+		NL_SET_ERR_MSG(extack, "Invalid address family");
+		goto out;
+	}
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->nlflags = nlh->nlmsg_flags;
+	cfg->nlinfo.portid = NETLINK_CB(skb).portid;
+	cfg->nlinfo.nlh = nlh;
+	cfg->nlinfo.nl_net = net;
+
+	cfg->nh_family = nhm->nh_family;
+	cfg->nh_protocol = nhm->nh_protocol;
+	cfg->nh_flags = nhm->nh_flags;
+
+	if (tb[NHA_ID])
+		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+
+	if (tb[NHA_FDB]) {
+		if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
+		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
+			NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
+			goto out;
+		}
+		if (nhm->nh_flags) {
+			NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
+			goto out;
+		}
+		cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
+	}
+
+	if (tb[NHA_GROUP]) {
+		if (nhm->nh_family != AF_UNSPEC) {
+			NL_SET_ERR_MSG(extack, "Invalid family for group");
+			goto out;
+		}
+		cfg->nh_grp = tb[NHA_GROUP];
+
+		cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+		if (tb[NHA_GROUP_TYPE])
+			cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+		if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid group type");
+			goto out;
+		}
+		err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb),
+					  cfg->nh_grp_type, extack);
+		if (err)
+			goto out;
+
+		if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+			err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
+						       cfg, extack);
+
+		/* no other attributes should be set */
+		goto out;
+	}
+
+	if (tb[NHA_BLACKHOLE]) {
+		if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
+		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
+			NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
+			goto out;
+		}
+
+		cfg->nh_blackhole = 1;
+		err = 0;
+		goto out;
+	}
+
+	if (!cfg->nh_fdb && !tb[NHA_OIF]) {
+		NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
+		goto out;
+	}
+
+	if (!cfg->nh_fdb && tb[NHA_OIF]) {
+		cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+		if (cfg->nh_ifindex)
+			cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+
+		if (!cfg->dev) {
+			NL_SET_ERR_MSG(extack, "Invalid device index");
+			goto out;
+		} else if (!(cfg->dev->flags & IFF_UP)) {
+			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+			err = -ENETDOWN;
+			goto out;
+		} else if (!netif_carrier_ok(cfg->dev)) {
+			NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+			err = -ENETDOWN;
+			goto out;
+		}
+	}
+
+	err = -EINVAL;
+	if (tb[NHA_GATEWAY]) {
+		struct nlattr *gwa = tb[NHA_GATEWAY];
+
+		switch (cfg->nh_family) {
+		case AF_INET:
+			if (nla_len(gwa) != sizeof(u32)) {
+				NL_SET_ERR_MSG(extack, "Invalid gateway");
+				goto out;
+			}
+			cfg->gw.ipv4 = nla_get_be32(gwa);
+			break;
+		case AF_INET6:
+			if (nla_len(gwa) != sizeof(struct in6_addr)) {
+				NL_SET_ERR_MSG(extack, "Invalid gateway");
+				goto out;
+			}
+			cfg->gw.ipv6 = nla_get_in6_addr(gwa);
+			break;
+		default:
+			NL_SET_ERR_MSG(extack,
+				       "Unknown address family for gateway");
+			goto out;
+		}
+	} else {
+		/* device only nexthop (no gateway) */
+		if (cfg->nh_flags & RTNH_F_ONLINK) {
+			NL_SET_ERR_MSG(extack,
+				       "ONLINK flag can not be set for nexthop without a gateway");
+			goto out;
+		}
+	}
+
+	if (tb[NHA_ENCAP]) {
+		cfg->nh_encap = tb[NHA_ENCAP];
+
+		if (!tb[NHA_ENCAP_TYPE]) {
+			NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
+			goto out;
+		}
+
+		cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
+		err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
+		if (err < 0)
+			goto out;
+
+	} else if (tb[NHA_ENCAP_TYPE]) {
+		NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
+		goto out;
+	}
+
+
+	err = 0;
+out:
+	return err;
+}
+
+/* rtnl */
+static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nh_config cfg;
+	struct nexthop *nh;
+	int err;
+
+	err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
+	if (!err) {
+		nh = nexthop_add(net, &cfg, extack);
+		if (IS_ERR(nh))
+			err = PTR_ERR(nh);
+	}
+
+	return err;
+}
+
+static int __nh_valid_get_del_req(const struct nlmsghdr *nlh,
+				  struct nlattr **tb, u32 *id,
+				  struct netlink_ext_ack *extack)
+{
+	struct nhmsg *nhm = nlmsg_data(nlh);
+
+	if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+		NL_SET_ERR_MSG(extack, "Invalid values in header");
+		return -EINVAL;
+	}
+
+	if (!tb[NHA_ID]) {
+		NL_SET_ERR_MSG(extack, "Nexthop id is missing");
+		return -EINVAL;
+	}
+
+	*id = nla_get_u32(tb[NHA_ID]);
+	if (!(*id)) {
+		NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id,
+				struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+			  ARRAY_SIZE(rtm_nh_policy_get) - 1,
+			  rtm_nh_policy_get, extack);
+	if (err < 0)
+		return err;
+
+	return __nh_valid_get_del_req(nlh, tb, id, extack);
+}
+
+/* rtnl */
+static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nl_info nlinfo = {
+		.nlh = nlh,
+		.nl_net = net,
+		.portid = NETLINK_CB(skb).portid,
+	};
+	struct nexthop *nh;
+	int err;
+	u32 id;
+
+	err = nh_valid_get_del_req(nlh, &id, extack);
+	if (err)
+		return err;
+
+	nh = nexthop_find_by_id(net, id);
+	if (!nh)
+		return -ENOENT;
+
+	remove_nexthop(net, nh, &nlinfo);
+
+	return 0;
+}
+
+/* rtnl */
+static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct sk_buff *skb = NULL;
+	struct nexthop *nh;
+	int err;
+	u32 id;
+
+	err = nh_valid_get_del_req(nlh, &id, extack);
+	if (err)
+		return err;
+
+	err = -ENOBUFS;
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		goto out;
+
+	err = -ENOENT;
+	nh = nexthop_find_by_id(net, id);
+	if (!nh)
+		goto errout_free;
+
+	err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
+			   nlh->nlmsg_seq, 0);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		goto errout_free;
+	}
+
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+	return err;
+errout_free:
+	kfree_skb(skb);
+	goto out;
+}
+
+struct nh_dump_filter {
+	u32 nh_id;
+	int dev_idx;
+	int master_idx;
+	bool group_filter;
+	bool fdb_filter;
+	u32 res_bucket_nh_id;
+};
+
+static bool nh_dump_filtered(struct nexthop *nh,
+			     struct nh_dump_filter *filter, u8 family)
+{
+	const struct net_device *dev;
+	const struct nh_info *nhi;
+
+	if (filter->group_filter && !nh->is_group)
+		return true;
+
+	if (!filter->dev_idx && !filter->master_idx && !family)
+		return false;
+
+	if (nh->is_group)
+		return true;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	if (family && nhi->family != family)
+		return true;
+
+	dev = nhi->fib_nhc.nhc_dev;
+	if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
+		return true;
+
+	if (filter->master_idx) {
+		struct net_device *master;
+
+		if (!dev)
+			return true;
+
+		master = netdev_master_upper_dev_get((struct net_device *)dev);
+		if (!master || master->ifindex != filter->master_idx)
+			return true;
+	}
+
+	return false;
+}
+
+static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
+			       struct nh_dump_filter *filter,
+			       struct netlink_ext_ack *extack)
+{
+	struct nhmsg *nhm;
+	u32 idx;
+
+	if (tb[NHA_OIF]) {
+		idx = nla_get_u32(tb[NHA_OIF]);
+		if (idx > INT_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid device index");
+			return -EINVAL;
+		}
+		filter->dev_idx = idx;
+	}
+	if (tb[NHA_MASTER]) {
+		idx = nla_get_u32(tb[NHA_MASTER]);
+		if (idx > INT_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid master device index");
+			return -EINVAL;
+		}
+		filter->master_idx = idx;
+	}
+	filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
+	filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);
+
+	nhm = nlmsg_data(nlh);
+	if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+		NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nh_valid_dump_req(const struct nlmsghdr *nlh,
+			     struct nh_dump_filter *filter,
+			     struct netlink_callback *cb)
+{
+	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+			  ARRAY_SIZE(rtm_nh_policy_dump) - 1,
+			  rtm_nh_policy_dump, cb->extack);
+	if (err < 0)
+		return err;
+
+	return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
+}
+
+struct rtm_dump_nh_ctx {
+	u32 idx;
+};
+
+static struct rtm_dump_nh_ctx *
+rtm_dump_nh_ctx(struct netlink_callback *cb)
+{
+	struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;
+
+	BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+	return ctx;
+}
+
+static int rtm_dump_walk_nexthops(struct sk_buff *skb,
+				  struct netlink_callback *cb,
+				  struct rb_root *root,
+				  struct rtm_dump_nh_ctx *ctx,
+				  int (*nh_cb)(struct sk_buff *skb,
+					       struct netlink_callback *cb,
+					       struct nexthop *nh, void *data),
+				  void *data)
+{
+	struct rb_node *node;
+	int s_idx;
+	int err;
+
+	s_idx = ctx->idx;
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		struct nexthop *nh;
+
+		nh = rb_entry(node, struct nexthop, rb_node);
+		if (nh->id < s_idx)
+			continue;
+
+		ctx->idx = nh->id;
+		err = nh_cb(skb, cb, nh, data);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
+			       struct nexthop *nh, void *data)
+{
+	struct nhmsg *nhm = nlmsg_data(cb->nlh);
+	struct nh_dump_filter *filter = data;
+
+	if (nh_dump_filtered(nh, filter, nhm->nh_family))
+		return 0;
+
+	return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
+			    NETLINK_CB(cb->skb).portid,
+			    cb->nlh->nlmsg_seq, NLM_F_MULTI);
+}
+
+/* rtnl */
+static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
+	struct net *net = sock_net(skb->sk);
+	struct rb_root *root = &net->nexthop.rb_root;
+	struct nh_dump_filter filter = {};
+	int err;
+
+	err = nh_valid_dump_req(cb->nlh, &filter, cb);
+	if (err < 0)
+		return err;
+
+	err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
+				     &rtm_dump_nexthop_cb, &filter);
+	if (err < 0) {
+		if (likely(skb->len))
+			err = skb->len;
+	}
+
+	cb->seq = net->nexthop.seq;
+	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+	return err;
+}
+
+static struct nexthop *
+nexthop_find_group_resilient(struct net *net, u32 id,
+			     struct netlink_ext_ack *extack)
+{
+	struct nh_group *nhg;
+	struct nexthop *nh;
+
+	nh = nexthop_find_by_id(net, id);
+	if (!nh)
+		return ERR_PTR(-ENOENT);
+
+	if (!nh->is_group) {
+		NL_SET_ERR_MSG(extack, "Not a nexthop group");
+		return ERR_PTR(-EINVAL);
+	}
+
+	nhg = rtnl_dereference(nh->nh_grp);
+	if (!nhg->resilient) {
+		NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return nh;
+}
+
+static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
+			      struct netlink_ext_ack *extack)
+{
+	u32 idx;
+
+	if (attr) {
+		idx = nla_get_u32(attr);
+		if (!idx) {
+			NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+			return -EINVAL;
+		}
+		*nh_id_p = idx;
+	} else {
+		*nh_id_p = 0;
+	}
+
+	return 0;
+}
+
+static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
+				    struct nh_dump_filter *filter,
+				    struct netlink_callback *cb)
+{
+	struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
+	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+			  ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
+			  rtm_nh_policy_dump_bucket, NULL);
+	if (err < 0)
+		return err;
+
+	err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
+	if (err)
+		return err;
+
+	if (tb[NHA_RES_BUCKET]) {
+		size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;
+
+		err = nla_parse_nested(res_tb, max,
+				       tb[NHA_RES_BUCKET],
+				       rtm_nh_res_bucket_policy_dump,
+				       cb->extack);
+		if (err < 0)
+			return err;
+
+		err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
+					 &filter->res_bucket_nh_id,
+					 cb->extack);
+		if (err)
+			return err;
+	}
+
+	return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
+}
+
+struct rtm_dump_res_bucket_ctx {
+	struct rtm_dump_nh_ctx nh;
+	u16 bucket_index;
+};
+
+static struct rtm_dump_res_bucket_ctx *
+rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
+{
+	struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;
+
+	BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+	return ctx;
+}
+
+struct rtm_dump_nexthop_bucket_data {
+	struct rtm_dump_res_bucket_ctx *ctx;
+	struct nh_dump_filter filter;
+};
+
+static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
+				      struct netlink_callback *cb,
+				      struct nexthop *nh,
+				      struct rtm_dump_nexthop_bucket_data *dd)
+{
+	u32 portid = NETLINK_CB(cb->skb).portid;
+	struct nhmsg *nhm = nlmsg_data(cb->nlh);
+	struct nh_res_table *res_table;
+	struct nh_group *nhg;
+	u16 bucket_index;
+	int err;
+
+	nhg = rtnl_dereference(nh->nh_grp);
+	res_table = rtnl_dereference(nhg->res_table);
+	for (bucket_index = dd->ctx->bucket_index;
+	     bucket_index < res_table->num_nh_buckets;
+	     bucket_index++) {
+		struct nh_res_bucket *bucket;
+		struct nh_grp_entry *nhge;
+
+		bucket = &res_table->nh_buckets[bucket_index];
+		nhge = rtnl_dereference(bucket->nh_entry);
+		if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
+			continue;
+
+		if (dd->filter.res_bucket_nh_id &&
+		    dd->filter.res_bucket_nh_id != nhge->nh->id)
+			continue;
+
+		dd->ctx->bucket_index = bucket_index;
+		err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+					 RTM_NEWNEXTHOPBUCKET, portid,
+					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					 cb->extack);
+		if (err)
+			return err;
+	}
+
+	dd->ctx->bucket_index = 0;
+
+	return 0;
+}
+
+static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
+				      struct netlink_callback *cb,
+				      struct nexthop *nh, void *data)
+{
+	struct rtm_dump_nexthop_bucket_data *dd = data;
+	struct nh_group *nhg;
+
+	if (!nh->is_group)
+		return 0;
+
+	nhg = rtnl_dereference(nh->nh_grp);
+	if (!nhg->resilient)
+		return 0;
+
+	return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
+}
+
+/* rtnl */
+static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
+	struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
+	struct net *net = sock_net(skb->sk);
+	struct nexthop *nh;
+	int err;
+
+	err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
+	if (err)
+		return err;
+
+	if (dd.filter.nh_id) {
+		nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
+						  cb->extack);
+		if (IS_ERR(nh))
+			return PTR_ERR(nh);
+		err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
+	} else {
+		struct rb_root *root = &net->nexthop.rb_root;
+
+		err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
+					     &rtm_dump_nexthop_bucket_cb, &dd);
+	}
+
+	if (err < 0) {
+		if (likely(skb->len))
+			err = skb->len;
+	}
+
+	cb->seq = net->nexthop.seq;
+	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+	return err;
+}
+
+static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
+					      u16 *bucket_index,
+					      struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
+	int err;
+
+	err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
+			       res, rtm_nh_res_bucket_policy_get, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[NHA_RES_BUCKET_INDEX]) {
+		NL_SET_ERR_MSG(extack, "Bucket index is missing");
+		return -EINVAL;
+	}
+
+	*bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
+	return 0;
+}
+
+static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
+				   u32 *id, u16 *bucket_index,
+				   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+			  ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
+			  rtm_nh_policy_get_bucket, extack);
+	if (err < 0)
+		return err;
+
+	err = __nh_valid_get_del_req(nlh, tb, id, extack);
+	if (err)
+		return err;
+
+	if (!tb[NHA_RES_BUCKET]) {
+		NL_SET_ERR_MSG(extack, "Bucket information is missing");
+		return -EINVAL;
+	}
+
+	err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
+						 bucket_index, extack);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/* rtnl */
+static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+				  struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nh_res_table *res_table;
+	struct sk_buff *skb = NULL;
+	struct nh_group *nhg;
+	struct nexthop *nh;
+	u16 bucket_index;
+	int err;
+	u32 id;
+
+	err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
+	if (err)
+		return err;
+
+	nh = nexthop_find_group_resilient(net, id, extack);
+	if (IS_ERR(nh))
+		return PTR_ERR(nh);
+
+	nhg = rtnl_dereference(nh->nh_grp);
+	res_table = rtnl_dereference(nhg->res_table);
+	if (bucket_index >= res_table->num_nh_buckets) {
+		NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
+		return -ENOENT;
+	}
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
+				 bucket_index, RTM_NEWNEXTHOPBUCKET,
+				 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+				 0, extack);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		goto errout_free;
+	}
+
+	return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout_free:
+	kfree_skb(skb);
+	return err;
+}
+
+static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+	unsigned int hash = nh_dev_hashfn(dev->ifindex);
+	struct net *net = dev_net(dev);
+	struct hlist_head *head = &net->nexthop.devhash[hash];
+	struct hlist_node *n;
+	struct nh_info *nhi;
+
+	hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+		if (nhi->fib_nhc.nhc_dev == dev) {
+			if (nhi->family == AF_INET)
+				fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
+						   orig_mtu);
+		}
+	}
+}
+
+/* rtnl */
+static int nh_netdev_event(struct notifier_block *this,
+			   unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_info_ext *info_ext;
+
+	switch (event) {
+	case NETDEV_DOWN:
+	case NETDEV_UNREGISTER:
+		nexthop_flush_dev(dev, event);
+		break;
+	case NETDEV_CHANGE:
+		if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+			nexthop_flush_dev(dev, event);
+		break;
+	case NETDEV_CHANGEMTU:
+		info_ext = ptr;
+		nexthop_sync_mtu(dev, info_ext->ext.mtu);
+		rt_cache_flush(dev_net(dev));
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nh_netdev_notifier = {
+	.notifier_call = nh_netdev_event,
+};
+
+static int nexthops_dump(struct net *net, struct notifier_block *nb,
+			 enum nexthop_event_type event_type,
+			 struct netlink_ext_ack *extack)
+{
+	struct rb_root *root = &net->nexthop.rb_root;
+	struct rb_node *node;
+	int err = 0;
+
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		struct nexthop *nh;
+
+		nh = rb_entry(node, struct nexthop, rb_node);
+		err = call_nexthop_notifier(nb, net, event_type, nh, extack);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
+			      struct netlink_ext_ack *extack)
+{
+	int err;
+
+	rtnl_lock();
+	err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack);
+	if (err)
+		goto unlock;
+	err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
+					       nb);
+unlock:
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL(register_nexthop_notifier);
+
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+	int err;
+
+	rtnl_lock();
+	err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
+						 nb);
+	if (err)
+		goto unlock;
+	nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
+unlock:
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL(unregister_nexthop_notifier);
+
+void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
+{
+	struct nexthop *nexthop;
+
+	rcu_read_lock();
+
+	nexthop = nexthop_find_by_id(net, id);
+	if (!nexthop)
+		goto out;
+
+	nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+	if (offload)
+		nexthop->nh_flags |= RTNH_F_OFFLOAD;
+	if (trap)
+		nexthop->nh_flags |= RTNH_F_TRAP;
+
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_set_hw_flags);
+
+void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
+				 bool offload, bool trap)
+{
+	struct nh_res_table *res_table;
+	struct nh_res_bucket *bucket;
+	struct nexthop *nexthop;
+	struct nh_group *nhg;
+
+	rcu_read_lock();
+
+	nexthop = nexthop_find_by_id(net, id);
+	if (!nexthop || !nexthop->is_group)
+		goto out;
+
+	nhg = rcu_dereference(nexthop->nh_grp);
+	if (!nhg->resilient)
+		goto out;
+
+	if (bucket_index >= nhg->res_table->num_nh_buckets)
+		goto out;
+
+	res_table = rcu_dereference(nhg->res_table);
+	bucket = &res_table->nh_buckets[bucket_index];
+	bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+	if (offload)
+		bucket->nh_flags |= RTNH_F_OFFLOAD;
+	if (trap)
+		bucket->nh_flags |= RTNH_F_TRAP;
+
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);
+
+void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
+				     unsigned long *activity)
+{
+	struct nh_res_table *res_table;
+	struct nexthop *nexthop;
+	struct nh_group *nhg;
+	u16 i;
+
+	rcu_read_lock();
+
+	nexthop = nexthop_find_by_id(net, id);
+	if (!nexthop || !nexthop->is_group)
+		goto out;
+
+	nhg = rcu_dereference(nexthop->nh_grp);
+	if (!nhg->resilient)
+		goto out;
+
+	/* Instead of silently ignoring some buckets, demand that the sizes
+	 * be the same.
+	 */
+	res_table = rcu_dereference(nhg->res_table);
+	if (num_buckets != res_table->num_nh_buckets)
+		goto out;
+
+	for (i = 0; i < num_buckets; i++) {
+		if (test_bit(i, activity))
+			nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
+	}
+
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_res_grp_activity_update);
+
+static void __net_exit nexthop_net_exit_batch(struct list_head *net_list)
+{
+	struct net *net;
+
+	rtnl_lock();
+	list_for_each_entry(net, net_list, exit_list) {
+		flush_all_nexthops(net);
+		kfree(net->nexthop.devhash);
+	}
+	rtnl_unlock();
+}
+
+static int __net_init nexthop_net_init(struct net *net)
+{
+	size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
+
+	net->nexthop.rb_root = RB_ROOT;
+	net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
+	if (!net->nexthop.devhash)
+		return -ENOMEM;
+	BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
+
+	return 0;
+}
+
+static struct pernet_operations nexthop_net_ops = {
+	.init = nexthop_net_init,
+	.exit_batch = nexthop_net_exit_batch,
+};
+
+static int __init nexthop_init(void)
+{
+	register_pernet_subsys(&nexthop_net_ops);
+
+	register_netdevice_notifier(&nh_netdev_notifier);
+
+	rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
+		      rtm_dump_nexthop, 0);
+
+	rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+	rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+	rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket,
+		      rtm_dump_nexthop_bucket, 0);
+
+	return 0;
+}
+subsys_initcall(nexthop_init);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 0000000000..4cb0c896ca
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,1196 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		"Ping" sockets
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors:	Vasiliy Kulikov / Openwall (for Linux 2.6),
+ *		Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#endif
+
+struct ping_table {
+	struct hlist_head	hash[PING_HTABLE_SIZE];
+	spinlock_t		lock;
+};
+
+static struct ping_table ping_table;
+struct pingv6_ops pingv6_ops;
+EXPORT_SYMBOL_GPL(pingv6_ops);
+
+static u16 ping_port_rover;
+
+static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
+{
+	u32 res = (num + net_hash_mix(net)) & mask;
+
+	pr_debug("hash(%u) = %u\n", num, res);
+	return res;
+}
+EXPORT_SYMBOL_GPL(ping_hash);
+
+static inline struct hlist_head *ping_hashslot(struct ping_table *table,
+					       struct net *net, unsigned int num)
+{
+	return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+
+int ping_get_port(struct sock *sk, unsigned short ident)
+{
+	struct inet_sock *isk, *isk2;
+	struct hlist_head *hlist;
+	struct sock *sk2 = NULL;
+
+	isk = inet_sk(sk);
+	spin_lock(&ping_table.lock);
+	if (ident == 0) {
+		u32 i;
+		u16 result = ping_port_rover + 1;
+
+		for (i = 0; i < (1L << 16); i++, result++) {
+			if (!result)
+				result++; /* avoid zero */
+			hlist = ping_hashslot(&ping_table, sock_net(sk),
+					    result);
+			sk_for_each(sk2, hlist) {
+				isk2 = inet_sk(sk2);
+
+				if (isk2->inet_num == result)
+					goto next_port;
+			}
+
+			/* found */
+			ping_port_rover = ident = result;
+			break;
+next_port:
+			;
+		}
+		if (i >= (1L << 16))
+			goto fail;
+	} else {
+		hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+		sk_for_each(sk2, hlist) {
+			isk2 = inet_sk(sk2);
+
+			/* BUG? Why is this reuse and not reuseaddr? ping.c
+			 * doesn't turn off SO_REUSEADDR, and it doesn't expect
+			 * that other ping processes can steal its packets.
+			 */
+			if ((isk2->inet_num == ident) &&
+			    (sk2 != sk) &&
+			    (!sk2->sk_reuse || !sk->sk_reuse))
+				goto fail;
+		}
+	}
+
+	pr_debug("found port/ident = %d\n", ident);
+	isk->inet_num = ident;
+	if (sk_unhashed(sk)) {
+		pr_debug("was not hashed\n");
+		sk_add_node_rcu(sk, hlist);
+		sock_set_flag(sk, SOCK_RCU_FREE);
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	}
+	spin_unlock(&ping_table.lock);
+	return 0;
+
+fail:
+	spin_unlock(&ping_table.lock);
+	return -EADDRINUSE;
+}
+EXPORT_SYMBOL_GPL(ping_get_port);
+
+int ping_hash(struct sock *sk)
+{
+	pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+	BUG(); /* "Please do not press this button again." */
+
+	return 0;
+}
+
+void ping_unhash(struct sock *sk)
+{
+	struct inet_sock *isk = inet_sk(sk);
+
+	pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+	spin_lock(&ping_table.lock);
+	if (sk_del_node_init_rcu(sk)) {
+		isk->inet_num = 0;
+		isk->inet_sport = 0;
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	}
+	spin_unlock(&ping_table.lock);
+}
+EXPORT_SYMBOL_GPL(ping_unhash);
+
+/* Called under rcu_read_lock() */
+static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
+{
+	struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident);
+	struct sock *sk = NULL;
+	struct inet_sock *isk;
+	int dif, sdif;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		dif = inet_iif(skb);
+		sdif = inet_sdif(skb);
+		pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
+			 (int)ident, &ip_hdr(skb)->daddr, dif);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		dif = inet6_iif(skb);
+		sdif = inet6_sdif(skb);
+		pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
+			 (int)ident, &ipv6_hdr(skb)->daddr, dif);
+#endif
+	} else {
+		return NULL;
+	}
+
+	sk_for_each_rcu(sk, hslot) {
+		isk = inet_sk(sk);
+
+		pr_debug("iterate\n");
+		if (isk->inet_num != ident)
+			continue;
+
+		if (skb->protocol == htons(ETH_P_IP) &&
+		    sk->sk_family == AF_INET) {
+			pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk,
+				 (int) isk->inet_num, &isk->inet_rcv_saddr,
+				 sk->sk_bound_dev_if);
+
+			if (isk->inet_rcv_saddr &&
+			    isk->inet_rcv_saddr != ip_hdr(skb)->daddr)
+				continue;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else if (skb->protocol == htons(ETH_P_IPV6) &&
+			   sk->sk_family == AF_INET6) {
+
+			pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
+				 (int) isk->inet_num,
+				 &sk->sk_v6_rcv_saddr,
+				 sk->sk_bound_dev_if);
+
+			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+			    !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
+					     &ipv6_hdr(skb)->daddr))
+				continue;
+#endif
+		} else {
+			continue;
+		}
+
+		if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+		    sk->sk_bound_dev_if != sdif)
+			continue;
+
+		goto exit;
+	}
+
+	sk = NULL;
+exit:
+
+	return sk;
+}
+
+static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
+					  kgid_t *high)
+{
+	kgid_t *data = net->ipv4.ping_group_range.range;
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
+
+		*low = data[0];
+		*high = data[1];
+	} while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
+}
+
+
+int ping_init_sock(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	kgid_t group = current_egid();
+	struct group_info *group_info;
+	int i;
+	kgid_t low, high;
+	int ret = 0;
+
+	if (sk->sk_family == AF_INET6)
+		sk->sk_ipv6only = 1;
+
+	inet_get_ping_group_range_net(net, &low, &high);
+	if (gid_lte(low, group) && gid_lte(group, high))
+		return 0;
+
+	group_info = get_current_groups();
+	for (i = 0; i < group_info->ngroups; i++) {
+		kgid_t gid = group_info->gid[i];
+
+		if (gid_lte(low, gid) && gid_lte(gid, high))
+			goto out_release_group;
+	}
+
+	ret = -EACCES;
+
+out_release_group:
+	put_group_info(group_info);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ping_init_sock);
+
+void ping_close(struct sock *sk, long timeout)
+{
+	pr_debug("ping_close(sk=%p,sk->num=%u)\n",
+		 inet_sk(sk), inet_sk(sk)->inet_num);
+	pr_debug("isk->refcnt = %d\n", refcount_read(&sk->sk_refcnt));
+
+	sk_common_release(sk);
+}
+EXPORT_SYMBOL_GPL(ping_close);
+
+static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+			    int addr_len)
+{
+	/* This check is replicated from __ip4_datagram_connect() and
+	 * intended to prevent BPF program called below from accessing bytes
+	 * that are out of the bound specified by user in addr_len.
+	 */
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+
+/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
+static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
+				struct sockaddr *uaddr, int addr_len)
+{
+	struct net *net = sock_net(sk);
+	if (sk->sk_family == AF_INET) {
+		struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+		u32 tb_id = RT_TABLE_LOCAL;
+		int chk_addr_ret;
+
+		if (addr_len < sizeof(*addr))
+			return -EINVAL;
+
+		if (addr->sin_family != AF_INET &&
+		    !(addr->sin_family == AF_UNSPEC &&
+		      addr->sin_addr.s_addr == htonl(INADDR_ANY)))
+			return -EAFNOSUPPORT;
+
+		pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
+			 sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+		if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+			return 0;
+
+		tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
+		chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
+
+		if (chk_addr_ret == RTN_MULTICAST ||
+		    chk_addr_ret == RTN_BROADCAST ||
+		    (chk_addr_ret != RTN_LOCAL &&
+		     !inet_can_nonlocal_bind(net, isk)))
+			return -EADDRNOTAVAIL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (sk->sk_family == AF_INET6) {
+		struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
+		int addr_type, scoped, has_addr;
+		struct net_device *dev = NULL;
+
+		if (addr_len < sizeof(*addr))
+			return -EINVAL;
+
+		if (addr->sin6_family != AF_INET6)
+			return -EAFNOSUPPORT;
+
+		pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
+			 sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
+
+		addr_type = ipv6_addr_type(&addr->sin6_addr);
+		scoped = __ipv6_addr_needs_scope_id(addr_type);
+		if ((addr_type != IPV6_ADDR_ANY &&
+		     !(addr_type & IPV6_ADDR_UNICAST)) ||
+		    (scoped && !addr->sin6_scope_id))
+			return -EINVAL;
+
+		rcu_read_lock();
+		if (addr->sin6_scope_id) {
+			dev = dev_get_by_index_rcu(net, addr->sin6_scope_id);
+			if (!dev) {
+				rcu_read_unlock();
+				return -ENODEV;
+			}
+		}
+
+		if (!dev && sk->sk_bound_dev_if) {
+			dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+			if (!dev) {
+				rcu_read_unlock();
+				return -ENODEV;
+			}
+		}
+		has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
+						    scoped);
+		rcu_read_unlock();
+
+		if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr ||
+		      addr_type == IPV6_ADDR_ANY))
+			return -EADDRNOTAVAIL;
+
+		if (scoped)
+			sk->sk_bound_dev_if = addr->sin6_scope_id;
+#endif
+	} else {
+		return -EAFNOSUPPORT;
+	}
+	return 0;
+}
+
+static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
+{
+	if (saddr->sa_family == AF_INET) {
+		struct inet_sock *isk = inet_sk(sk);
+		struct sockaddr_in *addr = (struct sockaddr_in *) saddr;
+		isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (saddr->sa_family == AF_INET6) {
+		struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
+		struct ipv6_pinfo *np = inet6_sk(sk);
+		sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
+#endif
+	}
+}
+
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+
+int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	unsigned short snum;
+	int err;
+	int dif = sk->sk_bound_dev_if;
+
+	err = ping_check_bind_addr(sk, isk, uaddr, addr_len);
+	if (err)
+		return err;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (isk->inet_num != 0)
+		goto out;
+
+	err = -EADDRINUSE;
+	snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
+	if (ping_get_port(sk, snum) != 0) {
+		/* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */
+		sk->sk_bound_dev_if = dif;
+		goto out;
+	}
+	ping_set_saddr(sk, uaddr);
+
+	pr_debug("after bind(): num = %hu, dif = %d\n",
+		 isk->inet_num,
+		 sk->sk_bound_dev_if);
+
+	err = 0;
+	if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#endif
+
+	if (snum)
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+	isk->inet_sport = htons(isk->inet_num);
+	isk->inet_daddr = 0;
+	isk->inet_dport = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
+#endif
+
+	sk_dst_reset(sk);
+out:
+	release_sock(sk);
+	pr_debug("ping_v4_bind -> %d\n", err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(ping_bind);
+
+/*
+ * Is this a supported type of ICMP message?
+ */
+
+static inline int ping_supported(int family, int type, int code)
+{
+	return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
+	       (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) ||
+	       (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) ||
+	       (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0);
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+
+void ping_err(struct sk_buff *skb, int offset, u32 info)
+{
+	int family;
+	struct icmphdr *icmph;
+	struct inet_sock *inet_sock;
+	int type;
+	int code;
+	struct net *net = dev_net(skb->dev);
+	struct sock *sk;
+	int harderr;
+	int err;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		family = AF_INET;
+		type = icmp_hdr(skb)->type;
+		code = icmp_hdr(skb)->code;
+		icmph = (struct icmphdr *)(skb->data + offset);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		family = AF_INET6;
+		type = icmp6_hdr(skb)->icmp6_type;
+		code = icmp6_hdr(skb)->icmp6_code;
+		icmph = (struct icmphdr *) (skb->data + offset);
+	} else {
+		BUG();
+	}
+
+	/* We assume the packet has already been checked by icmp_unreach */
+
+	if (!ping_supported(family, icmph->type, icmph->code))
+		return;
+
+	pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n",
+		 skb->protocol, type, code, ntohs(icmph->un.echo.id),
+		 ntohs(icmph->un.echo.sequence));
+
+	sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+	if (!sk) {
+		pr_debug("no socket, dropping\n");
+		return;	/* No socket for error */
+	}
+	pr_debug("err on socket %p\n", sk);
+
+	err = 0;
+	harderr = 0;
+	inet_sock = inet_sk(sk);
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		switch (type) {
+		default:
+		case ICMP_TIME_EXCEEDED:
+			err = EHOSTUNREACH;
+			break;
+		case ICMP_SOURCE_QUENCH:
+			/* This is not a real error but ping wants to see it.
+			 * Report it with some fake errno.
+			 */
+			err = EREMOTEIO;
+			break;
+		case ICMP_PARAMETERPROB:
+			err = EPROTO;
+			harderr = 1;
+			break;
+		case ICMP_DEST_UNREACH:
+			if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+				ipv4_sk_update_pmtu(skb, sk, info);
+				if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+					err = EMSGSIZE;
+					harderr = 1;
+					break;
+				}
+				goto out;
+			}
+			err = EHOSTUNREACH;
+			if (code <= NR_ICMP_UNREACH) {
+				harderr = icmp_err_convert[code].fatal;
+				err = icmp_err_convert[code].errno;
+			}
+			break;
+		case ICMP_REDIRECT:
+			/* See ICMP_SOURCE_QUENCH */
+			ipv4_sk_redirect(skb, sk);
+			err = EREMOTEIO;
+			break;
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		harderr = pingv6_ops.icmpv6_err_convert(type, code, &err);
+#endif
+	}
+
+	/*
+	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
+	 *	4.1.3.3.
+	 */
+	if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) ||
+	    (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
+		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+	} else {
+		if (family == AF_INET) {
+			ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+				      info, (u8 *)icmph);
+#if IS_ENABLED(CONFIG_IPV6)
+		} else if (family == AF_INET6) {
+			pingv6_ops.ipv6_icmp_error(sk, skb, err, 0,
+						   info, (u8 *)icmph);
+#endif
+		}
+	}
+	sk->sk_err = err;
+	sk_error_report(sk);
+out:
+	return;
+}
+EXPORT_SYMBOL_GPL(ping_err);
+
+/*
+ *	Copy and checksum an ICMP Echo packet from user space into a buffer
+ *	starting from the payload.
+ */
+
+int ping_getfrag(void *from, char *to,
+		 int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+	struct pingfakehdr *pfh = from;
+
+	if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
+					  &pfh->msg->msg_iter))
+		return -EFAULT;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	/* For IPv6, checksum each skb as we go along, as expected by
+	 * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in
+	 * wcheck, it will be finalized in ping_v4_push_pending_frames.
+	 */
+	if (pfh->family == AF_INET6) {
+		skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd);
+		skb->ip_summed = CHECKSUM_NONE;
+		pfh->wcheck = 0;
+	}
+#endif
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ping_getfrag);
+
+static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
+				       struct flowi4 *fl4)
+{
+	struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+
+	if (!skb)
+		return 0;
+	pfh->wcheck = csum_partial((char *)&pfh->icmph,
+		sizeof(struct icmphdr), pfh->wcheck);
+	pfh->icmph.checksum = csum_fold(pfh->wcheck);
+	memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+	skb->ip_summed = CHECKSUM_NONE;
+	return ip_push_pending_frames(sk, fl4);
+}
+
+int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
+			void *user_icmph, size_t icmph_len)
+{
+	u8 type, code;
+
+	if (len > 0xFFFF)
+		return -EMSGSIZE;
+
+	/* Must have at least a full ICMP header. */
+	if (len < icmph_len)
+		return -EINVAL;
+
+	/*
+	 *	Check the flags.
+	 */
+
+	/* Mirror BSD error message compatibility */
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	/*
+	 *	Fetch the ICMP header provided by the userland.
+	 *	iovec is modified! The ICMP header is consumed.
+	 */
+	if (memcpy_from_msg(user_icmph, msg, icmph_len))
+		return -EFAULT;
+
+	if (family == AF_INET) {
+		type = ((struct icmphdr *) user_icmph)->type;
+		code = ((struct icmphdr *) user_icmph)->code;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (family == AF_INET6) {
+		type = ((struct icmp6hdr *) user_icmph)->icmp6_type;
+		code = ((struct icmp6hdr *) user_icmph)->icmp6_code;
+#endif
+	} else {
+		BUG();
+	}
+
+	if (!ping_supported(family, type, code))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+
+static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+	struct net *net = sock_net(sk);
+	struct flowi4 fl4;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipcm_cookie ipc;
+	struct icmphdr user_icmph;
+	struct pingfakehdr pfh;
+	struct rtable *rt = NULL;
+	struct ip_options_data opt_copy;
+	int free = 0;
+	__be32 saddr, daddr, faddr;
+	u8 tos, scope;
+	int err;
+
+	pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+	err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph,
+				  sizeof(user_icmph));
+	if (err)
+		return err;
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	if (msg->msg_name) {
+		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+		if (msg->msg_namelen < sizeof(*usin))
+			return -EINVAL;
+		if (usin->sin_family != AF_INET)
+			return -EAFNOSUPPORT;
+		daddr = usin->sin_addr.s_addr;
+		/* no remote port */
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+		daddr = inet->inet_daddr;
+		/* no remote port */
+	}
+
+	ipcm_init_sk(&ipc, inet);
+
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(sk, msg, &ipc, false);
+		if (unlikely(err)) {
+			kfree(ipc.opt);
+			return err;
+		}
+		if (ipc.opt)
+			free = 1;
+	}
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
+
+	saddr = ipc.addr;
+	ipc.addr = faddr = daddr;
+
+	if (ipc.opt && ipc.opt->opt.srr) {
+		if (!daddr) {
+			err = -EINVAL;
+			goto out_free;
+		}
+		faddr = ipc.opt->opt.faddr;
+	}
+	tos = get_rttos(&ipc, inet);
+	scope = ip_sendmsg_scope(inet, &ipc, msg);
+
+	if (ipv4_is_multicast(daddr)) {
+		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	} else if (!ipc.oif)
+		ipc.oif = inet->uc_index;
+
+	flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope,
+			   sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
+			   saddr, 0, 0, sk->sk_uid);
+
+	fl4.fl4_icmp_type = user_icmph.type;
+	fl4.fl4_icmp_code = user_icmph.code;
+
+	security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
+	rt = ip_route_output_flow(net, &fl4, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		rt = NULL;
+		if (err == -ENETUNREACH)
+			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+		goto out;
+	}
+
+	err = -EACCES;
+	if ((rt->rt_flags & RTCF_BROADCAST) &&
+	    !sock_flag(sk, SOCK_BROADCAST))
+		goto out;
+
+	if (msg->msg_flags & MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	if (!ipc.addr)
+		ipc.addr = fl4.daddr;
+
+	lock_sock(sk);
+
+	pfh.icmph.type = user_icmph.type; /* already checked */
+	pfh.icmph.code = user_icmph.code; /* ditto */
+	pfh.icmph.checksum = 0;
+	pfh.icmph.un.echo.id = inet->inet_sport;
+	pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+	pfh.msg = msg;
+	pfh.wcheck = 0;
+	pfh.family = AF_INET;
+
+	err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
+			     sizeof(struct icmphdr), &ipc, &rt,
+			     msg->msg_flags);
+	if (err)
+		ip_flush_pending_frames(sk);
+	else
+		err = ping_v4_push_pending_frames(sk, &pfh, &fl4);
+	release_sock(sk);
+
+out:
+	ip_rt_put(rt);
+out_free:
+	if (free)
+		kfree(ipc.opt);
+	if (!err) {
+		icmp_out_count(sock_net(sk), user_icmph.type);
+		return len;
+	}
+	return err;
+
+do_confirm:
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4.daddr);
+	if (!(msg->msg_flags & MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto out;
+}
+
+int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+		 int *addr_len)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	int family = sk->sk_family;
+	struct sk_buff *skb;
+	int copied, err;
+
+	pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+	err = -EOPNOTSUPP;
+	if (flags & MSG_OOB)
+		goto out;
+
+	if (flags & MSG_ERRQUEUE)
+		return inet_recv_error(sk, msg, len, addr_len);
+
+	skb = skb_recv_datagram(sk, flags, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	/* Don't bother checking the checksum */
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
+	if (err)
+		goto done;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	/* Copy the address and add cmsg data. */
+	if (family == AF_INET) {
+		DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+		if (sin) {
+			sin->sin_family = AF_INET;
+			sin->sin_port = 0 /* skb->h.uh->source */;
+			sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+			*addr_len = sizeof(*sin);
+		}
+
+		if (inet_cmsg_flags(isk))
+			ip_cmsg_recv(msg, skb);
+
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+		struct ipv6hdr *ip6 = ipv6_hdr(skb);
+		DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+		if (sin6) {
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_port = 0;
+			sin6->sin6_addr = ip6->saddr;
+			sin6->sin6_flowinfo = 0;
+			if (np->sndflow)
+				sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+			sin6->sin6_scope_id =
+				ipv6_iface_scope_id(&sin6->sin6_addr,
+						    inet6_iif(skb));
+			*addr_len = sizeof(*sin6);
+		}
+
+		if (inet6_sk(sk)->rxopt.all)
+			pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb);
+		if (skb->protocol == htons(ETH_P_IPV6) &&
+		    inet6_sk(sk)->rxopt.all)
+			pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
+		else if (skb->protocol == htons(ETH_P_IP) &&
+			 inet_cmsg_flags(isk))
+			ip_cmsg_recv(msg, skb);
+#endif
+	} else {
+		BUG();
+	}
+
+	err = copied;
+
+done:
+	skb_free_datagram(sk, skb);
+out:
+	pr_debug("ping_recvmsg -> %d\n", err);
+	return err;
+}
+EXPORT_SYMBOL_GPL(ping_recvmsg);
+
+static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
+						 struct sk_buff *skb)
+{
+	enum skb_drop_reason reason;
+
+	pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+		 inet_sk(sk), inet_sk(sk)->inet_num, skb);
+	if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+		kfree_skb_reason(skb, reason);
+		pr_debug("ping_queue_rcv_skb -> failed\n");
+		return reason;
+	}
+	return SKB_NOT_DROPPED_YET;
+}
+
+int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	return __ping_queue_rcv_skb(sk, skb) ? -1 : 0;
+}
+EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
+
+
+/*
+ *	All we need to do is get the socket.
+ */
+
+enum skb_drop_reason ping_rcv(struct sk_buff *skb)
+{
+	enum skb_drop_reason reason = SKB_DROP_REASON_NO_SOCKET;
+	struct sock *sk;
+	struct net *net = dev_net(skb->dev);
+	struct icmphdr *icmph = icmp_hdr(skb);
+
+	/* We assume the packet has already been checked by icmp_rcv */
+
+	pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+		 skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+	/* Push ICMP header back */
+	skb_push(skb, skb->data - (u8 *)icmph);
+
+	sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+	if (sk) {
+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+		pr_debug("rcv on socket %p\n", sk);
+		if (skb2)
+			reason = __ping_queue_rcv_skb(sk, skb2);
+		else
+			reason = SKB_DROP_REASON_NOMEM;
+	}
+
+	if (reason)
+		pr_debug("no socket, dropping\n");
+
+	return reason;
+}
+EXPORT_SYMBOL_GPL(ping_rcv);
+
+struct proto ping_prot = {
+	.name =		"PING",
+	.owner =	THIS_MODULE,
+	.init =		ping_init_sock,
+	.close =	ping_close,
+	.pre_connect =	ping_pre_connect,
+	.connect =	ip4_datagram_connect,
+	.disconnect =	__udp_disconnect,
+	.setsockopt =	ip_setsockopt,
+	.getsockopt =	ip_getsockopt,
+	.sendmsg =	ping_v4_sendmsg,
+	.recvmsg =	ping_recvmsg,
+	.bind =		ping_bind,
+	.backlog_rcv =	ping_queue_rcv_skb,
+	.release_cb =	ip4_datagram_release_cb,
+	.hash =		ping_hash,
+	.unhash =	ping_unhash,
+	.get_port =	ping_get_port,
+	.put_port =	ping_unhash,
+	.obj_size =	sizeof(struct inet_sock),
+};
+EXPORT_SYMBOL(ping_prot);
+
+#ifdef CONFIG_PROC_FS
+
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+	struct sock *sk;
+	struct ping_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+	     ++state->bucket) {
+		struct hlist_head *hslot;
+
+		hslot = &ping_table.hash[state->bucket];
+
+		if (hlist_empty(hslot))
+			continue;
+
+		sk_for_each(sk, hslot) {
+			if (net_eq(sock_net(sk), net) &&
+			    sk->sk_family == state->family)
+				goto found;
+		}
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct ping_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	do {
+		sk = sk_next(sk);
+	} while (sk && (!net_eq(sock_net(sk), net)));
+
+	if (!sk)
+		return ping_get_first(seq, state->bucket + 1);
+	return sk;
+}
+
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = ping_get_first(seq, 0);
+
+	if (sk)
+		while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
+	__acquires(ping_table.lock)
+{
+	struct ping_iter_state *state = seq->private;
+	state->bucket = 0;
+	state->family = family;
+
+	spin_lock(&ping_table.lock);
+
+	return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL_GPL(ping_seq_start);
+
+static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return ping_seq_start(seq, pos, AF_INET);
+}
+
+void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = ping_get_idx(seq, 0);
+	else
+		sk = ping_get_next(seq, v);
+
+	++*pos;
+	return sk;
+}
+EXPORT_SYMBOL_GPL(ping_seq_next);
+
+void ping_seq_stop(struct seq_file *seq, void *v)
+	__releases(ping_table.lock)
+{
+	spin_unlock(&ping_table.lock);
+}
+EXPORT_SYMBOL_GPL(ping_seq_stop);
+
+static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
+		int bucket)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	__be32 dest = inet->inet_daddr;
+	__be32 src = inet->inet_rcv_saddr;
+	__u16 destp = ntohs(inet->inet_dport);
+	__u16 srcp = ntohs(inet->inet_sport);
+
+	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
+		bucket, src, srcp, dest, destp, sp->sk_state,
+		sk_wmem_alloc_get(sp),
+		sk_rmem_alloc_get(sp),
+		0, 0L, 0,
+		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+		0, sock_i_ino(sp),
+		refcount_read(&sp->sk_refcnt), sp,
+		atomic_read(&sp->sk_drops));
+}
+
+static int ping_v4_seq_show(struct seq_file *seq, void *v)
+{
+	seq_setwidth(seq, 127);
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode ref pointer drops");
+	else {
+		struct ping_iter_state *state = seq->private;
+
+		ping_v4_format_sock(v, seq, state->bucket);
+	}
+	seq_pad(seq, '\n');
+	return 0;
+}
+
+static const struct seq_operations ping_v4_seq_ops = {
+	.start		= ping_v4_seq_start,
+	.show		= ping_v4_seq_show,
+	.next		= ping_seq_next,
+	.stop		= ping_seq_stop,
+};
+
+static int __net_init ping_v4_proc_init_net(struct net *net)
+{
+	if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
+			sizeof(struct ping_iter_state)))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit ping_v4_proc_exit_net(struct net *net)
+{
+	remove_proc_entry("icmp", net->proc_net);
+}
+
+static struct pernet_operations ping_v4_net_ops = {
+	.init = ping_v4_proc_init_net,
+	.exit = ping_v4_proc_exit_net,
+};
+
+int __init ping_proc_init(void)
+{
+	return register_pernet_subsys(&ping_v4_net_ops);
+}
+
+void ping_proc_exit(void)
+{
+	unregister_pernet_subsys(&ping_v4_net_ops);
+}
+
+#endif
+
+void __init ping_init(void)
+{
+	int i;
+
+	for (i = 0; i < PING_HTABLE_SIZE; i++)
+		INIT_HLIST_HEAD(&ping_table.hash[i]);
+	spin_lock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
new file mode 100644
index 0000000000..eaf1d3113b
--- /dev/null
+++ b/net/ipv4/proc.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		This file implements the various access functions for the
+ *		PROC file system.  It is mainly used for debugging and
+ *		statistics.
+ *
+ * Authors:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
+ *		Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ *		Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
+ *
+ * Fixes:
+ *		Alan Cox	:	UDP sockets show the rxqueue/txqueue
+ *					using hint flag for the netinfo.
+ *	Pauline Middelink	:	identd support
+ *		Alan Cox	:	Make /proc safer.
+ *	Erik Schoenfelder	:	/proc/net/snmp
+ *		Alan Cox	:	Handle dead sockets properly.
+ *	Gerhard Koerting	:	Show both timers
+ *		Alan Cox	:	Allow inode to be NULL (kernel socket)
+ *	Andi Kleen		:	Add support for open_requests and
+ *					split functions for more readibility.
+ *	Andi Kleen		:	Add support for /proc/net/netstat
+ *	Arnaldo C. Melo		:	Convert to seq_file
+ */
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/bottom_half.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/raw.h>
+
+#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX)
+
+/*
+ *	Report socket allocation statistics [mea@utu.fi]
+ */
+static int sockstat_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq->private;
+	int orphans, sockets;
+
+	orphans = tcp_orphan_count_sum();
+	sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
+
+	socket_seq_show(seq);
+	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+		   sock_prot_inuse_get(net, &tcp_prot), orphans,
+		   refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1,
+		   sockets, proto_memory_allocated(&tcp_prot));
+	seq_printf(seq, "UDP: inuse %d mem %ld\n",
+		   sock_prot_inuse_get(net, &udp_prot),
+		   proto_memory_allocated(&udp_prot));
+	seq_printf(seq, "UDPLITE: inuse %d\n",
+		   sock_prot_inuse_get(net, &udplite_prot));
+	seq_printf(seq, "RAW: inuse %d\n",
+		   sock_prot_inuse_get(net, &raw_prot));
+	seq_printf(seq,  "FRAG: inuse %u memory %lu\n",
+		   atomic_read(&net->ipv4.fqdir->rhashtable.nelems),
+		   frag_mem_limit(net->ipv4.fqdir));
+	return 0;
+}
+
+/* snmp items */
+static const struct snmp_mib snmp4_ipstats_list[] = {
+	SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
+	SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+	SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+	SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+	SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+	SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
+	SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
+	SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS),
+	SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+	SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+	SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+	SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
+	SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
+	SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
+	SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
+	SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
+	SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
+	SNMP_MIB_SENTINEL
+};
+
+/* Following items are displayed in /proc/net/netstat */
+static const struct snmp_mib snmp4_ipextstats_list[] = {
+	SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
+	SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
+	SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
+	SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
+	SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS),
+	SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS),
+	SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS),
+	SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS),
+	SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+	SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+	SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+	SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+	/* Non RFC4293 fields */
+	SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+	SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+	SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+	SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+	SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+	SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
+	SNMP_MIB_SENTINEL
+};
+
+static const struct {
+	const char *name;
+	int index;
+} icmpmibmap[] = {
+	{ "DestUnreachs", ICMP_DEST_UNREACH },
+	{ "TimeExcds", ICMP_TIME_EXCEEDED },
+	{ "ParmProbs", ICMP_PARAMETERPROB },
+	{ "SrcQuenchs", ICMP_SOURCE_QUENCH },
+	{ "Redirects", ICMP_REDIRECT },
+	{ "Echos", ICMP_ECHO },
+	{ "EchoReps", ICMP_ECHOREPLY },
+	{ "Timestamps", ICMP_TIMESTAMP },
+	{ "TimestampReps", ICMP_TIMESTAMPREPLY },
+	{ "AddrMasks", ICMP_ADDRESS },
+	{ "AddrMaskReps", ICMP_ADDRESSREPLY },
+	{ NULL, 0 }
+};
+
+
+static const struct snmp_mib snmp4_tcp_list[] = {
+	SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
+	SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
+	SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
+	SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
+	SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
+	SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
+	SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
+	SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
+	SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
+	SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
+	SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
+	SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
+	SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
+	SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+	SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
+	SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp4_udp_list[] = {
+	SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
+	SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
+	SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
+	SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+	SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+	SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
+	SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
+	SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+	SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS),
+	SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp4_net_list[] = {
+	SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
+	SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
+	SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
+	SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
+	SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
+	SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
+	SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
+	SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
+	SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
+	SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
+	SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
+	SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
+	SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
+	SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
+	SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+	SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
+	SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
+	SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
+	SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
+	SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+	SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
+	SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+	SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
+	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
+	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
+	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
+	SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
+	SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
+	SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
+	SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
+	SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
+	SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
+	SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
+	SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
+	SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
+	SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
+	SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
+	SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
+	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
+	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+	SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+	SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
+	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
+	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
+	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+	SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
+	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
+	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
+	SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
+	SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
+	SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
+	SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
+	SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
+	SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
+	SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
+	SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
+	SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+	SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO),
+	SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
+	SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
+	SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
+	SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
+	SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
+	SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+	SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE),
+	SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
+	SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
+	SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
+	SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+	SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
+	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
+	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+	SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
+	SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
+	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
+	SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
+	SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+	SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
+	SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
+	SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
+	SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+	SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+	SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
+	SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
+	SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
+	SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
+	SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
+	SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE),
+	SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
+	SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
+	SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
+	SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
+	SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
+	SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
+	SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
+	SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
+	SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT),
+	SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
+	SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
+	SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
+	SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV),
+	SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS),
+	SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ),
+	SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
+	SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
+	SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+	SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
+	SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
+	SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
+	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+	SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
+	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
+	SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
+	SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
+	SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
+	SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
+	SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
+	SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH),
+	SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH),
+	SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS),
+	SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
+	SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
+	SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
+	SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH),
+	SNMP_MIB_SENTINEL
+};
+
+static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
+			     unsigned short *type, int count)
+{
+	int j;
+
+	if (count) {
+		seq_puts(seq, "\nIcmpMsg:");
+		for (j = 0; j < count; ++j)
+			seq_printf(seq, " %sType%u",
+				type[j] & 0x100 ? "Out" : "In",
+				type[j] & 0xff);
+		seq_puts(seq, "\nIcmpMsg:");
+		for (j = 0; j < count; ++j)
+			seq_printf(seq, " %lu", vals[j]);
+	}
+}
+
+static void icmpmsg_put(struct seq_file *seq)
+{
+#define PERLINE	16
+
+	int i, count;
+	unsigned short type[PERLINE];
+	unsigned long vals[PERLINE], val;
+	struct net *net = seq->private;
+
+	count = 0;
+	for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+		val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
+		if (val) {
+			type[count] = i;
+			vals[count++] = val;
+		}
+		if (count == PERLINE) {
+			icmpmsg_put_line(seq, vals, type, count);
+			count = 0;
+		}
+	}
+	icmpmsg_put_line(seq, vals, type, count);
+
+#undef PERLINE
+}
+
+static void icmp_put(struct seq_file *seq)
+{
+	int i;
+	struct net *net = seq->private;
+	atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
+
+	seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
+	for (i = 0; icmpmibmap[i].name; i++)
+		seq_printf(seq, " In%s", icmpmibmap[i].name);
+	seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost");
+	for (i = 0; icmpmibmap[i].name; i++)
+		seq_printf(seq, " Out%s", icmpmibmap[i].name);
+	seq_printf(seq, "\nIcmp: %lu %lu %lu",
+		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
+		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+	for (i = 0; icmpmibmap[i].name; i++)
+		seq_printf(seq, " %lu",
+			   atomic_long_read(ptr + icmpmibmap[i].index));
+	seq_printf(seq, " %lu %lu %lu %lu",
+		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS),
+		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL),
+		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST));
+	for (i = 0; icmpmibmap[i].name; i++)
+		seq_printf(seq, " %lu",
+			   atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
+}
+
+/*
+ *	Called from the PROCfs module. This outputs /proc/net/snmp.
+ */
+static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
+{
+	struct net *net = seq->private;
+	u64 buff64[IPSTATS_MIB_MAX];
+	int i;
+
+	memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
+
+	seq_puts(seq, "Ip: Forwarding DefaultTTL");
+	for (i = 0; snmp4_ipstats_list[i].name; i++)
+		seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+
+	seq_printf(seq, "\nIp: %d %d",
+		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
+		   READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
+
+	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
+	snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
+				   net->mib.ip_statistics,
+				   offsetof(struct ipstats_mib, syncp));
+	for (i = 0; snmp4_ipstats_list[i].name; i++)
+		seq_printf(seq, " %llu", buff64[i]);
+
+	return 0;
+}
+
+static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
+{
+	unsigned long buff[TCPUDP_MIB_MAX];
+	struct net *net = seq->private;
+	int i;
+
+	memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+
+	seq_puts(seq, "\nTcp:");
+	for (i = 0; snmp4_tcp_list[i].name; i++)
+		seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+
+	seq_puts(seq, "\nTcp:");
+	snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
+				 net->mib.tcp_statistics);
+	for (i = 0; snmp4_tcp_list[i].name; i++) {
+		/* MaxConn field is signed, RFC 2012 */
+		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+			seq_printf(seq, " %ld", buff[i]);
+		else
+			seq_printf(seq, " %lu", buff[i]);
+	}
+
+	memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+
+	snmp_get_cpu_field_batch(buff, snmp4_udp_list,
+				 net->mib.udp_statistics);
+	seq_puts(seq, "\nUdp:");
+	for (i = 0; snmp4_udp_list[i].name; i++)
+		seq_printf(seq, " %s", snmp4_udp_list[i].name);
+	seq_puts(seq, "\nUdp:");
+	for (i = 0; snmp4_udp_list[i].name; i++)
+		seq_printf(seq, " %lu", buff[i]);
+
+	memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+
+	/* the UDP and UDP-Lite MIBs are the same */
+	seq_puts(seq, "\nUdpLite:");
+	snmp_get_cpu_field_batch(buff, snmp4_udp_list,
+				 net->mib.udplite_statistics);
+	for (i = 0; snmp4_udp_list[i].name; i++)
+		seq_printf(seq, " %s", snmp4_udp_list[i].name);
+	seq_puts(seq, "\nUdpLite:");
+	for (i = 0; snmp4_udp_list[i].name; i++)
+		seq_printf(seq, " %lu", buff[i]);
+
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+	snmp_seq_show_ipstats(seq, v);
+
+	icmp_put(seq);	/* RFC 2011 compatibility */
+	icmpmsg_put(seq);
+
+	snmp_seq_show_tcp_udp(seq, v);
+
+	return 0;
+}
+
+/*
+ *	Output /proc/net/netstat
+ */
+static int netstat_seq_show(struct seq_file *seq, void *v)
+{
+	const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1;
+	const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1;
+	struct net *net = seq->private;
+	unsigned long *buff;
+	int i;
+
+	seq_puts(seq, "TcpExt:");
+	for (i = 0; i < tcp_cnt; i++)
+		seq_printf(seq, " %s", snmp4_net_list[i].name);
+
+	seq_puts(seq, "\nTcpExt:");
+	buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
+		       GFP_KERNEL);
+	if (buff) {
+		snmp_get_cpu_field_batch(buff, snmp4_net_list,
+					 net->mib.net_statistics);
+		for (i = 0; i < tcp_cnt; i++)
+			seq_printf(seq, " %lu", buff[i]);
+	} else {
+		for (i = 0; i < tcp_cnt; i++)
+			seq_printf(seq, " %lu",
+				   snmp_fold_field(net->mib.net_statistics,
+						   snmp4_net_list[i].entry));
+	}
+	seq_puts(seq, "\nIpExt:");
+	for (i = 0; i < ip_cnt; i++)
+		seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
+
+	seq_puts(seq, "\nIpExt:");
+	if (buff) {
+		u64 *buff64 = (u64 *)buff;
+
+		memset(buff64, 0, ip_cnt * sizeof(u64));
+		snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list,
+					   net->mib.ip_statistics,
+					   offsetof(struct ipstats_mib, syncp));
+		for (i = 0; i < ip_cnt; i++)
+			seq_printf(seq, " %llu", buff64[i]);
+	} else {
+		for (i = 0; i < ip_cnt; i++)
+			seq_printf(seq, " %llu",
+				   snmp_fold_field64(net->mib.ip_statistics,
+						     snmp4_ipextstats_list[i].entry,
+						     offsetof(struct ipstats_mib, syncp)));
+	}
+	kfree(buff);
+	seq_putc(seq, '\n');
+	mptcp_seq_show(seq);
+	return 0;
+}
+
+static __net_init int ip_proc_init_net(struct net *net)
+{
+	if (!proc_create_net_single("sockstat", 0444, net->proc_net,
+			sockstat_seq_show, NULL))
+		goto out_sockstat;
+	if (!proc_create_net_single("netstat", 0444, net->proc_net,
+			netstat_seq_show, NULL))
+		goto out_netstat;
+	if (!proc_create_net_single("snmp", 0444, net->proc_net, snmp_seq_show,
+			NULL))
+		goto out_snmp;
+
+	return 0;
+
+out_snmp:
+	remove_proc_entry("netstat", net->proc_net);
+out_netstat:
+	remove_proc_entry("sockstat", net->proc_net);
+out_sockstat:
+	return -ENOMEM;
+}
+
+static __net_exit void ip_proc_exit_net(struct net *net)
+{
+	remove_proc_entry("snmp", net->proc_net);
+	remove_proc_entry("netstat", net->proc_net);
+	remove_proc_entry("sockstat", net->proc_net);
+}
+
+static __net_initdata struct pernet_operations ip_proc_ops = {
+	.init = ip_proc_init_net,
+	.exit = ip_proc_exit_net,
+};
+
+int __init ip_misc_proc_init(void)
+{
+	return register_pernet_subsys(&ip_proc_ops);
+}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
new file mode 100644
index 0000000000..6913979948
--- /dev/null
+++ b/net/ipv4/protocol.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		INET protocol dispatch tables.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *		Alan Cox	: Ahah! udp icmp errors don't work because
+ *				  udp_err is never called!
+ *		Alan Cox	: Added new fields for init and ready for
+ *				  proper fragmentation (_NO_ 4K limits!)
+ *		Richard Colella	: Hang on hash collision
+ *		Vince Laviano	: Modified inet_del_protocol() to correctly
+ *				  maintain copy bit.
+ */
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+
+struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_protos);
+const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_offloads);
+
+int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+			NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_protocol);
+
+int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
+{
+	return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+			NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_offload);
+
+int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+	int ret;
+
+	ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+		       prot, NULL) == prot) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(inet_del_protocol);
+
+int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
+{
+	int ret;
+
+	ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+		       prot, NULL) == prot) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(inet_del_offload);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
new file mode 100644
index 0000000000..4b5db5d1ed
--- /dev/null
+++ b/net/ipv4/raw.c
@@ -0,0 +1,1110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		RAW - implementation of IP "raw" sockets.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *		Alan Cox	:	verify_area() fixed up
+ *		Alan Cox	:	ICMP error handling
+ *		Alan Cox	:	EMSGSIZE if you send too big a packet
+ *		Alan Cox	: 	Now uses generic datagrams and shared
+ *					skbuff library. No more peek crashes,
+ *					no more backlogs
+ *		Alan Cox	:	Checks sk->broadcast.
+ *		Alan Cox	:	Uses skb_free_datagram/skb_copy_datagram
+ *		Alan Cox	:	Raw passes ip options too
+ *		Alan Cox	:	Setsocketopt added
+ *		Alan Cox	:	Fixed error return for broadcasts
+ *		Alan Cox	:	Removed wake_up calls
+ *		Alan Cox	:	Use ttl/tos
+ *		Alan Cox	:	Cleaned up old debugging
+ *		Alan Cox	:	Use new kernel side addresses
+ *	Arnt Gulbrandsen	:	Fixed MSG_DONTROUTE in raw sockets.
+ *		Alan Cox	:	BSD style RAW socket demultiplexing.
+ *		Alan Cox	:	Beginnings of mrouted support.
+ *		Alan Cox	:	Added IP_HDRINCL option.
+ *		Alan Cox	:	Skip broadcast check if BSDism set.
+ *		David S. Miller	:	New socket lookup architecture.
+ */
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <asm/byteorder.h>
+#include <asm/current.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/sockios.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/mroute.h>
+#include <linux/netdevice.h>
+#include <linux/in_route.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/igmp.h>
+#include <net/net_namespace.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/snmp.h>
+#include <net/tcp_states.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+#include <linux/uio.h>
+
+struct raw_frag_vec {
+	struct msghdr *msg;
+	union {
+		struct icmphdr icmph;
+		char c[1];
+	} hdr;
+	int hlen;
+};
+
+struct raw_hashinfo raw_v4_hashinfo;
+EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
+
+int raw_hash_sk(struct sock *sk)
+{
+	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+	struct hlist_head *hlist;
+
+	hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)];
+
+	spin_lock(&h->lock);
+	sk_add_node_rcu(sk, hlist);
+	sock_set_flag(sk, SOCK_RCU_FREE);
+	spin_unlock(&h->lock);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(raw_hash_sk);
+
+void raw_unhash_sk(struct sock *sk)
+{
+	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+
+	spin_lock(&h->lock);
+	if (sk_del_node_init_rcu(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	spin_unlock(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_unhash_sk);
+
+bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num,
+		  __be32 raddr, __be32 laddr, int dif, int sdif)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+
+	if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
+	    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
+	    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+	    raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+		return true;
+	return false;
+}
+EXPORT_SYMBOL_GPL(raw_v4_match);
+
+/*
+ *	0 - deliver
+ *	1 - block
+ */
+static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
+{
+	struct icmphdr _hdr;
+	const struct icmphdr *hdr;
+
+	hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+				 sizeof(_hdr), &_hdr);
+	if (!hdr)
+		return 1;
+
+	if (hdr->type < 32) {
+		__u32 data = raw_sk(sk)->filter.data;
+
+		return ((1U << hdr->type) & data) != 0;
+	}
+
+	/* Do not block unknown ICMP types */
+	return 0;
+}
+
+/* IP input processing comes here for RAW socket delivery.
+ * Caller owns SKB, so we must make clones.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+static int raw_v4_input(struct net *net, struct sk_buff *skb,
+			const struct iphdr *iph, int hash)
+{
+	int sdif = inet_sdif(skb);
+	struct hlist_head *hlist;
+	int dif = inet_iif(skb);
+	int delivered = 0;
+	struct sock *sk;
+
+	hlist = &raw_v4_hashinfo.ht[hash];
+	rcu_read_lock();
+	sk_for_each_rcu(sk, hlist) {
+		if (!raw_v4_match(net, sk, iph->protocol,
+				  iph->saddr, iph->daddr, dif, sdif))
+			continue;
+		delivered = 1;
+		if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
+		    ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
+				   skb->dev->ifindex, sdif)) {
+			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+			/* Not releasing hash table! */
+			if (clone)
+				raw_rcv(sk, clone);
+		}
+	}
+	rcu_read_unlock();
+	return delivered;
+}
+
+int raw_local_deliver(struct sk_buff *skb, int protocol)
+{
+	struct net *net = dev_net(skb->dev);
+
+	return raw_v4_input(net, skb, ip_hdr(skb),
+			    raw_hashfunc(net, protocol));
+}
+
+static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	int harderr = 0;
+	bool recverr;
+	int err = 0;
+
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		ipv4_sk_update_pmtu(skb, sk, info);
+	else if (type == ICMP_REDIRECT) {
+		ipv4_sk_redirect(skb, sk);
+		return;
+	}
+
+	/* Report error on raw socket, if:
+	   1. User requested ip_recverr.
+	   2. Socket is connected (otherwise the error indication
+	      is useless without ip_recverr and error is hard.
+	 */
+	recverr = inet_test_bit(RECVERR, sk);
+	if (!recverr && sk->sk_state != TCP_ESTABLISHED)
+		return;
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		return;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		err = EHOSTUNREACH;
+		if (code > NR_ICMP_UNREACH)
+			break;
+		if (code == ICMP_FRAG_NEEDED) {
+			harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+			err = EMSGSIZE;
+		} else {
+			err = icmp_err_convert[code].errno;
+			harderr = icmp_err_convert[code].fatal;
+		}
+	}
+
+	if (recverr) {
+		const struct iphdr *iph = (const struct iphdr *)skb->data;
+		u8 *payload = skb->data + (iph->ihl << 2);
+
+		if (inet_test_bit(HDRINCL, sk))
+			payload = skb->data;
+		ip_icmp_error(sk, skb, err, 0, info, payload);
+	}
+
+	if (recverr || harderr) {
+		sk->sk_err = err;
+		sk_error_report(sk);
+	}
+}
+
+void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	int dif = skb->dev->ifindex;
+	int sdif = inet_sdif(skb);
+	struct hlist_head *hlist;
+	const struct iphdr *iph;
+	struct sock *sk;
+	int hash;
+
+	hash = raw_hashfunc(net, protocol);
+	hlist = &raw_v4_hashinfo.ht[hash];
+
+	rcu_read_lock();
+	sk_for_each_rcu(sk, hlist) {
+		iph = (const struct iphdr *)skb->data;
+		if (!raw_v4_match(net, sk, iph->protocol,
+				  iph->daddr, iph->saddr, dif, sdif))
+			continue;
+		raw_err(sk, skb, info);
+	}
+	rcu_read_unlock();
+}
+
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	enum skb_drop_reason reason;
+
+	/* Charge it to the socket. */
+
+	ipv4_pktinfo_prepare(sk, skb);
+	if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+		kfree_skb_reason(skb, reason);
+		return NET_RX_DROP;
+	}
+
+	return NET_RX_SUCCESS;
+}
+
+int raw_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+		atomic_inc(&sk->sk_drops);
+		kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY);
+		return NET_RX_DROP;
+	}
+	nf_reset_ct(skb);
+
+	skb_push(skb, skb->data - skb_network_header(skb));
+
+	raw_rcv_skb(sk, skb);
+	return 0;
+}
+
+static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
+			   struct msghdr *msg, size_t length,
+			   struct rtable **rtp, unsigned int flags,
+			   const struct sockcm_cookie *sockc)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	struct iphdr *iph;
+	struct sk_buff *skb;
+	unsigned int iphlen;
+	int err;
+	struct rtable *rt = *rtp;
+	int hlen, tlen;
+
+	if (length > rt->dst.dev->mtu) {
+		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+			       rt->dst.dev->mtu);
+		return -EMSGSIZE;
+	}
+	if (length < sizeof(struct iphdr))
+		return -EINVAL;
+
+	if (flags&MSG_PROBE)
+		goto out;
+
+	hlen = LL_RESERVED_SPACE(rt->dst.dev);
+	tlen = rt->dst.dev->needed_tailroom;
+	skb = sock_alloc_send_skb(sk,
+				  length + hlen + tlen + 15,
+				  flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		goto error;
+	skb_reserve(skb, hlen);
+
+	skb->priority = READ_ONCE(sk->sk_priority);
+	skb->mark = sockc->mark;
+	skb->tstamp = sockc->transmit_time;
+	skb_dst_set(skb, &rt->dst);
+	*rtp = NULL;
+
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	skb_put(skb, length);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	skb_setup_tx_timestamp(skb, sockc->tsflags);
+
+	if (flags & MSG_CONFIRM)
+		skb_set_dst_pending_confirm(skb, 1);
+
+	skb->transport_header = skb->network_header;
+	err = -EFAULT;
+	if (memcpy_from_msg(iph, msg, length))
+		goto error_free;
+
+	iphlen = iph->ihl * 4;
+
+	/*
+	 * We don't want to modify the ip header, but we do need to
+	 * be sure that it won't cause problems later along the network
+	 * stack.  Specifically we want to make sure that iph->ihl is a
+	 * sane value.  If ihl points beyond the length of the buffer passed
+	 * in, reject the frame as invalid
+	 */
+	err = -EINVAL;
+	if (iphlen > length)
+		goto error_free;
+
+	if (iphlen >= sizeof(*iph)) {
+		if (!iph->saddr)
+			iph->saddr = fl4->saddr;
+		iph->check   = 0;
+		iph->tot_len = htons(length);
+		if (!iph->id)
+			ip_select_ident(net, skb, NULL);
+
+		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+		skb->transport_header += iphlen;
+		if (iph->protocol == IPPROTO_ICMP &&
+		    length >= iphlen + sizeof(struct icmphdr))
+			icmp_out_count(net, ((struct icmphdr *)
+				skb_transport_header(skb))->type);
+	}
+
+	err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
+		      net, sk, skb, NULL, rt->dst.dev,
+		      dst_output);
+	if (err > 0)
+		err = net_xmit_errno(err);
+	if (err)
+		goto error;
+out:
+	return 0;
+
+error_free:
+	kfree_skb(skb);
+error:
+	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+	if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
+		err = 0;
+	return err;
+}
+
+static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
+{
+	int err;
+
+	if (fl4->flowi4_proto != IPPROTO_ICMP)
+		return 0;
+
+	/* We only need the first two bytes. */
+	rfv->hlen = 2;
+
+	err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen);
+	if (err)
+		return err;
+
+	fl4->fl4_icmp_type = rfv->hdr.icmph.type;
+	fl4->fl4_icmp_code = rfv->hdr.icmph.code;
+
+	return 0;
+}
+
+static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
+		       struct sk_buff *skb)
+{
+	struct raw_frag_vec *rfv = from;
+
+	if (offset < rfv->hlen) {
+		int copy = min(rfv->hlen - offset, len);
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			memcpy(to, rfv->hdr.c + offset, copy);
+		else
+			skb->csum = csum_block_add(
+				skb->csum,
+				csum_partial_copy_nocheck(rfv->hdr.c + offset,
+							  to, copy),
+				odd);
+
+		odd = 0;
+		offset += copy;
+		to += copy;
+		len -= copy;
+
+		if (!len)
+			return 0;
+	}
+
+	offset -= rfv->hlen;
+
+	return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
+}
+
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	struct ipcm_cookie ipc;
+	struct rtable *rt = NULL;
+	struct flowi4 fl4;
+	u8 tos, scope;
+	int free = 0;
+	__be32 daddr;
+	__be32 saddr;
+	int err;
+	struct ip_options_data opt_copy;
+	struct raw_frag_vec rfv;
+	int hdrincl;
+
+	err = -EMSGSIZE;
+	if (len > 0xFFFF)
+		goto out;
+
+	hdrincl = inet_test_bit(HDRINCL, sk);
+
+	/*
+	 *	Check the flags.
+	 */
+
+	err = -EOPNOTSUPP;
+	if (msg->msg_flags & MSG_OOB)	/* Mirror BSD error message */
+		goto out;               /* compatibility */
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	if (msg->msg_namelen) {
+		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+		err = -EINVAL;
+		if (msg->msg_namelen < sizeof(*usin))
+			goto out;
+		if (usin->sin_family != AF_INET) {
+			pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
+				     __func__, current->comm);
+			err = -EAFNOSUPPORT;
+			if (usin->sin_family)
+				goto out;
+		}
+		daddr = usin->sin_addr.s_addr;
+		/* ANK: I did not forget to get protocol from port field.
+		 * I just do not know, who uses this weirdness.
+		 * IP_HDRINCL is much more convenient.
+		 */
+	} else {
+		err = -EDESTADDRREQ;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+		daddr = inet->inet_daddr;
+	}
+
+	ipcm_init_sk(&ipc, inet);
+	/* Keep backward compat */
+	if (hdrincl)
+		ipc.protocol = IPPROTO_RAW;
+
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(sk, msg, &ipc, false);
+		if (unlikely(err)) {
+			kfree(ipc.opt);
+			goto out;
+		}
+		if (ipc.opt)
+			free = 1;
+	}
+
+	saddr = ipc.addr;
+	ipc.addr = daddr;
+
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
+
+	if (ipc.opt) {
+		err = -EINVAL;
+		/* Linux does not mangle headers on raw sockets,
+		 * so that IP options + IP_HDRINCL is non-sense.
+		 */
+		if (hdrincl)
+			goto done;
+		if (ipc.opt->opt.srr) {
+			if (!daddr)
+				goto done;
+			daddr = ipc.opt->opt.faddr;
+		}
+	}
+	tos = get_rttos(&ipc, inet);
+	scope = ip_sendmsg_scope(inet, &ipc, msg);
+
+	if (ipv4_is_multicast(daddr)) {
+		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	} else if (!ipc.oif) {
+		ipc.oif = inet->uc_index;
+	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+		/* oif is set, packet is to local broadcast
+		 * and uc_index is set. oif is most likely set
+		 * by sk_bound_dev_if. If uc_index != oif check if the
+		 * oif is an L3 master and uc_index is an L3 slave.
+		 * If so, we want to allow the send using the uc_index.
+		 */
+		if (ipc.oif != inet->uc_index &&
+		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
+							      inet->uc_index)) {
+			ipc.oif = inet->uc_index;
+		}
+	}
+
+	flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope,
+			   hdrincl ? ipc.protocol : sk->sk_protocol,
+			   inet_sk_flowi_flags(sk) |
+			    (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+			   daddr, saddr, 0, 0, sk->sk_uid);
+
+	if (!hdrincl) {
+		rfv.msg = msg;
+		rfv.hlen = 0;
+
+		err = raw_probe_proto_opt(&rfv, &fl4);
+		if (err)
+			goto done;
+	}
+
+	security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
+	rt = ip_route_output_flow(net, &fl4, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		rt = NULL;
+		goto done;
+	}
+
+	err = -EACCES;
+	if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
+		goto done;
+
+	if (msg->msg_flags & MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	if (hdrincl)
+		err = raw_send_hdrinc(sk, &fl4, msg, len,
+				      &rt, msg->msg_flags, &ipc.sockc);
+
+	 else {
+		if (!ipc.addr)
+			ipc.addr = fl4.daddr;
+		lock_sock(sk);
+		err = ip_append_data(sk, &fl4, raw_getfrag,
+				     &rfv, len, 0,
+				     &ipc, &rt, msg->msg_flags);
+		if (err)
+			ip_flush_pending_frames(sk);
+		else if (!(msg->msg_flags & MSG_MORE)) {
+			err = ip_push_pending_frames(sk, &fl4);
+			if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
+				err = 0;
+		}
+		release_sock(sk);
+	}
+done:
+	if (free)
+		kfree(ipc.opt);
+	ip_rt_put(rt);
+
+out:
+	if (err < 0)
+		return err;
+	return len;
+
+do_confirm:
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4.daddr);
+	if (!(msg->msg_flags & MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto done;
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+	/*
+	 * Raw sockets may have direct kernel references. Kill them.
+	 */
+	ip_ra_control(sk, 0, NULL);
+
+	sk_common_release(sk);
+}
+
+static void raw_destroy(struct sock *sk)
+{
+	lock_sock(sk);
+	ip_flush_pending_frames(sk);
+	release_sock(sk);
+}
+
+/* This gets rid of all the nasties in af_inet. -DaveM */
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+	struct net *net = sock_net(sk);
+	u32 tb_id = RT_TABLE_LOCAL;
+	int ret = -EINVAL;
+	int chk_addr_ret;
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
+		goto out;
+
+	if (sk->sk_bound_dev_if)
+		tb_id = l3mdev_fib_table_by_index(net,
+						  sk->sk_bound_dev_if) ? : tb_id;
+
+	chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
+
+	ret = -EADDRNOTAVAIL;
+	if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+					 chk_addr_ret))
+		goto out;
+
+	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+		inet->inet_saddr = 0;  /* Use device */
+	sk_dst_reset(sk);
+	ret = 0;
+out:
+	release_sock(sk);
+	return ret;
+}
+
+/*
+ *	This should be easy, if there is something there
+ *	we return it, otherwise we block.
+ */
+
+static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+		       int flags, int *addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	size_t copied = 0;
+	int err = -EOPNOTSUPP;
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+	struct sk_buff *skb;
+
+	if (flags & MSG_OOB)
+		goto out;
+
+	if (flags & MSG_ERRQUEUE) {
+		err = ip_recv_error(sk, msg, len, addr_len);
+		goto out;
+	}
+
+	skb = skb_recv_datagram(sk, flags, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
+	if (err)
+		goto done;
+
+	sock_recv_cmsgs(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		sin->sin_port = 0;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
+	}
+	if (inet_cmsg_flags(inet))
+		ip_cmsg_recv(msg, skb);
+	if (flags & MSG_TRUNC)
+		copied = skb->len;
+done:
+	skb_free_datagram(sk, skb);
+out:
+	if (err)
+		return err;
+	return copied;
+}
+
+static int raw_sk_init(struct sock *sk)
+{
+	struct raw_sock *rp = raw_sk(sk);
+
+	if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
+		memset(&rp->filter, 0, sizeof(rp->filter));
+	return 0;
+}
+
+static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen)
+{
+	if (optlen > sizeof(struct icmp_filter))
+		optlen = sizeof(struct icmp_filter);
+	if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen))
+		return -EFAULT;
+	return 0;
+}
+
+static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
+{
+	int len, ret = -EFAULT;
+
+	if (get_user(len, optlen))
+		goto out;
+	ret = -EINVAL;
+	if (len < 0)
+		goto out;
+	if (len > sizeof(struct icmp_filter))
+		len = sizeof(struct icmp_filter);
+	ret = -EFAULT;
+	if (put_user(len, optlen) ||
+	    copy_to_user(optval, &raw_sk(sk)->filter, len))
+		goto out;
+	ret = 0;
+out:	return ret;
+}
+
+static int do_raw_setsockopt(struct sock *sk, int level, int optname,
+			     sockptr_t optval, unsigned int optlen)
+{
+	if (optname == ICMP_FILTER) {
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
+			return -EOPNOTSUPP;
+		else
+			return raw_seticmpfilter(sk, optval, optlen);
+	}
+	return -ENOPROTOOPT;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+			  sockptr_t optval, unsigned int optlen)
+{
+	if (level != SOL_RAW)
+		return ip_setsockopt(sk, level, optname, optval, optlen);
+	return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
+
+static int do_raw_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (optname == ICMP_FILTER) {
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
+			return -EOPNOTSUPP;
+		else
+			return raw_geticmpfilter(sk, optval, optlen);
+	}
+	return -ENOPROTOOPT;
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_RAW)
+		return ip_getsockopt(sk, level, optname, optval, optlen);
+	return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+
+static int raw_ioctl(struct sock *sk, int cmd, int *karg)
+{
+	switch (cmd) {
+	case SIOCOUTQ: {
+		*karg = sk_wmem_alloc_get(sk);
+		return 0;
+	}
+	case SIOCINQ: {
+		struct sk_buff *skb;
+
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb)
+			*karg = skb->len;
+		else
+			*karg = 0;
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+		return 0;
+	}
+
+	default:
+#ifdef CONFIG_IP_MROUTE
+		return ipmr_ioctl(sk, cmd, karg);
+#else
+		return -ENOIOCTLCMD;
+#endif
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case SIOCOUTQ:
+	case SIOCINQ:
+		return -ENOIOCTLCMD;
+	default:
+#ifdef CONFIG_IP_MROUTE
+		return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+		return -ENOIOCTLCMD;
+#endif
+	}
+}
+#endif
+
+int raw_abort(struct sock *sk, int err)
+{
+	lock_sock(sk);
+
+	sk->sk_err = err;
+	sk_error_report(sk);
+	__udp_disconnect(sk, 0);
+
+	release_sock(sk);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(raw_abort);
+
+struct proto raw_prot = {
+	.name		   = "RAW",
+	.owner		   = THIS_MODULE,
+	.close		   = raw_close,
+	.destroy	   = raw_destroy,
+	.connect	   = ip4_datagram_connect,
+	.disconnect	   = __udp_disconnect,
+	.ioctl		   = raw_ioctl,
+	.init		   = raw_sk_init,
+	.setsockopt	   = raw_setsockopt,
+	.getsockopt	   = raw_getsockopt,
+	.sendmsg	   = raw_sendmsg,
+	.recvmsg	   = raw_recvmsg,
+	.bind		   = raw_bind,
+	.backlog_rcv	   = raw_rcv_skb,
+	.release_cb	   = ip4_datagram_release_cb,
+	.hash		   = raw_hash_sk,
+	.unhash		   = raw_unhash_sk,
+	.obj_size	   = sizeof(struct raw_sock),
+	.useroffset	   = offsetof(struct raw_sock, filter),
+	.usersize	   = sizeof_field(struct raw_sock, filter),
+	.h.raw_hash	   = &raw_v4_hashinfo,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	   = compat_raw_ioctl,
+#endif
+	.diag_destroy	   = raw_abort,
+};
+
+#ifdef CONFIG_PROC_FS
+static struct sock *raw_get_first(struct seq_file *seq, int bucket)
+{
+	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+	struct raw_iter_state *state = raw_seq_private(seq);
+	struct hlist_head *hlist;
+	struct sock *sk;
+
+	for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
+			++state->bucket) {
+		hlist = &h->ht[state->bucket];
+		sk_for_each(sk, hlist) {
+			if (sock_net(sk) == seq_file_net(seq))
+				return sk;
+		}
+	}
+	return NULL;
+}
+
+static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct raw_iter_state *state = raw_seq_private(seq);
+
+	do {
+		sk = sk_next(sk);
+	} while (sk && sock_net(sk) != seq_file_net(seq));
+
+	if (!sk)
+		return raw_get_first(seq, state->bucket + 1);
+	return sk;
+}
+
+static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = raw_get_first(seq, 0);
+
+	if (sk)
+		while (pos && (sk = raw_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(&h->lock)
+{
+	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+	spin_lock(&h->lock);
+
+	return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL_GPL(raw_seq_start);
+
+void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = raw_get_first(seq, 0);
+	else
+		sk = raw_get_next(seq, v);
+	++*pos;
+	return sk;
+}
+EXPORT_SYMBOL_GPL(raw_seq_next);
+
+void raw_seq_stop(struct seq_file *seq, void *v)
+	__releases(&h->lock)
+{
+	struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+	spin_unlock(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_seq_stop);
+
+static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	__be32 dest = inet->inet_daddr,
+	       src = inet->inet_rcv_saddr;
+	__u16 destp = 0,
+	      srcp  = inet->inet_num;
+
+	seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n",
+		i, src, srcp, dest, destp, sp->sk_state,
+		sk_wmem_alloc_get(sp),
+		sk_rmem_alloc_get(sp),
+		0, 0L, 0,
+		from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+		0, sock_i_ino(sp),
+		refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+}
+
+static int raw_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "  sl  local_address rem_address   st tx_queue "
+				"rx_queue tr tm->when retrnsmt   uid  timeout "
+				"inode ref pointer drops\n");
+	else
+		raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
+	return 0;
+}
+
+static const struct seq_operations raw_seq_ops = {
+	.start = raw_seq_start,
+	.next  = raw_seq_next,
+	.stop  = raw_seq_stop,
+	.show  = raw_seq_show,
+};
+
+static __net_init int raw_init_net(struct net *net)
+{
+	if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops,
+			sizeof(struct raw_iter_state), &raw_v4_hashinfo))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static __net_exit void raw_exit_net(struct net *net)
+{
+	remove_proc_entry("raw", net->proc_net);
+}
+
+static __net_initdata struct pernet_operations raw_net_ops = {
+	.init = raw_init_net,
+	.exit = raw_exit_net,
+};
+
+int __init raw_proc_init(void)
+{
+
+	return register_pernet_subsys(&raw_net_ops);
+}
+
+void __init raw_proc_exit(void)
+{
+	unregister_pernet_subsys(&raw_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+static void raw_sysctl_init_net(struct net *net)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	net->ipv4.sysctl_raw_l3mdev_accept = 1;
+#endif
+}
+
+static int __net_init raw_sysctl_init(struct net *net)
+{
+	raw_sysctl_init_net(net);
+	return 0;
+}
+
+static struct pernet_operations __net_initdata raw_sysctl_ops = {
+	.init	= raw_sysctl_init,
+};
+
+void __init raw_init(void)
+{
+	raw_sysctl_init_net(&init_net);
+	if (register_pernet_subsys(&raw_sysctl_ops))
+		panic("RAW: failed to init sysctl parameters.\n");
+}
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
new file mode 100644
index 0000000000..63a40e4b67
--- /dev/null
+++ b/net/ipv4/raw_diag.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+#include <net/inet_sock.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+
+#ifdef pr_fmt
+# undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static struct raw_hashinfo *
+raw_get_hashinfo(const struct inet_diag_req_v2 *r)
+{
+	if (r->sdiag_family == AF_INET) {
+		return &raw_v4_hashinfo;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (r->sdiag_family == AF_INET6) {
+		return &raw_v6_hashinfo;
+#endif
+	} else {
+		return ERR_PTR(-EINVAL);
+	}
+}
+
+/*
+ * Due to requirement of not breaking user API we can't simply
+ * rename @pad field in inet_diag_req_v2 structure, instead
+ * use helper to figure it out.
+ */
+
+static bool raw_lookup(struct net *net, const struct sock *sk,
+		       const struct inet_diag_req_v2 *req)
+{
+	struct inet_diag_req_raw *r = (void *)req;
+
+	if (r->sdiag_family == AF_INET)
+		return raw_v4_match(net, sk, r->sdiag_raw_protocol,
+				    r->id.idiag_dst[0],
+				    r->id.idiag_src[0],
+				    r->id.idiag_if, 0);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		return raw_v6_match(net, sk, r->sdiag_raw_protocol,
+				    (const struct in6_addr *)r->id.idiag_src,
+				    (const struct in6_addr *)r->id.idiag_dst,
+				    r->id.idiag_if, 0);
+#endif
+	return false;
+}
+
+static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
+{
+	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+	struct hlist_head *hlist;
+	struct sock *sk;
+	int slot;
+
+	if (IS_ERR(hashinfo))
+		return ERR_CAST(hashinfo);
+
+	rcu_read_lock();
+	for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
+		hlist = &hashinfo->ht[slot];
+		sk_for_each_rcu(sk, hlist) {
+			if (raw_lookup(net, sk, r)) {
+				/*
+				 * Grab it and keep until we fill
+				 * diag message to be reported, so
+				 * caller should call sock_put then.
+				 */
+				if (refcount_inc_not_zero(&sk->sk_refcnt))
+					goto out_unlock;
+			}
+		}
+	}
+	sk = ERR_PTR(-ENOENT);
+out_unlock:
+	rcu_read_unlock();
+
+	return sk;
+}
+
+static int raw_diag_dump_one(struct netlink_callback *cb,
+			     const struct inet_diag_req_v2 *r)
+{
+	struct sk_buff *in_skb = cb->skb;
+	struct sk_buff *rep;
+	struct sock *sk;
+	struct net *net;
+	int err;
+
+	net = sock_net(in_skb->sk);
+	sk = raw_sock_get(net, r);
+	if (IS_ERR(sk))
+		return PTR_ERR(sk);
+
+	rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+			inet_diag_msg_attrs_size() +
+			nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
+			GFP_KERNEL);
+	if (!rep) {
+		sock_put(sk);
+		return -ENOMEM;
+	}
+
+	err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0,
+				netlink_net_capable(in_skb, CAP_NET_ADMIN));
+	sock_put(sk);
+
+	if (err < 0) {
+		kfree_skb(rep);
+		return err;
+	}
+
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+	return err;
+}
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+			struct netlink_callback *cb,
+			const struct inet_diag_req_v2 *r,
+			struct nlattr *bc, bool net_admin)
+{
+	if (!inet_diag_bc_sk(bc, sk))
+		return 0;
+
+	return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
+}
+
+static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  const struct inet_diag_req_v2 *r)
+{
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+	struct net *net = sock_net(skb->sk);
+	struct inet_diag_dump_data *cb_data;
+	int num, s_num, slot, s_slot;
+	struct hlist_head *hlist;
+	struct sock *sk = NULL;
+	struct nlattr *bc;
+
+	if (IS_ERR(hashinfo))
+		return;
+
+	cb_data = cb->data;
+	bc = cb_data->inet_diag_nla_bc;
+	s_slot = cb->args[0];
+	num = s_num = cb->args[1];
+
+	rcu_read_lock();
+	for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
+		num = 0;
+
+		hlist = &hashinfo->ht[slot];
+		sk_for_each_rcu(sk, hlist) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+			if (num < s_num)
+				goto next;
+			if (sk->sk_family != r->sdiag_family)
+				goto next;
+			if (r->id.idiag_sport != inet->inet_sport &&
+			    r->id.idiag_sport)
+				goto next;
+			if (r->id.idiag_dport != inet->inet_dport &&
+			    r->id.idiag_dport)
+				goto next;
+			if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
+				goto out_unlock;
+next:
+			num++;
+		}
+	}
+
+out_unlock:
+	rcu_read_unlock();
+
+	cb->args[0] = slot;
+	cb->args[1] = num;
+}
+
+static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			      void *info)
+{
+	r->idiag_rqueue = sk_rmem_alloc_get(sk);
+	r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int raw_diag_destroy(struct sk_buff *in_skb,
+			    const struct inet_diag_req_v2 *r)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct sock *sk;
+	int err;
+
+	sk = raw_sock_get(net, r);
+	if (IS_ERR(sk))
+		return PTR_ERR(sk);
+	err = sock_diag_destroy(sk, ECONNABORTED);
+	sock_put(sk);
+	return err;
+}
+#endif
+
+static const struct inet_diag_handler raw_diag_handler = {
+	.dump			= raw_diag_dump,
+	.dump_one		= raw_diag_dump_one,
+	.idiag_get_info		= raw_diag_get_info,
+	.idiag_type		= IPPROTO_RAW,
+	.idiag_info_size	= 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+	.destroy		= raw_diag_destroy,
+#endif
+};
+
+static void __always_unused __check_inet_diag_req_raw(void)
+{
+	/*
+	 * Make sure the two structures are identical,
+	 * except the @pad field.
+	 */
+#define __offset_mismatch(m1, m2)			\
+	(offsetof(struct inet_diag_req_v2, m1) !=	\
+	 offsetof(struct inet_diag_req_raw, m2))
+
+	BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) !=
+		     sizeof(struct inet_diag_req_raw));
+	BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family));
+	BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol));
+	BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext));
+	BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol));
+	BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states));
+	BUILD_BUG_ON(__offset_mismatch(id, id));
+#undef __offset_mismatch
+}
+
+static int __init raw_diag_init(void)
+{
+	return inet_diag_register(&raw_diag_handler);
+}
+
+static void __exit raw_diag_exit(void)
+{
+	inet_diag_unregister(&raw_diag_handler);
+}
+
+module_init(raw_diag_init);
+module_exit(raw_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
new file mode 100644
index 0000000000..3bad9aa066
--- /dev/null
+++ b/net/ipv4/route.c
@@ -0,0 +1,3789 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		ROUTE - implementation of the IP router.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *		Alan Cox	:	Verify area fixes.
+ *		Alan Cox	:	cli() protects routing changes
+ *		Rui Oliveira	:	ICMP routing table updates
+ *		(rco@di.uminho.pt)	Routing table insertion and update
+ *		Linus Torvalds	:	Rewrote bits to be sensible
+ *		Alan Cox	:	Added BSD route gw semantics
+ *		Alan Cox	:	Super /proc >4K
+ *		Alan Cox	:	MTU in route table
+ *		Alan Cox	:	MSS actually. Also added the window
+ *					clamper.
+ *		Sam Lantinga	:	Fixed route matching in rt_del()
+ *		Alan Cox	:	Routing cache support.
+ *		Alan Cox	:	Removed compatibility cruft.
+ *		Alan Cox	:	RTF_REJECT support.
+ *		Alan Cox	:	TCP irtt support.
+ *		Jonathan Naylor	:	Added Metric support.
+ *	Miquel van Smoorenburg	:	BSD API fixes.
+ *	Miquel van Smoorenburg	:	Metrics.
+ *		Alan Cox	:	Use __u32 properly
+ *		Alan Cox	:	Aligned routing errors more closely with BSD
+ *					our system is still very different.
+ *		Alan Cox	:	Faster /proc handling
+ *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
+ *					routing caches and better behaviour.
+ *
+ *		Olaf Erb	:	irtt wasn't being copied right.
+ *		Bjorn Ekwall	:	Kerneld route support.
+ *		Alan Cox	:	Multicast fixed (I hope)
+ *		Pavel Krauz	:	Limited broadcast fixed
+ *		Mike McLagan	:	Routing by source
+ *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
+ *					route.c and rewritten from scratch.
+ *		Andi Kleen	:	Load-limit warning messages.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
+ *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
+ *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
+ *		Marc Boucher	:	routing by fwmark
+ *	Robert Olsson		:	Added rt_cache statistics
+ *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
+ *	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
+ *	Ilia Sotnikov		:	Removed TOS from hash calculations
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/nexthop.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/rtnetlink.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <net/secure_seq.h>
+#include <net/ip_tunnels.h>
+
+#include "fib_lookup.h"
+
+#define RT_FL_TOS(oldflp4) \
+	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
+
+#define RT_GC_TIMEOUT (300*HZ)
+
+#define DEFAULT_MIN_PMTU (512 + 20 + 20)
+#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
+#define DEFAULT_MIN_ADVMSS 256
+static int ip_rt_max_size;
+static int ip_rt_redirect_number __read_mostly	= 9;
+static int ip_rt_redirect_load __read_mostly	= HZ / 50;
+static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
+static int ip_rt_error_cost __read_mostly	= HZ;
+static int ip_rt_error_burst __read_mostly	= 5 * HZ;
+
+static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
+
+/*
+ *	Interface to generic destination cache.
+ */
+
+INDIRECT_CALLABLE_SCOPE
+struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
+INDIRECT_CALLABLE_SCOPE
+unsigned int		ipv4_mtu(const struct dst_entry *dst);
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+static void		 ipv4_link_failure(struct sk_buff *skb);
+static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					   struct sk_buff *skb, u32 mtu,
+					   bool confirm_neigh);
+static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+					struct sk_buff *skb);
+static void		ipv4_dst_destroy(struct dst_entry *dst);
+
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+	WARN_ON(1);
+	return NULL;
+}
+
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+					   struct sk_buff *skb,
+					   const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
+
+static struct dst_ops ipv4_dst_ops = {
+	.family =		AF_INET,
+	.check =		ipv4_dst_check,
+	.default_advmss =	ipv4_default_advmss,
+	.mtu =			ipv4_mtu,
+	.cow_metrics =		ipv4_cow_metrics,
+	.destroy =		ipv4_dst_destroy,
+	.negative_advice =	ipv4_negative_advice,
+	.link_failure =		ipv4_link_failure,
+	.update_pmtu =		ip_rt_update_pmtu,
+	.redirect =		ip_do_redirect,
+	.local_out =		__ip_local_out,
+	.neigh_lookup =		ipv4_neigh_lookup,
+	.confirm_neigh =	ipv4_confirm_neigh,
+};
+
+#define ECN_OR_COST(class)	TC_PRIO_##class
+
+const __u8 ip_tos2prio[16] = {
+	TC_PRIO_BESTEFFORT,
+	ECN_OR_COST(BESTEFFORT),
+	TC_PRIO_BESTEFFORT,
+	ECN_OR_COST(BESTEFFORT),
+	TC_PRIO_BULK,
+	ECN_OR_COST(BULK),
+	TC_PRIO_BULK,
+	ECN_OR_COST(BULK),
+	TC_PRIO_INTERACTIVE,
+	ECN_OR_COST(INTERACTIVE),
+	TC_PRIO_INTERACTIVE,
+	ECN_OR_COST(INTERACTIVE),
+	TC_PRIO_INTERACTIVE_BULK,
+	ECN_OR_COST(INTERACTIVE_BULK),
+	TC_PRIO_INTERACTIVE_BULK,
+	ECN_OR_COST(INTERACTIVE_BULK)
+};
+EXPORT_SYMBOL(ip_tos2prio);
+
+static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
+
+#ifdef CONFIG_PROC_FS
+static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos)
+		return NULL;
+	return SEQ_START_TOKEN;
+}
+
+static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
+			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
+			   "HHUptod\tSpecDst");
+	return 0;
+}
+
+static const struct seq_operations rt_cache_seq_ops = {
+	.start  = rt_cache_seq_start,
+	.next   = rt_cache_seq_next,
+	.stop   = rt_cache_seq_stop,
+	.show   = rt_cache_seq_show,
+};
+
+static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return &per_cpu(rt_cache_stat, cpu);
+	}
+	return NULL;
+}
+
+static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	int cpu;
+
+	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return &per_cpu(rt_cache_stat, cpu);
+	}
+	(*pos)++;
+	return NULL;
+
+}
+
+static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int rt_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct rt_cache_stat *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+		return 0;
+	}
+
+	seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
+			"%08x       %08x %08x     %08x    %08x %08x   "
+			"%08x     %08x        %08x        %08x\n",
+		   dst_entries_get_slow(&ipv4_dst_ops),
+		   0, /* st->in_hit */
+		   st->in_slow_tot,
+		   st->in_slow_mc,
+		   st->in_no_route,
+		   st->in_brd,
+		   st->in_martian_dst,
+		   st->in_martian_src,
+
+		   0, /* st->out_hit */
+		   st->out_slow_tot,
+		   st->out_slow_mc,
+
+		   0, /* st->gc_total */
+		   0, /* st->gc_ignored */
+		   0, /* st->gc_goal_miss */
+		   0, /* st->gc_dst_overflow */
+		   0, /* st->in_hlist_search */
+		   0  /* st->out_hlist_search */
+		);
+	return 0;
+}
+
+static const struct seq_operations rt_cpu_seq_ops = {
+	.start  = rt_cpu_seq_start,
+	.next   = rt_cpu_seq_next,
+	.stop   = rt_cpu_seq_stop,
+	.show   = rt_cpu_seq_show,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static int rt_acct_proc_show(struct seq_file *m, void *v)
+{
+	struct ip_rt_acct *dst, *src;
+	unsigned int i, j;
+
+	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
+	if (!dst)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
+		for (j = 0; j < 256; j++) {
+			dst[j].o_bytes   += src[j].o_bytes;
+			dst[j].o_packets += src[j].o_packets;
+			dst[j].i_bytes   += src[j].i_bytes;
+			dst[j].i_packets += src[j].i_packets;
+		}
+	}
+
+	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
+	kfree(dst);
+	return 0;
+}
+#endif
+
+static int __net_init ip_rt_do_proc_init(struct net *net)
+{
+	struct proc_dir_entry *pde;
+
+	pde = proc_create_seq("rt_cache", 0444, net->proc_net,
+			      &rt_cache_seq_ops);
+	if (!pde)
+		goto err1;
+
+	pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
+			      &rt_cpu_seq_ops);
+	if (!pde)
+		goto err2;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	pde = proc_create_single("rt_acct", 0, net->proc_net,
+			rt_acct_proc_show);
+	if (!pde)
+		goto err3;
+#endif
+	return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+err3:
+	remove_proc_entry("rt_cache", net->proc_net_stat);
+#endif
+err2:
+	remove_proc_entry("rt_cache", net->proc_net);
+err1:
+	return -ENOMEM;
+}
+
+static void __net_exit ip_rt_do_proc_exit(struct net *net)
+{
+	remove_proc_entry("rt_cache", net->proc_net_stat);
+	remove_proc_entry("rt_cache", net->proc_net);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	remove_proc_entry("rt_acct", net->proc_net);
+#endif
+}
+
+static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
+	.init = ip_rt_do_proc_init,
+	.exit = ip_rt_do_proc_exit,
+};
+
+static int __init ip_rt_proc_init(void)
+{
+	return register_pernet_subsys(&ip_rt_proc_ops);
+}
+
+#else
+static inline int ip_rt_proc_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+static inline bool rt_is_expired(const struct rtable *rth)
+{
+	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
+}
+
+void rt_cache_flush(struct net *net)
+{
+	rt_genid_bump_ipv4(net);
+}
+
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+					   struct sk_buff *skb,
+					   const void *daddr)
+{
+	const struct rtable *rt = container_of(dst, struct rtable, dst);
+	struct net_device *dev = dst->dev;
+	struct neighbour *n;
+
+	rcu_read_lock();
+
+	if (likely(rt->rt_gw_family == AF_INET)) {
+		n = ip_neigh_gw4(dev, rt->rt_gw4);
+	} else if (rt->rt_gw_family == AF_INET6) {
+		n = ip_neigh_gw6(dev, &rt->rt_gw6);
+        } else {
+		__be32 pkey;
+
+		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
+		n = ip_neigh_gw4(dev, pkey);
+	}
+
+	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
+		n = NULL;
+
+	rcu_read_unlock();
+
+	return n;
+}
+
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	const struct rtable *rt = container_of(dst, struct rtable, dst);
+	struct net_device *dev = dst->dev;
+	const __be32 *pkey = daddr;
+
+	if (rt->rt_gw_family == AF_INET) {
+		pkey = (const __be32 *)&rt->rt_gw4;
+	} else if (rt->rt_gw_family == AF_INET6) {
+		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
+	} else if (!daddr ||
+		 (rt->rt_flags &
+		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
+		return;
+	}
+	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
+}
+
+/* Hash tables of size 2048..262144 depending on RAM size.
+ * Each bucket uses 8 bytes.
+ */
+static u32 ip_idents_mask __read_mostly;
+static atomic_t *ip_idents __read_mostly;
+static u32 *ip_tstamps __read_mostly;
+
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
+ */
+static u32 ip_idents_reserve(u32 hash, int segs)
+{
+	u32 bucket, old, now = (u32)jiffies;
+	atomic_t *p_id;
+	u32 *p_tstamp;
+	u32 delta = 0;
+
+	bucket = hash & ip_idents_mask;
+	p_tstamp = ip_tstamps + bucket;
+	p_id = ip_idents + bucket;
+	old = READ_ONCE(*p_tstamp);
+
+	if (old != now && cmpxchg(p_tstamp, old, now) == old)
+		delta = get_random_u32_below(now - old);
+
+	/* If UBSAN reports an error there, please make sure your compiler
+	 * supports -fno-strict-overflow before reporting it that was a bug
+	 * in UBSAN, and it has been fixed in GCC-8.
+	 */
+	return atomic_add_return(segs + delta, p_id) - segs;
+}
+
+void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
+{
+	u32 hash, id;
+
+	/* Note the following code is not safe, but this is okay. */
+	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+		get_random_bytes(&net->ipv4.ip_id_key,
+				 sizeof(net->ipv4.ip_id_key));
+
+	hash = siphash_3u32((__force u32)iph->daddr,
+			    (__force u32)iph->saddr,
+			    iph->protocol,
+			    &net->ipv4.ip_id_key);
+	id = ip_idents_reserve(hash, segs);
+	iph->id = htons(id);
+}
+EXPORT_SYMBOL(__ip_select_ident);
+
+static void ip_rt_fix_tos(struct flowi4 *fl4)
+{
+	__u8 tos = RT_FL_TOS(fl4);
+
+	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+	if (tos & RTO_ONLINK)
+		fl4->flowi4_scope = RT_SCOPE_LINK;
+}
+
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+			     const struct sock *sk, const struct iphdr *iph,
+			     int oif, __u8 tos, u8 prot, u32 mark,
+			     int flow_flags)
+{
+	__u8 scope = RT_SCOPE_UNIVERSE;
+
+	if (sk) {
+		oif = sk->sk_bound_dev_if;
+		mark = READ_ONCE(sk->sk_mark);
+		tos = ip_sock_rt_tos(sk);
+		scope = ip_sock_rt_scope(sk);
+		prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
+						    sk->sk_protocol;
+	}
+
+	flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
+			   prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
+			   sock_net_uid(net, sk));
+}
+
+static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+			       const struct sock *sk)
+{
+	const struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph = ip_hdr(skb);
+	int oif = skb->dev->ifindex;
+	u8 prot = iph->protocol;
+	u32 mark = skb->mark;
+	__u8 tos = iph->tos;
+
+	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
+}
+
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ip_options_rcu *inet_opt;
+	__be32 daddr = inet->inet_daddr;
+
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
+			   ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
+			   ip_sock_rt_scope(sk),
+			   inet_test_bit(HDRINCL, sk) ?
+				IPPROTO_RAW : sk->sk_protocol,
+			   inet_sk_flowi_flags(sk),
+			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
+	rcu_read_unlock();
+}
+
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+				 const struct sk_buff *skb)
+{
+	if (skb)
+		build_skb_flow_key(fl4, skb, sk);
+	else
+		build_sk_flow_key(fl4, sk);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
+
+static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
+{
+	struct rtable *rt;
+
+	rt = rcu_dereference(fnhe->fnhe_rth_input);
+	if (rt) {
+		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
+		dst_dev_put(&rt->dst);
+		dst_release(&rt->dst);
+	}
+	rt = rcu_dereference(fnhe->fnhe_rth_output);
+	if (rt) {
+		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
+		dst_dev_put(&rt->dst);
+		dst_release(&rt->dst);
+	}
+}
+
+static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
+{
+	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
+	struct fib_nh_exception *fnhe, *oldest = NULL;
+
+	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
+		fnhe = rcu_dereference_protected(*fnhe_p,
+						 lockdep_is_held(&fnhe_lock));
+		if (!fnhe)
+			break;
+		if (!oldest ||
+		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
+			oldest = fnhe;
+			oldest_p = fnhe_p;
+		}
+	}
+	fnhe_flush_routes(oldest);
+	*oldest_p = oldest->fnhe_next;
+	kfree_rcu(oldest, rcu);
+}
+
+static u32 fnhe_hashfun(__be32 daddr)
+{
+	static siphash_aligned_key_t fnhe_hash_key;
+	u64 hval;
+
+	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
+	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
+	return hash_64(hval, FNHE_HASH_SHIFT);
+}
+
+static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
+{
+	rt->rt_pmtu = fnhe->fnhe_pmtu;
+	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
+	rt->dst.expires = fnhe->fnhe_expires;
+
+	if (fnhe->fnhe_gw) {
+		rt->rt_flags |= RTCF_REDIRECTED;
+		rt->rt_uses_gateway = 1;
+		rt->rt_gw_family = AF_INET;
+		rt->rt_gw4 = fnhe->fnhe_gw;
+	}
+}
+
+static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
+				  __be32 gw, u32 pmtu, bool lock,
+				  unsigned long expires)
+{
+	struct fnhe_hash_bucket *hash;
+	struct fib_nh_exception *fnhe;
+	struct rtable *rt;
+	u32 genid, hval;
+	unsigned int i;
+	int depth;
+
+	genid = fnhe_genid(dev_net(nhc->nhc_dev));
+	hval = fnhe_hashfun(daddr);
+
+	spin_lock_bh(&fnhe_lock);
+
+	hash = rcu_dereference(nhc->nhc_exceptions);
+	if (!hash) {
+		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
+		if (!hash)
+			goto out_unlock;
+		rcu_assign_pointer(nhc->nhc_exceptions, hash);
+	}
+
+	hash += hval;
+
+	depth = 0;
+	for (fnhe = rcu_dereference(hash->chain); fnhe;
+	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
+		if (fnhe->fnhe_daddr == daddr)
+			break;
+		depth++;
+	}
+
+	if (fnhe) {
+		if (fnhe->fnhe_genid != genid)
+			fnhe->fnhe_genid = genid;
+		if (gw)
+			fnhe->fnhe_gw = gw;
+		if (pmtu) {
+			fnhe->fnhe_pmtu = pmtu;
+			fnhe->fnhe_mtu_locked = lock;
+		}
+		fnhe->fnhe_expires = max(1UL, expires);
+		/* Update all cached dsts too */
+		rt = rcu_dereference(fnhe->fnhe_rth_input);
+		if (rt)
+			fill_route_from_fnhe(rt, fnhe);
+		rt = rcu_dereference(fnhe->fnhe_rth_output);
+		if (rt)
+			fill_route_from_fnhe(rt, fnhe);
+	} else {
+		/* Randomize max depth to avoid some side channels attacks. */
+		int max_depth = FNHE_RECLAIM_DEPTH +
+				get_random_u32_below(FNHE_RECLAIM_DEPTH);
+
+		while (depth > max_depth) {
+			fnhe_remove_oldest(hash);
+			depth--;
+		}
+
+		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+		if (!fnhe)
+			goto out_unlock;
+
+		fnhe->fnhe_next = hash->chain;
+
+		fnhe->fnhe_genid = genid;
+		fnhe->fnhe_daddr = daddr;
+		fnhe->fnhe_gw = gw;
+		fnhe->fnhe_pmtu = pmtu;
+		fnhe->fnhe_mtu_locked = lock;
+		fnhe->fnhe_expires = max(1UL, expires);
+
+		rcu_assign_pointer(hash->chain, fnhe);
+
+		/* Exception created; mark the cached routes for the nexthop
+		 * stale, so anyone caching it rechecks if this exception
+		 * applies to them.
+		 */
+		rt = rcu_dereference(nhc->nhc_rth_input);
+		if (rt)
+			rt->dst.obsolete = DST_OBSOLETE_KILL;
+
+		for_each_possible_cpu(i) {
+			struct rtable __rcu **prt;
+
+			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
+			rt = rcu_dereference(*prt);
+			if (rt)
+				rt->dst.obsolete = DST_OBSOLETE_KILL;
+		}
+	}
+
+	fnhe->fnhe_stamp = jiffies;
+
+out_unlock:
+	spin_unlock_bh(&fnhe_lock);
+}
+
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+			     bool kill_route)
+{
+	__be32 new_gw = icmp_hdr(skb)->un.gateway;
+	__be32 old_gw = ip_hdr(skb)->saddr;
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	struct fib_result res;
+	struct neighbour *n;
+	struct net *net;
+
+	switch (icmp_hdr(skb)->code & 7) {
+	case ICMP_REDIR_NET:
+	case ICMP_REDIR_NETTOS:
+	case ICMP_REDIR_HOST:
+	case ICMP_REDIR_HOSTTOS:
+		break;
+
+	default:
+		return;
+	}
+
+	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
+		return;
+
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev)
+		return;
+
+	net = dev_net(dev);
+	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
+	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
+	    ipv4_is_zeronet(new_gw))
+		goto reject_redirect;
+
+	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+			goto reject_redirect;
+		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+			goto reject_redirect;
+	} else {
+		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
+			goto reject_redirect;
+	}
+
+	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
+	if (!n)
+		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
+	if (!IS_ERR(n)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
+			neigh_event_send(n, NULL);
+		} else {
+			if (fib_lookup(net, fl4, &res, 0) == 0) {
+				struct fib_nh_common *nhc;
+
+				fib_select_path(net, &res, fl4, skb);
+				nhc = FIB_RES_NHC(res);
+				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
+						0, false,
+						jiffies + ip_rt_gc_timeout);
+			}
+			if (kill_route)
+				rt->dst.obsolete = DST_OBSOLETE_KILL;
+			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+		}
+		neigh_release(n);
+	}
+	return;
+
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev)) {
+		const struct iphdr *iph = (const struct iphdr *) skb->data;
+		__be32 daddr = iph->daddr;
+		__be32 saddr = iph->saddr;
+
+		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
+				     "  Advised path = %pI4 -> %pI4\n",
+				     &old_gw, dev->name, &new_gw,
+				     &saddr, &daddr);
+	}
+#endif
+	;
+}
+
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	struct net *net = dev_net(skb->dev);
+	int oif = skb->dev->ifindex;
+	u8 prot = iph->protocol;
+	u32 mark = skb->mark;
+	__u8 tos = iph->tos;
+
+	rt = (struct rtable *) dst;
+
+	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
+	__ip_do_redirect(rt, skb, &fl4, true);
+}
+
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+{
+	struct rtable *rt = (struct rtable *)dst;
+	struct dst_entry *ret = dst;
+
+	if (rt) {
+		if (dst->obsolete > 0) {
+			ip_rt_put(rt);
+			ret = NULL;
+		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+			   rt->dst.expires) {
+			ip_rt_put(rt);
+			ret = NULL;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Algorithm:
+ *	1. The first ip_rt_redirect_number redirects are sent
+ *	   with exponential backoff, then we stop sending them at all,
+ *	   assuming that the host ignores our redirects.
+ *	2. If we did not see packets requiring redirects
+ *	   during ip_rt_redirect_silence, we assume that the host
+ *	   forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
+ */
+
+void ip_rt_send_redirect(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct in_device *in_dev;
+	struct inet_peer *peer;
+	struct net *net;
+	int log_martians;
+	int vif;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(rt->dst.dev);
+	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
+		rcu_read_unlock();
+		return;
+	}
+	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
+	rcu_read_unlock();
+
+	net = dev_net(rt->dst.dev);
+	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
+	if (!peer) {
+		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+			  rt_nexthop(rt, ip_hdr(skb)->daddr));
+		return;
+	}
+
+	/* No redirected packets during ip_rt_redirect_silence;
+	 * reset the algorithm.
+	 */
+	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
+		peer->rate_tokens = 0;
+		peer->n_redirects = 0;
+	}
+
+	/* Too many ignored redirects; do not send anything
+	 * set dst.rate_last to the last seen redirected packet.
+	 */
+	if (peer->n_redirects >= ip_rt_redirect_number) {
+		peer->rate_last = jiffies;
+		goto out_put_peer;
+	}
+
+	/* Check for load limit; set rate_last to the latest sent
+	 * redirect.
+	 */
+	if (peer->n_redirects == 0 ||
+	    time_after(jiffies,
+		       (peer->rate_last +
+			(ip_rt_redirect_load << peer->n_redirects)))) {
+		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
+		peer->rate_last = jiffies;
+		++peer->n_redirects;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+		if (log_martians &&
+		    peer->n_redirects == ip_rt_redirect_number)
+			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
+					     &ip_hdr(skb)->saddr, inet_iif(skb),
+					     &ip_hdr(skb)->daddr, &gw);
+#endif
+	}
+out_put_peer:
+	inet_putpeer(peer);
+}
+
+static int ip_error(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	struct inet_peer *peer;
+	unsigned long now;
+	struct net *net;
+	SKB_DR(reason);
+	bool send;
+	int code;
+
+	if (netif_is_l3_master(skb->dev)) {
+		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
+		if (!dev)
+			goto out;
+	}
+
+	in_dev = __in_dev_get_rcu(dev);
+
+	/* IP on this device is disabled. */
+	if (!in_dev)
+		goto out;
+
+	net = dev_net(rt->dst.dev);
+	if (!IN_DEV_FORWARD(in_dev)) {
+		switch (rt->dst.error) {
+		case EHOSTUNREACH:
+			SKB_DR_SET(reason, IP_INADDRERRORS);
+			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
+			break;
+
+		case ENETUNREACH:
+			SKB_DR_SET(reason, IP_INNOROUTES);
+			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
+			break;
+		}
+		goto out;
+	}
+
+	switch (rt->dst.error) {
+	case EINVAL:
+	default:
+		goto out;
+	case EHOSTUNREACH:
+		code = ICMP_HOST_UNREACH;
+		break;
+	case ENETUNREACH:
+		code = ICMP_NET_UNREACH;
+		SKB_DR_SET(reason, IP_INNOROUTES);
+		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
+		break;
+	case EACCES:
+		code = ICMP_PKT_FILTERED;
+		break;
+	}
+
+	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
+			       l3mdev_master_ifindex(skb->dev), 1);
+
+	send = true;
+	if (peer) {
+		now = jiffies;
+		peer->rate_tokens += now - peer->rate_last;
+		if (peer->rate_tokens > ip_rt_error_burst)
+			peer->rate_tokens = ip_rt_error_burst;
+		peer->rate_last = now;
+		if (peer->rate_tokens >= ip_rt_error_cost)
+			peer->rate_tokens -= ip_rt_error_cost;
+		else
+			send = false;
+		inet_putpeer(peer);
+	}
+	if (send)
+		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+
+out:	kfree_skb_reason(skb, reason);
+	return 0;
+}
+
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+{
+	struct dst_entry *dst = &rt->dst;
+	struct net *net = dev_net(dst->dev);
+	struct fib_result res;
+	bool lock = false;
+	u32 old_mtu;
+
+	if (ip_mtu_locked(dst))
+		return;
+
+	old_mtu = ipv4_mtu(dst);
+	if (old_mtu < mtu)
+		return;
+
+	if (mtu < net->ipv4.ip_rt_min_pmtu) {
+		lock = true;
+		mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
+	}
+
+	if (rt->rt_pmtu == mtu && !lock &&
+	    time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
+		return;
+
+	rcu_read_lock();
+	if (fib_lookup(net, fl4, &res, 0) == 0) {
+		struct fib_nh_common *nhc;
+
+		fib_select_path(net, &res, fl4, NULL);
+		nhc = FIB_RES_NHC(res);
+		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+				      jiffies + net->ipv4.ip_rt_mtu_expires);
+	}
+	rcu_read_unlock();
+}
+
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu,
+			      bool confirm_neigh)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	struct flowi4 fl4;
+
+	ip_rt_build_flow_key(&fl4, sk, skb);
+
+	/* Don't make lookup fail for bridged encapsulations */
+	if (skb && netif_is_any_bridge_port(skb->dev))
+		fl4.flowi4_oif = 0;
+
+	__ip_rt_update_pmtu(rt, &fl4, mtu);
+}
+
+void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+		      int oif, u8 protocol)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
+	u32 mark = IP4_REPLY_MARK(net, skb->mark);
+
+	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
+			 0);
+	rt = __ip_route_output_key(net, &fl4);
+	if (!IS_ERR(rt)) {
+		__ip_rt_update_pmtu(rt, &fl4, mtu);
+		ip_rt_put(rt);
+	}
+}
+EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
+
+static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
+
+	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
+
+	if (!fl4.flowi4_mark)
+		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
+
+	rt = __ip_route_output_key(sock_net(sk), &fl4);
+	if (!IS_ERR(rt)) {
+		__ip_rt_update_pmtu(rt, &fl4, mtu);
+		ip_rt_put(rt);
+	}
+}
+
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
+	struct dst_entry *odst = NULL;
+	bool new = false;
+	struct net *net = sock_net(sk);
+
+	bh_lock_sock(sk);
+
+	if (!ip_sk_accept_pmtu(sk))
+		goto out;
+
+	odst = sk_dst_get(sk);
+
+	if (sock_owned_by_user(sk) || !odst) {
+		__ipv4_sk_update_pmtu(skb, sk, mtu);
+		goto out;
+	}
+
+	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+
+	rt = (struct rtable *)odst;
+	if (odst->obsolete && !odst->ops->check(odst, 0)) {
+		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+		if (IS_ERR(rt))
+			goto out;
+
+		new = true;
+	}
+
+	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
+
+	if (!dst_check(&rt->dst, 0)) {
+		if (new)
+			dst_release(&rt->dst);
+
+		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+		if (IS_ERR(rt))
+			goto out;
+
+		new = true;
+	}
+
+	if (new)
+		sk_dst_set(sk, &rt->dst);
+
+out:
+	bh_unlock_sock(sk);
+	dst_release(odst);
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
+
+void ipv4_redirect(struct sk_buff *skb, struct net *net,
+		   int oif, u8 protocol)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
+
+	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
+	rt = __ip_route_output_key(net, &fl4);
+	if (!IS_ERR(rt)) {
+		__ip_do_redirect(rt, skb, &fl4, false);
+		ip_rt_put(rt);
+	}
+}
+EXPORT_SYMBOL_GPL(ipv4_redirect);
+
+void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
+	struct net *net = sock_net(sk);
+
+	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+	rt = __ip_route_output_key(net, &fl4);
+	if (!IS_ERR(rt)) {
+		__ip_do_redirect(rt, skb, &fl4, false);
+		ip_rt_put(rt);
+	}
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
+
+INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
+							 u32 cookie)
+{
+	struct rtable *rt = (struct rtable *) dst;
+
+	/* All IPV4 dsts are created with ->obsolete set to the value
+	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+	 * into this function always.
+	 *
+	 * When a PMTU/redirect information update invalidates a route,
+	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
+	 * DST_OBSOLETE_DEAD.
+	 */
+	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
+		return NULL;
+	return dst;
+}
+EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
+
+static void ipv4_send_dest_unreach(struct sk_buff *skb)
+{
+	struct net_device *dev;
+	struct ip_options opt;
+	int res;
+
+	/* Recompile ip options since IPCB may not be valid anymore.
+	 * Also check we have a reasonable ipv4 header.
+	 */
+	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
+	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
+		return;
+
+	memset(&opt, 0, sizeof(opt));
+	if (ip_hdr(skb)->ihl > 5) {
+		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
+			return;
+		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+
+		rcu_read_lock();
+		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
+		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
+		rcu_read_unlock();
+
+		if (res)
+			return;
+	}
+	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
+}
+
+static void ipv4_link_failure(struct sk_buff *skb)
+{
+	struct rtable *rt;
+
+	ipv4_send_dest_unreach(skb);
+
+	rt = skb_rtable(skb);
+	if (rt)
+		dst_set_expires(&rt->dst, 0);
+}
+
+static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	pr_debug("%s: %pI4 -> %pI4, %s\n",
+		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+		 skb->dev ? skb->dev->name : "?");
+	kfree_skb(skb);
+	WARN_ON(1);
+	return 0;
+}
+
+/*
+ * We do not cache source address of outgoing interface,
+ * because it is used only by IP RR, TS and SRR options,
+ * so that it out of fast path.
+ *
+ * BTW remember: "addr" is allowed to be not aligned
+ * in IP options!
+ */
+
+void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
+{
+	__be32 src;
+
+	if (rt_is_output_route(rt))
+		src = ip_hdr(skb)->saddr;
+	else {
+		struct fib_result res;
+		struct iphdr *iph = ip_hdr(skb);
+		struct flowi4 fl4 = {
+			.daddr = iph->daddr,
+			.saddr = iph->saddr,
+			.flowi4_tos = RT_TOS(iph->tos),
+			.flowi4_oif = rt->dst.dev->ifindex,
+			.flowi4_iif = skb->dev->ifindex,
+			.flowi4_mark = skb->mark,
+		};
+
+		rcu_read_lock();
+		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
+			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
+		else
+			src = inet_select_addr(rt->dst.dev,
+					       rt_nexthop(rt, iph->daddr),
+					       RT_SCOPE_UNIVERSE);
+		rcu_read_unlock();
+	}
+	memcpy(addr, &src, 4);
+}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+	if (!(rt->dst.tclassid & 0xFFFF))
+		rt->dst.tclassid |= tag & 0xFFFF;
+	if (!(rt->dst.tclassid & 0xFFFF0000))
+		rt->dst.tclassid |= tag & 0xFFFF0000;
+}
+#endif
+
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
+{
+	struct net *net = dev_net(dst->dev);
+	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
+	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
+				    net->ipv4.ip_rt_min_advmss);
+
+	return min(advmss, IPV4_MAX_PMTU - header_size);
+}
+
+INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
+{
+	return ip_dst_mtu_maybe_forward(dst, false);
+}
+EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
+
+static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
+{
+	struct fnhe_hash_bucket *hash;
+	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+	u32 hval = fnhe_hashfun(daddr);
+
+	spin_lock_bh(&fnhe_lock);
+
+	hash = rcu_dereference_protected(nhc->nhc_exceptions,
+					 lockdep_is_held(&fnhe_lock));
+	hash += hval;
+
+	fnhe_p = &hash->chain;
+	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+	while (fnhe) {
+		if (fnhe->fnhe_daddr == daddr) {
+			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+			/* set fnhe_daddr to 0 to ensure it won't bind with
+			 * new dsts in rt_bind_exception().
+			 */
+			fnhe->fnhe_daddr = 0;
+			fnhe_flush_routes(fnhe);
+			kfree_rcu(fnhe, rcu);
+			break;
+		}
+		fnhe_p = &fnhe->fnhe_next;
+		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+						 lockdep_is_held(&fnhe_lock));
+	}
+
+	spin_unlock_bh(&fnhe_lock);
+}
+
+static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
+					       __be32 daddr)
+{
+	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
+	struct fib_nh_exception *fnhe;
+	u32 hval;
+
+	if (!hash)
+		return NULL;
+
+	hval = fnhe_hashfun(daddr);
+
+	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
+	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
+		if (fnhe->fnhe_daddr == daddr) {
+			if (fnhe->fnhe_expires &&
+			    time_after(jiffies, fnhe->fnhe_expires)) {
+				ip_del_fnhe(nhc, daddr);
+				break;
+			}
+			return fnhe;
+		}
+	}
+	return NULL;
+}
+
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ */
+
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
+{
+	struct fib_nh_common *nhc = res->nhc;
+	struct net_device *dev = nhc->nhc_dev;
+	struct fib_info *fi = res->fi;
+	u32 mtu = 0;
+
+	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
+	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
+		mtu = fi->fib_mtu;
+
+	if (likely(!mtu)) {
+		struct fib_nh_exception *fnhe;
+
+		fnhe = find_exception(nhc, daddr);
+		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
+			mtu = fnhe->fnhe_pmtu;
+	}
+
+	if (likely(!mtu))
+		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
+
+	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
+}
+
+static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+			      __be32 daddr, const bool do_cache)
+{
+	bool ret = false;
+
+	spin_lock_bh(&fnhe_lock);
+
+	if (daddr == fnhe->fnhe_daddr) {
+		struct rtable __rcu **porig;
+		struct rtable *orig;
+		int genid = fnhe_genid(dev_net(rt->dst.dev));
+
+		if (rt_is_input_route(rt))
+			porig = &fnhe->fnhe_rth_input;
+		else
+			porig = &fnhe->fnhe_rth_output;
+		orig = rcu_dereference(*porig);
+
+		if (fnhe->fnhe_genid != genid) {
+			fnhe->fnhe_genid = genid;
+			fnhe->fnhe_gw = 0;
+			fnhe->fnhe_pmtu = 0;
+			fnhe->fnhe_expires = 0;
+			fnhe->fnhe_mtu_locked = false;
+			fnhe_flush_routes(fnhe);
+			orig = NULL;
+		}
+		fill_route_from_fnhe(rt, fnhe);
+		if (!rt->rt_gw4) {
+			rt->rt_gw4 = daddr;
+			rt->rt_gw_family = AF_INET;
+		}
+
+		if (do_cache) {
+			dst_hold(&rt->dst);
+			rcu_assign_pointer(*porig, rt);
+			if (orig) {
+				dst_dev_put(&orig->dst);
+				dst_release(&orig->dst);
+			}
+			ret = true;
+		}
+
+		fnhe->fnhe_stamp = jiffies;
+	}
+	spin_unlock_bh(&fnhe_lock);
+
+	return ret;
+}
+
+static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
+{
+	struct rtable *orig, *prev, **p;
+	bool ret = true;
+
+	if (rt_is_input_route(rt)) {
+		p = (struct rtable **)&nhc->nhc_rth_input;
+	} else {
+		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
+	}
+	orig = *p;
+
+	/* hold dst before doing cmpxchg() to avoid race condition
+	 * on this dst
+	 */
+	dst_hold(&rt->dst);
+	prev = cmpxchg(p, orig, rt);
+	if (prev == orig) {
+		if (orig) {
+			rt_add_uncached_list(orig);
+			dst_release(&orig->dst);
+		}
+	} else {
+		dst_release(&rt->dst);
+		ret = false;
+	}
+
+	return ret;
+}
+
+struct uncached_list {
+	spinlock_t		lock;
+	struct list_head	head;
+	struct list_head	quarantine;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
+
+void rt_add_uncached_list(struct rtable *rt)
+{
+	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
+
+	rt->dst.rt_uncached_list = ul;
+
+	spin_lock_bh(&ul->lock);
+	list_add_tail(&rt->dst.rt_uncached, &ul->head);
+	spin_unlock_bh(&ul->lock);
+}
+
+void rt_del_uncached_list(struct rtable *rt)
+{
+	if (!list_empty(&rt->dst.rt_uncached)) {
+		struct uncached_list *ul = rt->dst.rt_uncached_list;
+
+		spin_lock_bh(&ul->lock);
+		list_del_init(&rt->dst.rt_uncached);
+		spin_unlock_bh(&ul->lock);
+	}
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+	struct rtable *rt = (struct rtable *)dst;
+
+	ip_dst_metrics_put(dst);
+	rt_del_uncached_list(rt);
+}
+
+void rt_flush_dev(struct net_device *dev)
+{
+	struct rtable *rt, *safe;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+		if (list_empty(&ul->head))
+			continue;
+
+		spin_lock_bh(&ul->lock);
+		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
+			if (rt->dst.dev != dev)
+				continue;
+			rt->dst.dev = blackhole_netdev;
+			netdev_ref_replace(dev, blackhole_netdev,
+					   &rt->dst.dev_tracker, GFP_ATOMIC);
+			list_move(&rt->dst.rt_uncached, &ul->quarantine);
+		}
+		spin_unlock_bh(&ul->lock);
+	}
+}
+
+static bool rt_cache_valid(const struct rtable *rt)
+{
+	return	rt &&
+		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+		!rt_is_expired(rt);
+}
+
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
+			   const struct fib_result *res,
+			   struct fib_nh_exception *fnhe,
+			   struct fib_info *fi, u16 type, u32 itag,
+			   const bool do_cache)
+{
+	bool cached = false;
+
+	if (fi) {
+		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+
+		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
+			rt->rt_uses_gateway = 1;
+			rt->rt_gw_family = nhc->nhc_gw_family;
+			/* only INET and INET6 are supported */
+			if (likely(nhc->nhc_gw_family == AF_INET))
+				rt->rt_gw4 = nhc->nhc_gw.ipv4;
+			else
+				rt->rt_gw6 = nhc->nhc_gw.ipv6;
+		}
+
+		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		if (nhc->nhc_family == AF_INET) {
+			struct fib_nh *nh;
+
+			nh = container_of(nhc, struct fib_nh, nh_common);
+			rt->dst.tclassid = nh->nh_tclassid;
+		}
+#endif
+		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
+		if (unlikely(fnhe))
+			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
+		else if (do_cache)
+			cached = rt_cache_route(nhc, rt);
+		if (unlikely(!cached)) {
+			/* Routes we intend to cache in nexthop exception or
+			 * FIB nexthop have the DST_NOCACHE bit clear.
+			 * However, if we are unsuccessful at storing this
+			 * route into the cache we really need to set it.
+			 */
+			if (!rt->rt_gw4) {
+				rt->rt_gw_family = AF_INET;
+				rt->rt_gw4 = daddr;
+			}
+			rt_add_uncached_list(rt);
+		}
+	} else
+		rt_add_uncached_list(rt);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	set_class_tag(rt, res->tclassid);
+#endif
+	set_class_tag(rt, itag);
+#endif
+}
+
+struct rtable *rt_dst_alloc(struct net_device *dev,
+			    unsigned int flags, u16 type,
+			    bool noxfrm)
+{
+	struct rtable *rt;
+
+	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+		       (noxfrm ? DST_NOXFRM : 0));
+
+	if (rt) {
+		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+		rt->rt_flags = flags;
+		rt->rt_type = type;
+		rt->rt_is_input = 0;
+		rt->rt_iif = 0;
+		rt->rt_pmtu = 0;
+		rt->rt_mtu_locked = 0;
+		rt->rt_uses_gateway = 0;
+		rt->rt_gw_family = 0;
+		rt->rt_gw4 = 0;
+
+		rt->dst.output = ip_output;
+		if (flags & RTCF_LOCAL)
+			rt->dst.input = ip_local_deliver;
+	}
+
+	return rt;
+}
+EXPORT_SYMBOL(rt_dst_alloc);
+
+struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
+{
+	struct rtable *new_rt;
+
+	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+			   rt->dst.flags);
+
+	if (new_rt) {
+		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+		new_rt->rt_flags = rt->rt_flags;
+		new_rt->rt_type = rt->rt_type;
+		new_rt->rt_is_input = rt->rt_is_input;
+		new_rt->rt_iif = rt->rt_iif;
+		new_rt->rt_pmtu = rt->rt_pmtu;
+		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
+		new_rt->rt_gw_family = rt->rt_gw_family;
+		if (rt->rt_gw_family == AF_INET)
+			new_rt->rt_gw4 = rt->rt_gw4;
+		else if (rt->rt_gw_family == AF_INET6)
+			new_rt->rt_gw6 = rt->rt_gw6;
+
+		new_rt->dst.input = rt->dst.input;
+		new_rt->dst.output = rt->dst.output;
+		new_rt->dst.error = rt->dst.error;
+		new_rt->dst.lastuse = jiffies;
+		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
+	}
+	return new_rt;
+}
+EXPORT_SYMBOL(rt_dst_clone);
+
+/* called in rcu_read_lock() section */
+int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			  u8 tos, struct net_device *dev,
+			  struct in_device *in_dev, u32 *itag)
+{
+	int err;
+
+	/* Primary sanity checks. */
+	if (!in_dev)
+		return -EINVAL;
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+	    skb->protocol != htons(ETH_P_IP))
+		return -EINVAL;
+
+	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
+		return -EINVAL;
+
+	if (ipv4_is_zeronet(saddr)) {
+		if (!ipv4_is_local_multicast(daddr) &&
+		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
+			return -EINVAL;
+	} else {
+		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+					  in_dev, itag);
+		if (err < 0)
+			return err;
+	}
+	return 0;
+}
+
+/* called in rcu_read_lock() section */
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			     u8 tos, struct net_device *dev, int our)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	unsigned int flags = RTCF_MULTICAST;
+	struct rtable *rth;
+	u32 itag = 0;
+	int err;
+
+	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
+	if (err)
+		return err;
+
+	if (our)
+		flags |= RTCF_LOCAL;
+
+	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+		IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
+	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
+			   false);
+	if (!rth)
+		return -ENOBUFS;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	rth->dst.tclassid = itag;
+#endif
+	rth->dst.output = ip_rt_bug;
+	rth->rt_is_input= 1;
+
+#ifdef CONFIG_IP_MROUTE
+	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
+		rth->dst.input = ip_mr_input;
+#endif
+	RT_CACHE_STAT_INC(in_slow_mc);
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rth->dst);
+	return 0;
+}
+
+
+static void ip_handle_martian_source(struct net_device *dev,
+				     struct in_device *in_dev,
+				     struct sk_buff *skb,
+				     __be32 daddr,
+				     __be32 saddr)
+{
+	RT_CACHE_STAT_INC(in_martian_src);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+		/*
+		 *	RFC1812 recommendation, if source is martian,
+		 *	the only hint is MAC header.
+		 */
+		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
+			&daddr, &saddr, dev->name);
+		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
+			print_hex_dump(KERN_WARNING, "ll header: ",
+				       DUMP_PREFIX_OFFSET, 16, 1,
+				       skb_mac_header(skb),
+				       dev->hard_header_len, false);
+		}
+	}
+#endif
+}
+
+/* called in rcu_read_lock() section */
+static int __mkroute_input(struct sk_buff *skb,
+			   const struct fib_result *res,
+			   struct in_device *in_dev,
+			   __be32 daddr, __be32 saddr, u32 tos)
+{
+	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+	struct net_device *dev = nhc->nhc_dev;
+	struct fib_nh_exception *fnhe;
+	struct rtable *rth;
+	int err;
+	struct in_device *out_dev;
+	bool do_cache;
+	u32 itag = 0;
+
+	/* get a working reference to the output device */
+	out_dev = __in_dev_get_rcu(dev);
+	if (!out_dev) {
+		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
+		return -EINVAL;
+	}
+
+	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
+				  in_dev->dev, in_dev, &itag);
+	if (err < 0) {
+		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
+					 saddr);
+
+		goto cleanup;
+	}
+
+	do_cache = res->fi && !itag;
+	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
+	    skb->protocol == htons(ETH_P_IP)) {
+		__be32 gw;
+
+		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
+		if (IN_DEV_SHARED_MEDIA(out_dev) ||
+		    inet_addr_onlink(out_dev, saddr, gw))
+			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+	}
+
+	if (skb->protocol != htons(ETH_P_IP)) {
+		/* Not IP (i.e. ARP). Do not create route, if it is
+		 * invalid for proxy arp. DNAT routes are always valid.
+		 *
+		 * Proxy arp feature have been extended to allow, ARP
+		 * replies back to the same interface, to support
+		 * Private VLAN switch technologies. See arp.c.
+		 */
+		if (out_dev == in_dev &&
+		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+	}
+
+	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+		IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
+	fnhe = find_exception(nhc, daddr);
+	if (do_cache) {
+		if (fnhe)
+			rth = rcu_dereference(fnhe->fnhe_rth_input);
+		else
+			rth = rcu_dereference(nhc->nhc_rth_input);
+		if (rt_cache_valid(rth)) {
+			skb_dst_set_noref(skb, &rth->dst);
+			goto out;
+		}
+	}
+
+	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
+			   IN_DEV_ORCONF(out_dev, NOXFRM));
+	if (!rth) {
+		err = -ENOBUFS;
+		goto cleanup;
+	}
+
+	rth->rt_is_input = 1;
+	RT_CACHE_STAT_INC(in_slow_tot);
+
+	rth->dst.input = ip_forward;
+
+	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
+		       do_cache);
+	lwtunnel_set_redirect(&rth->dst);
+	skb_dst_set(skb, &rth->dst);
+out:
+	err = 0;
+ cleanup:
+	return err;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses.
+ */
+static void ip_multipath_l3_keys(const struct sk_buff *skb,
+				 struct flow_keys *hash_keys)
+{
+	const struct iphdr *outer_iph = ip_hdr(skb);
+	const struct iphdr *key_iph = outer_iph;
+	const struct iphdr *inner_iph;
+	const struct icmphdr *icmph;
+	struct iphdr _inner_iph;
+	struct icmphdr _icmph;
+
+	if (likely(outer_iph->protocol != IPPROTO_ICMP))
+		goto out;
+
+	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+		goto out;
+
+	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+				   &_icmph);
+	if (!icmph)
+		goto out;
+
+	if (!icmp_is_err(icmph->type))
+		goto out;
+
+	inner_iph = skb_header_pointer(skb,
+				       outer_iph->ihl * 4 + sizeof(_icmph),
+				       sizeof(_inner_iph), &_inner_iph);
+	if (!inner_iph)
+		goto out;
+
+	key_iph = inner_iph;
+out:
+	hash_keys->addrs.v4addrs.src = key_iph->saddr;
+	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
+}
+
+static u32 fib_multipath_custom_hash_outer(const struct net *net,
+					   const struct sk_buff *skb,
+					   bool *p_has_inner)
+{
+	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+	struct flow_keys keys, hash_keys;
+
+	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+		return 0;
+
+	memset(&hash_keys, 0, sizeof(hash_keys));
+	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+
+	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+		hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+		hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+		hash_keys.basic.ip_proto = keys.basic.ip_proto;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
+		hash_keys.ports.src = keys.ports.src;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+		hash_keys.ports.dst = keys.ports.dst;
+
+	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
+	return flow_hash_from_keys(&hash_keys);
+}
+
+static u32 fib_multipath_custom_hash_inner(const struct net *net,
+					   const struct sk_buff *skb,
+					   bool has_inner)
+{
+	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+	struct flow_keys keys, hash_keys;
+
+	/* We assume the packet carries an encapsulation, but if none was
+	 * encountered during dissection of the outer flow, then there is no
+	 * point in calling the flow dissector again.
+	 */
+	if (!has_inner)
+		return 0;
+
+	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
+		return 0;
+
+	memset(&hash_keys, 0, sizeof(hash_keys));
+	skb_flow_dissect_flow_keys(skb, &keys, 0);
+
+	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
+		return 0;
+
+	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
+			hash_keys.tags.flow_label = keys.tags.flow_label;
+	}
+
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
+		hash_keys.basic.ip_proto = keys.basic.ip_proto;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
+		hash_keys.ports.src = keys.ports.src;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
+		hash_keys.ports.dst = keys.ports.dst;
+
+	return flow_hash_from_keys(&hash_keys);
+}
+
+static u32 fib_multipath_custom_hash_skb(const struct net *net,
+					 const struct sk_buff *skb)
+{
+	u32 mhash, mhash_inner;
+	bool has_inner = true;
+
+	mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
+	mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
+
+	return jhash_2words(mhash, mhash_inner, 0);
+}
+
+static u32 fib_multipath_custom_hash_fl4(const struct net *net,
+					 const struct flowi4 *fl4)
+{
+	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+	struct flow_keys hash_keys;
+
+	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+		return 0;
+
+	memset(&hash_keys, 0, sizeof(hash_keys));
+	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+		hash_keys.addrs.v4addrs.src = fl4->saddr;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+		hash_keys.addrs.v4addrs.dst = fl4->daddr;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+		hash_keys.basic.ip_proto = fl4->flowi4_proto;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
+		hash_keys.ports.src = fl4->fl4_sport;
+	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+		hash_keys.ports.dst = fl4->fl4_dport;
+
+	return flow_hash_from_keys(&hash_keys);
+}
+
+/* if skb is set it will be used and fl4 can be NULL */
+int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
+		       const struct sk_buff *skb, struct flow_keys *flkeys)
+{
+	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
+	struct flow_keys hash_keys;
+	u32 mhash = 0;
+
+	switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
+	case 0:
+		memset(&hash_keys, 0, sizeof(hash_keys));
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		if (skb) {
+			ip_multipath_l3_keys(skb, &hash_keys);
+		} else {
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+		}
+		mhash = flow_hash_from_keys(&hash_keys);
+		break;
+	case 1:
+		/* skb is currently provided only when forwarding */
+		if (skb) {
+			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+			struct flow_keys keys;
+
+			/* short-circuit if we already have L4 hash present */
+			if (skb->l4_hash)
+				return skb_get_hash_raw(skb) >> 1;
+
+			memset(&hash_keys, 0, sizeof(hash_keys));
+
+			if (!flkeys) {
+				skb_flow_dissect_flow_keys(skb, &keys, flag);
+				flkeys = &keys;
+			}
+
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
+			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
+			hash_keys.ports.src = flkeys->ports.src;
+			hash_keys.ports.dst = flkeys->ports.dst;
+			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+		} else {
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+			hash_keys.ports.src = fl4->fl4_sport;
+			hash_keys.ports.dst = fl4->fl4_dport;
+			hash_keys.basic.ip_proto = fl4->flowi4_proto;
+		}
+		mhash = flow_hash_from_keys(&hash_keys);
+		break;
+	case 2:
+		memset(&hash_keys, 0, sizeof(hash_keys));
+		/* skb is currently provided only when forwarding */
+		if (skb) {
+			struct flow_keys keys;
+
+			skb_flow_dissect_flow_keys(skb, &keys, 0);
+			/* Inner can be v4 or v6 */
+			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+				hash_keys.tags.flow_label = keys.tags.flow_label;
+				hash_keys.basic.ip_proto = keys.basic.ip_proto;
+			} else {
+				/* Same as case 0 */
+				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+				ip_multipath_l3_keys(skb, &hash_keys);
+			}
+		} else {
+			/* Same as case 0 */
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			hash_keys.addrs.v4addrs.src = fl4->saddr;
+			hash_keys.addrs.v4addrs.dst = fl4->daddr;
+		}
+		mhash = flow_hash_from_keys(&hash_keys);
+		break;
+	case 3:
+		if (skb)
+			mhash = fib_multipath_custom_hash_skb(net, skb);
+		else
+			mhash = fib_multipath_custom_hash_fl4(net, fl4);
+		break;
+	}
+
+	if (multipath_hash)
+		mhash = jhash_2words(mhash, multipath_hash, 0);
+
+	return mhash >> 1;
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+static int ip_mkroute_input(struct sk_buff *skb,
+			    struct fib_result *res,
+			    struct in_device *in_dev,
+			    __be32 daddr, __be32 saddr, u32 tos,
+			    struct flow_keys *hkeys)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res->fi && fib_info_num_path(res->fi) > 1) {
+		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
+
+		fib_select_multipath(res, h);
+		IPCB(skb)->flags |= IPSKB_MULTIPATH;
+	}
+#endif
+
+	/* create a routing cache entry */
+	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
+}
+
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and the destination is not a local broadcast one.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		      u8 tos, struct net_device *dev,
+		      const struct sk_buff *hint)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct rtable *rt = skb_rtable(hint);
+	struct net *net = dev_net(dev);
+	int err = -EINVAL;
+	u32 tag = 0;
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+		goto martian_source;
+
+	if (ipv4_is_zeronet(saddr))
+		goto martian_source;
+
+	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+		goto martian_source;
+
+	if (rt->rt_type != RTN_LOCAL)
+		goto skip_validate_source;
+
+	tos &= IPTOS_RT_MASK;
+	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
+	if (err < 0)
+		goto martian_source;
+
+skip_validate_source:
+	skb_dst_copy(skb, hint);
+	return 0;
+
+martian_source:
+	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+	return err;
+}
+
+/* get device for dst_alloc with local routes */
+static struct net_device *ip_rt_get_dev(struct net *net,
+					const struct fib_result *res)
+{
+	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
+	struct net_device *dev = NULL;
+
+	if (nhc)
+		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
+
+	return dev ? : net->loopback_dev;
+}
+
+/*
+ *	NOTE. We drop all the packets that has local source
+ *	addresses, because every properly looped back packet
+ *	must have correct destination already attached by output routine.
+ *	Changes in the enforced policies must be applied also to
+ *	ip_route_use_hint().
+ *
+ *	Such approach solves two big problems:
+ *	1. Not simplex devices are handled properly.
+ *	2. IP spoofing attempts are filtered with 100% of guarantee.
+ *	called with rcu_read_lock()
+ */
+
+static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			       u8 tos, struct net_device *dev,
+			       struct fib_result *res)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct flow_keys *flkeys = NULL, _flkeys;
+	struct net    *net = dev_net(dev);
+	struct ip_tunnel_info *tun_info;
+	int		err = -EINVAL;
+	unsigned int	flags = 0;
+	u32		itag = 0;
+	struct rtable	*rth;
+	struct flowi4	fl4;
+	bool do_cache = true;
+
+	/* IP on this device is disabled. */
+
+	if (!in_dev)
+		goto out;
+
+	/* Check for the most weird martians, which can be not detected
+	 * by fib_lookup.
+	 */
+
+	tun_info = skb_tunnel_info(skb);
+	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
+	else
+		fl4.flowi4_tun_key.tun_id = 0;
+	skb_dst_drop(skb);
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+		goto martian_source;
+
+	res->fi = NULL;
+	res->table = NULL;
+	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
+		goto brd_input;
+
+	/* Accept zero addresses only to limited broadcast;
+	 * I even do not know to fix it or not. Waiting for complains :-)
+	 */
+	if (ipv4_is_zeronet(saddr))
+		goto martian_source;
+
+	if (ipv4_is_zeronet(daddr))
+		goto martian_destination;
+
+	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
+	 * and call it once if daddr or/and saddr are loopback addresses
+	 */
+	if (ipv4_is_loopback(daddr)) {
+		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+			goto martian_destination;
+	} else if (ipv4_is_loopback(saddr)) {
+		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+			goto martian_source;
+	}
+
+	/*
+	 *	Now we are ready to route packet.
+	 */
+	fl4.flowi4_l3mdev = 0;
+	fl4.flowi4_oif = 0;
+	fl4.flowi4_iif = dev->ifindex;
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_tos = tos;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+	fl4.daddr = daddr;
+	fl4.saddr = saddr;
+	fl4.flowi4_uid = sock_net_uid(net, NULL);
+	fl4.flowi4_multipath_hash = 0;
+
+	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
+		flkeys = &_flkeys;
+	} else {
+		fl4.flowi4_proto = 0;
+		fl4.fl4_sport = 0;
+		fl4.fl4_dport = 0;
+	}
+
+	err = fib_lookup(net, &fl4, res, 0);
+	if (err != 0) {
+		if (!IN_DEV_FORWARD(in_dev))
+			err = -EHOSTUNREACH;
+		goto no_route;
+	}
+
+	if (res->type == RTN_BROADCAST) {
+		if (IN_DEV_BFORWARD(in_dev))
+			goto make_route;
+		/* not do cache if bc_forwarding is enabled */
+		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
+			do_cache = false;
+		goto brd_input;
+	}
+
+	if (res->type == RTN_LOCAL) {
+		err = fib_validate_source(skb, saddr, daddr, tos,
+					  0, dev, in_dev, &itag);
+		if (err < 0)
+			goto martian_source;
+		goto local_input;
+	}
+
+	if (!IN_DEV_FORWARD(in_dev)) {
+		err = -EHOSTUNREACH;
+		goto no_route;
+	}
+	if (res->type != RTN_UNICAST)
+		goto martian_destination;
+
+make_route:
+	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
+out:	return err;
+
+brd_input:
+	if (skb->protocol != htons(ETH_P_IP))
+		goto e_inval;
+
+	if (!ipv4_is_zeronet(saddr)) {
+		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+					  in_dev, &itag);
+		if (err < 0)
+			goto martian_source;
+	}
+	flags |= RTCF_BROADCAST;
+	res->type = RTN_BROADCAST;
+	RT_CACHE_STAT_INC(in_brd);
+
+local_input:
+	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+		IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
+	do_cache &= res->fi && !itag;
+	if (do_cache) {
+		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+
+		rth = rcu_dereference(nhc->nhc_rth_input);
+		if (rt_cache_valid(rth)) {
+			skb_dst_set_noref(skb, &rth->dst);
+			err = 0;
+			goto out;
+		}
+	}
+
+	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
+			   flags | RTCF_LOCAL, res->type, false);
+	if (!rth)
+		goto e_nobufs;
+
+	rth->dst.output= ip_rt_bug;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	rth->dst.tclassid = itag;
+#endif
+	rth->rt_is_input = 1;
+
+	RT_CACHE_STAT_INC(in_slow_tot);
+	if (res->type == RTN_UNREACHABLE) {
+		rth->dst.input= ip_error;
+		rth->dst.error= -err;
+		rth->rt_flags	&= ~RTCF_LOCAL;
+	}
+
+	if (do_cache) {
+		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+
+		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
+		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+			WARN_ON(rth->dst.input == lwtunnel_input);
+			rth->dst.lwtstate->orig_input = rth->dst.input;
+			rth->dst.input = lwtunnel_input;
+		}
+
+		if (unlikely(!rt_cache_route(nhc, rth)))
+			rt_add_uncached_list(rth);
+	}
+	skb_dst_set(skb, &rth->dst);
+	err = 0;
+	goto out;
+
+no_route:
+	RT_CACHE_STAT_INC(in_no_route);
+	res->type = RTN_UNREACHABLE;
+	res->fi = NULL;
+	res->table = NULL;
+	goto local_input;
+
+	/*
+	 *	Do not cache martian addresses: they should be logged (RFC1812)
+	 */
+martian_destination:
+	RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev))
+		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+				     &daddr, &saddr, dev->name);
+#endif
+
+e_inval:
+	err = -EINVAL;
+	goto out;
+
+e_nobufs:
+	err = -ENOBUFS;
+	goto out;
+
+martian_source:
+	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+	goto out;
+}
+
+/* called with rcu_read_lock held */
+static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			      u8 tos, struct net_device *dev, struct fib_result *res)
+{
+	/* Multicast recognition logic is moved from route cache to here.
+	 * The problem was that too many Ethernet cards have broken/missing
+	 * hardware multicast filters :-( As result the host on multicasting
+	 * network acquires a lot of useless route cache entries, sort of
+	 * SDR messages from all the world. Now we try to get rid of them.
+	 * Really, provided software IP multicast filter is organized
+	 * reasonably (at least, hashed), it does not result in a slowdown
+	 * comparing with route cache reject entries.
+	 * Note, that multicast routers are not affected, because
+	 * route cache entry is created eventually.
+	 */
+	if (ipv4_is_multicast(daddr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+		int our = 0;
+		int err = -EINVAL;
+
+		if (!in_dev)
+			return err;
+		our = ip_check_mc_rcu(in_dev, daddr, saddr,
+				      ip_hdr(skb)->protocol);
+
+		/* check l3 master if no match yet */
+		if (!our && netif_is_l3_slave(dev)) {
+			struct in_device *l3_in_dev;
+
+			l3_in_dev = __in_dev_get_rcu(skb->dev);
+			if (l3_in_dev)
+				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
+						      ip_hdr(skb)->protocol);
+		}
+
+		if (our
+#ifdef CONFIG_IP_MROUTE
+			||
+		    (!ipv4_is_local_multicast(daddr) &&
+		     IN_DEV_MFORWARD(in_dev))
+#endif
+		   ) {
+			err = ip_route_input_mc(skb, daddr, saddr,
+						tos, dev, our);
+		}
+		return err;
+	}
+
+	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
+}
+
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			 u8 tos, struct net_device *dev)
+{
+	struct fib_result res;
+	int err;
+
+	tos &= IPTOS_RT_MASK;
+	rcu_read_lock();
+	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
+	rcu_read_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(ip_route_input_noref);
+
+/* called with rcu_read_lock() */
+static struct rtable *__mkroute_output(const struct fib_result *res,
+				       const struct flowi4 *fl4, int orig_oif,
+				       struct net_device *dev_out,
+				       unsigned int flags)
+{
+	struct fib_info *fi = res->fi;
+	struct fib_nh_exception *fnhe;
+	struct in_device *in_dev;
+	u16 type = res->type;
+	struct rtable *rth;
+	bool do_cache;
+
+	in_dev = __in_dev_get_rcu(dev_out);
+	if (!in_dev)
+		return ERR_PTR(-EINVAL);
+
+	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+		if (ipv4_is_loopback(fl4->saddr) &&
+		    !(dev_out->flags & IFF_LOOPBACK) &&
+		    !netif_is_l3_master(dev_out))
+			return ERR_PTR(-EINVAL);
+
+	if (ipv4_is_lbcast(fl4->daddr))
+		type = RTN_BROADCAST;
+	else if (ipv4_is_multicast(fl4->daddr))
+		type = RTN_MULTICAST;
+	else if (ipv4_is_zeronet(fl4->daddr))
+		return ERR_PTR(-EINVAL);
+
+	if (dev_out->flags & IFF_LOOPBACK)
+		flags |= RTCF_LOCAL;
+
+	do_cache = true;
+	if (type == RTN_BROADCAST) {
+		flags |= RTCF_BROADCAST | RTCF_LOCAL;
+		fi = NULL;
+	} else if (type == RTN_MULTICAST) {
+		flags |= RTCF_MULTICAST | RTCF_LOCAL;
+		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
+				     fl4->flowi4_proto))
+			flags &= ~RTCF_LOCAL;
+		else
+			do_cache = false;
+		/* If multicast route do not exist use
+		 * default one, but do not gateway in this case.
+		 * Yes, it is hack.
+		 */
+		if (fi && res->prefixlen < 4)
+			fi = NULL;
+	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+		   (orig_oif != dev_out->ifindex)) {
+		/* For local routes that require a particular output interface
+		 * we do not want to cache the result.  Caching the result
+		 * causes incorrect behaviour when there are multiple source
+		 * addresses on the interface, the end result being that if the
+		 * intended recipient is waiting on that interface for the
+		 * packet he won't receive it because it will be delivered on
+		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
+		 * be set to the loopback interface as well.
+		 */
+		do_cache = false;
+	}
+
+	fnhe = NULL;
+	do_cache &= fi != NULL;
+	if (fi) {
+		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+		struct rtable __rcu **prth;
+
+		fnhe = find_exception(nhc, fl4->daddr);
+		if (!do_cache)
+			goto add;
+		if (fnhe) {
+			prth = &fnhe->fnhe_rth_output;
+		} else {
+			if (unlikely(fl4->flowi4_flags &
+				     FLOWI_FLAG_KNOWN_NH &&
+				     !(nhc->nhc_gw_family &&
+				       nhc->nhc_scope == RT_SCOPE_LINK))) {
+				do_cache = false;
+				goto add;
+			}
+			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
+		}
+		rth = rcu_dereference(*prth);
+		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
+			return rth;
+	}
+
+add:
+	rth = rt_dst_alloc(dev_out, flags, type,
+			   IN_DEV_ORCONF(in_dev, NOXFRM));
+	if (!rth)
+		return ERR_PTR(-ENOBUFS);
+
+	rth->rt_iif = orig_oif;
+
+	RT_CACHE_STAT_INC(out_slow_tot);
+
+	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+		if (flags & RTCF_LOCAL &&
+		    !(dev_out->flags & IFF_LOOPBACK)) {
+			rth->dst.output = ip_mc_output;
+			RT_CACHE_STAT_INC(out_slow_mc);
+		}
+#ifdef CONFIG_IP_MROUTE
+		if (type == RTN_MULTICAST) {
+			if (IN_DEV_MFORWARD(in_dev) &&
+			    !ipv4_is_local_multicast(fl4->daddr)) {
+				rth->dst.input = ip_mr_input;
+				rth->dst.output = ip_mc_output;
+			}
+		}
+#endif
+	}
+
+	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
+	lwtunnel_set_redirect(&rth->dst);
+
+	return rth;
+}
+
+/*
+ * Major route resolver routine.
+ */
+
+struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+					const struct sk_buff *skb)
+{
+	struct fib_result res = {
+		.type		= RTN_UNSPEC,
+		.fi		= NULL,
+		.table		= NULL,
+		.tclassid	= 0,
+	};
+	struct rtable *rth;
+
+	fl4->flowi4_iif = LOOPBACK_IFINDEX;
+	ip_rt_fix_tos(fl4);
+
+	rcu_read_lock();
+	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
+	rcu_read_unlock();
+
+	return rth;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
+
+struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
+					    struct fib_result *res,
+					    const struct sk_buff *skb)
+{
+	struct net_device *dev_out = NULL;
+	int orig_oif = fl4->flowi4_oif;
+	unsigned int flags = 0;
+	struct rtable *rth;
+	int err;
+
+	if (fl4->saddr) {
+		if (ipv4_is_multicast(fl4->saddr) ||
+		    ipv4_is_lbcast(fl4->saddr) ||
+		    ipv4_is_zeronet(fl4->saddr)) {
+			rth = ERR_PTR(-EINVAL);
+			goto out;
+		}
+
+		rth = ERR_PTR(-ENETUNREACH);
+
+		/* I removed check for oif == dev_out->oif here.
+		 * It was wrong for two reasons:
+		 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+		 *    is assigned to multiple interfaces.
+		 * 2. Moreover, we are allowed to send packets with saddr
+		 *    of another iface. --ANK
+		 */
+
+		if (fl4->flowi4_oif == 0 &&
+		    (ipv4_is_multicast(fl4->daddr) ||
+		     ipv4_is_lbcast(fl4->daddr))) {
+			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+			dev_out = __ip_dev_find(net, fl4->saddr, false);
+			if (!dev_out)
+				goto out;
+
+			/* Special hack: user can direct multicasts
+			 * and limited broadcast via necessary interface
+			 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+			 * This hack is not just for fun, it allows
+			 * vic,vat and friends to work.
+			 * They bind socket to loopback, set ttl to zero
+			 * and expect that it will work.
+			 * From the viewpoint of routing cache they are broken,
+			 * because we are not allowed to build multicast path
+			 * with loopback source addr (look, routing cache
+			 * cannot know, that ttl is zero, so that packet
+			 * will not leave this host and route is valid).
+			 * Luckily, this hack is good workaround.
+			 */
+
+			fl4->flowi4_oif = dev_out->ifindex;
+			goto make_route;
+		}
+
+		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
+			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+			if (!__ip_dev_find(net, fl4->saddr, false))
+				goto out;
+		}
+	}
+
+
+	if (fl4->flowi4_oif) {
+		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
+		rth = ERR_PTR(-ENODEV);
+		if (!dev_out)
+			goto out;
+
+		/* RACE: Check return value of inet_select_addr instead. */
+		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
+			rth = ERR_PTR(-ENETUNREACH);
+			goto out;
+		}
+		if (ipv4_is_local_multicast(fl4->daddr) ||
+		    ipv4_is_lbcast(fl4->daddr) ||
+		    fl4->flowi4_proto == IPPROTO_IGMP) {
+			if (!fl4->saddr)
+				fl4->saddr = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_LINK);
+			goto make_route;
+		}
+		if (!fl4->saddr) {
+			if (ipv4_is_multicast(fl4->daddr))
+				fl4->saddr = inet_select_addr(dev_out, 0,
+							      fl4->flowi4_scope);
+			else if (!fl4->daddr)
+				fl4->saddr = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_HOST);
+		}
+	}
+
+	if (!fl4->daddr) {
+		fl4->daddr = fl4->saddr;
+		if (!fl4->daddr)
+			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
+		dev_out = net->loopback_dev;
+		fl4->flowi4_oif = LOOPBACK_IFINDEX;
+		res->type = RTN_LOCAL;
+		flags |= RTCF_LOCAL;
+		goto make_route;
+	}
+
+	err = fib_lookup(net, fl4, res, 0);
+	if (err) {
+		res->fi = NULL;
+		res->table = NULL;
+		if (fl4->flowi4_oif &&
+		    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
+			/* Apparently, routing tables are wrong. Assume,
+			 * that the destination is on link.
+			 *
+			 * WHY? DW.
+			 * Because we are allowed to send to iface
+			 * even if it has NO routes and NO assigned
+			 * addresses. When oif is specified, routing
+			 * tables are looked up with only one purpose:
+			 * to catch if destination is gatewayed, rather than
+			 * direct. Moreover, if MSG_DONTROUTE is set,
+			 * we send packet, ignoring both routing tables
+			 * and ifaddr state. --ANK
+			 *
+			 *
+			 * We could make it even if oif is unknown,
+			 * likely IPv6, but we do not.
+			 */
+
+			if (fl4->saddr == 0)
+				fl4->saddr = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_LINK);
+			res->type = RTN_UNICAST;
+			goto make_route;
+		}
+		rth = ERR_PTR(err);
+		goto out;
+	}
+
+	if (res->type == RTN_LOCAL) {
+		if (!fl4->saddr) {
+			if (res->fi->fib_prefsrc)
+				fl4->saddr = res->fi->fib_prefsrc;
+			else
+				fl4->saddr = fl4->daddr;
+		}
+
+		/* L3 master device is the loopback for that domain */
+		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
+			net->loopback_dev;
+
+		/* make sure orig_oif points to fib result device even
+		 * though packet rx/tx happens over loopback or l3mdev
+		 */
+		orig_oif = FIB_RES_OIF(*res);
+
+		fl4->flowi4_oif = dev_out->ifindex;
+		flags |= RTCF_LOCAL;
+		goto make_route;
+	}
+
+	fib_select_path(net, res, fl4, skb);
+
+	dev_out = FIB_RES_DEV(*res);
+
+make_route:
+	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
+
+out:
+	return rth;
+}
+
+static struct dst_ops ipv4_dst_blackhole_ops = {
+	.family			= AF_INET,
+	.default_advmss		= ipv4_default_advmss,
+	.neigh_lookup		= ipv4_neigh_lookup,
+	.check			= dst_blackhole_check,
+	.cow_metrics		= dst_blackhole_cow_metrics,
+	.update_pmtu		= dst_blackhole_update_pmtu,
+	.redirect		= dst_blackhole_redirect,
+	.mtu			= dst_blackhole_mtu,
+};
+
+struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
+{
+	struct rtable *ort = (struct rtable *) dst_orig;
+	struct rtable *rt;
+
+	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
+	if (rt) {
+		struct dst_entry *new = &rt->dst;
+
+		new->__use = 1;
+		new->input = dst_discard;
+		new->output = dst_discard_out;
+
+		new->dev = net->loopback_dev;
+		netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
+
+		rt->rt_is_input = ort->rt_is_input;
+		rt->rt_iif = ort->rt_iif;
+		rt->rt_pmtu = ort->rt_pmtu;
+		rt->rt_mtu_locked = ort->rt_mtu_locked;
+
+		rt->rt_genid = rt_genid_ipv4(net);
+		rt->rt_flags = ort->rt_flags;
+		rt->rt_type = ort->rt_type;
+		rt->rt_uses_gateway = ort->rt_uses_gateway;
+		rt->rt_gw_family = ort->rt_gw_family;
+		if (rt->rt_gw_family == AF_INET)
+			rt->rt_gw4 = ort->rt_gw4;
+		else if (rt->rt_gw_family == AF_INET6)
+			rt->rt_gw6 = ort->rt_gw6;
+	}
+
+	dst_release(dst_orig);
+
+	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
+}
+
+struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
+				    const struct sock *sk)
+{
+	struct rtable *rt = __ip_route_output_key(net, flp4);
+
+	if (IS_ERR(rt))
+		return rt;
+
+	if (flp4->flowi4_proto) {
+		flp4->flowi4_oif = rt->dst.dev->ifindex;
+		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
+							flowi4_to_flowi(flp4),
+							sk, 0);
+	}
+
+	return rt;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
+struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
+				      struct net_device *dev,
+				      struct net *net, __be32 *saddr,
+				      const struct ip_tunnel_info *info,
+				      u8 protocol, bool use_cache)
+{
+#ifdef CONFIG_DST_CACHE
+	struct dst_cache *dst_cache;
+#endif
+	struct rtable *rt = NULL;
+	struct flowi4 fl4;
+	__u8 tos;
+
+#ifdef CONFIG_DST_CACHE
+	dst_cache = (struct dst_cache *)&info->dst_cache;
+	if (use_cache) {
+		rt = dst_cache_get_ip4(dst_cache, saddr);
+		if (rt)
+			return rt;
+	}
+#endif
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_proto = protocol;
+	fl4.daddr = info->key.u.ipv4.dst;
+	fl4.saddr = info->key.u.ipv4.src;
+	tos = info->key.tos;
+	fl4.flowi4_tos = RT_TOS(tos);
+
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
+		return ERR_PTR(-ENETUNREACH);
+	}
+	if (rt->dst.dev == dev) { /* is this necessary? */
+		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
+		ip_rt_put(rt);
+		return ERR_PTR(-ELOOP);
+	}
+#ifdef CONFIG_DST_CACHE
+	if (use_cache)
+		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+#endif
+	*saddr = fl4.saddr;
+	return rt;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
+
+/* called with rcu_read_lock held */
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+			struct sk_buff *skb, u32 portid, u32 seq,
+			unsigned int flags)
+{
+	struct rtmsg *r;
+	struct nlmsghdr *nlh;
+	unsigned long expires = 0;
+	u32 error;
+	u32 metrics[RTAX_MAX];
+
+	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	r->rtm_family	 = AF_INET;
+	r->rtm_dst_len	= 32;
+	r->rtm_src_len	= 0;
+	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
+	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
+	if (nla_put_u32(skb, RTA_TABLE, table_id))
+		goto nla_put_failure;
+	r->rtm_type	= rt->rt_type;
+	r->rtm_scope	= RT_SCOPE_UNIVERSE;
+	r->rtm_protocol = RTPROT_UNSPEC;
+	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+	if (rt->rt_flags & RTCF_NOTIFY)
+		r->rtm_flags |= RTM_F_NOTIFY;
+	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
+		r->rtm_flags |= RTCF_DOREDIRECT;
+
+	if (nla_put_in_addr(skb, RTA_DST, dst))
+		goto nla_put_failure;
+	if (src) {
+		r->rtm_src_len = 32;
+		if (nla_put_in_addr(skb, RTA_SRC, src))
+			goto nla_put_failure;
+	}
+	if (rt->dst.dev &&
+	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+		goto nla_put_failure;
+	if (rt->dst.lwtstate &&
+	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+		goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (rt->dst.tclassid &&
+	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
+		goto nla_put_failure;
+#endif
+	if (fl4 && !rt_is_input_route(rt) &&
+	    fl4->saddr != src) {
+		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
+			goto nla_put_failure;
+	}
+	if (rt->rt_uses_gateway) {
+		if (rt->rt_gw_family == AF_INET &&
+		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
+			goto nla_put_failure;
+		} else if (rt->rt_gw_family == AF_INET6) {
+			int alen = sizeof(struct in6_addr);
+			struct nlattr *nla;
+			struct rtvia *via;
+
+			nla = nla_reserve(skb, RTA_VIA, alen + 2);
+			if (!nla)
+				goto nla_put_failure;
+
+			via = nla_data(nla);
+			via->rtvia_family = AF_INET6;
+			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+		}
+	}
+
+	expires = rt->dst.expires;
+	if (expires) {
+		unsigned long now = jiffies;
+
+		if (time_before(now, expires))
+			expires -= now;
+		else
+			expires = 0;
+	}
+
+	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+	if (rt->rt_pmtu && expires)
+		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+	if (rt->rt_mtu_locked && expires)
+		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
+	if (rtnetlink_put_metrics(skb, metrics) < 0)
+		goto nla_put_failure;
+
+	if (fl4) {
+		if (fl4->flowi4_mark &&
+		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+			goto nla_put_failure;
+
+		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+		    nla_put_u32(skb, RTA_UID,
+				from_kuid_munged(current_user_ns(),
+						 fl4->flowi4_uid)))
+			goto nla_put_failure;
+
+		if (rt_is_input_route(rt)) {
+#ifdef CONFIG_IP_MROUTE
+			if (ipv4_is_multicast(dst) &&
+			    !ipv4_is_local_multicast(dst) &&
+			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+				int err = ipmr_get_route(net, skb,
+							 fl4->saddr, fl4->daddr,
+							 r, portid);
+
+				if (err <= 0) {
+					if (err == 0)
+						return 0;
+					goto nla_put_failure;
+				}
+			} else
+#endif
+				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
+					goto nla_put_failure;
+		}
+	}
+
+	error = rt->dst.error;
+
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
+			    struct netlink_callback *cb, u32 table_id,
+			    struct fnhe_hash_bucket *bucket, int genid,
+			    int *fa_index, int fa_start, unsigned int flags)
+{
+	int i;
+
+	for (i = 0; i < FNHE_HASH_SIZE; i++) {
+		struct fib_nh_exception *fnhe;
+
+		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
+		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
+			struct rtable *rt;
+			int err;
+
+			if (*fa_index < fa_start)
+				goto next;
+
+			if (fnhe->fnhe_genid != genid)
+				goto next;
+
+			if (fnhe->fnhe_expires &&
+			    time_after(jiffies, fnhe->fnhe_expires))
+				goto next;
+
+			rt = rcu_dereference(fnhe->fnhe_rth_input);
+			if (!rt)
+				rt = rcu_dereference(fnhe->fnhe_rth_output);
+			if (!rt)
+				goto next;
+
+			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
+					   table_id, NULL, skb,
+					   NETLINK_CB(cb->skb).portid,
+					   cb->nlh->nlmsg_seq, flags);
+			if (err)
+				return err;
+next:
+			(*fa_index)++;
+		}
+	}
+
+	return 0;
+}
+
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
+		       u32 table_id, struct fib_info *fi,
+		       int *fa_index, int fa_start, unsigned int flags)
+{
+	struct net *net = sock_net(cb->skb->sk);
+	int nhsel, genid = fnhe_genid(net);
+
+	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
+		struct fnhe_hash_bucket *bucket;
+		int err;
+
+		if (nhc->nhc_flags & RTNH_F_DEAD)
+			continue;
+
+		rcu_read_lock();
+		bucket = rcu_dereference(nhc->nhc_exceptions);
+		err = 0;
+		if (bucket)
+			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
+					       genid, fa_index, fa_start,
+					       flags);
+		rcu_read_unlock();
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+						   u8 ip_proto, __be16 sport,
+						   __be16 dport)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	/* Reserve room for dummy headers, this skb can pass
+	 * through good chunk of routing engine.
+	 */
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IP);
+	iph = skb_put(skb, sizeof(struct iphdr));
+	iph->protocol = ip_proto;
+	iph->saddr = src;
+	iph->daddr = dst;
+	iph->version = 0x4;
+	iph->frag_off = 0;
+	iph->ihl = 0x5;
+	skb_set_transport_header(skb, skb->len);
+
+	switch (iph->protocol) {
+	case IPPROTO_UDP: {
+		struct udphdr *udph;
+
+		udph = skb_put_zero(skb, sizeof(struct udphdr));
+		udph->source = sport;
+		udph->dest = dport;
+		udph->len = htons(sizeof(struct udphdr));
+		udph->check = 0;
+		break;
+	}
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph;
+
+		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+		tcph->source	= sport;
+		tcph->dest	= dport;
+		tcph->doff	= sizeof(struct tcphdr) / 4;
+		tcph->rst = 1;
+		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+					    src, dst, 0);
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr *icmph;
+
+		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+		icmph->type = ICMP_ECHO;
+		icmph->code = 0;
+	}
+	}
+
+	return skb;
+}
+
+static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
+				       const struct nlmsghdr *nlh,
+				       struct nlattr **tb,
+				       struct netlink_ext_ack *extack)
+{
+	struct rtmsg *rtm;
+	int i, err;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+		NL_SET_ERR_MSG(extack,
+			       "ipv4: Invalid header for route get request");
+		return -EINVAL;
+	}
+
+	if (!netlink_strict_get_check(skb))
+		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+					      rtm_ipv4_policy, extack);
+
+	rtm = nlmsg_data(nlh);
+	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+	    rtm->rtm_table || rtm->rtm_protocol ||
+	    rtm->rtm_scope || rtm->rtm_type) {
+		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
+		return -EINVAL;
+	}
+
+	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
+			       RTM_F_LOOKUP_TABLE |
+			       RTM_F_FIB_MATCH)) {
+		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
+		return -EINVAL;
+	}
+
+	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+					    rtm_ipv4_policy, extack);
+	if (err)
+		return err;
+
+	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+		return -EINVAL;
+	}
+
+	for (i = 0; i <= RTA_MAX; i++) {
+		if (!tb[i])
+			continue;
+
+		switch (i) {
+		case RTA_IIF:
+		case RTA_OIF:
+		case RTA_SRC:
+		case RTA_DST:
+		case RTA_IP_PROTO:
+		case RTA_SPORT:
+		case RTA_DPORT:
+		case RTA_MARK:
+		case RTA_UID:
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nlattr *tb[RTA_MAX+1];
+	u32 table_id = RT_TABLE_MAIN;
+	__be16 sport = 0, dport = 0;
+	struct fib_result res = {};
+	u8 ip_proto = IPPROTO_UDP;
+	struct rtable *rt = NULL;
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
+	struct flowi4 fl4 = {};
+	__be32 dst = 0;
+	__be32 src = 0;
+	kuid_t uid;
+	u32 iif;
+	int err;
+	int mark;
+
+	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
+	if (err < 0)
+		return err;
+
+	rtm = nlmsg_data(nlh);
+	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
+	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
+	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+	if (tb[RTA_UID])
+		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+	else
+		uid = (iif ? INVALID_UID : current_uid());
+
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &ip_proto, AF_INET, extack);
+		if (err)
+			return err;
+	}
+
+	if (tb[RTA_SPORT])
+		sport = nla_get_be16(tb[RTA_SPORT]);
+
+	if (tb[RTA_DPORT])
+		dport = nla_get_be16(tb[RTA_DPORT]);
+
+	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+	if (!skb)
+		return -ENOBUFS;
+
+	fl4.daddr = dst;
+	fl4.saddr = src;
+	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
+	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+	fl4.flowi4_mark = mark;
+	fl4.flowi4_uid = uid;
+	if (sport)
+		fl4.fl4_sport = sport;
+	if (dport)
+		fl4.fl4_dport = dport;
+	fl4.flowi4_proto = ip_proto;
+
+	rcu_read_lock();
+
+	if (iif) {
+		struct net_device *dev;
+
+		dev = dev_get_by_index_rcu(net, iif);
+		if (!dev) {
+			err = -ENODEV;
+			goto errout_rcu;
+		}
+
+		fl4.flowi4_iif = iif; /* for rt_fill_info */
+		skb->dev	= dev;
+		skb->mark	= mark;
+		err = ip_route_input_rcu(skb, dst, src,
+					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
+					 &res);
+
+		rt = skb_rtable(skb);
+		if (err == 0 && rt->dst.error)
+			err = -rt->dst.error;
+	} else {
+		fl4.flowi4_iif = LOOPBACK_IFINDEX;
+		skb->dev = net->loopback_dev;
+		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
+		err = 0;
+		if (IS_ERR(rt))
+			err = PTR_ERR(rt);
+		else
+			skb_dst_set(skb, &rt->dst);
+	}
+
+	if (err)
+		goto errout_rcu;
+
+	if (rtm->rtm_flags & RTM_F_NOTIFY)
+		rt->rt_flags |= RTCF_NOTIFY;
+
+	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
+		table_id = res.table ? res.table->tb_id : 0;
+
+	/* reset skb for netlink reply msg */
+	skb_trim(skb, 0);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_mac_header(skb);
+
+	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+		struct fib_rt_info fri;
+
+		if (!res.fi) {
+			err = fib_props[res.type].error;
+			if (!err)
+				err = -EHOSTUNREACH;
+			goto errout_rcu;
+		}
+		fri.fi = res.fi;
+		fri.tb_id = table_id;
+		fri.dst = res.prefix;
+		fri.dst_len = res.prefixlen;
+		fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
+		fri.type = rt->rt_type;
+		fri.offload = 0;
+		fri.trap = 0;
+		fri.offload_failed = 0;
+		if (res.fa_head) {
+			struct fib_alias *fa;
+
+			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
+				u8 slen = 32 - fri.dst_len;
+
+				if (fa->fa_slen == slen &&
+				    fa->tb_id == fri.tb_id &&
+				    fa->fa_dscp == fri.dscp &&
+				    fa->fa_info == res.fi &&
+				    fa->fa_type == fri.type) {
+					fri.offload = READ_ONCE(fa->offload);
+					fri.trap = READ_ONCE(fa->trap);
+					fri.offload_failed =
+						READ_ONCE(fa->offload_failed);
+					break;
+				}
+			}
+		}
+		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
+				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
+	} else {
+		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
+				   NETLINK_CB(in_skb).portid,
+				   nlh->nlmsg_seq, 0);
+	}
+	if (err < 0)
+		goto errout_rcu;
+
+	rcu_read_unlock();
+
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout_free:
+	return err;
+errout_rcu:
+	rcu_read_unlock();
+	kfree_skb(skb);
+	goto errout_free;
+}
+
+void ip_rt_multicast_event(struct in_device *in_dev)
+{
+	rt_cache_flush(dev_net(in_dev->dev));
+}
+
+#ifdef CONFIG_SYSCTL
+static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
+static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
+static int ip_rt_gc_elasticity __read_mostly	= 8;
+static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
+
+static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = (struct net *)__ctl->extra1;
+
+	if (write) {
+		rt_cache_flush(net);
+		fnhe_genid_bump(net);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static struct ctl_table ipv4_route_table[] = {
+	{
+		.procname	= "gc_thresh",
+		.data		= &ipv4_dst_ops.gc_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "max_size",
+		.data		= &ip_rt_max_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		/*  Deprecated. Use gc_min_interval_ms */
+
+		.procname	= "gc_min_interval",
+		.data		= &ip_rt_gc_min_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "gc_min_interval_ms",
+		.data		= &ip_rt_gc_min_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
+		.procname	= "gc_timeout",
+		.data		= &ip_rt_gc_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "gc_interval",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "redirect_load",
+		.data		= &ip_rt_redirect_load,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "redirect_number",
+		.data		= &ip_rt_redirect_number,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "redirect_silence",
+		.data		= &ip_rt_redirect_silence,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "error_cost",
+		.data		= &ip_rt_error_cost,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "error_burst",
+		.data		= &ip_rt_error_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "gc_elasticity",
+		.data		= &ip_rt_gc_elasticity,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static const char ipv4_route_flush_procname[] = "flush";
+
+static struct ctl_table ipv4_route_netns_table[] = {
+	{
+		.procname	= ipv4_route_flush_procname,
+		.maxlen		= sizeof(int),
+		.mode		= 0200,
+		.proc_handler	= ipv4_sysctl_rtcache_flush,
+	},
+	{
+		.procname       = "min_pmtu",
+		.data           = &init_net.ipv4.ip_rt_min_pmtu,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &ip_min_valid_pmtu,
+	},
+	{
+		.procname       = "mtu_expires",
+		.data           = &init_net.ipv4.ip_rt_mtu_expires,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_jiffies,
+	},
+	{
+		.procname   = "min_adv_mss",
+		.data       = &init_net.ipv4.ip_rt_min_advmss,
+		.maxlen     = sizeof(int),
+		.mode       = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ },
+};
+
+static __net_init int sysctl_route_net_init(struct net *net)
+{
+	struct ctl_table *tbl;
+	size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);
+
+	tbl = ipv4_route_netns_table;
+	if (!net_eq(net, &init_net)) {
+		int i;
+
+		tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
+		if (!tbl)
+			goto err_dup;
+
+		/* Don't export non-whitelisted sysctls to unprivileged users */
+		if (net->user_ns != &init_user_ns) {
+			if (tbl[0].procname != ipv4_route_flush_procname) {
+				tbl[0].procname = NULL;
+				table_size = 0;
+			}
+		}
+
+		/* Update the variables to point into the current struct net
+		 * except for the first element flush
+		 */
+		for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
+			tbl[i].data += (void *)net - (void *)&init_net;
+	}
+	tbl[0].extra1 = net;
+
+	net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
+						     tbl, table_size);
+	if (!net->ipv4.route_hdr)
+		goto err_reg;
+	return 0;
+
+err_reg:
+	if (tbl != ipv4_route_netns_table)
+		kfree(tbl);
+err_dup:
+	return -ENOMEM;
+}
+
+static __net_exit void sysctl_route_net_exit(struct net *net)
+{
+	struct ctl_table *tbl;
+
+	tbl = net->ipv4.route_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.route_hdr);
+	BUG_ON(tbl == ipv4_route_netns_table);
+	kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_route_ops = {
+	.init = sysctl_route_net_init,
+	.exit = sysctl_route_net_exit,
+};
+#endif
+
+static __net_init int netns_ip_rt_init(struct net *net)
+{
+	/* Set default value for namespaceified sysctls */
+	net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
+	net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
+	net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
+	return 0;
+}
+
+static struct pernet_operations __net_initdata ip_rt_ops = {
+	.init = netns_ip_rt_init,
+};
+
+static __net_init int rt_genid_init(struct net *net)
+{
+	atomic_set(&net->ipv4.rt_genid, 0);
+	atomic_set(&net->fnhe_genid, 0);
+	atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
+	return 0;
+}
+
+static __net_initdata struct pernet_operations rt_genid_ops = {
+	.init = rt_genid_init,
+};
+
+static int __net_init ipv4_inetpeer_init(struct net *net)
+{
+	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+	if (!bp)
+		return -ENOMEM;
+	inet_peer_base_init(bp);
+	net->ipv4.peers = bp;
+	return 0;
+}
+
+static void __net_exit ipv4_inetpeer_exit(struct net *net)
+{
+	struct inet_peer_base *bp = net->ipv4.peers;
+
+	net->ipv4.peers = NULL;
+	inetpeer_invalidate_tree(bp);
+	kfree(bp);
+}
+
+static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
+	.init	=	ipv4_inetpeer_init,
+	.exit	=	ipv4_inetpeer_exit,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
+#endif /* CONFIG_IP_ROUTE_CLASSID */
+
+int __init ip_rt_init(void)
+{
+	void *idents_hash;
+	int cpu;
+
+	/* For modern hosts, this will use 2 MB of memory */
+	idents_hash = alloc_large_system_hash("IP idents",
+					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
+					      0,
+					      16, /* one bucket per 64 KB */
+					      HASH_ZERO,
+					      NULL,
+					      &ip_idents_mask,
+					      2048,
+					      256*1024);
+
+	ip_idents = idents_hash;
+
+	get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
+
+	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
+
+	for_each_possible_cpu(cpu) {
+		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+		INIT_LIST_HEAD(&ul->head);
+		INIT_LIST_HEAD(&ul->quarantine);
+		spin_lock_init(&ul->lock);
+	}
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
+	if (!ip_rt_acct)
+		panic("IP: failed to allocate ip_rt_acct\n");
+#endif
+
+	ipv4_dst_ops.kmem_cachep =
+		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
+
+	if (dst_entries_init(&ipv4_dst_ops) < 0)
+		panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
+	ipv4_dst_ops.gc_thresh = ~0;
+	ip_rt_max_size = INT_MAX;
+
+	devinet_init();
+	ip_fib_init();
+
+	if (ip_rt_proc_init())
+		pr_err("Unable to create route proc files\n");
+#ifdef CONFIG_XFRM
+	xfrm_init();
+	xfrm4_init();
+#endif
+	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
+		      RTNL_FLAG_DOIT_UNLOCKED);
+
+#ifdef CONFIG_SYSCTL
+	register_pernet_subsys(&sysctl_route_ops);
+#endif
+	register_pernet_subsys(&ip_rt_ops);
+	register_pernet_subsys(&rt_genid_ops);
+	register_pernet_subsys(&ipv4_inetpeer_ops);
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * We really need to sanitize the damn ipv4 init order, then all
+ * this nonsense will go away.
+ */
+void __init ip_static_sysctl_init(void)
+{
+	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
+}
+#endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
new file mode 100644
index 0000000000..3b4dafefb4
--- /dev/null
+++ b/net/ipv4/syncookies.c
@@ -0,0 +1,449 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  Syncookies implementation for the Linux kernel
+ *
+ *  Copyright (C) 1997 Andi Kleen
+ *  Based on ideas by D.J.Bernstein and Eric Schenk.
+ */
+
+#include <linux/tcp.h>
+#include <linux/siphash.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <net/secure_seq.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+static siphash_aligned_key_t syncookie_secret[2];
+
+#define COOKIEBITS 24	/* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK
+ * stores TCP options:
+ *
+ * MSB                               LSB
+ * | 31 ...   6 |  5  |  4   | 3 2 1 0 |
+ * |  Timestamp | ECN | SACK | WScale  |
+ *
+ * When we receive a valid cookie-ACK, we look at the echoed tsval (if
+ * any) to figure out which TCP options we should use for the rebuilt
+ * connection.
+ *
+ * A WScale setting of '0xf' (which is an invalid scaling value)
+ * means that original syn did not include the TCP window scaling option.
+ */
+#define TS_OPT_WSCALE_MASK	0xf
+#define TS_OPT_SACK		BIT(4)
+#define TS_OPT_ECN		BIT(5)
+/* There is no TS_OPT_TIMESTAMP:
+ * if ACK contains timestamp option, we already know it was
+ * requested/supported by the syn/synack exchange.
+ */
+#define TSBITS	6
+
+static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
+		       u32 count, int c)
+{
+	net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
+	return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+			    (__force u32)sport << 16 | (__force u32)dport,
+			    count, &syncookie_secret[c]);
+}
+
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we encode
+ * tcp options in the lower bits of the timestamp value that will be
+ * sent in the syn-ack.
+ * Since subsequent timestamps use the normal tcp_time_stamp value, we
+ * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
+ */
+u64 cookie_init_timestamp(struct request_sock *req, u64 now)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	u64 ts, ts_now = tcp_ns_to_ts(now);
+	u32 options = 0;
+
+	options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
+	if (ireq->sack_ok)
+		options |= TS_OPT_SACK;
+	if (ireq->ecn_ok)
+		options |= TS_OPT_ECN;
+
+	ts = (ts_now >> TSBITS) << TSBITS;
+	ts |= options;
+	if (ts > ts_now)
+		ts -= (1UL << TSBITS);
+
+	return ts * (NSEC_PER_SEC / TCP_TS_HZ);
+}
+
+
+static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
+				   __be16 dport, __u32 sseq, __u32 data)
+{
+	/*
+	 * Compute the secure sequence number.
+	 * The output should be:
+	 *   HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+	 *      + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
+	 * Where sseq is their sequence number and count increases every
+	 * minute by 1.
+	 * As an extra hack, we add a small "data" value that encodes the
+	 * MSS into the second hash value.
+	 */
+	u32 count = tcp_cookie_time();
+	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+		sseq + (count << COOKIEBITS) +
+		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+		 & COOKIEMASK));
+}
+
+/*
+ * This retrieves the small "data" value from the syncookie.
+ * If the syncookie is bad, the data returned will be out of
+ * range.  This must be checked by the caller.
+ *
+ * The count value used to generate the cookie must be less than
+ * MAX_SYNCOOKIE_AGE minutes in the past.
+ * The return value (__u32)-1 if this test fails.
+ */
+static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
+				  __be16 sport, __be16 dport, __u32 sseq)
+{
+	u32 diff, count = tcp_cookie_time();
+
+	/* Strip away the layers from the cookie */
+	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+	/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
+	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+	if (diff >= MAX_SYNCOOKIE_AGE)
+		return (__u32)-1;
+
+	return (cookie -
+		cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+		& COOKIEMASK;	/* Leaving the data behind */
+}
+
+/*
+ * MSS Values are chosen based on the 2011 paper
+ * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
+ * Values ..
+ *  .. lower than 536 are rare (< 0.2%)
+ *  .. between 537 and 1299 account for less than < 1.5% of observed values
+ *  .. in the 1300-1349 range account for about 15 to 20% of observed mss values
+ *  .. exceeding 1460 are very rare (< 0.04%)
+ *
+ *  1460 is the single most frequently announced mss value (30 to 46% depending
+ *  on monitor location).  Table must be sorted.
+ */
+static __u16 const msstab[] = {
+	536,
+	1300,
+	1440,	/* 1440, 1452: PPPoE */
+	1460,
+};
+
+/*
+ * Generate a syncookie.  mssp points to the mss, which is returned
+ * rounded down to the value encoded in the cookie.
+ */
+u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+			      u16 *mssp)
+{
+	int mssind;
+	const __u16 mss = *mssp;
+
+	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+		if (mss >= msstab[mssind])
+			break;
+	*mssp = msstab[mssind];
+
+	return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
+				     th->source, th->dest, ntohl(th->seq),
+				     mssind);
+}
+EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
+
+__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
+
+	return __cookie_v4_init_sequence(iph, th, mssp);
+}
+
+/*
+ * Check if a ack sequence number is a valid syncookie.
+ * Return the decoded mss if it is, or 0 if not.
+ */
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
+		      u32 cookie)
+{
+	__u32 seq = ntohl(th->seq) - 1;
+	__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
+					    th->source, th->dest, seq);
+
+	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
+}
+EXPORT_SYMBOL_GPL(__cookie_v4_check);
+
+struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+				 struct request_sock *req,
+				 struct dst_entry *dst, u32 tsoff)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sock *child;
+	bool own_req;
+
+	child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+						 NULL, &own_req);
+	if (child) {
+		refcount_set(&req->rsk_refcnt, 1);
+		tcp_sk(child)->tsoffset = tsoff;
+		sock_rps_save_rxhash(child, skb);
+
+		if (rsk_drop_req(req)) {
+			reqsk_put(req);
+			return child;
+		}
+
+		if (inet_csk_reqsk_queue_add(sk, req, child))
+			return child;
+
+		bh_unlock_sock(child);
+		sock_put(child);
+	}
+	__reqsk_free(req);
+
+	return NULL;
+}
+EXPORT_SYMBOL(tcp_get_cookie_sock);
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we stored
+ * additional tcp options in the timestamp.
+ * This extracts these options from the timestamp echo.
+ *
+ * return false if we decode a tcp option that is disabled
+ * on the host.
+ */
+bool cookie_timestamp_decode(const struct net *net,
+			     struct tcp_options_received *tcp_opt)
+{
+	/* echoed timestamp, lowest bits contain options */
+	u32 options = tcp_opt->rcv_tsecr;
+
+	if (!tcp_opt->saw_tstamp)  {
+		tcp_clear_options(tcp_opt);
+		return true;
+	}
+
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
+		return false;
+
+	tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0;
+
+	if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
+		return false;
+
+	if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK)
+		return true; /* no window scaling */
+
+	tcp_opt->wscale_ok = 1;
+	tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK;
+
+	return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0;
+}
+EXPORT_SYMBOL(cookie_timestamp_decode);
+
+bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
+		   const struct net *net, const struct dst_entry *dst)
+{
+	bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN;
+
+	if (!ecn_ok)
+		return false;
+
+	if (READ_ONCE(net->ipv4.sysctl_tcp_ecn))
+		return true;
+
+	return dst_feature(dst, RTAX_FEATURE_ECN);
+}
+EXPORT_SYMBOL(cookie_ecn_ok);
+
+struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
+					    const struct tcp_request_sock_ops *af_ops,
+					    struct sock *sk,
+					    struct sk_buff *skb)
+{
+	struct tcp_request_sock *treq;
+	struct request_sock *req;
+
+	if (sk_is_mptcp(sk))
+		req = mptcp_subflow_reqsk_alloc(ops, sk, false);
+	else
+		req = inet_reqsk_alloc(ops, sk, false);
+
+	if (!req)
+		return NULL;
+
+	treq = tcp_rsk(req);
+
+	/* treq->af_specific might be used to perform TCP_MD5 lookup */
+	treq->af_specific = af_ops;
+
+	treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+#if IS_ENABLED(CONFIG_MPTCP)
+	treq->is_mptcp = sk_is_mptcp(sk);
+	if (treq->is_mptcp) {
+		int err = mptcp_subflow_init_cookie_req(req, sk, skb);
+
+		if (err) {
+			reqsk_free(req);
+			return NULL;
+		}
+	}
+#endif
+
+	return req;
+}
+EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc);
+
+/* On input, sk is a listener.
+ * Output is listener if incoming packet would not create a child
+ *           NULL if memory could not be allocated.
+ */
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
+	struct tcp_options_received tcp_opt;
+	struct inet_request_sock *ireq;
+	struct tcp_request_sock *treq;
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcphdr *th = tcp_hdr(skb);
+	__u32 cookie = ntohl(th->ack_seq) - 1;
+	struct sock *ret = sk;
+	struct request_sock *req;
+	int full_space, mss;
+	struct rtable *rt;
+	__u8 rcv_wscale;
+	struct flowi4 fl4;
+	u32 tsoff = 0;
+
+	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) ||
+	    !th->ack || th->rst)
+		goto out;
+
+	if (tcp_synq_no_recent_overflow(sk))
+		goto out;
+
+	mss = __cookie_v4_check(ip_hdr(skb), th, cookie);
+	if (mss == 0) {
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+		goto out;
+	}
+
+	__NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+
+	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
+		tsoff = secure_tcp_ts_off(sock_net(sk),
+					  ip_hdr(skb)->daddr,
+					  ip_hdr(skb)->saddr);
+		tcp_opt.rcv_tsecr -= tsoff;
+	}
+
+	if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt))
+		goto out;
+
+	ret = NULL;
+	req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops,
+				     &tcp_request_sock_ipv4_ops, sk, skb);
+	if (!req)
+		goto out;
+
+	ireq = inet_rsk(req);
+	treq = tcp_rsk(req);
+	treq->rcv_isn		= ntohl(th->seq) - 1;
+	treq->snt_isn		= cookie;
+	treq->ts_off		= 0;
+	treq->txhash		= net_tx_rndhash();
+	req->mss		= mss;
+	ireq->ir_num		= ntohs(th->dest);
+	ireq->ir_rmt_port	= th->source;
+	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+	ireq->ir_mark		= inet_request_mark(sk, skb);
+	ireq->snd_wscale	= tcp_opt.snd_wscale;
+	ireq->sack_ok		= tcp_opt.sack_ok;
+	ireq->wscale_ok		= tcp_opt.wscale_ok;
+	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
+	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+	treq->snt_synack	= 0;
+	treq->tfo_listener	= false;
+
+	if (IS_ENABLED(CONFIG_SMC))
+		ireq->smc_ok = 0;
+
+	ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
+
+	/* We throwed the options of the initial SYN away, so we hope
+	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
+	 */
+	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb));
+
+	if (security_inet_conn_request(sk, skb, req)) {
+		reqsk_free(req);
+		goto out;
+	}
+
+	req->num_retrans = 0;
+
+	/*
+	 * We need to lookup the route here to get at the correct
+	 * window size. We should better make sure that the window size
+	 * hasn't changed since we received the original syn, but I see
+	 * no easy way to do this.
+	 */
+	flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
+			   ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
+			   IPPROTO_TCP, inet_sk_flowi_flags(sk),
+			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
+			   ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
+	security_req_classify_flow(req, flowi4_to_flowi_common(&fl4));
+	rt = ip_route_output_key(sock_net(sk), &fl4);
+	if (IS_ERR(rt)) {
+		reqsk_free(req);
+		goto out;
+	}
+
+	/* Try to redo what tcp_v4_send_synack did. */
+	req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+	/* limit the window selection if the user enforce a smaller rx buffer */
+	full_space = tcp_full_space(sk);
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+	    (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+		req->rsk_window_clamp = full_space;
+
+	tcp_select_initial_window(sk, full_space, req->mss,
+				  &req->rsk_rcv_wnd, &req->rsk_window_clamp,
+				  ireq->wscale_ok, &rcv_wscale,
+				  dst_metric(&rt->dst, RTAX_INITRWND));
+
+	ireq->rcv_wscale  = rcv_wscale;
+	ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
+
+	ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff);
+	/* ip_queue_xmit() depends on our flow being setup
+	 * Normal sockets get it right from inet_csk_route_child_sock()
+	 */
+	if (ret)
+		inet_sk(ret)->cork.fl.u.ip4 = fl4;
+out:	return ret;
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 0000000000..6ac890b407
--- /dev/null
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,1573 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/sysctl.h>
+#include <linux/seqlock.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/cipso_ipv4.h>
+#include <net/ping.h>
+#include <net/protocol.h>
+#include <net/netevent.h>
+
+static int tcp_retr1_max = 255;
+static int ip_local_port_range_min[] = { 1, 1 };
+static int ip_local_port_range_max[] = { 65535, 65535 };
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
+static int tcp_app_win_max = 31;
+static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
+static int tcp_min_snd_mss_max = 65535;
+static int ip_privileged_port_min;
+static int ip_privileged_port_max = 65535;
+static int ip_ttl_min = 1;
+static int ip_ttl_max = 255;
+static int tcp_syn_retries_min = 1;
+static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
+static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT;
+static unsigned long ip_ping_group_range_min[] = { 0, 0 };
+static unsigned long ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static u32 u32_max_div_HZ = UINT_MAX / HZ;
+static int one_day_secs = 24 * 3600;
+static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
+	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
+static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
+static int tcp_plb_max_rounds = 31;
+static int tcp_plb_max_cong_thresh = 256;
+
+/* obsolete */
+static int sysctl_tcp_low_latency __read_mostly;
+
+/* Update system visible IP port range */
+static void set_local_port_range(struct net *net, int range[2])
+{
+	bool same_parity = !((range[0] ^ range[1]) & 1);
+
+	write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
+	if (same_parity && !net->ipv4.ip_local_ports.warned) {
+		net->ipv4.ip_local_ports.warned = true;
+		pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
+	}
+	net->ipv4.ip_local_ports.range[0] = range[0];
+	net->ipv4.ip_local_ports.range[1] = range[1];
+	write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_local_port_range(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net =
+		container_of(table->data, struct net, ipv4.ip_local_ports.range);
+	int ret;
+	int range[2];
+	struct ctl_table tmp = {
+		.data = &range,
+		.maxlen = sizeof(range),
+		.mode = table->mode,
+		.extra1 = &ip_local_port_range_min,
+		.extra2 = &ip_local_port_range_max,
+	};
+
+	inet_get_local_port_range(net, &range[0], &range[1]);
+
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		/* Ensure that the upper limit is not smaller than the lower,
+		 * and that the lower does not encroach upon the privileged
+		 * port limit.
+		 */
+		if ((range[1] < range[0]) ||
+		    (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock)))
+			ret = -EINVAL;
+		else
+			set_local_port_range(net, range);
+	}
+
+	return ret;
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_privileged_ports(struct ctl_table *table, int write,
+				void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_ip_prot_sock);
+	int ret;
+	int pports;
+	int range[2];
+	struct ctl_table tmp = {
+		.data = &pports,
+		.maxlen = sizeof(pports),
+		.mode = table->mode,
+		.extra1 = &ip_privileged_port_min,
+		.extra2 = &ip_privileged_port_max,
+	};
+
+	pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
+
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		inet_get_local_port_range(net, &range[0], &range[1]);
+		/* Ensure that the local port range doesn't overlap with the
+		 * privileged port range.
+		 */
+		if (range[0] < pports)
+			ret = -EINVAL;
+		else
+			WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports);
+	}
+
+	return ret;
+}
+
+static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
+{
+	kgid_t *data = table->data;
+	struct net *net =
+		container_of(table->data, struct net, ipv4.ping_group_range.range);
+	unsigned int seq;
+	do {
+		seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
+
+		*low = data[0];
+		*high = data[1];
+	} while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
+}
+
+/* Update system visible IP port range */
+static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
+{
+	kgid_t *data = table->data;
+	struct net *net =
+		container_of(table->data, struct net, ipv4.ping_group_range.range);
+	write_seqlock(&net->ipv4.ping_group_range.lock);
+	data[0] = low;
+	data[1] = high;
+	write_sequnlock(&net->ipv4.ping_group_range.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct user_namespace *user_ns = current_user_ns();
+	int ret;
+	unsigned long urange[2];
+	kgid_t low, high;
+	struct ctl_table tmp = {
+		.data = &urange,
+		.maxlen = sizeof(urange),
+		.mode = table->mode,
+		.extra1 = &ip_ping_group_range_min,
+		.extra2 = &ip_ping_group_range_max,
+	};
+
+	inet_get_ping_group_range_table(table, &low, &high);
+	urange[0] = from_kgid_munged(user_ns, low);
+	urange[1] = from_kgid_munged(user_ns, high);
+	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		low = make_kgid(user_ns, urange[0]);
+		high = make_kgid(user_ns, urange[1]);
+		if (!gid_valid(low) || !gid_valid(high))
+			return -EINVAL;
+		if (urange[1] < urange[0] || gid_lt(high, low)) {
+			low = make_kgid(&init_user_ns, 1);
+			high = make_kgid(&init_user_ns, 0);
+		}
+		set_ping_group_range(table, low, high);
+	}
+
+	return ret;
+}
+
+static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
+				    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net;
+	int ret;
+
+	net = container_of(table->data, struct net,
+			   ipv4.sysctl_ip_fwd_update_priority);
+	ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
+					net);
+
+	return ret;
+}
+
+static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
+				       void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(ctl->data, struct net,
+				       ipv4.tcp_congestion_control);
+	char val[TCP_CA_NAME_MAX];
+	struct ctl_table tbl = {
+		.data = val,
+		.maxlen = TCP_CA_NAME_MAX,
+	};
+	int ret;
+
+	tcp_get_default_congestion_control(net, val);
+
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		ret = tcp_set_default_congestion_control(net, val);
+	return ret;
+}
+
+static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
+						 int write, void *buffer,
+						 size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+	tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	kfree(tbl.data);
+	return ret;
+}
+
+static int proc_allowed_congestion_control(struct ctl_table *ctl,
+					   int write, void *buffer,
+					   size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+
+	tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		ret = tcp_set_allowed_congestion_control(tbl.data);
+	kfree(tbl.data);
+	return ret;
+}
+
+static int sscanf_key(char *buf, __le32 *key)
+{
+	u32 user_key[4];
+	int i, ret = 0;
+
+	if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1,
+		   user_key + 2, user_key + 3) != 4) {
+		ret = -EINVAL;
+	} else {
+		for (i = 0; i < ARRAY_SIZE(user_key); i++)
+			key[i] = cpu_to_le32(user_key[i]);
+	}
+	pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+		 user_key[0], user_key[1], user_key[2], user_key[3], buf, ret);
+
+	return ret;
+}
+
+static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_tcp_fastopen);
+	/* maxlen to print the list of keys in hex (*2), with dashes
+	 * separating doublewords and a comma in between keys.
+	 */
+	struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+					    2 * TCP_FASTOPEN_KEY_MAX) +
+					    (TCP_FASTOPEN_KEY_MAX * 5)) };
+	u32 user_key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u32)];
+	__le32 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(__le32)];
+	char *backup_data;
+	int ret, i = 0, off = 0, n_keys;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
+	if (!tbl.data)
+		return -ENOMEM;
+
+	n_keys = tcp_fastopen_get_cipher(net, NULL, (u64 *)key);
+	if (!n_keys) {
+		memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH);
+		n_keys = 1;
+	}
+
+	for (i = 0; i < n_keys * 4; i++)
+		user_key[i] = le32_to_cpu(key[i]);
+
+	for (i = 0; i < n_keys; i++) {
+		off += snprintf(tbl.data + off, tbl.maxlen - off,
+				"%08x-%08x-%08x-%08x",
+				user_key[i * 4],
+				user_key[i * 4 + 1],
+				user_key[i * 4 + 2],
+				user_key[i * 4 + 3]);
+
+		if (WARN_ON_ONCE(off >= tbl.maxlen - 1))
+			break;
+
+		if (i + 1 < n_keys)
+			off += snprintf(tbl.data + off, tbl.maxlen - off, ",");
+	}
+
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		backup_data = strchr(tbl.data, ',');
+		if (backup_data) {
+			*backup_data = '\0';
+			backup_data++;
+		}
+		if (sscanf_key(tbl.data, key)) {
+			ret = -EINVAL;
+			goto bad_key;
+		}
+		if (backup_data) {
+			if (sscanf_key(backup_data, key + 4)) {
+				ret = -EINVAL;
+				goto bad_key;
+			}
+		}
+		tcp_fastopen_reset_cipher(net, NULL, key,
+					  backup_data ? key + 4 : NULL);
+	}
+
+bad_key:
+	kfree(tbl.data);
+	return ret;
+}
+
+static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
+					     int write, void *buffer,
+					     size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_tcp_fastopen_blackhole_timeout);
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+
+	return ret;
+}
+
+static int proc_tcp_available_ulp(struct ctl_table *ctl,
+				  int write, void *buffer, size_t *lenp,
+				  loff_t *ppos)
+{
+	struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+	tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	kfree(tbl.data);
+
+	return ret;
+}
+
+static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
+				  void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv4.sysctl_tcp_child_ehash_entries);
+	struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
+	int tcp_ehash_entries;
+	struct ctl_table tbl;
+
+	tcp_ehash_entries = hinfo->ehash_mask + 1;
+
+	/* A negative number indicates that the child netns
+	 * shares the global ehash.
+	 */
+	if (!net_eq(net, &init_net) && !hinfo->pernet)
+		tcp_ehash_entries *= -1;
+
+	memset(&tbl, 0, sizeof(tbl));
+	tbl.data = &tcp_ehash_entries;
+	tbl.maxlen = sizeof(int);
+
+	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
+static int proc_udp_hash_entries(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv4.sysctl_udp_child_hash_entries);
+	int udp_hash_entries;
+	struct ctl_table tbl;
+
+	udp_hash_entries = net->ipv4.udp_table->mask + 1;
+
+	/* A negative number indicates that the child netns
+	 * shares the global udp_table.
+	 */
+	if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
+		udp_hash_entries *= -1;
+
+	memset(&tbl, 0, sizeof(tbl));
+	tbl.data = &udp_hash_entries;
+	tbl.maxlen = sizeof(int);
+
+	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_fib_multipath_hash_policy);
+	int ret;
+
+	ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
+
+	return ret;
+}
+
+static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write,
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	struct net *net;
+	int ret;
+
+	net = container_of(table->data, struct net,
+			   ipv4.sysctl_fib_multipath_hash_fields);
+	ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
+
+	return ret;
+}
+#endif
+
+static struct ctl_table ipv4_table[] = {
+	{
+		.procname	= "tcp_max_orphans",
+		.data		= &sysctl_tcp_max_orphans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "inet_peer_threshold",
+		.data		= &inet_peer_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "inet_peer_minttl",
+		.data		= &inet_peer_minttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "inet_peer_maxttl",
+		.data		= &inet_peer_maxttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "tcp_mem",
+		.maxlen		= sizeof(sysctl_tcp_mem),
+		.data		= &sysctl_tcp_mem,
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "tcp_low_latency",
+		.data		= &sysctl_tcp_low_latency,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#ifdef CONFIG_NETLABEL
+	{
+		.procname	= "cipso_cache_enable",
+		.data		= &cipso_v4_cache_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "cipso_cache_bucket_size",
+		.data		= &cipso_v4_cache_bucketsize,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "cipso_rbm_optfmt",
+		.data		= &cipso_v4_rbm_optfmt,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "cipso_rbm_strictvalid",
+		.data		= &cipso_v4_rbm_strictvalid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif /* CONFIG_NETLABEL */
+	{
+		.procname	= "tcp_available_ulp",
+		.maxlen		= TCP_ULP_BUF_MAX,
+		.mode		= 0444,
+		.proc_handler   = proc_tcp_available_ulp,
+	},
+	{
+		.procname	= "icmp_msgs_per_sec",
+		.data		= &sysctl_icmp_msgs_per_sec,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "icmp_msgs_burst",
+		.data		= &sysctl_icmp_msgs_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "udp_mem",
+		.data		= &sysctl_udp_mem,
+		.maxlen		= sizeof(sysctl_udp_mem),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "fib_sync_mem",
+		.data		= &sysctl_fib_sync_mem,
+		.maxlen		= sizeof(sysctl_fib_sync_mem),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= &sysctl_fib_sync_mem_min,
+		.extra2		= &sysctl_fib_sync_mem_max,
+	},
+	{ }
+};
+
+static struct ctl_table ipv4_net_table[] = {
+	{
+		.procname	= "tcp_max_tw_buckets",
+		.data		= &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "icmp_echo_ignore_all",
+		.data		= &init_net.ipv4.sysctl_icmp_echo_ignore_all,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
+	{
+		.procname	= "icmp_echo_enable_probe",
+		.data		= &init_net.ipv4.sysctl_icmp_echo_enable_probe,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
+	{
+		.procname	= "icmp_echo_ignore_broadcasts",
+		.data		= &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
+	{
+		.procname	= "icmp_ignore_bogus_error_responses",
+		.data		= &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
+	{
+		.procname	= "icmp_errors_use_inbound_ifaddr",
+		.data		= &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
+	{
+		.procname	= "icmp_ratelimit",
+		.data		= &init_net.ipv4.sysctl_icmp_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
+		.procname	= "icmp_ratemask",
+		.data		= &init_net.ipv4.sysctl_icmp_ratemask,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ping_group_range",
+		.data		= &init_net.ipv4.ping_group_range.range,
+		.maxlen		= sizeof(gid_t)*2,
+		.mode		= 0644,
+		.proc_handler	= ipv4_ping_group_range,
+	},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	{
+		.procname	= "raw_l3mdev_accept",
+		.data		= &init_net.ipv4.sysctl_raw_l3mdev_accept,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif
+	{
+		.procname	= "tcp_ecn",
+		.data		= &init_net.ipv4.sysctl_tcp_ecn,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{
+		.procname	= "tcp_ecn_fallback",
+		.data		= &init_net.ipv4.sysctl_tcp_ecn_fallback,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "ip_dynaddr",
+		.data		= &init_net.ipv4.sysctl_ip_dynaddr,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "ip_early_demux",
+		.data		= &init_net.ipv4.sysctl_ip_early_demux,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname       = "udp_early_demux",
+		.data           = &init_net.ipv4.sysctl_udp_early_demux,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+	},
+	{
+		.procname       = "tcp_early_demux",
+		.data           = &init_net.ipv4.sysctl_tcp_early_demux,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+	},
+	{
+		.procname       = "nexthop_compat_mode",
+		.data           = &init_net.ipv4.sysctl_nexthop_compat_mode,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "ip_default_ttl",
+		.data		= &init_net.ipv4.sysctl_ip_default_ttl,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= &ip_ttl_min,
+		.extra2		= &ip_ttl_max,
+	},
+	{
+		.procname	= "ip_local_port_range",
+		.maxlen		= sizeof(init_net.ipv4.ip_local_ports.range),
+		.data		= &init_net.ipv4.ip_local_ports.range,
+		.mode		= 0644,
+		.proc_handler	= ipv4_local_port_range,
+	},
+	{
+		.procname	= "ip_local_reserved_ports",
+		.data		= &init_net.ipv4.sysctl_local_reserved_ports,
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
+	{
+		.procname	= "ip_no_pmtu_disc",
+		.data		= &init_net.ipv4.sysctl_ip_no_pmtu_disc,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "ip_forward_use_pmtu",
+		.data		= &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "ip_forward_update_priority",
+		.data		= &init_net.ipv4.sysctl_ip_fwd_update_priority,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler   = ipv4_fwd_update_priority,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "ip_nonlocal_bind",
+		.data		= &init_net.ipv4.sysctl_ip_nonlocal_bind,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "ip_autobind_reuse",
+		.data		= &init_net.ipv4.sysctl_ip_autobind_reuse,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{
+		.procname	= "fwmark_reflect",
+		.data		= &init_net.ipv4.sysctl_fwmark_reflect,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_fwmark_accept",
+		.data		= &init_net.ipv4.sysctl_tcp_fwmark_accept,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	{
+		.procname	= "tcp_l3mdev_accept",
+		.data		= &init_net.ipv4.sysctl_tcp_l3mdev_accept,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif
+	{
+		.procname	= "tcp_mtu_probing",
+		.data		= &init_net.ipv4.sysctl_tcp_mtu_probing,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_base_mss",
+		.data		= &init_net.ipv4.sysctl_tcp_base_mss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_min_snd_mss",
+		.data		= &init_net.ipv4.sysctl_tcp_min_snd_mss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_min_snd_mss_min,
+		.extra2		= &tcp_min_snd_mss_max,
+	},
+	{
+		.procname	= "tcp_mtu_probe_floor",
+		.data		= &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_min_snd_mss_min,
+		.extra2		= &tcp_min_snd_mss_max,
+	},
+	{
+		.procname	= "tcp_probe_threshold",
+		.data		= &init_net.ipv4.sysctl_tcp_probe_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_probe_interval",
+		.data		= &init_net.ipv4.sysctl_tcp_probe_interval,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra2		= &u32_max_div_HZ,
+	},
+	{
+		.procname	= "igmp_link_local_mcast_reports",
+		.data		= &init_net.ipv4.sysctl_igmp_llm_reports,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "igmp_max_memberships",
+		.data		= &init_net.ipv4.sysctl_igmp_max_memberships,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "igmp_max_msf",
+		.data		= &init_net.ipv4.sysctl_igmp_max_msf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#ifdef CONFIG_IP_MULTICAST
+	{
+		.procname	= "igmp_qrv",
+		.data		= &init_net.ipv4.sysctl_igmp_qrv,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE
+	},
+#endif
+	{
+		.procname	= "tcp_congestion_control",
+		.data		= &init_net.ipv4.tcp_congestion_control,
+		.mode		= 0644,
+		.maxlen		= TCP_CA_NAME_MAX,
+		.proc_handler	= proc_tcp_congestion_control,
+	},
+	{
+		.procname	= "tcp_available_congestion_control",
+		.maxlen		= TCP_CA_BUF_MAX,
+		.mode		= 0444,
+		.proc_handler   = proc_tcp_available_congestion_control,
+	},
+	{
+		.procname	= "tcp_allowed_congestion_control",
+		.maxlen		= TCP_CA_BUF_MAX,
+		.mode		= 0644,
+		.proc_handler   = proc_allowed_congestion_control,
+	},
+	{
+		.procname	= "tcp_keepalive_time",
+		.data		= &init_net.ipv4.sysctl_tcp_keepalive_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "tcp_keepalive_probes",
+		.data		= &init_net.ipv4.sysctl_tcp_keepalive_probes,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_keepalive_intvl",
+		.data		= &init_net.ipv4.sysctl_tcp_keepalive_intvl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "tcp_syn_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_syn_retries,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= &tcp_syn_retries_min,
+		.extra2		= &tcp_syn_retries_max
+	},
+	{
+		.procname	= "tcp_synack_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_synack_retries,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+#ifdef CONFIG_SYN_COOKIES
+	{
+		.procname	= "tcp_syncookies",
+		.data		= &init_net.ipv4.sysctl_tcp_syncookies,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+#endif
+	{
+		.procname	= "tcp_migrate_req",
+		.data		= &init_net.ipv4.sysctl_tcp_migrate_req,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
+	{
+		.procname	= "tcp_reordering",
+		.data		= &init_net.ipv4.sysctl_tcp_reordering,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_retries1",
+		.data		= &init_net.ipv4.sysctl_tcp_retries1,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra2		= &tcp_retr1_max
+	},
+	{
+		.procname	= "tcp_retries2",
+		.data		= &init_net.ipv4.sysctl_tcp_retries2,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_orphan_retries",
+		.data		= &init_net.ipv4.sysctl_tcp_orphan_retries,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_fin_timeout",
+		.data		= &init_net.ipv4.sysctl_tcp_fin_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "tcp_notsent_lowat",
+		.data		= &init_net.ipv4.sysctl_tcp_notsent_lowat,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec,
+	},
+	{
+		.procname	= "tcp_tw_reuse",
+		.data		= &init_net.ipv4.sysctl_tcp_tw_reuse,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{
+		.procname	= "tcp_max_syn_backlog",
+		.data		= &init_net.ipv4.sysctl_max_syn_backlog,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_fastopen",
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_fastopen_key",
+		.mode		= 0600,
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen,
+		/* maxlen to print the list of keys in hex (*2), with dashes
+		 * separating doublewords and a comma in between keys.
+		 */
+		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH *
+				   2 * TCP_FASTOPEN_KEY_MAX) +
+				   (TCP_FASTOPEN_KEY_MAX * 5)),
+		.proc_handler	= proc_tcp_fastopen_key,
+	},
+	{
+		.procname	= "tcp_fastopen_blackhole_timeout_sec",
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_tfo_blackhole_detect_timeout,
+		.extra1		= SYSCTL_ZERO,
+	},
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	{
+		.procname	= "fib_multipath_use_neigh",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_use_neigh,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "fib_multipath_hash_policy",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_fib_multipath_hash_policy,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_THREE,
+	},
+	{
+		.procname	= "fib_multipath_hash_fields",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_fields,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_fib_multipath_hash_fields,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= &fib_multipath_hash_fields_all_mask,
+	},
+#endif
+	{
+		.procname	= "ip_unprivileged_port_start",
+		.maxlen		= sizeof(int),
+		.data		= &init_net.ipv4.sysctl_ip_prot_sock,
+		.mode		= 0644,
+		.proc_handler	= ipv4_privileged_ports,
+	},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	{
+		.procname	= "udp_l3mdev_accept",
+		.data		= &init_net.ipv4.sysctl_udp_l3mdev_accept,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif
+	{
+		.procname	= "tcp_sack",
+		.data		= &init_net.ipv4.sysctl_tcp_sack,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_window_scaling",
+		.data		= &init_net.ipv4.sysctl_tcp_window_scaling,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_timestamps",
+		.data		= &init_net.ipv4.sysctl_tcp_timestamps,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_early_retrans",
+		.data		= &init_net.ipv4.sysctl_tcp_early_retrans,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_FOUR,
+	},
+	{
+		.procname	= "tcp_recovery",
+		.data		= &init_net.ipv4.sysctl_tcp_recovery,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname       = "tcp_thin_linear_timeouts",
+		.data           = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_slow_start_after_idle",
+		.data		= &init_net.ipv4.sysctl_tcp_slow_start_after_idle,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_retrans_collapse",
+		.data		= &init_net.ipv4.sysctl_tcp_retrans_collapse,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_stdurg",
+		.data		= &init_net.ipv4.sysctl_tcp_stdurg,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_rfc1337",
+		.data		= &init_net.ipv4.sysctl_tcp_rfc1337,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_abort_on_overflow",
+		.data		= &init_net.ipv4.sysctl_tcp_abort_on_overflow,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_fack",
+		.data		= &init_net.ipv4.sysctl_tcp_fack,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_max_reordering",
+		.data		= &init_net.ipv4.sysctl_tcp_max_reordering,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_dsack",
+		.data		= &init_net.ipv4.sysctl_tcp_dsack,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_app_win",
+		.data		= &init_net.ipv4.sysctl_tcp_app_win,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &tcp_app_win_max,
+	},
+	{
+		.procname	= "tcp_adv_win_scale",
+		.data		= &init_net.ipv4.sysctl_tcp_adv_win_scale,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_adv_win_scale_min,
+		.extra2		= &tcp_adv_win_scale_max,
+	},
+	{
+		.procname	= "tcp_frto",
+		.data		= &init_net.ipv4.sysctl_tcp_frto,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_no_metrics_save",
+		.data		= &init_net.ipv4.sysctl_tcp_nometrics_save,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_no_ssthresh_metrics_save",
+		.data		= &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "tcp_moderate_rcvbuf",
+		.data		= &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_tso_win_divisor",
+		.data		= &init_net.ipv4.sysctl_tcp_tso_win_divisor,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_workaround_signed_windows",
+		.data		= &init_net.ipv4.sysctl_tcp_workaround_signed_windows,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_limit_output_bytes",
+		.data		= &init_net.ipv4.sysctl_tcp_limit_output_bytes,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_challenge_ack_limit",
+		.data		= &init_net.ipv4.sysctl_tcp_challenge_ack_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_min_tso_segs",
+		.data		= &init_net.ipv4.sysctl_tcp_min_tso_segs,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "tcp_tso_rtt_log",
+		.data		= &init_net.ipv4.sysctl_tcp_tso_rtt_log,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+	},
+	{
+		.procname	= "tcp_min_rtt_wlen",
+		.data		= &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &one_day_secs
+	},
+	{
+		.procname	= "tcp_autocorking",
+		.data		= &init_net.ipv4.sysctl_tcp_autocorking,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "tcp_invalid_ratelimit",
+		.data		= &init_net.ipv4.sysctl_tcp_invalid_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
+		.procname	= "tcp_pacing_ss_ratio",
+		.data		= &init_net.ipv4.sysctl_tcp_pacing_ss_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_THOUSAND,
+	},
+	{
+		.procname	= "tcp_pacing_ca_ratio",
+		.data		= &init_net.ipv4.sysctl_tcp_pacing_ca_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_THOUSAND,
+	},
+	{
+		.procname	= "tcp_wmem",
+		.data		= &init_net.ipv4.sysctl_tcp_wmem,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_wmem),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "tcp_rmem",
+		.data		= &init_net.ipv4.sysctl_tcp_rmem,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_rmem),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "tcp_comp_sack_delay_ns",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "tcp_comp_sack_slack_ns",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "tcp_comp_sack_nr",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_nr,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname       = "tcp_reflect_tos",
+		.data           = &init_net.ipv4.sysctl_tcp_reflect_tos,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{
+		.procname	= "tcp_ehash_entries",
+		.data		= &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+		.mode		= 0444,
+		.proc_handler	= proc_tcp_ehash_entries,
+	},
+	{
+		.procname	= "tcp_child_ehash_entries",
+		.data		= &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &tcp_child_ehash_entries_max,
+	},
+	{
+		.procname	= "udp_hash_entries",
+		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
+		.mode		= 0444,
+		.proc_handler	= proc_udp_hash_entries,
+	},
+	{
+		.procname	= "udp_child_hash_entries",
+		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &udp_child_hash_entries_max,
+	},
+	{
+		.procname	= "udp_rmem_min",
+		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_rmem_min),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE
+	},
+	{
+		.procname	= "udp_wmem_min",
+		.data		= &init_net.ipv4.sysctl_udp_wmem_min,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_wmem_min),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE
+	},
+	{
+		.procname	= "fib_notify_on_flag_change",
+		.data		= &init_net.ipv4.sysctl_fib_notify_on_flag_change,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{
+		.procname       = "tcp_plb_enabled",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_enabled,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{
+		.procname       = "tcp_plb_idle_rehash_rounds",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra2		= &tcp_plb_max_rounds,
+	},
+	{
+		.procname       = "tcp_plb_rehash_rounds",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra2         = &tcp_plb_max_rounds,
+	},
+	{
+		.procname       = "tcp_plb_suspend_rto_sec",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+	},
+	{
+		.procname       = "tcp_plb_cong_thresh",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_cong_thresh,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = &tcp_plb_max_cong_thresh,
+	},
+	{
+		.procname	= "tcp_syn_linear_timeouts",
+		.data		= &init_net.ipv4.sysctl_tcp_syn_linear_timeouts,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &tcp_syn_linear_timeouts_max,
+	},
+	{
+		.procname	= "tcp_shrink_window",
+		.data		= &init_net.ipv4.sysctl_tcp_shrink_window,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
+static __net_init int ipv4_sysctl_init_net(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = ipv4_net_table;
+	if (!net_eq(net, &init_net)) {
+		int i;
+
+		table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
+		if (!table)
+			goto err_alloc;
+
+		for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
+			if (table[i].data) {
+				/* Update the variables to point into
+				 * the current struct net
+				 */
+				table[i].data += (void *)net - (void *)&init_net;
+			} else {
+				/* Entries without data pointer are global;
+				 * Make them read-only in non-init_net ns
+				 */
+				table[i].mode &= ~0222;
+			}
+		}
+	}
+
+	net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+						    ARRAY_SIZE(ipv4_net_table));
+	if (!net->ipv4.ipv4_hdr)
+		goto err_reg;
+
+	net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!net->ipv4.sysctl_local_reserved_ports)
+		goto err_ports;
+
+	return 0;
+
+err_ports:
+	unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static __net_exit void ipv4_sysctl_exit_net(struct net *net)
+{
+	struct ctl_table *table;
+
+	kfree(net->ipv4.sysctl_local_reserved_ports);
+	table = net->ipv4.ipv4_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+	kfree(table);
+}
+
+static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
+	.init = ipv4_sysctl_init_net,
+	.exit = ipv4_sysctl_exit_net,
+};
+
+static __init int sysctl_ipv4_init(void)
+{
+	struct ctl_table_header *hdr;
+
+	hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
+	if (!hdr)
+		return -ENOMEM;
+
+	if (register_pernet_subsys(&ipv4_sysctl_ops)) {
+		unregister_net_sysctl_table(hdr);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+__initcall(sysctl_ipv4_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
new file mode 100644
index 0000000000..fb417aee86
--- /dev/null
+++ b/net/ipv4/tcp.c
@@ -0,0 +1,4729 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *
+ * Fixes:
+ *		Alan Cox	:	Numerous verify_area() calls
+ *		Alan Cox	:	Set the ACK bit on a reset
+ *		Alan Cox	:	Stopped it crashing if it closed while
+ *					sk->inuse=1 and was trying to connect
+ *					(tcp_err()).
+ *		Alan Cox	:	All icmp error handling was broken
+ *					pointers passed where wrong and the
+ *					socket was looked up backwards. Nobody
+ *					tested any icmp error code obviously.
+ *		Alan Cox	:	tcp_err() now handled properly. It
+ *					wakes people on errors. poll
+ *					behaves and the icmp error race
+ *					has gone by moving it into sock.c
+ *		Alan Cox	:	tcp_send_reset() fixed to work for
+ *					everything not just packets for
+ *					unknown sockets.
+ *		Alan Cox	:	tcp option processing.
+ *		Alan Cox	:	Reset tweaked (still not 100%) [Had
+ *					syn rule wrong]
+ *		Herp Rosmanith  :	More reset fixes
+ *		Alan Cox	:	No longer acks invalid rst frames.
+ *					Acking any kind of RST is right out.
+ *		Alan Cox	:	Sets an ignore me flag on an rst
+ *					receive otherwise odd bits of prattle
+ *					escape still
+ *		Alan Cox	:	Fixed another acking RST frame bug.
+ *					Should stop LAN workplace lockups.
+ *		Alan Cox	: 	Some tidyups using the new skb list
+ *					facilities
+ *		Alan Cox	:	sk->keepopen now seems to work
+ *		Alan Cox	:	Pulls options out correctly on accepts
+ *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
+ *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
+ *					bit to skb ops.
+ *		Alan Cox	:	Tidied tcp_data to avoid a potential
+ *					nasty.
+ *		Alan Cox	:	Added some better commenting, as the
+ *					tcp is hard to follow
+ *		Alan Cox	:	Removed incorrect check for 20 * psh
+ *	Michael O'Reilly	:	ack < copied bug fix.
+ *	Johannes Stille		:	Misc tcp fixes (not all in yet).
+ *		Alan Cox	:	FIN with no memory -> CRASH
+ *		Alan Cox	:	Added socket option proto entries.
+ *					Also added awareness of them to accept.
+ *		Alan Cox	:	Added TCP options (SOL_TCP)
+ *		Alan Cox	:	Switched wakeup calls to callbacks,
+ *					so the kernel can layer network
+ *					sockets.
+ *		Alan Cox	:	Use ip_tos/ip_ttl settings.
+ *		Alan Cox	:	Handle FIN (more) properly (we hope).
+ *		Alan Cox	:	RST frames sent on unsynchronised
+ *					state ack error.
+ *		Alan Cox	:	Put in missing check for SYN bit.
+ *		Alan Cox	:	Added tcp_select_window() aka NET2E
+ *					window non shrink trick.
+ *		Alan Cox	:	Added a couple of small NET2E timer
+ *					fixes
+ *		Charles Hedrick :	TCP fixes
+ *		Toomas Tamm	:	TCP window fixes
+ *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
+ *		Charles Hedrick	:	Rewrote most of it to actually work
+ *		Linus		:	Rewrote tcp_read() and URG handling
+ *					completely
+ *		Gerhard Koerting:	Fixed some missing timer handling
+ *		Matthew Dillon  :	Reworked TCP machine states as per RFC
+ *		Gerhard Koerting:	PC/TCP workarounds
+ *		Adam Caldwell	:	Assorted timer/timing errors
+ *		Matthew Dillon	:	Fixed another RST bug
+ *		Alan Cox	:	Move to kernel side addressing changes.
+ *		Alan Cox	:	Beginning work on TCP fastpathing
+ *					(not yet usable)
+ *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
+ *		Alan Cox	:	TCP fast path debugging
+ *		Alan Cox	:	Window clamping
+ *		Michael Riepe	:	Bug in tcp_check()
+ *		Matt Dillon	:	More TCP improvements and RST bug fixes
+ *		Matt Dillon	:	Yet more small nasties remove from the
+ *					TCP code (Be very nice to this man if
+ *					tcp finally works 100%) 8)
+ *		Alan Cox	:	BSD accept semantics.
+ *		Alan Cox	:	Reset on closedown bug.
+ *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
+ *		Michael Pall	:	Handle poll() after URG properly in
+ *					all cases.
+ *		Michael Pall	:	Undo the last fix in tcp_read_urg()
+ *					(multi URG PUSH broke rlogin).
+ *		Michael Pall	:	Fix the multi URG PUSH problem in
+ *					tcp_readable(), poll() after URG
+ *					works now.
+ *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
+ *					BSD api.
+ *		Alan Cox	:	Changed the semantics of sk->socket to
+ *					fix a race and a signal problem with
+ *					accept() and async I/O.
+ *		Alan Cox	:	Relaxed the rules on tcp_sendto().
+ *		Yury Shevchuk	:	Really fixed accept() blocking problem.
+ *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
+ *					clients/servers which listen in on
+ *					fixed ports.
+ *		Alan Cox	:	Cleaned the above up and shrank it to
+ *					a sensible code size.
+ *		Alan Cox	:	Self connect lockup fix.
+ *		Alan Cox	:	No connect to multicast.
+ *		Ross Biro	:	Close unaccepted children on master
+ *					socket close.
+ *		Alan Cox	:	Reset tracing code.
+ *		Alan Cox	:	Spurious resets on shutdown.
+ *		Alan Cox	:	Giant 15 minute/60 second timer error
+ *		Alan Cox	:	Small whoops in polling before an
+ *					accept.
+ *		Alan Cox	:	Kept the state trace facility since
+ *					it's handy for debugging.
+ *		Alan Cox	:	More reset handler fixes.
+ *		Alan Cox	:	Started rewriting the code based on
+ *					the RFC's for other useful protocol
+ *					references see: Comer, KA9Q NOS, and
+ *					for a reference on the difference
+ *					between specifications and how BSD
+ *					works see the 4.4lite source.
+ *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
+ *					close.
+ *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
+ *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
+ *		Alan Cox	:	Reimplemented timers as per the RFC
+ *					and using multiple timers for sanity.
+ *		Alan Cox	:	Small bug fixes, and a lot of new
+ *					comments.
+ *		Alan Cox	:	Fixed dual reader crash by locking
+ *					the buffers (much like datagram.c)
+ *		Alan Cox	:	Fixed stuck sockets in probe. A probe
+ *					now gets fed up of retrying without
+ *					(even a no space) answer.
+ *		Alan Cox	:	Extracted closing code better
+ *		Alan Cox	:	Fixed the closing state machine to
+ *					resemble the RFC.
+ *		Alan Cox	:	More 'per spec' fixes.
+ *		Jorge Cwik	:	Even faster checksumming.
+ *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
+ *					only frames. At least one pc tcp stack
+ *					generates them.
+ *		Alan Cox	:	Cache last socket.
+ *		Alan Cox	:	Per route irtt.
+ *		Matt Day	:	poll()->select() match BSD precisely on error
+ *		Alan Cox	:	New buffers
+ *		Marc Tamsky	:	Various sk->prot->retransmits and
+ *					sk->retransmits misupdating fixed.
+ *					Fixed tcp_write_timeout: stuck close,
+ *					and TCP syn retries gets used now.
+ *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
+ *					ack if state is TCP_CLOSED.
+ *		Alan Cox	:	Look up device on a retransmit - routes may
+ *					change. Doesn't yet cope with MSS shrink right
+ *					but it's a start!
+ *		Marc Tamsky	:	Closing in closing fixes.
+ *		Mike Shaver	:	RFC1122 verifications.
+ *		Alan Cox	:	rcv_saddr errors.
+ *		Alan Cox	:	Block double connect().
+ *		Alan Cox	:	Small hooks for enSKIP.
+ *		Alexey Kuznetsov:	Path MTU discovery.
+ *		Alan Cox	:	Support soft errors.
+ *		Alan Cox	:	Fix MTU discovery pathological case
+ *					when the remote claims no mtu!
+ *		Marc Tamsky	:	TCP_CLOSE fix.
+ *		Colin (G3TNE)	:	Send a reset on syn ack replies in
+ *					window but wrong (fixes NT lpd problems)
+ *		Pedro Roque	:	Better TCP window handling, delayed ack.
+ *		Joerg Reuter	:	No modification of locked buffers in
+ *					tcp_do_retransmit()
+ *		Eric Schenk	:	Changed receiver side silly window
+ *					avoidance algorithm to BSD style
+ *					algorithm. This doubles throughput
+ *					against machines running Solaris,
+ *					and seems to result in general
+ *					improvement.
+ *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *	Mike McLagan		:	Routing by source
+ *		Keith Owens	:	Do proper merging with partial SKB's in
+ *					tcp_do_sendmsg to avoid burstiness.
+ *		Eric Schenk	:	Fix fast close down bug with
+ *					shutdown() followed by close().
+ *		Andi Kleen 	:	Make poll agree with SIGIO
+ *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
+ *					lingertime == 0 (RFC 793 ABORT Call)
+ *	Hirokazu Takahashi	:	Use copy_from_user() instead of
+ *					csum_and_copy_from_user() if possible.
+ *
+ * Description of States:
+ *
+ *	TCP_SYN_SENT		sent a connection request, waiting for ack
+ *
+ *	TCP_SYN_RECV		received a connection request, sent ack,
+ *				waiting for final ack in three-way handshake.
+ *
+ *	TCP_ESTABLISHED		connection established
+ *
+ *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
+ *				transmission of remaining buffered data
+ *
+ *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
+ *				to shutdown
+ *
+ *	TCP_CLOSING		both sides have shutdown but we still have
+ *				data we have to finish sending
+ *
+ *	TCP_TIME_WAIT		timeout to catch resent junk before entering
+ *				closed, can only be entered from FIN_WAIT2
+ *				or CLOSING.  Required because the other end
+ *				may not have gotten our last ACK causing it
+ *				to retransmit the data packet (which we ignore)
+ *
+ *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
+ *				us to finish writing our data and to shutdown
+ *				(we have to close() to move on to LAST_ACK)
+ *
+ *	TCP_LAST_ACK		out side has shutdown after remote has
+ *				shutdown.  There may still be data in our
+ *				buffer that we have to finish sending
+ *
+ *	TCP_CLOSE		socket is finished
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <crypto/hash.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/inet_diag.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+#include <linux/splice.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/random.h>
+#include <linux/memblock.h>
+#include <linux/highmem.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/errqueue.h>
+#include <linux/static_key.h>
+#include <linux/btf.h>
+
+#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+#include <net/busy_poll.h>
+
+/* Track pending CMSGs. */
+enum {
+	TCP_CMSG_INQ = 1,
+	TCP_CMSG_TS = 2
+};
+
+DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
+
+long sysctl_tcp_mem[3] __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_mem);
+
+atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;	/* Current allocated memory. */
+EXPORT_SYMBOL(tcp_memory_allocated);
+DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
+
+#if IS_ENABLED(CONFIG_SMC)
+DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
+EXPORT_SYMBOL(tcp_have_smc);
+#endif
+
+/*
+ * Current number of TCP sockets.
+ */
+struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
+EXPORT_SYMBOL(tcp_sockets_allocated);
+
+/*
+ * TCP splice context
+ */
+struct tcp_splice_state {
+	struct pipe_inode_info *pipe;
+	size_t len;
+	unsigned int flags;
+};
+
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the __sk_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+unsigned long tcp_memory_pressure __read_mostly;
+EXPORT_SYMBOL_GPL(tcp_memory_pressure);
+
+void tcp_enter_memory_pressure(struct sock *sk)
+{
+	unsigned long val;
+
+	if (READ_ONCE(tcp_memory_pressure))
+		return;
+	val = jiffies;
+
+	if (!val)
+		val--;
+	if (!cmpxchg(&tcp_memory_pressure, 0, val))
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
+}
+EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
+
+void tcp_leave_memory_pressure(struct sock *sk)
+{
+	unsigned long val;
+
+	if (!READ_ONCE(tcp_memory_pressure))
+		return;
+	val = xchg(&tcp_memory_pressure, 0);
+	if (val)
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
+			      jiffies_to_msecs(jiffies - val));
+}
+EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
+
+/* Convert seconds to retransmits based on initial and max timeout */
+static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
+{
+	u8 res = 0;
+
+	if (seconds > 0) {
+		int period = timeout;
+
+		res = 1;
+		while (seconds > period && res < 255) {
+			res++;
+			timeout <<= 1;
+			if (timeout > rto_max)
+				timeout = rto_max;
+			period += timeout;
+		}
+	}
+	return res;
+}
+
+/* Convert retransmits to seconds based on initial and max timeout */
+static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
+{
+	int period = 0;
+
+	if (retrans > 0) {
+		period = timeout;
+		while (--retrans) {
+			timeout <<= 1;
+			if (timeout > rto_max)
+				timeout = rto_max;
+			period += timeout;
+		}
+	}
+	return period;
+}
+
+static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
+{
+	u32 rate = READ_ONCE(tp->rate_delivered);
+	u32 intv = READ_ONCE(tp->rate_interval_us);
+	u64 rate64 = 0;
+
+	if (rate && intv) {
+		rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+		do_div(rate64, intv);
+	}
+	return rate64;
+}
+
+/* Address-family independent initialization for a tcp_sock.
+ *
+ * NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
+void tcp_init_sock(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->out_of_order_queue = RB_ROOT;
+	sk->tcp_rtx_queue = RB_ROOT;
+	tcp_init_xmit_timers(sk);
+	INIT_LIST_HEAD(&tp->tsq_node);
+	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
+
+	icsk->icsk_rto = TCP_TIMEOUT_INIT;
+	icsk->icsk_rto_min = TCP_RTO_MIN;
+	icsk->icsk_delack_max = TCP_DELACK_MAX;
+	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+	minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
+
+	/* So many TCP implementations out there (incorrectly) count the
+	 * initial SYN frame in their delayed-ACK and congestion control
+	 * algorithms that we must have the following bandaid to talk
+	 * efficiently to them.  -DaveM
+	 */
+	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
+
+	/* There's a bubble in the pipe until at least the first ACK. */
+	tp->app_limited = ~0U;
+	tp->rate_app_limited = 1;
+
+	/* See draft-stevens-tcpca-spec-01 for discussion of the
+	 * initialization of these values.
+	 */
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	tp->snd_cwnd_clamp = ~0;
+	tp->mss_cache = TCP_MSS_DEFAULT;
+
+	tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
+	tcp_assign_congestion_control(sk);
+
+	tp->tsoffset = 0;
+	tp->rack.reo_wnd_steps = 1;
+
+	sk->sk_write_space = sk_stream_write_space;
+	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+	icsk->icsk_sync_mss = tcp_sync_mss;
+
+	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
+	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
+	tcp_scaling_ratio_init(sk);
+
+	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+	sk_sockets_allocated_inc(sk);
+}
+EXPORT_SYMBOL(tcp_init_sock);
+
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
+{
+	struct sk_buff *skb = tcp_write_queue_tail(sk);
+
+	if (tsflags && skb) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+		sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
+		if (tsflags & SOF_TIMESTAMPING_TX_ACK)
+			tcb->txstamp_ack = 1;
+		if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
+			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+	}
+}
+
+static bool tcp_stream_is_readable(struct sock *sk, int target)
+{
+	if (tcp_epollin_ready(sk, target))
+		return true;
+	return sk_is_readable(sk);
+}
+
+/*
+ *	Wait for a TCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
+ */
+__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	__poll_t mask;
+	struct sock *sk = sock->sk;
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u8 shutdown;
+	int state;
+
+	sock_poll_wait(file, sock, wait);
+
+	state = inet_sk_state_load(sk);
+	if (state == TCP_LISTEN)
+		return inet_csk_listen_poll(sk);
+
+	/* Socket is not locked. We are protected from async events
+	 * by poll logic and correct handling of state changes
+	 * made by other threads is impossible in any case.
+	 */
+
+	mask = 0;
+
+	/*
+	 * EPOLLHUP is certainly not done right. But poll() doesn't
+	 * have a notion of HUP in just one direction, and for a
+	 * socket the read side is more interesting.
+	 *
+	 * Some poll() documentation says that EPOLLHUP is incompatible
+	 * with the EPOLLOUT/POLLWR flags, so somebody should check this
+	 * all. But careful, it tends to be safer to return too many
+	 * bits than too few, and you can easily break real applications
+	 * if you don't tell them that something has hung up!
+	 *
+	 * Check-me.
+	 *
+	 * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
+	 * our fs/select.c). It means that after we received EOF,
+	 * poll always returns immediately, making impossible poll() on write()
+	 * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
+	 * if and only if shutdown has been made in both directions.
+	 * Actually, it is interesting to look how Solaris and DUX
+	 * solve this dilemma. I would prefer, if EPOLLHUP were maskable,
+	 * then we could set it on SND_SHUTDOWN. BTW examples given
+	 * in Stevens' books assume exactly this behaviour, it explains
+	 * why EPOLLHUP is incompatible with EPOLLOUT.	--ANK
+	 *
+	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+	 * blocking on fresh not-connected or disconnected socket. --ANK
+	 */
+	shutdown = READ_ONCE(sk->sk_shutdown);
+	if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+		mask |= EPOLLHUP;
+	if (shutdown & RCV_SHUTDOWN)
+		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+
+	/* Connected or passive Fast Open socket? */
+	if (state != TCP_SYN_SENT &&
+	    (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
+		int target = sock_rcvlowat(sk, 0, INT_MAX);
+		u16 urg_data = READ_ONCE(tp->urg_data);
+
+		if (unlikely(urg_data) &&
+		    READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
+		    !sock_flag(sk, SOCK_URGINLINE))
+			target++;
+
+		if (tcp_stream_is_readable(sk, target))
+			mask |= EPOLLIN | EPOLLRDNORM;
+
+		if (!(shutdown & SEND_SHUTDOWN)) {
+			if (__sk_stream_is_writeable(sk, 1)) {
+				mask |= EPOLLOUT | EPOLLWRNORM;
+			} else {  /* send SIGIO later */
+				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+				/* Race breaker. If space is freed after
+				 * wspace test but before the flags are set,
+				 * IO signal will be lost. Memory barrier
+				 * pairs with the input side.
+				 */
+				smp_mb__after_atomic();
+				if (__sk_stream_is_writeable(sk, 1))
+					mask |= EPOLLOUT | EPOLLWRNORM;
+			}
+		} else
+			mask |= EPOLLOUT | EPOLLWRNORM;
+
+		if (urg_data & TCP_URG_VALID)
+			mask |= EPOLLPRI;
+	} else if (state == TCP_SYN_SENT &&
+		   inet_test_bit(DEFER_CONNECT, sk)) {
+		/* Active TCP fastopen socket with defer_connect
+		 * Return EPOLLOUT so application can call write()
+		 * in order for kernel to generate SYN+data
+		 */
+		mask |= EPOLLOUT | EPOLLWRNORM;
+	}
+	/* This barrier is coupled with smp_wmb() in tcp_reset() */
+	smp_rmb();
+	if (READ_ONCE(sk->sk_err) ||
+	    !skb_queue_empty_lockless(&sk->sk_error_queue))
+		mask |= EPOLLERR;
+
+	return mask;
+}
+EXPORT_SYMBOL(tcp_poll);
+
+int tcp_ioctl(struct sock *sk, int cmd, int *karg)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int answ;
+	bool slow;
+
+	switch (cmd) {
+	case SIOCINQ:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		slow = lock_sock_fast(sk);
+		answ = tcp_inq(sk);
+		unlock_sock_fast(sk, slow);
+		break;
+	case SIOCATMARK:
+		answ = READ_ONCE(tp->urg_data) &&
+		       READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
+		break;
+	case SIOCOUTQ:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+			answ = 0;
+		else
+			answ = READ_ONCE(tp->write_seq) - tp->snd_una;
+		break;
+	case SIOCOUTQNSD:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+			answ = 0;
+		else
+			answ = READ_ONCE(tp->write_seq) -
+			       READ_ONCE(tp->snd_nxt);
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	*karg = answ;
+	return 0;
+}
+EXPORT_SYMBOL(tcp_ioctl);
+
+void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+	tp->pushed_seq = tp->write_seq;
+}
+
+static inline bool forced_push(const struct tcp_sock *tp)
+{
+	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
+}
+
+void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	tcb->seq     = tcb->end_seq = tp->write_seq;
+	tcb->tcp_flags = TCPHDR_ACK;
+	__skb_header_release(skb);
+	tcp_add_write_queue_tail(sk, skb);
+	sk_wmem_queued_add(sk, skb->truesize);
+	sk_mem_charge(sk, skb->truesize);
+	if (tp->nonagle & TCP_NAGLE_PUSH)
+		tp->nonagle &= ~TCP_NAGLE_PUSH;
+
+	tcp_slow_start_after_idle_check(sk);
+}
+
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
+{
+	if (flags & MSG_OOB)
+		tp->snd_up = tp->write_seq;
+}
+
+/* If a not yet filled skb is pushed, do not send it if
+ * we have data packets in Qdisc or NIC queues :
+ * Because TX completion will happen shortly, it gives a chance
+ * to coalesce future sendmsg() payload into this skb, without
+ * need for a timer, and with no latency trade off.
+ * As packets containing data payload have a bigger truesize
+ * than pure acks (dataless) packets, the last checks prevent
+ * autocorking if we only have an ACK in Qdisc/NIC queues,
+ * or if TX completion was delayed after we processed ACK packet.
+ */
+static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
+				int size_goal)
+{
+	return skb->len < size_goal &&
+	       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
+	       !tcp_rtx_queue_empty(sk) &&
+	       refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
+	       tcp_skb_can_collapse_to(skb);
+}
+
+void tcp_push(struct sock *sk, int flags, int mss_now,
+	      int nonagle, int size_goal)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	skb = tcp_write_queue_tail(sk);
+	if (!skb)
+		return;
+	if (!(flags & MSG_MORE) || forced_push(tp))
+		tcp_mark_push(tp, skb);
+
+	tcp_mark_urg(tp, flags);
+
+	if (tcp_should_autocork(sk, skb, size_goal)) {
+
+		/* avoid atomic op if TSQ_THROTTLED bit is already set */
+		if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+			set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+			smp_mb__after_atomic();
+		}
+		/* It is possible TX completion already happened
+		 * before we set TSQ_THROTTLED.
+		 */
+		if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
+			return;
+	}
+
+	if (flags & MSG_MORE)
+		nonagle = TCP_NAGLE_CORK;
+
+	__tcp_push_pending_frames(sk, mss_now, nonagle);
+}
+
+static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+				unsigned int offset, size_t len)
+{
+	struct tcp_splice_state *tss = rd_desc->arg.data;
+	int ret;
+
+	ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
+			      min(rd_desc->count, len), tss->flags);
+	if (ret > 0)
+		rd_desc->count -= ret;
+	return ret;
+}
+
+static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+	/* Store TCP splice context information in read_descriptor_t. */
+	read_descriptor_t rd_desc = {
+		.arg.data = tss,
+		.count	  = tss->len,
+	};
+
+	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ *  tcp_splice_read - splice data from TCP socket to a pipe
+ * @sock:	socket to splice from
+ * @ppos:	position (not valid)
+ * @pipe:	pipe to splice to
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will read pages from given socket and fill them into a pipe.
+ *
+ **/
+ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
+			struct pipe_inode_info *pipe, size_t len,
+			unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	struct tcp_splice_state tss = {
+		.pipe = pipe,
+		.len = len,
+		.flags = flags,
+	};
+	long timeo;
+	ssize_t spliced;
+	int ret;
+
+	sock_rps_record_flow(sk);
+	/*
+	 * We can't seek on a socket input
+	 */
+	if (unlikely(*ppos))
+		return -ESPIPE;
+
+	ret = spliced = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+	while (tss.len) {
+		ret = __tcp_splice_read(sk, &tss);
+		if (ret < 0)
+			break;
+		else if (!ret) {
+			if (spliced)
+				break;
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				ret = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				/*
+				 * This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				ret = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				ret = -EAGAIN;
+				break;
+			}
+			/* if __tcp_splice_read() got nothing while we have
+			 * an skb in receive queue, we do not want to loop.
+			 * This might happen with URG data.
+			 */
+			if (!skb_queue_empty(&sk->sk_receive_queue))
+				break;
+			ret = sk_wait_data(sk, &timeo, NULL);
+			if (ret < 0)
+				break;
+			if (signal_pending(current)) {
+				ret = sock_intr_errno(timeo);
+				break;
+			}
+			continue;
+		}
+		tss.len -= ret;
+		spliced += ret;
+
+		if (!tss.len || !timeo)
+			break;
+		release_sock(sk);
+		lock_sock(sk);
+
+		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current))
+			break;
+	}
+
+	release_sock(sk);
+
+	if (spliced)
+		return spliced;
+
+	return ret;
+}
+EXPORT_SYMBOL(tcp_splice_read);
+
+struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
+				     bool force_schedule)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
+	if (likely(skb)) {
+		bool mem_scheduled;
+
+		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+		if (force_schedule) {
+			mem_scheduled = true;
+			sk_forced_mem_schedule(sk, skb->truesize);
+		} else {
+			mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
+		}
+		if (likely(mem_scheduled)) {
+			skb_reserve(skb, MAX_TCP_HEADER);
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+			return skb;
+		}
+		__kfree_skb(skb);
+	} else {
+		sk->sk_prot->enter_memory_pressure(sk);
+		sk_stream_moderate_sndbuf(sk);
+	}
+	return NULL;
+}
+
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+				       int large_allowed)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 new_size_goal, size_goal;
+
+	if (!large_allowed)
+		return mss_now;
+
+	/* Note : tcp_tso_autosize() will eventually split this later */
+	new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
+
+	/* We try hard to avoid divides here */
+	size_goal = tp->gso_segs * mss_now;
+	if (unlikely(new_size_goal < size_goal ||
+		     new_size_goal >= size_goal + mss_now)) {
+		tp->gso_segs = min_t(u16, new_size_goal / mss_now,
+				     sk->sk_gso_max_segs);
+		size_goal = tp->gso_segs * mss_now;
+	}
+
+	return max(size_goal, mss_now);
+}
+
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+	int mss_now;
+
+	mss_now = tcp_current_mss(sk);
+	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+
+	return mss_now;
+}
+
+/* In some cases, sendmsg() could have added an skb to the write queue,
+ * but failed adding payload on it. We need to remove it to consume less
+ * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
+ * epoll() users. Another reason is that tcp_write_xmit() does not like
+ * finding an empty skb in the write queue.
+ */
+void tcp_remove_empty_skb(struct sock *sk)
+{
+	struct sk_buff *skb = tcp_write_queue_tail(sk);
+
+	if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+		tcp_unlink_write_queue(skb, sk);
+		if (tcp_write_queue_empty(sk))
+			tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+		tcp_wmem_free_skb(sk, skb);
+	}
+}
+
+/* skb changing from pure zc to mixed, must charge zc */
+static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
+{
+	if (unlikely(skb_zcopy_pure(skb))) {
+		u32 extra = skb->truesize -
+			    SKB_TRUESIZE(skb_end_offset(skb));
+
+		if (!sk_wmem_schedule(sk, extra))
+			return -ENOMEM;
+
+		sk_mem_charge(sk, extra);
+		skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+	}
+	return 0;
+}
+
+
+int tcp_wmem_schedule(struct sock *sk, int copy)
+{
+	int left;
+
+	if (likely(sk_wmem_schedule(sk, copy)))
+		return copy;
+
+	/* We could be in trouble if we have nothing queued.
+	 * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
+	 * to guarantee some progress.
+	 */
+	left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued;
+	if (left > 0)
+		sk_forced_mem_schedule(sk, min(left, copy));
+	return min(copy, sk->sk_forward_alloc);
+}
+
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+	if (tp->fastopen_req) {
+		kfree(tp->fastopen_req);
+		tp->fastopen_req = NULL;
+	}
+}
+
+int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
+			 size_t size, struct ubuf_info *uarg)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr *uaddr = msg->msg_name;
+	int err, flags;
+
+	if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
+	      TFO_CLIENT_ENABLE) ||
+	    (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
+	     uaddr->sa_family == AF_UNSPEC))
+		return -EOPNOTSUPP;
+	if (tp->fastopen_req)
+		return -EALREADY; /* Another Fast Open is in progress */
+
+	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+				   sk->sk_allocation);
+	if (unlikely(!tp->fastopen_req))
+		return -ENOBUFS;
+	tp->fastopen_req->data = msg;
+	tp->fastopen_req->size = size;
+	tp->fastopen_req->uarg = uarg;
+
+	if (inet_test_bit(DEFER_CONNECT, sk)) {
+		err = tcp_connect(sk);
+		/* Same failure procedure as in tcp_v4/6_connect */
+		if (err) {
+			tcp_set_state(sk, TCP_CLOSE);
+			inet->inet_dport = 0;
+			sk->sk_route_caps = 0;
+		}
+	}
+	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+	err = __inet_stream_connect(sk->sk_socket, uaddr,
+				    msg->msg_namelen, flags, 1);
+	/* fastopen_req could already be freed in __inet_stream_connect
+	 * if the connection times out or gets rst
+	 */
+	if (tp->fastopen_req) {
+		*copied = tp->fastopen_req->copied;
+		tcp_free_fastopen_req(tp);
+		inet_clear_bit(DEFER_CONNECT, sk);
+	}
+	return err;
+}
+
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct ubuf_info *uarg = NULL;
+	struct sk_buff *skb;
+	struct sockcm_cookie sockc;
+	int flags, err, copied = 0;
+	int mss_now = 0, size_goal, copied_syn = 0;
+	int process_backlog = 0;
+	int zc = 0;
+	long timeo;
+
+	flags = msg->msg_flags;
+
+	if ((flags & MSG_ZEROCOPY) && size) {
+		if (msg->msg_ubuf) {
+			uarg = msg->msg_ubuf;
+			if (sk->sk_route_caps & NETIF_F_SG)
+				zc = MSG_ZEROCOPY;
+		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+			skb = tcp_write_queue_tail(sk);
+			uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
+			if (!uarg) {
+				err = -ENOBUFS;
+				goto out_err;
+			}
+			if (sk->sk_route_caps & NETIF_F_SG)
+				zc = MSG_ZEROCOPY;
+			else
+				uarg_to_msgzc(uarg)->zerocopy = 0;
+		}
+	} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
+		if (sk->sk_route_caps & NETIF_F_SG)
+			zc = MSG_SPLICE_PAGES;
+	}
+
+	if (unlikely(flags & MSG_FASTOPEN ||
+		     inet_test_bit(DEFER_CONNECT, sk)) &&
+	    !tp->repair) {
+		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
+		if (err == -EINPROGRESS && copied_syn > 0)
+			goto out;
+		else if (err)
+			goto out_err;
+	}
+
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
+
+	/* Wait for a connection to finish. One exception is TCP Fast Open
+	 * (passive side) where data is allowed to be sent before a connection
+	 * is fully established.
+	 */
+	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+	    !tcp_passive_fastopen(sk)) {
+		err = sk_stream_wait_connect(sk, &timeo);
+		if (err != 0)
+			goto do_error;
+	}
+
+	if (unlikely(tp->repair)) {
+		if (tp->repair_queue == TCP_RECV_QUEUE) {
+			copied = tcp_send_rcvq(sk, msg, size);
+			goto out_nopush;
+		}
+
+		err = -EINVAL;
+		if (tp->repair_queue == TCP_NO_QUEUE)
+			goto out_err;
+
+		/* 'common' sending to sendq */
+	}
+
+	sockcm_init(&sockc, sk);
+	if (msg->msg_controllen) {
+		err = sock_cmsg_send(sk, msg, &sockc);
+		if (unlikely(err)) {
+			err = -EINVAL;
+			goto out_err;
+		}
+	}
+
+	/* This should be in poll */
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	/* Ok commence sending. */
+	copied = 0;
+
+restart:
+	mss_now = tcp_send_mss(sk, &size_goal, flags);
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		goto do_error;
+
+	while (msg_data_left(msg)) {
+		ssize_t copy = 0;
+
+		skb = tcp_write_queue_tail(sk);
+		if (skb)
+			copy = size_goal - skb->len;
+
+		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
+			bool first_skb;
+
+new_segment:
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_space;
+
+			if (unlikely(process_backlog >= 16)) {
+				process_backlog = 0;
+				if (sk_flush_backlog(sk))
+					goto restart;
+			}
+			first_skb = tcp_rtx_and_write_queues_empty(sk);
+			skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
+						   first_skb);
+			if (!skb)
+				goto wait_for_space;
+
+			process_backlog++;
+
+			tcp_skb_entail(sk, skb);
+			copy = size_goal;
+
+			/* All packets are restored as if they have
+			 * already been sent. skb_mstamp_ns isn't set to
+			 * avoid wrong rtt estimation.
+			 */
+			if (tp->repair)
+				TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
+		}
+
+		/* Try to append data to the end of skb. */
+		if (copy > msg_data_left(msg))
+			copy = msg_data_left(msg);
+
+		if (zc == 0) {
+			bool merge = true;
+			int i = skb_shinfo(skb)->nr_frags;
+			struct page_frag *pfrag = sk_page_frag(sk);
+
+			if (!sk_page_frag_refill(sk, pfrag))
+				goto wait_for_space;
+
+			if (!skb_can_coalesce(skb, i, pfrag->page,
+					      pfrag->offset)) {
+				if (i >= READ_ONCE(sysctl_max_skb_frags)) {
+					tcp_mark_push(tp, skb);
+					goto new_segment;
+				}
+				merge = false;
+			}
+
+			copy = min_t(int, copy, pfrag->size - pfrag->offset);
+
+			if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
+				if (tcp_downgrade_zcopy_pure(sk, skb))
+					goto wait_for_space;
+				skb_zcopy_downgrade_managed(skb);
+			}
+
+			copy = tcp_wmem_schedule(sk, copy);
+			if (!copy)
+				goto wait_for_space;
+
+			err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+						       pfrag->page,
+						       pfrag->offset,
+						       copy);
+			if (err)
+				goto do_error;
+
+			/* Update the skb. */
+			if (merge) {
+				skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+			} else {
+				skb_fill_page_desc(skb, i, pfrag->page,
+						   pfrag->offset, copy);
+				page_ref_inc(pfrag->page);
+			}
+			pfrag->offset += copy;
+		} else if (zc == MSG_ZEROCOPY)  {
+			/* First append to a fragless skb builds initial
+			 * pure zerocopy skb
+			 */
+			if (!skb->len)
+				skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
+
+			if (!skb_zcopy_pure(skb)) {
+				copy = tcp_wmem_schedule(sk, copy);
+				if (!copy)
+					goto wait_for_space;
+			}
+
+			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+			if (err == -EMSGSIZE || err == -EEXIST) {
+				tcp_mark_push(tp, skb);
+				goto new_segment;
+			}
+			if (err < 0)
+				goto do_error;
+			copy = err;
+		} else if (zc == MSG_SPLICE_PAGES) {
+			/* Splice in data if we can; copy if we can't. */
+			if (tcp_downgrade_zcopy_pure(sk, skb))
+				goto wait_for_space;
+			copy = tcp_wmem_schedule(sk, copy);
+			if (!copy)
+				goto wait_for_space;
+
+			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
+						   sk->sk_allocation);
+			if (err < 0) {
+				if (err == -EMSGSIZE) {
+					tcp_mark_push(tp, skb);
+					goto new_segment;
+				}
+				goto do_error;
+			}
+			copy = err;
+
+			if (!(flags & MSG_NO_SHARED_FRAGS))
+				skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
+
+			sk_wmem_queued_add(sk, copy);
+			sk_mem_charge(sk, copy);
+		}
+
+		if (!copied)
+			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+
+		WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
+		TCP_SKB_CB(skb)->end_seq += copy;
+		tcp_skb_pcount_set(skb, 0);
+
+		copied += copy;
+		if (!msg_data_left(msg)) {
+			if (unlikely(flags & MSG_EOR))
+				TCP_SKB_CB(skb)->eor = 1;
+			goto out;
+		}
+
+		if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
+			continue;
+
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == tcp_send_head(sk))
+			tcp_push_one(sk, mss_now);
+		continue;
+
+wait_for_space:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		tcp_remove_empty_skb(sk);
+		if (copied)
+			tcp_push(sk, flags & ~MSG_MORE, mss_now,
+				 TCP_NAGLE_PUSH, size_goal);
+
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err != 0)
+			goto do_error;
+
+		mss_now = tcp_send_mss(sk, &size_goal, flags);
+	}
+
+out:
+	if (copied) {
+		tcp_tx_timestamp(sk, sockc.tsflags);
+		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+	}
+out_nopush:
+	/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
+	if (uarg && !msg->msg_ubuf)
+		net_zcopy_put(uarg);
+	return copied + copied_syn;
+
+do_error:
+	tcp_remove_empty_skb(sk);
+
+	if (copied + copied_syn)
+		goto out;
+out_err:
+	/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
+	if (uarg && !msg->msg_ubuf)
+		net_zcopy_put_abort(uarg, true);
+	err = sk_stream_error(sk, flags, err);
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
+		sk->sk_write_space(sk);
+		tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
+
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = tcp_sendmsg_locked(sk, msg, size);
+	release_sock(sk);
+
+	return ret;
+}
+EXPORT_SYMBOL(tcp_sendmsg);
+
+void tcp_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+
+	if (!tcp_write_queue_tail(sk))
+		return;
+
+	lock_sock(sk);
+	mss_now = tcp_send_mss(sk, &size_goal, 0);
+	tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_splice_eof);
+
+/*
+ *	Handle reading urgent data. BSD has very simple semantics for
+ *	this, no blocking and very strange errors 8)
+ */
+
+static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* No URG data to read. */
+	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
+	    tp->urg_data == TCP_URG_READ)
+		return -EINVAL;	/* Yes this is right ! */
+
+	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
+		return -ENOTCONN;
+
+	if (tp->urg_data & TCP_URG_VALID) {
+		int err = 0;
+		char c = tp->urg_data;
+
+		if (!(flags & MSG_PEEK))
+			WRITE_ONCE(tp->urg_data, TCP_URG_READ);
+
+		/* Read urgent data. */
+		msg->msg_flags |= MSG_OOB;
+
+		if (len > 0) {
+			if (!(flags & MSG_TRUNC))
+				err = memcpy_to_msg(msg, &c, 1);
+			len = 1;
+		} else
+			msg->msg_flags |= MSG_TRUNC;
+
+		return err ? -EFAULT : len;
+	}
+
+	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+		return 0;
+
+	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
+	 * the available implementations agree in this case:
+	 * this call should never block, independent of the
+	 * blocking state of the socket.
+	 * Mike <pall@rz.uni-karlsruhe.de>
+	 */
+	return -EAGAIN;
+}
+
+static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct sk_buff *skb;
+	int copied = 0, err = 0;
+
+	/* XXX -- need to support SO_PEEK_OFF */
+
+	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+		if (err)
+			return err;
+		copied += skb->len;
+	}
+
+	skb_queue_walk(&sk->sk_write_queue, skb) {
+		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+		if (err)
+			break;
+
+		copied += skb->len;
+	}
+
+	return err ?: copied;
+}
+
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary.  COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+void __tcp_cleanup_rbuf(struct sock *sk, int copied)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	bool time_to_ack = false;
+
+	if (inet_csk_ack_scheduled(sk)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+
+		if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
+		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
+		    /*
+		     * If this read emptied read buffer, we send ACK, if
+		     * connection is not bidirectional, user drained
+		     * receive buffer and there was a small segment
+		     * in queue.
+		     */
+		    (copied > 0 &&
+		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
+		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+		       !inet_csk_in_pingpong_mode(sk))) &&
+		      !atomic_read(&sk->sk_rmem_alloc)))
+			time_to_ack = true;
+	}
+
+	/* We send an ACK if we can now advertise a non-zero window
+	 * which has been raised "significantly".
+	 *
+	 * Even if window raised up to infinity, do not send window open ACK
+	 * in states, where we will not receive more. It is useless.
+	 */
+	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+		__u32 rcv_window_now = tcp_receive_window(tp);
+
+		/* Optimize, __tcp_select_window() is not cheap. */
+		if (2*rcv_window_now <= tp->window_clamp) {
+			__u32 new_window = __tcp_select_window(sk);
+
+			/* Send ACK now, if this read freed lots of space
+			 * in our buffer. Certainly, new_window is new window.
+			 * We can advertise it now, if it is not less than current one.
+			 * "Lots" means "at least twice" here.
+			 */
+			if (new_window && new_window >= 2 * rcv_window_now)
+				time_to_ack = true;
+		}
+	}
+	if (time_to_ack)
+		tcp_send_ack(sk);
+}
+
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
+{
+	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+	__tcp_cleanup_rbuf(sk, copied);
+}
+
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	__skb_unlink(skb, &sk->sk_receive_queue);
+	if (likely(skb->destructor == sock_rfree)) {
+		sock_rfree(skb);
+		skb->destructor = NULL;
+		skb->sk = NULL;
+		return skb_attempt_defer_free(skb);
+	}
+	__kfree_skb(skb);
+}
+
+struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+{
+	struct sk_buff *skb;
+	u32 offset;
+
+	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+		offset = seq - TCP_SKB_CB(skb)->seq;
+		if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+			pr_err_once("%s: found a SYN, please report !\n", __func__);
+			offset--;
+		}
+		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
+			*off = offset;
+			return skb;
+		}
+		/* This looks weird, but this can happen if TCP collapsing
+		 * splitted a fat GRO packet, while we released socket lock
+		 * in skb_splice_bits()
+		 */
+		tcp_eat_recv_skb(sk, skb);
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(tcp_recv_skb);
+
+/*
+ * This routine provides an alternative to tcp_recvmsg() for routines
+ * that would like to handle copying from skbuffs directly in 'sendfile'
+ * fashion.
+ * Note:
+ *	- It is assumed that the socket was locked by the caller.
+ *	- The routine does not block.
+ *	- At present, there is no support for reading OOB data
+ *	  or for 'peeking' the socket using this routine
+ *	  (although both would be easy to implement).
+ */
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+		  sk_read_actor_t recv_actor)
+{
+	struct sk_buff *skb;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 seq = tp->copied_seq;
+	u32 offset;
+	int copied = 0;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return -ENOTCONN;
+	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+		if (offset < skb->len) {
+			int used;
+			size_t len;
+
+			len = skb->len - offset;
+			/* Stop reading if we hit a patch of urgent data */
+			if (unlikely(tp->urg_data)) {
+				u32 urg_offset = tp->urg_seq - seq;
+				if (urg_offset < len)
+					len = urg_offset;
+				if (!len)
+					break;
+			}
+			used = recv_actor(desc, skb, offset, len);
+			if (used <= 0) {
+				if (!copied)
+					copied = used;
+				break;
+			}
+			if (WARN_ON_ONCE(used > len))
+				used = len;
+			seq += used;
+			copied += used;
+			offset += used;
+
+			/* If recv_actor drops the lock (e.g. TCP splice
+			 * receive) the skb pointer might be invalid when
+			 * getting here: tcp_collapse might have deleted it
+			 * while aggregating skbs from the socket queue.
+			 */
+			skb = tcp_recv_skb(sk, seq - 1, &offset);
+			if (!skb)
+				break;
+			/* TCP coalescing might have appended data to the skb.
+			 * Try to splice more frags
+			 */
+			if (offset + 1 != skb->len)
+				continue;
+		}
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+			tcp_eat_recv_skb(sk, skb);
+			++seq;
+			break;
+		}
+		tcp_eat_recv_skb(sk, skb);
+		if (!desc->count)
+			break;
+		WRITE_ONCE(tp->copied_seq, seq);
+	}
+	WRITE_ONCE(tp->copied_seq, seq);
+
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	if (copied > 0) {
+		tcp_recv_skb(sk, seq, &offset);
+		tcp_cleanup_rbuf(sk, copied);
+	}
+	return copied;
+}
+EXPORT_SYMBOL(tcp_read_sock);
+
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
+{
+	struct sk_buff *skb;
+	int copied = 0;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return -ENOTCONN;
+
+	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+		u8 tcp_flags;
+		int used;
+
+		__skb_unlink(skb, &sk->sk_receive_queue);
+		WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+		tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
+		used = recv_actor(sk, skb);
+		if (used < 0) {
+			if (!copied)
+				copied = used;
+			break;
+		}
+		copied += used;
+
+		if (tcp_flags & TCPHDR_FIN)
+			break;
+	}
+	return copied;
+}
+EXPORT_SYMBOL(tcp_read_skb);
+
+void tcp_read_done(struct sock *sk, size_t len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 seq = tp->copied_seq;
+	struct sk_buff *skb;
+	size_t left;
+	u32 offset;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return;
+
+	left = len;
+	while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+		int used;
+
+		used = min_t(size_t, skb->len - offset, left);
+		seq += used;
+		left -= used;
+
+		if (skb->len > offset + used)
+			break;
+
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+			tcp_eat_recv_skb(sk, skb);
+			++seq;
+			break;
+		}
+		tcp_eat_recv_skb(sk, skb);
+	}
+	WRITE_ONCE(tp->copied_seq, seq);
+
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	if (left != len)
+		tcp_cleanup_rbuf(sk, len - left);
+}
+EXPORT_SYMBOL(tcp_read_done);
+
+int tcp_peek_len(struct socket *sock)
+{
+	return tcp_inq(sock->sk);
+}
+EXPORT_SYMBOL(tcp_peek_len);
+
+/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
+int tcp_set_rcvlowat(struct sock *sk, int val)
+{
+	int space, cap;
+
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+		cap = sk->sk_rcvbuf >> 1;
+	else
+		cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
+	val = min(val, cap);
+	WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
+
+	/* Check if we need to signal EPOLLIN right now */
+	tcp_data_ready(sk);
+
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+		return 0;
+
+	space = tcp_space_from_win(sk, val);
+	if (space > sk->sk_rcvbuf) {
+		WRITE_ONCE(sk->sk_rcvbuf, space);
+		tcp_sk(sk)->window_clamp = val;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcp_set_rcvlowat);
+
+void tcp_update_recv_tstamps(struct sk_buff *skb,
+			     struct scm_timestamping_internal *tss)
+{
+	if (skb->tstamp)
+		tss->ts[0] = ktime_to_timespec64(skb->tstamp);
+	else
+		tss->ts[0] = (struct timespec64) {0};
+
+	if (skb_hwtstamps(skb)->hwtstamp)
+		tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
+	else
+		tss->ts[2] = (struct timespec64) {0};
+}
+
+#ifdef CONFIG_MMU
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
+int tcp_mmap(struct file *file, struct socket *sock,
+	     struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+		return -EPERM;
+	vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC);
+
+	/* Instruct vm_insert_page() to not mmap_read_lock(mm) */
+	vm_flags_set(vma, VM_MIXEDMAP);
+
+	vma->vm_ops = &tcp_vm_ops;
+	return 0;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
+				       u32 *offset_frag)
+{
+	skb_frag_t *frag;
+
+	if (unlikely(offset_skb >= skb->len))
+		return NULL;
+
+	offset_skb -= skb_headlen(skb);
+	if ((int)offset_skb < 0 || skb_has_frag_list(skb))
+		return NULL;
+
+	frag = skb_shinfo(skb)->frags;
+	while (offset_skb) {
+		if (skb_frag_size(frag) > offset_skb) {
+			*offset_frag = offset_skb;
+			return frag;
+		}
+		offset_skb -= skb_frag_size(frag);
+		++frag;
+	}
+	*offset_frag = 0;
+	return frag;
+}
+
+static bool can_map_frag(const skb_frag_t *frag)
+{
+	return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
+}
+
+static int find_next_mappable_frag(const skb_frag_t *frag,
+				   int remaining_in_skb)
+{
+	int offset = 0;
+
+	if (likely(can_map_frag(frag)))
+		return 0;
+
+	while (offset < remaining_in_skb && !can_map_frag(frag)) {
+		offset += skb_frag_size(frag);
+		++frag;
+	}
+	return offset;
+}
+
+static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
+					  struct tcp_zerocopy_receive *zc,
+					  struct sk_buff *skb, u32 offset)
+{
+	u32 frag_offset, partial_frag_remainder = 0;
+	int mappable_offset;
+	skb_frag_t *frag;
+
+	/* worst case: skip to next skb. try to improve on this case below */
+	zc->recv_skip_hint = skb->len - offset;
+
+	/* Find the frag containing this offset (and how far into that frag) */
+	frag = skb_advance_to_frag(skb, offset, &frag_offset);
+	if (!frag)
+		return;
+
+	if (frag_offset) {
+		struct skb_shared_info *info = skb_shinfo(skb);
+
+		/* We read part of the last frag, must recvmsg() rest of skb. */
+		if (frag == &info->frags[info->nr_frags - 1])
+			return;
+
+		/* Else, we must at least read the remainder in this frag. */
+		partial_frag_remainder = skb_frag_size(frag) - frag_offset;
+		zc->recv_skip_hint -= partial_frag_remainder;
+		++frag;
+	}
+
+	/* partial_frag_remainder: If part way through a frag, must read rest.
+	 * mappable_offset: Bytes till next mappable frag, *not* counting bytes
+	 * in partial_frag_remainder.
+	 */
+	mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
+	zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
+}
+
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+			      int flags, struct scm_timestamping_internal *tss,
+			      int *cmsg_flags);
+static int receive_fallback_to_copy(struct sock *sk,
+				    struct tcp_zerocopy_receive *zc, int inq,
+				    struct scm_timestamping_internal *tss)
+{
+	unsigned long copy_address = (unsigned long)zc->copybuf_address;
+	struct msghdr msg = {};
+	struct iovec iov;
+	int err;
+
+	zc->length = 0;
+	zc->recv_skip_hint = 0;
+
+	if (copy_address != zc->copybuf_address)
+		return -EINVAL;
+
+	err = import_single_range(ITER_DEST, (void __user *)copy_address,
+				  inq, &iov, &msg.msg_iter);
+	if (err)
+		return err;
+
+	err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
+				 tss, &zc->msg_flags);
+	if (err < 0)
+		return err;
+
+	zc->copybuf_len = err;
+	if (likely(zc->copybuf_len)) {
+		struct sk_buff *skb;
+		u32 offset;
+
+		skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
+		if (skb)
+			tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
+	}
+	return 0;
+}
+
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
+				   struct sk_buff *skb, u32 copylen,
+				   u32 *offset, u32 *seq)
+{
+	unsigned long copy_address = (unsigned long)zc->copybuf_address;
+	struct msghdr msg = {};
+	struct iovec iov;
+	int err;
+
+	if (copy_address != zc->copybuf_address)
+		return -EINVAL;
+
+	err = import_single_range(ITER_DEST, (void __user *)copy_address,
+				  copylen, &iov, &msg.msg_iter);
+	if (err)
+		return err;
+	err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
+	if (err)
+		return err;
+	zc->recv_skip_hint -= copylen;
+	*offset += copylen;
+	*seq += copylen;
+	return (__s32)copylen;
+}
+
+static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
+				  struct sock *sk,
+				  struct sk_buff *skb,
+				  u32 *seq,
+				  s32 copybuf_len,
+				  struct scm_timestamping_internal *tss)
+{
+	u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
+
+	if (!copylen)
+		return 0;
+	/* skb is null if inq < PAGE_SIZE. */
+	if (skb) {
+		offset = *seq - TCP_SKB_CB(skb)->seq;
+	} else {
+		skb = tcp_recv_skb(sk, *seq, &offset);
+		if (TCP_SKB_CB(skb)->has_rxtstamp) {
+			tcp_update_recv_tstamps(skb, tss);
+			zc->msg_flags |= TCP_CMSG_TS;
+		}
+	}
+
+	zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
+						  seq);
+	return zc->copybuf_len < 0 ? 0 : copylen;
+}
+
+static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
+					      struct page **pending_pages,
+					      unsigned long pages_remaining,
+					      unsigned long *address,
+					      u32 *length,
+					      u32 *seq,
+					      struct tcp_zerocopy_receive *zc,
+					      u32 total_bytes_to_map,
+					      int err)
+{
+	/* At least one page did not map. Try zapping if we skipped earlier. */
+	if (err == -EBUSY &&
+	    zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
+		u32 maybe_zap_len;
+
+		maybe_zap_len = total_bytes_to_map -  /* All bytes to map */
+				*length + /* Mapped or pending */
+				(pages_remaining * PAGE_SIZE); /* Failed map. */
+		zap_page_range_single(vma, *address, maybe_zap_len, NULL);
+		err = 0;
+	}
+
+	if (!err) {
+		unsigned long leftover_pages = pages_remaining;
+		int bytes_mapped;
+
+		/* We called zap_page_range_single, try to reinsert. */
+		err = vm_insert_pages(vma, *address,
+				      pending_pages,
+				      &pages_remaining);
+		bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
+		*seq += bytes_mapped;
+		*address += bytes_mapped;
+	}
+	if (err) {
+		/* Either we were unable to zap, OR we zapped, retried an
+		 * insert, and still had an issue. Either ways, pages_remaining
+		 * is the number of pages we were unable to map, and we unroll
+		 * some state we speculatively touched before.
+		 */
+		const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+
+		*length -= bytes_not_mapped;
+		zc->recv_skip_hint += bytes_not_mapped;
+	}
+	return err;
+}
+
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+					struct page **pages,
+					unsigned int pages_to_map,
+					unsigned long *address,
+					u32 *length,
+					u32 *seq,
+					struct tcp_zerocopy_receive *zc,
+					u32 total_bytes_to_map)
+{
+	unsigned long pages_remaining = pages_to_map;
+	unsigned int pages_mapped;
+	unsigned int bytes_mapped;
+	int err;
+
+	err = vm_insert_pages(vma, *address, pages, &pages_remaining);
+	pages_mapped = pages_to_map - (unsigned int)pages_remaining;
+	bytes_mapped = PAGE_SIZE * pages_mapped;
+	/* Even if vm_insert_pages fails, it may have partially succeeded in
+	 * mapping (some but not all of the pages).
+	 */
+	*seq += bytes_mapped;
+	*address += bytes_mapped;
+
+	if (likely(!err))
+		return 0;
+
+	/* Error: maybe zap and retry + rollback state for failed inserts. */
+	return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
+		pages_remaining, address, length, seq, zc, total_bytes_to_map,
+		err);
+}
+
+#define TCP_VALID_ZC_MSG_FLAGS   (TCP_CMSG_TS)
+static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
+				      struct tcp_zerocopy_receive *zc,
+				      struct scm_timestamping_internal *tss)
+{
+	unsigned long msg_control_addr;
+	struct msghdr cmsg_dummy;
+
+	msg_control_addr = (unsigned long)zc->msg_control;
+	cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
+	cmsg_dummy.msg_controllen =
+		(__kernel_size_t)zc->msg_controllen;
+	cmsg_dummy.msg_flags = in_compat_syscall()
+		? MSG_CMSG_COMPAT : 0;
+	cmsg_dummy.msg_control_is_user = true;
+	zc->msg_flags = 0;
+	if (zc->msg_control == msg_control_addr &&
+	    zc->msg_controllen == cmsg_dummy.msg_controllen) {
+		tcp_recv_timestamp(&cmsg_dummy, sk, tss);
+		zc->msg_control = (__u64)
+			((uintptr_t)cmsg_dummy.msg_control_user);
+		zc->msg_controllen =
+			(__u64)cmsg_dummy.msg_controllen;
+		zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
+	}
+}
+
+static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
+					   unsigned long address,
+					   bool *mmap_locked)
+{
+	struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
+
+	if (vma) {
+		if (vma->vm_ops != &tcp_vm_ops) {
+			vma_end_read(vma);
+			return NULL;
+		}
+		*mmap_locked = false;
+		return vma;
+	}
+
+	mmap_read_lock(mm);
+	vma = vma_lookup(mm, address);
+	if (!vma || vma->vm_ops != &tcp_vm_ops) {
+		mmap_read_unlock(mm);
+		return NULL;
+	}
+	*mmap_locked = true;
+	return vma;
+}
+
+#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
+static int tcp_zerocopy_receive(struct sock *sk,
+				struct tcp_zerocopy_receive *zc,
+				struct scm_timestamping_internal *tss)
+{
+	u32 length = 0, offset, vma_len, avail_len, copylen = 0;
+	unsigned long address = (unsigned long)zc->address;
+	struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
+	s32 copybuf_len = zc->copybuf_len;
+	struct tcp_sock *tp = tcp_sk(sk);
+	const skb_frag_t *frags = NULL;
+	unsigned int pages_to_map = 0;
+	struct vm_area_struct *vma;
+	struct sk_buff *skb = NULL;
+	u32 seq = tp->copied_seq;
+	u32 total_bytes_to_map;
+	int inq = tcp_inq(sk);
+	bool mmap_locked;
+	int ret;
+
+	zc->copybuf_len = 0;
+	zc->msg_flags = 0;
+
+	if (address & (PAGE_SIZE - 1) || address != zc->address)
+		return -EINVAL;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return -ENOTCONN;
+
+	sock_rps_record_flow(sk);
+
+	if (inq && inq <= copybuf_len)
+		return receive_fallback_to_copy(sk, zc, inq, tss);
+
+	if (inq < PAGE_SIZE) {
+		zc->length = 0;
+		zc->recv_skip_hint = inq;
+		if (!inq && sock_flag(sk, SOCK_DONE))
+			return -EIO;
+		return 0;
+	}
+
+	vma = find_tcp_vma(current->mm, address, &mmap_locked);
+	if (!vma)
+		return -EINVAL;
+
+	vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
+	avail_len = min_t(u32, vma_len, inq);
+	total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
+	if (total_bytes_to_map) {
+		if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
+			zap_page_range_single(vma, address, total_bytes_to_map,
+					      NULL);
+		zc->length = total_bytes_to_map;
+		zc->recv_skip_hint = 0;
+	} else {
+		zc->length = avail_len;
+		zc->recv_skip_hint = avail_len;
+	}
+	ret = 0;
+	while (length + PAGE_SIZE <= zc->length) {
+		int mappable_offset;
+		struct page *page;
+
+		if (zc->recv_skip_hint < PAGE_SIZE) {
+			u32 offset_frag;
+
+			if (skb) {
+				if (zc->recv_skip_hint > 0)
+					break;
+				skb = skb->next;
+				offset = seq - TCP_SKB_CB(skb)->seq;
+			} else {
+				skb = tcp_recv_skb(sk, seq, &offset);
+			}
+
+			if (TCP_SKB_CB(skb)->has_rxtstamp) {
+				tcp_update_recv_tstamps(skb, tss);
+				zc->msg_flags |= TCP_CMSG_TS;
+			}
+			zc->recv_skip_hint = skb->len - offset;
+			frags = skb_advance_to_frag(skb, offset, &offset_frag);
+			if (!frags || offset_frag)
+				break;
+		}
+
+		mappable_offset = find_next_mappable_frag(frags,
+							  zc->recv_skip_hint);
+		if (mappable_offset) {
+			zc->recv_skip_hint = mappable_offset;
+			break;
+		}
+		page = skb_frag_page(frags);
+		prefetchw(page);
+		pages[pages_to_map++] = page;
+		length += PAGE_SIZE;
+		zc->recv_skip_hint -= PAGE_SIZE;
+		frags++;
+		if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
+		    zc->recv_skip_hint < PAGE_SIZE) {
+			/* Either full batch, or we're about to go to next skb
+			 * (and we cannot unroll failed ops across skbs).
+			 */
+			ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+							   pages_to_map,
+							   &address, &length,
+							   &seq, zc,
+							   total_bytes_to_map);
+			if (ret)
+				goto out;
+			pages_to_map = 0;
+		}
+	}
+	if (pages_to_map) {
+		ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
+						   &address, &length, &seq,
+						   zc, total_bytes_to_map);
+	}
+out:
+	if (mmap_locked)
+		mmap_read_unlock(current->mm);
+	else
+		vma_end_read(vma);
+	/* Try to copy straggler data. */
+	if (!ret)
+		copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
+
+	if (length + copylen) {
+		WRITE_ONCE(tp->copied_seq, seq);
+		tcp_rcv_space_adjust(sk);
+
+		/* Clean up data we have read: This will do ACK frames. */
+		tcp_recv_skb(sk, seq, &offset);
+		tcp_cleanup_rbuf(sk, length + copylen);
+		ret = 0;
+		if (length == zc->length)
+			zc->recv_skip_hint = 0;
+	} else {
+		if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+			ret = -EIO;
+	}
+	zc->length = length;
+	return ret;
+}
+#endif
+
+/* Similar to __sock_recv_timestamp, but does not require an skb */
+void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+			struct scm_timestamping_internal *tss)
+{
+	int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
+	bool has_timestamping = false;
+
+	if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
+		if (sock_flag(sk, SOCK_RCVTSTAMP)) {
+			if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
+				if (new_tstamp) {
+					struct __kernel_timespec kts = {
+						.tv_sec = tss->ts[0].tv_sec,
+						.tv_nsec = tss->ts[0].tv_nsec,
+					};
+					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+						 sizeof(kts), &kts);
+				} else {
+					struct __kernel_old_timespec ts_old = {
+						.tv_sec = tss->ts[0].tv_sec,
+						.tv_nsec = tss->ts[0].tv_nsec,
+					};
+					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
+						 sizeof(ts_old), &ts_old);
+				}
+			} else {
+				if (new_tstamp) {
+					struct __kernel_sock_timeval stv = {
+						.tv_sec = tss->ts[0].tv_sec,
+						.tv_usec = tss->ts[0].tv_nsec / 1000,
+					};
+					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+						 sizeof(stv), &stv);
+				} else {
+					struct __kernel_old_timeval tv = {
+						.tv_sec = tss->ts[0].tv_sec,
+						.tv_usec = tss->ts[0].tv_nsec / 1000,
+					};
+					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+						 sizeof(tv), &tv);
+				}
+			}
+		}
+
+		if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE)
+			has_timestamping = true;
+		else
+			tss->ts[0] = (struct timespec64) {0};
+	}
+
+	if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
+		if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE)
+			has_timestamping = true;
+		else
+			tss->ts[2] = (struct timespec64) {0};
+	}
+
+	if (has_timestamping) {
+		tss->ts[1] = (struct timespec64) {0};
+		if (sock_flag(sk, SOCK_TSTAMP_NEW))
+			put_cmsg_scm_timestamping64(msg, tss);
+		else
+			put_cmsg_scm_timestamping(msg, tss);
+	}
+}
+
+static int tcp_inq_hint(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 copied_seq = READ_ONCE(tp->copied_seq);
+	u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
+	int inq;
+
+	inq = rcv_nxt - copied_seq;
+	if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
+		lock_sock(sk);
+		inq = tp->rcv_nxt - tp->copied_seq;
+		release_sock(sk);
+	}
+	/* After receiving a FIN, tell the user-space to continue reading
+	 * by returning a non-zero inq.
+	 */
+	if (inq == 0 && sock_flag(sk, SOCK_DONE))
+		inq = 1;
+	return inq;
+}
+
+/*
+ *	This routine copies from a sock struct into the user buffer.
+ *
+ *	Technical note: in 2.3 we work on _locked_ socket, so that
+ *	tricks with *seq access order and skb->users are not required.
+ *	Probably, code can be easily improved even more.
+ */
+
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+			      int flags, struct scm_timestamping_internal *tss,
+			      int *cmsg_flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int copied = 0;
+	u32 peek_seq;
+	u32 *seq;
+	unsigned long used;
+	int err;
+	int target;		/* Read at least this many bytes */
+	long timeo;
+	struct sk_buff *skb, *last;
+	u32 urg_hole = 0;
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	if (tp->recvmsg_inq) {
+		*cmsg_flags = TCP_CMSG_INQ;
+		msg->msg_get_inq = 1;
+	}
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	/* Urgent data needs to be handled specially. */
+	if (flags & MSG_OOB)
+		goto recv_urg;
+
+	if (unlikely(tp->repair)) {
+		err = -EPERM;
+		if (!(flags & MSG_PEEK))
+			goto out;
+
+		if (tp->repair_queue == TCP_SEND_QUEUE)
+			goto recv_sndq;
+
+		err = -EINVAL;
+		if (tp->repair_queue == TCP_NO_QUEUE)
+			goto out;
+
+		/* 'common' recv queue MSG_PEEK-ing */
+	}
+
+	seq = &tp->copied_seq;
+	if (flags & MSG_PEEK) {
+		peek_seq = tp->copied_seq;
+		seq = &peek_seq;
+	}
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+	do {
+		u32 offset;
+
+		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+		if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
+			if (copied)
+				break;
+			if (signal_pending(current)) {
+				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+				break;
+			}
+		}
+
+		/* Next get a buffer. */
+
+		last = skb_peek_tail(&sk->sk_receive_queue);
+		skb_queue_walk(&sk->sk_receive_queue, skb) {
+			last = skb;
+			/* Now that we have two receive queues this
+			 * shouldn't happen.
+			 */
+			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+				 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
+				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
+				 flags))
+				break;
+
+			offset = *seq - TCP_SKB_CB(skb)->seq;
+			if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+				pr_err_once("%s: found a SYN, please report !\n", __func__);
+				offset--;
+			}
+			if (offset < skb->len)
+				goto found_ok_skb;
+			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+				goto found_fin_ok;
+			WARN(!(flags & MSG_PEEK),
+			     "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
+			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
+		}
+
+		/* Well, if we have backlog, try to process it now yet. */
+
+		if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
+			break;
+
+		if (copied) {
+			if (!timeo ||
+			    sk->sk_err ||
+			    sk->sk_state == TCP_CLOSE ||
+			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    signal_pending(current))
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+
+			if (sk->sk_err) {
+				copied = sock_error(sk);
+				break;
+			}
+
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+
+			if (sk->sk_state == TCP_CLOSE) {
+				/* This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				copied = -ENOTCONN;
+				break;
+			}
+
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+
+			if (signal_pending(current)) {
+				copied = sock_intr_errno(timeo);
+				break;
+			}
+		}
+
+		if (copied >= target) {
+			/* Do not sleep, just process backlog. */
+			__sk_flush_backlog(sk);
+		} else {
+			tcp_cleanup_rbuf(sk, copied);
+			err = sk_wait_data(sk, &timeo, last);
+			if (err < 0) {
+				err = copied ? : err;
+				goto out;
+			}
+		}
+
+		if ((flags & MSG_PEEK) &&
+		    (peek_seq - copied - urg_hole != tp->copied_seq)) {
+			net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
+					    current->comm,
+					    task_pid_nr(current));
+			peek_seq = tp->copied_seq;
+		}
+		continue;
+
+found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		/* Do we have urgent data here? */
+		if (unlikely(tp->urg_data)) {
+			u32 urg_offset = tp->urg_seq - *seq;
+			if (urg_offset < used) {
+				if (!urg_offset) {
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						WRITE_ONCE(*seq, *seq + 1);
+						urg_hole++;
+						offset++;
+						used--;
+						if (!used)
+							goto skip_copy;
+					}
+				} else
+					used = urg_offset;
+			}
+		}
+
+		if (!(flags & MSG_TRUNC)) {
+			err = skb_copy_datagram_msg(skb, offset, msg, used);
+			if (err) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		}
+
+		WRITE_ONCE(*seq, *seq + used);
+		copied += used;
+		len -= used;
+
+		tcp_rcv_space_adjust(sk);
+
+skip_copy:
+		if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
+			WRITE_ONCE(tp->urg_data, 0);
+			tcp_fast_path_check(sk);
+		}
+
+		if (TCP_SKB_CB(skb)->has_rxtstamp) {
+			tcp_update_recv_tstamps(skb, tss);
+			*cmsg_flags |= TCP_CMSG_TS;
+		}
+
+		if (used + offset < skb->len)
+			continue;
+
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+			goto found_fin_ok;
+		if (!(flags & MSG_PEEK))
+			tcp_eat_recv_skb(sk, skb);
+		continue;
+
+found_fin_ok:
+		/* Process the FIN. */
+		WRITE_ONCE(*seq, *seq + 1);
+		if (!(flags & MSG_PEEK))
+			tcp_eat_recv_skb(sk, skb);
+		break;
+	} while (len > 0);
+
+	/* According to UNIX98, msg_name/msg_namelen are ignored
+	 * on connected socket. I was just happy when found this 8) --ANK
+	 */
+
+	/* Clean up data we have read: This will do ACK frames. */
+	tcp_cleanup_rbuf(sk, copied);
+	return copied;
+
+out:
+	return err;
+
+recv_urg:
+	err = tcp_recv_urg(sk, msg, len, flags);
+	goto out;
+
+recv_sndq:
+	err = tcp_peek_sndq(sk, msg, len);
+	goto out;
+}
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+		int *addr_len)
+{
+	int cmsg_flags = 0, ret;
+	struct scm_timestamping_internal tss;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+
+	if (sk_can_busy_loop(sk) &&
+	    skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+	    sk->sk_state == TCP_ESTABLISHED)
+		sk_busy_loop(sk, flags & MSG_DONTWAIT);
+
+	lock_sock(sk);
+	ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
+	release_sock(sk);
+
+	if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
+		if (cmsg_flags & TCP_CMSG_TS)
+			tcp_recv_timestamp(msg, sk, &tss);
+		if (msg->msg_get_inq) {
+			msg->msg_inq = tcp_inq_hint(sk);
+			if (cmsg_flags & TCP_CMSG_INQ)
+				put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
+					 sizeof(msg->msg_inq), &msg->msg_inq);
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(tcp_recvmsg);
+
+void tcp_set_state(struct sock *sk, int state)
+{
+	int oldstate = sk->sk_state;
+
+	/* We defined a new enum for TCP states that are exported in BPF
+	 * so as not force the internal TCP states to be frozen. The
+	 * following checks will detect if an internal state value ever
+	 * differs from the BPF value. If this ever happens, then we will
+	 * need to remap the internal value to the BPF value before calling
+	 * tcp_call_bpf_2arg.
+	 */
+	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
+	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
+	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
+	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+
+	/* bpf uapi header bpf.h defines an anonymous enum with values
+	 * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
+	 * is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
+	 * But clang built vmlinux does not have this enum in DWARF
+	 * since clang removes the above code before generating IR/debuginfo.
+	 * Let us explicitly emit the type debuginfo to ensure the
+	 * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
+	 * regardless of which compiler is used.
+	 */
+	BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+
+	switch (state) {
+	case TCP_ESTABLISHED:
+		if (oldstate != TCP_ESTABLISHED)
+			TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+		break;
+
+	case TCP_CLOSE:
+		if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
+			TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
+
+		sk->sk_prot->unhash(sk);
+		if (inet_csk(sk)->icsk_bind_hash &&
+		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+			inet_put_port(sk);
+		fallthrough;
+	default:
+		if (oldstate == TCP_ESTABLISHED)
+			TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+	}
+
+	/* Change state AFTER socket is unhashed to avoid closed
+	 * socket sitting in hash tables.
+	 */
+	inet_sk_state_store(sk, state);
+}
+EXPORT_SYMBOL_GPL(tcp_set_state);
+
+/*
+ *	State processing on a close. This implements the state shift for
+ *	sending our FIN frame. Note that we only send a FIN for some
+ *	states. A shutdown() may have already sent the FIN, or we may be
+ *	closed.
+ */
+
+static const unsigned char new_state[16] = {
+  /* current state:        new state:      action:	*/
+  [0 /* (Invalid) */]	= TCP_CLOSE,
+  [TCP_ESTABLISHED]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+  [TCP_SYN_SENT]	= TCP_CLOSE,
+  [TCP_SYN_RECV]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+  [TCP_FIN_WAIT1]	= TCP_FIN_WAIT1,
+  [TCP_FIN_WAIT2]	= TCP_FIN_WAIT2,
+  [TCP_TIME_WAIT]	= TCP_CLOSE,
+  [TCP_CLOSE]		= TCP_CLOSE,
+  [TCP_CLOSE_WAIT]	= TCP_LAST_ACK  | TCP_ACTION_FIN,
+  [TCP_LAST_ACK]	= TCP_LAST_ACK,
+  [TCP_LISTEN]		= TCP_CLOSE,
+  [TCP_CLOSING]		= TCP_CLOSING,
+  [TCP_NEW_SYN_RECV]	= TCP_CLOSE,	/* should not happen ! */
+};
+
+static int tcp_close_state(struct sock *sk)
+{
+	int next = (int)new_state[sk->sk_state];
+	int ns = next & TCP_STATE_MASK;
+
+	tcp_set_state(sk, ns);
+
+	return next & TCP_ACTION_FIN;
+}
+
+/*
+ *	Shutdown the sending side of a connection. Much like close except
+ *	that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
+ */
+
+void tcp_shutdown(struct sock *sk, int how)
+{
+	/*	We need to grab some memory, and put together a FIN,
+	 *	and then put it into the queue to be sent.
+	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
+	 */
+	if (!(how & SEND_SHUTDOWN))
+		return;
+
+	/* If we've already sent a FIN, or it's a closed state, skip this. */
+	if ((1 << sk->sk_state) &
+	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+		/* Clear out any half completed packets.  FIN if needed. */
+		if (tcp_close_state(sk))
+			tcp_send_fin(sk);
+	}
+}
+EXPORT_SYMBOL(tcp_shutdown);
+
+int tcp_orphan_count_sum(void)
+{
+	int i, total = 0;
+
+	for_each_possible_cpu(i)
+		total += per_cpu(tcp_orphan_count, i);
+
+	return max(total, 0);
+}
+
+static int tcp_orphan_cache;
+static struct timer_list tcp_orphan_timer;
+#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
+
+static void tcp_orphan_update(struct timer_list *unused)
+{
+	WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
+	mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+}
+
+static bool tcp_too_many_orphans(int shift)
+{
+	return READ_ONCE(tcp_orphan_cache) << shift >
+		READ_ONCE(sysctl_tcp_max_orphans);
+}
+
+bool tcp_check_oom(struct sock *sk, int shift)
+{
+	bool too_many_orphans, out_of_socket_memory;
+
+	too_many_orphans = tcp_too_many_orphans(shift);
+	out_of_socket_memory = tcp_out_of_memory(sk);
+
+	if (too_many_orphans)
+		net_info_ratelimited("too many orphaned sockets\n");
+	if (out_of_socket_memory)
+		net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
+	return too_many_orphans || out_of_socket_memory;
+}
+
+void __tcp_close(struct sock *sk, long timeout)
+{
+	struct sk_buff *skb;
+	int data_was_unread = 0;
+	int state;
+
+	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
+
+	if (sk->sk_state == TCP_LISTEN) {
+		tcp_set_state(sk, TCP_CLOSE);
+
+		/* Special case. */
+		inet_csk_listen_stop(sk);
+
+		goto adjudge_to_death;
+	}
+
+	/*  We need to flush the recv. buffs.  We do this only on the
+	 *  descriptor close, not protocol-sourced closes, because the
+	 *  reader process may not have drained the data yet!
+	 */
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+			len--;
+		data_was_unread += len;
+		__kfree_skb(skb);
+	}
+
+	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+	if (sk->sk_state == TCP_CLOSE)
+		goto adjudge_to_death;
+
+	/* As outlined in RFC 2525, section 2.17, we send a RST here because
+	 * data was lost. To witness the awful effects of the old behavior of
+	 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
+	 * GET in an FTP client, suspend the process, wait for the client to
+	 * advertise a zero window, then kill -9 the FTP client, wheee...
+	 * Note: timeout is always zero in such a case.
+	 */
+	if (unlikely(tcp_sk(sk)->repair)) {
+		sk->sk_prot->disconnect(sk, 0);
+	} else if (data_was_unread) {
+		/* Unread data was tossed, zap the connection. */
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
+		tcp_set_state(sk, TCP_CLOSE);
+		tcp_send_active_reset(sk, sk->sk_allocation);
+	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+		/* Check zero linger _after_ checking for unread data. */
+		sk->sk_prot->disconnect(sk, 0);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+	} else if (tcp_close_state(sk)) {
+		/* We FIN if the application ate all the data before
+		 * zapping the connection.
+		 */
+
+		/* RED-PEN. Formally speaking, we have broken TCP state
+		 * machine. State transitions:
+		 *
+		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
+		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+		 *
+		 * are legal only when FIN has been sent (i.e. in window),
+		 * rather than queued out of window. Purists blame.
+		 *
+		 * F.e. "RFC state" is ESTABLISHED,
+		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+		 *
+		 * The visible declinations are that sometimes
+		 * we enter time-wait state, when it is not required really
+		 * (harmless), do not send active resets, when they are
+		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+		 * they look as CLOSING or LAST_ACK for Linux)
+		 * Probably, I missed some more holelets.
+		 * 						--ANK
+		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+		 * in a single packet! (May consider it later but will
+		 * probably need API support or TCP_CORK SYN-ACK until
+		 * data is written and socket is closed.)
+		 */
+		tcp_send_fin(sk);
+	}
+
+	sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+	state = sk->sk_state;
+	sock_hold(sk);
+	sock_orphan(sk);
+
+	local_bh_disable();
+	bh_lock_sock(sk);
+	/* remove backlog if any, without releasing ownership. */
+	__release_sock(sk);
+
+	this_cpu_inc(tcp_orphan_count);
+
+	/* Have we already been destroyed by a softirq or backlog? */
+	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	/*	This is a (useful) BSD violating of the RFC. There is a
+	 *	problem with TCP as specified in that the other end could
+	 *	keep a socket open forever with no application left this end.
+	 *	We use a 1 minute timeout (about the same as BSD) then kill
+	 *	our end. If they send after that then tough - BUT: long enough
+	 *	that we won't make the old 4*rto = almost no time - whoops
+	 *	reset mistake.
+	 *
+	 *	Nope, it was not mistake. It is really desired behaviour
+	 *	f.e. on http servers, when such sockets are useless, but
+	 *	consume significant resources. Let's do it with special
+	 *	linger2	option.					--ANK
+	 */
+
+	if (sk->sk_state == TCP_FIN_WAIT2) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		if (READ_ONCE(tp->linger2) < 0) {
+			tcp_set_state(sk, TCP_CLOSE);
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			__NET_INC_STATS(sock_net(sk),
+					LINUX_MIB_TCPABORTONLINGER);
+		} else {
+			const int tmo = tcp_fin_time(sk);
+
+			if (tmo > TCP_TIMEWAIT_LEN) {
+				inet_csk_reset_keepalive_timer(sk,
+						tmo - TCP_TIMEWAIT_LEN);
+			} else {
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+	}
+	if (sk->sk_state != TCP_CLOSE) {
+		if (tcp_check_oom(sk, 0)) {
+			tcp_set_state(sk, TCP_CLOSE);
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			__NET_INC_STATS(sock_net(sk),
+					LINUX_MIB_TCPABORTONMEMORY);
+		} else if (!check_net(sock_net(sk))) {
+			/* Not possible to send reset; just close */
+			tcp_set_state(sk, TCP_CLOSE);
+		}
+	}
+
+	if (sk->sk_state == TCP_CLOSE) {
+		struct request_sock *req;
+
+		req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+						lockdep_sock_is_held(sk));
+		/* We could get here with a non-NULL req if the socket is
+		 * aborted (e.g., closed with unread data) before 3WHS
+		 * finishes.
+		 */
+		if (req)
+			reqsk_fastopen_remove(sk, req, false);
+		inet_csk_destroy_sock(sk);
+	}
+	/* Otherwise, socket is reprieved until protocol close. */
+
+out:
+	bh_unlock_sock(sk);
+	local_bh_enable();
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+	lock_sock(sk);
+	__tcp_close(sk, timeout);
+	release_sock(sk);
+	sock_put(sk);
+}
+EXPORT_SYMBOL(tcp_close);
+
+/* These states need RST on ABORT according to RFC793 */
+
+static inline bool tcp_need_reset(int state)
+{
+	return (1 << state) &
+	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+}
+
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+	struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+	tcp_sk(sk)->highest_sack = NULL;
+	while (p) {
+		struct sk_buff *skb = rb_to_skb(p);
+
+		p = rb_next(p);
+		/* Since we are deleting whole queue, no need to
+		 * list_del(&skb->tcp_tsorted_anchor)
+		 */
+		tcp_rtx_queue_unlink(skb, sk);
+		tcp_wmem_free_skb(sk, skb);
+	}
+}
+
+void tcp_write_queue_purge(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		tcp_skb_tsorted_anchor_cleanup(skb);
+		tcp_wmem_free_skb(sk, skb);
+	}
+	tcp_rtx_queue_purge(sk);
+	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+	tcp_clear_all_retrans_hints(tcp_sk(sk));
+	tcp_sk(sk)->packets_out = 0;
+	inet_csk(sk)->icsk_backoff = 0;
+}
+
+int tcp_disconnect(struct sock *sk, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int old_state = sk->sk_state;
+	u32 seq;
+
+	if (old_state != TCP_CLOSE)
+		tcp_set_state(sk, TCP_CLOSE);
+
+	/* ABORT function of RFC793 */
+	if (old_state == TCP_LISTEN) {
+		inet_csk_listen_stop(sk);
+	} else if (unlikely(tp->repair)) {
+		WRITE_ONCE(sk->sk_err, ECONNABORTED);
+	} else if (tcp_need_reset(old_state) ||
+		   (tp->snd_nxt != tp->write_seq &&
+		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+		/* The last check adjusts for discrepancy of Linux wrt. RFC
+		 * states
+		 */
+		tcp_send_active_reset(sk, gfp_any());
+		WRITE_ONCE(sk->sk_err, ECONNRESET);
+	} else if (old_state == TCP_SYN_SENT)
+		WRITE_ONCE(sk->sk_err, ECONNRESET);
+
+	tcp_clear_xmit_timers(sk);
+	__skb_queue_purge(&sk->sk_receive_queue);
+	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+	WRITE_ONCE(tp->urg_data, 0);
+	tcp_write_queue_purge(sk);
+	tcp_fastopen_active_disable_ofo_check(sk);
+	skb_rbtree_purge(&tp->out_of_order_queue);
+
+	inet->inet_dport = 0;
+
+	inet_bhash2_reset_saddr(sk);
+
+	WRITE_ONCE(sk->sk_shutdown, 0);
+	sock_reset_flag(sk, SOCK_DONE);
+	tp->srtt_us = 0;
+	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+	tp->rcv_rtt_last_tsecr = 0;
+
+	seq = tp->write_seq + tp->max_window + 2;
+	if (!seq)
+		seq = 1;
+	WRITE_ONCE(tp->write_seq, seq);
+
+	icsk->icsk_backoff = 0;
+	icsk->icsk_probes_out = 0;
+	icsk->icsk_probes_tstamp = 0;
+	icsk->icsk_rto = TCP_TIMEOUT_INIT;
+	icsk->icsk_rto_min = TCP_RTO_MIN;
+	icsk->icsk_delack_max = TCP_DELACK_MAX;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
+	tp->snd_cwnd_cnt = 0;
+	tp->is_cwnd_limited = 0;
+	tp->max_packets_out = 0;
+	tp->window_clamp = 0;
+	tp->delivered = 0;
+	tp->delivered_ce = 0;
+	if (icsk->icsk_ca_ops->release)
+		icsk->icsk_ca_ops->release(sk);
+	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+	icsk->icsk_ca_initialized = 0;
+	tcp_set_ca_state(sk, TCP_CA_Open);
+	tp->is_sack_reneg = 0;
+	tcp_clear_retrans(tp);
+	tp->total_retrans = 0;
+	inet_csk_delack_init(sk);
+	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+	 * issue in __tcp_select_window()
+	 */
+	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
+	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
+	__sk_dst_reset(sk);
+	dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
+	tcp_saved_syn_free(tp);
+	tp->compressed_ack = 0;
+	tp->segs_in = 0;
+	tp->segs_out = 0;
+	tp->bytes_sent = 0;
+	tp->bytes_acked = 0;
+	tp->bytes_received = 0;
+	tp->bytes_retrans = 0;
+	tp->data_segs_in = 0;
+	tp->data_segs_out = 0;
+	tp->duplicate_sack[0].start_seq = 0;
+	tp->duplicate_sack[0].end_seq = 0;
+	tp->dsack_dups = 0;
+	tp->reord_seen = 0;
+	tp->retrans_out = 0;
+	tp->sacked_out = 0;
+	tp->tlp_high_seq = 0;
+	tp->last_oow_ack_time = 0;
+	tp->plb_rehash = 0;
+	/* There's a bubble in the pipe until at least the first ACK. */
+	tp->app_limited = ~0U;
+	tp->rate_app_limited = 1;
+	tp->rack.mstamp = 0;
+	tp->rack.advanced = 0;
+	tp->rack.reo_wnd_steps = 1;
+	tp->rack.last_delivered = 0;
+	tp->rack.reo_wnd_persist = 0;
+	tp->rack.dsack_seen = 0;
+	tp->syn_data_acked = 0;
+	tp->rx_opt.saw_tstamp = 0;
+	tp->rx_opt.dsack = 0;
+	tp->rx_opt.num_sacks = 0;
+	tp->rcv_ooopack = 0;
+
+
+	/* Clean up fastopen related fields */
+	tcp_free_fastopen_req(tp);
+	inet_clear_bit(DEFER_CONNECT, sk);
+	tp->fastopen_client_fail = 0;
+
+	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
+
+	if (sk->sk_frag.page) {
+		put_page(sk->sk_frag.page);
+		sk->sk_frag.page = NULL;
+		sk->sk_frag.offset = 0;
+	}
+	sk_error_report(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_disconnect);
+
+static inline bool tcp_can_repair_sock(const struct sock *sk)
+{
+	return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
+		(sk->sk_state != TCP_LISTEN);
+}
+
+static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
+{
+	struct tcp_repair_window opt;
+
+	if (!tp->repair)
+		return -EPERM;
+
+	if (len != sizeof(opt))
+		return -EINVAL;
+
+	if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
+		return -EFAULT;
+
+	if (opt.max_window < opt.snd_wnd)
+		return -EINVAL;
+
+	if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
+		return -EINVAL;
+
+	if (after(opt.rcv_wup, tp->rcv_nxt))
+		return -EINVAL;
+
+	tp->snd_wl1	= opt.snd_wl1;
+	tp->snd_wnd	= opt.snd_wnd;
+	tp->max_window	= opt.max_window;
+
+	tp->rcv_wnd	= opt.rcv_wnd;
+	tp->rcv_wup	= opt.rcv_wup;
+
+	return 0;
+}
+
+static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
+		unsigned int len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_repair_opt opt;
+	size_t offset = 0;
+
+	while (len >= sizeof(opt)) {
+		if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
+			return -EFAULT;
+
+		offset += sizeof(opt);
+		len -= sizeof(opt);
+
+		switch (opt.opt_code) {
+		case TCPOPT_MSS:
+			tp->rx_opt.mss_clamp = opt.opt_val;
+			tcp_mtup_init(sk);
+			break;
+		case TCPOPT_WINDOW:
+			{
+				u16 snd_wscale = opt.opt_val & 0xFFFF;
+				u16 rcv_wscale = opt.opt_val >> 16;
+
+				if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
+					return -EFBIG;
+
+				tp->rx_opt.snd_wscale = snd_wscale;
+				tp->rx_opt.rcv_wscale = rcv_wscale;
+				tp->rx_opt.wscale_ok = 1;
+			}
+			break;
+		case TCPOPT_SACK_PERM:
+			if (opt.opt_val != 0)
+				return -EINVAL;
+
+			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
+			break;
+		case TCPOPT_TIMESTAMP:
+			if (opt.opt_val != 0)
+				return -EINVAL;
+
+			tp->rx_opt.tstamp_ok = 1;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(void)
+{
+	if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+		static int __tcp_tx_delay_enabled = 0;
+
+		if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+			static_branch_enable(&tcp_tx_delay_enabled);
+			pr_info("TCP_TX_DELAY enabled\n");
+		}
+	}
+}
+
+/* When set indicates to always queue non-full frames.  Later the user clears
+ * this option and we transmit any pending partial frames in the queue.  This is
+ * meant to be used alongside sendfile() to get properly filled frames when the
+ * user (for example) must write out headers with a write() call first and then
+ * use sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
+ * TCP_NODELAY.
+ */
+void __tcp_sock_set_cork(struct sock *sk, bool on)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (on) {
+		tp->nonagle |= TCP_NAGLE_CORK;
+	} else {
+		tp->nonagle &= ~TCP_NAGLE_CORK;
+		if (tp->nonagle & TCP_NAGLE_OFF)
+			tp->nonagle |= TCP_NAGLE_PUSH;
+		tcp_push_pending_frames(sk);
+	}
+}
+
+void tcp_sock_set_cork(struct sock *sk, bool on)
+{
+	lock_sock(sk);
+	__tcp_sock_set_cork(sk, on);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_cork);
+
+/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
+ * remembered, but it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make an explicit push, which overrides
+ * even TCP_CORK for currently queued segments.
+ */
+void __tcp_sock_set_nodelay(struct sock *sk, bool on)
+{
+	if (on) {
+		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+		tcp_push_pending_frames(sk);
+	} else {
+		tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
+	}
+}
+
+void tcp_sock_set_nodelay(struct sock *sk)
+{
+	lock_sock(sk);
+	__tcp_sock_set_nodelay(sk, true);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_nodelay);
+
+static void __tcp_sock_set_quickack(struct sock *sk, int val)
+{
+	if (!val) {
+		inet_csk_enter_pingpong_mode(sk);
+		return;
+	}
+
+	inet_csk_exit_pingpong_mode(sk);
+	if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+	    inet_csk_ack_scheduled(sk)) {
+		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
+		tcp_cleanup_rbuf(sk, 1);
+		if (!(val & 1))
+			inet_csk_enter_pingpong_mode(sk);
+	}
+}
+
+void tcp_sock_set_quickack(struct sock *sk, int val)
+{
+	lock_sock(sk);
+	__tcp_sock_set_quickack(sk, val);
+	release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_quickack);
+
+int tcp_sock_set_syncnt(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_SYNCNT)
+		return -EINVAL;
+
+	WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_syncnt);
+
+int tcp_sock_set_user_timeout(struct sock *sk, int val)
+{
+	/* Cap the max time in ms TCP will retry or probe the window
+	 * before giving up and aborting (ETIMEDOUT) a connection.
+	 */
+	if (val < 0)
+		return -EINVAL;
+
+	WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_user_timeout);
+
+int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (val < 1 || val > MAX_TCP_KEEPIDLE)
+		return -EINVAL;
+
+	/* Paired with WRITE_ONCE() in keepalive_time_when() */
+	WRITE_ONCE(tp->keepalive_time, val * HZ);
+	if (sock_flag(sk, SOCK_KEEPOPEN) &&
+	    !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
+		u32 elapsed = keepalive_time_elapsed(tp);
+
+		if (tp->keepalive_time > elapsed)
+			elapsed = tp->keepalive_time - elapsed;
+		else
+			elapsed = 0;
+		inet_csk_reset_keepalive_timer(sk, elapsed);
+	}
+
+	return 0;
+}
+
+int tcp_sock_set_keepidle(struct sock *sk, int val)
+{
+	int err;
+
+	lock_sock(sk);
+	err = tcp_sock_set_keepidle_locked(sk, val);
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepidle);
+
+int tcp_sock_set_keepintvl(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_KEEPINTVL)
+		return -EINVAL;
+
+	WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepintvl);
+
+int tcp_sock_set_keepcnt(struct sock *sk, int val)
+{
+	if (val < 1 || val > MAX_TCP_KEEPCNT)
+		return -EINVAL;
+
+	/* Paired with READ_ONCE() in keepalive_probes() */
+	WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepcnt);
+
+int tcp_set_window_clamp(struct sock *sk, int val)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!val) {
+		if (sk->sk_state != TCP_CLOSE)
+			return -EINVAL;
+		tp->window_clamp = 0;
+	} else {
+		u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
+		u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+						SOCK_MIN_RCVBUF / 2 : val;
+
+		if (new_window_clamp == old_window_clamp)
+			return 0;
+
+		tp->window_clamp = new_window_clamp;
+		if (new_window_clamp < old_window_clamp) {
+			/* need to apply the reserved mem provisioning only
+			 * when shrinking the window clamp
+			 */
+			__tcp_adjust_rcv_ssthresh(sk, tp->window_clamp);
+
+		} else {
+			new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
+			tp->rcv_ssthresh = max(new_rcv_ssthresh,
+					       tp->rcv_ssthresh);
+		}
+	}
+	return 0;
+}
+
+/*
+ *	Socket option code for TCP.
+ */
+int do_tcp_setsockopt(struct sock *sk, int level, int optname,
+		      sockptr_t optval, unsigned int optlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct net *net = sock_net(sk);
+	int val;
+	int err = 0;
+
+	/* These are data/string values, all the others are ints */
+	switch (optname) {
+	case TCP_CONGESTION: {
+		char name[TCP_CA_NAME_MAX];
+
+		if (optlen < 1)
+			return -EINVAL;
+
+		val = strncpy_from_sockptr(name, optval,
+					min_t(long, TCP_CA_NAME_MAX-1, optlen));
+		if (val < 0)
+			return -EFAULT;
+		name[val] = 0;
+
+		sockopt_lock_sock(sk);
+		err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(),
+						 sockopt_ns_capable(sock_net(sk)->user_ns,
+								    CAP_NET_ADMIN));
+		sockopt_release_sock(sk);
+		return err;
+	}
+	case TCP_ULP: {
+		char name[TCP_ULP_NAME_MAX];
+
+		if (optlen < 1)
+			return -EINVAL;
+
+		val = strncpy_from_sockptr(name, optval,
+					min_t(long, TCP_ULP_NAME_MAX - 1,
+					      optlen));
+		if (val < 0)
+			return -EFAULT;
+		name[val] = 0;
+
+		sockopt_lock_sock(sk);
+		err = tcp_set_ulp(sk, name);
+		sockopt_release_sock(sk);
+		return err;
+	}
+	case TCP_FASTOPEN_KEY: {
+		__u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
+		__u8 *backup_key = NULL;
+
+		/* Allow a backup key as well to facilitate key rotation
+		 * First key is the active one.
+		 */
+		if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
+		    optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
+			return -EINVAL;
+
+		if (copy_from_sockptr(key, optval, optlen))
+			return -EFAULT;
+
+		if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
+			backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
+
+		return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
+	}
+	default:
+		/* fallthru */
+		break;
+	}
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (copy_from_sockptr(&val, optval, sizeof(val)))
+		return -EFAULT;
+
+	/* Handle options that can be set without locking the socket. */
+	switch (optname) {
+	case TCP_SYNCNT:
+		return tcp_sock_set_syncnt(sk, val);
+	case TCP_USER_TIMEOUT:
+		return tcp_sock_set_user_timeout(sk, val);
+	case TCP_KEEPINTVL:
+		return tcp_sock_set_keepintvl(sk, val);
+	case TCP_KEEPCNT:
+		return tcp_sock_set_keepcnt(sk, val);
+	case TCP_LINGER2:
+		if (val < 0)
+			WRITE_ONCE(tp->linger2, -1);
+		else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
+			WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
+		else
+			WRITE_ONCE(tp->linger2, val * HZ);
+		return 0;
+	case TCP_DEFER_ACCEPT:
+		/* Translate value in seconds to number of retransmits */
+		WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
+			   secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+					   TCP_RTO_MAX / HZ));
+		return 0;
+	}
+
+	sockopt_lock_sock(sk);
+
+	switch (optname) {
+	case TCP_MAXSEG:
+		/* Values greater than interface MTU won't take effect. However
+		 * at the point when this call is done we typically don't yet
+		 * know which interface is going to be used
+		 */
+		if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
+			err = -EINVAL;
+			break;
+		}
+		tp->rx_opt.user_mss = val;
+		break;
+
+	case TCP_NODELAY:
+		__tcp_sock_set_nodelay(sk, val);
+		break;
+
+	case TCP_THIN_LINEAR_TIMEOUTS:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		else
+			tp->thin_lto = val;
+		break;
+
+	case TCP_THIN_DUPACK:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		break;
+
+	case TCP_REPAIR:
+		if (!tcp_can_repair_sock(sk))
+			err = -EPERM;
+		else if (val == TCP_REPAIR_ON) {
+			tp->repair = 1;
+			sk->sk_reuse = SK_FORCE_REUSE;
+			tp->repair_queue = TCP_NO_QUEUE;
+		} else if (val == TCP_REPAIR_OFF) {
+			tp->repair = 0;
+			sk->sk_reuse = SK_NO_REUSE;
+			tcp_send_window_probe(sk);
+		} else if (val == TCP_REPAIR_OFF_NO_WP) {
+			tp->repair = 0;
+			sk->sk_reuse = SK_NO_REUSE;
+		} else
+			err = -EINVAL;
+
+		break;
+
+	case TCP_REPAIR_QUEUE:
+		if (!tp->repair)
+			err = -EPERM;
+		else if ((unsigned int)val < TCP_QUEUES_NR)
+			tp->repair_queue = val;
+		else
+			err = -EINVAL;
+		break;
+
+	case TCP_QUEUE_SEQ:
+		if (sk->sk_state != TCP_CLOSE) {
+			err = -EPERM;
+		} else if (tp->repair_queue == TCP_SEND_QUEUE) {
+			if (!tcp_rtx_queue_empty(sk))
+				err = -EPERM;
+			else
+				WRITE_ONCE(tp->write_seq, val);
+		} else if (tp->repair_queue == TCP_RECV_QUEUE) {
+			if (tp->rcv_nxt != tp->copied_seq) {
+				err = -EPERM;
+			} else {
+				WRITE_ONCE(tp->rcv_nxt, val);
+				WRITE_ONCE(tp->copied_seq, val);
+			}
+		} else {
+			err = -EINVAL;
+		}
+		break;
+
+	case TCP_REPAIR_OPTIONS:
+		if (!tp->repair)
+			err = -EINVAL;
+		else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
+			err = tcp_repair_options_est(sk, optval, optlen);
+		else
+			err = -EPERM;
+		break;
+
+	case TCP_CORK:
+		__tcp_sock_set_cork(sk, val);
+		break;
+
+	case TCP_KEEPIDLE:
+		err = tcp_sock_set_keepidle_locked(sk, val);
+		break;
+	case TCP_SAVE_SYN:
+		/* 0: disable, 1: enable, 2: start from ether_header */
+		if (val < 0 || val > 2)
+			err = -EINVAL;
+		else
+			tp->save_syn = val;
+		break;
+
+	case TCP_WINDOW_CLAMP:
+		err = tcp_set_window_clamp(sk, val);
+		break;
+
+	case TCP_QUICKACK:
+		__tcp_sock_set_quickack(sk, val);
+		break;
+
+#ifdef CONFIG_TCP_MD5SIG
+	case TCP_MD5SIG:
+	case TCP_MD5SIG_EXT:
+		err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
+		break;
+#endif
+	case TCP_FASTOPEN:
+		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+		    TCPF_LISTEN))) {
+			tcp_fastopen_init_key_once(net);
+
+			fastopen_queue_tune(sk, val);
+		} else {
+			err = -EINVAL;
+		}
+		break;
+	case TCP_FASTOPEN_CONNECT:
+		if (val > 1 || val < 0) {
+			err = -EINVAL;
+		} else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
+			   TFO_CLIENT_ENABLE) {
+			if (sk->sk_state == TCP_CLOSE)
+				tp->fastopen_connect = val;
+			else
+				err = -EINVAL;
+		} else {
+			err = -EOPNOTSUPP;
+		}
+		break;
+	case TCP_FASTOPEN_NO_COOKIE:
+		if (val > 1 || val < 0)
+			err = -EINVAL;
+		else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+			err = -EINVAL;
+		else
+			tp->fastopen_no_cookie = val;
+		break;
+	case TCP_TIMESTAMP:
+		if (!tp->repair)
+			err = -EPERM;
+		else
+			WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw());
+		break;
+	case TCP_REPAIR_WINDOW:
+		err = tcp_repair_set_window(tp, optval, optlen);
+		break;
+	case TCP_NOTSENT_LOWAT:
+		WRITE_ONCE(tp->notsent_lowat, val);
+		sk->sk_write_space(sk);
+		break;
+	case TCP_INQ:
+		if (val > 1 || val < 0)
+			err = -EINVAL;
+		else
+			tp->recvmsg_inq = val;
+		break;
+	case TCP_TX_DELAY:
+		if (val)
+			tcp_enable_tx_delay();
+		WRITE_ONCE(tp->tcp_tx_delay, val);
+		break;
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	sockopt_release_sock(sk);
+	return err;
+}
+
+int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+		   unsigned int optlen)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (level != SOL_TCP)
+		/* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
+		return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
+								optval, optlen);
+	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(tcp_setsockopt);
+
+static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
+				      struct tcp_info *info)
+{
+	u64 stats[__TCP_CHRONO_MAX], total = 0;
+	enum tcp_chrono i;
+
+	for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
+		stats[i] = tp->chrono_stat[i - 1];
+		if (i == tp->chrono_type)
+			stats[i] += tcp_jiffies32 - tp->chrono_start;
+		stats[i] *= USEC_PER_SEC / HZ;
+		total += stats[i];
+	}
+
+	info->tcpi_busy_time = total;
+	info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
+	info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
+}
+
+/* Return information about state of tcp endpoint in API format. */
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
+{
+	const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	unsigned long rate;
+	u32 now;
+	u64 rate64;
+	bool slow;
+
+	memset(info, 0, sizeof(*info));
+	if (sk->sk_type != SOCK_STREAM)
+		return;
+
+	info->tcpi_state = inet_sk_state_load(sk);
+
+	/* Report meaningful fields for all TCP states, including listeners */
+	rate = READ_ONCE(sk->sk_pacing_rate);
+	rate64 = (rate != ~0UL) ? rate : ~0ULL;
+	info->tcpi_pacing_rate = rate64;
+
+	rate = READ_ONCE(sk->sk_max_pacing_rate);
+	rate64 = (rate != ~0UL) ? rate : ~0ULL;
+	info->tcpi_max_pacing_rate = rate64;
+
+	info->tcpi_reordering = tp->reordering;
+	info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
+
+	if (info->tcpi_state == TCP_LISTEN) {
+		/* listeners aliased fields :
+		 * tcpi_unacked -> Number of children ready for accept()
+		 * tcpi_sacked  -> max backlog
+		 */
+		info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
+		info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
+		return;
+	}
+
+	slow = lock_sock_fast(sk);
+
+	info->tcpi_ca_state = icsk->icsk_ca_state;
+	info->tcpi_retransmits = icsk->icsk_retransmits;
+	info->tcpi_probes = icsk->icsk_probes_out;
+	info->tcpi_backoff = icsk->icsk_backoff;
+
+	if (tp->rx_opt.tstamp_ok)
+		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+	if (tcp_is_sack(tp))
+		info->tcpi_options |= TCPI_OPT_SACK;
+	if (tp->rx_opt.wscale_ok) {
+		info->tcpi_options |= TCPI_OPT_WSCALE;
+		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
+		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
+	}
+
+	if (tp->ecn_flags & TCP_ECN_OK)
+		info->tcpi_options |= TCPI_OPT_ECN;
+	if (tp->ecn_flags & TCP_ECN_SEEN)
+		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+	if (tp->syn_data_acked)
+		info->tcpi_options |= TCPI_OPT_SYN_DATA;
+
+	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+	info->tcpi_ato = jiffies_to_usecs(min(icsk->icsk_ack.ato,
+					      tcp_delack_max(sk)));
+	info->tcpi_snd_mss = tp->mss_cache;
+	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
+
+	info->tcpi_unacked = tp->packets_out;
+	info->tcpi_sacked = tp->sacked_out;
+
+	info->tcpi_lost = tp->lost_out;
+	info->tcpi_retrans = tp->retrans_out;
+
+	now = tcp_jiffies32;
+	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
+	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
+	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
+
+	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
+	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
+	info->tcpi_rtt = tp->srtt_us >> 3;
+	info->tcpi_rttvar = tp->mdev_us >> 2;
+	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
+	info->tcpi_advmss = tp->advmss;
+
+	info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
+	info->tcpi_rcv_space = tp->rcvq_space.space;
+
+	info->tcpi_total_retrans = tp->total_retrans;
+
+	info->tcpi_bytes_acked = tp->bytes_acked;
+	info->tcpi_bytes_received = tp->bytes_received;
+	info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
+	tcp_get_info_chrono_stats(tp, info);
+
+	info->tcpi_segs_out = tp->segs_out;
+
+	/* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
+	info->tcpi_segs_in = READ_ONCE(tp->segs_in);
+	info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
+
+	info->tcpi_min_rtt = tcp_min_rtt(tp);
+	info->tcpi_data_segs_out = tp->data_segs_out;
+
+	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+	rate64 = tcp_compute_delivery_rate(tp);
+	if (rate64)
+		info->tcpi_delivery_rate = rate64;
+	info->tcpi_delivered = tp->delivered;
+	info->tcpi_delivered_ce = tp->delivered_ce;
+	info->tcpi_bytes_sent = tp->bytes_sent;
+	info->tcpi_bytes_retrans = tp->bytes_retrans;
+	info->tcpi_dsack_dups = tp->dsack_dups;
+	info->tcpi_reord_seen = tp->reord_seen;
+	info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+	info->tcpi_snd_wnd = tp->snd_wnd;
+	info->tcpi_rcv_wnd = tp->rcv_wnd;
+	info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
+	info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
+	unlock_sock_fast(sk, slow);
+}
+EXPORT_SYMBOL_GPL(tcp_get_info);
+
+static size_t tcp_opt_stats_get_size(void)
+{
+	return
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
+		nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
+		nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
+		nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
+		nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
+		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
+		nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
+		nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
+		0;
+}
+
+/* Returns TTL or hop limit of an incoming packet from skb. */
+static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return ip_hdr(skb)->ttl;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_hdr(skb)->hop_limit;
+	else
+		return 0;
+}
+
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
+					       const struct sk_buff *orig_skb,
+					       const struct sk_buff *ack_skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *stats;
+	struct tcp_info info;
+	unsigned long rate;
+	u64 rate64;
+
+	stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
+	if (!stats)
+		return NULL;
+
+	tcp_get_info_chrono_stats(tp, &info);
+	nla_put_u64_64bit(stats, TCP_NLA_BUSY,
+			  info.tcpi_busy_time, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
+			  info.tcpi_rwnd_limited, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
+			  info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
+			  tp->data_segs_out, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
+			  tp->total_retrans, TCP_NLA_PAD);
+
+	rate = READ_ONCE(sk->sk_pacing_rate);
+	rate64 = (rate != ~0UL) ? rate : ~0ULL;
+	nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
+
+	rate64 = tcp_compute_delivery_rate(tp);
+	nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
+
+	nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
+	nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
+	nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+
+	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
+	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
+	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
+
+	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
+	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+
+	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
+			  TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
+			  TCP_NLA_PAD);
+	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
+	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
+	nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
+	nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
+		    max_t(int, 0, tp->write_seq - tp->snd_nxt));
+	nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
+			  TCP_NLA_PAD);
+	if (ack_skb)
+		nla_put_u8(stats, TCP_NLA_TTL,
+			   tcp_skb_ttl_or_hop_limit(ack_skb));
+
+	nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
+	return stats;
+}
+
+int do_tcp_getsockopt(struct sock *sk, int level,
+		      int optname, sockptr_t optval, sockptr_t optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	int val, len;
+
+	if (copy_from_sockptr(&len, optlen, sizeof(int)))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case TCP_MAXSEG:
+		val = tp->mss_cache;
+		if (tp->rx_opt.user_mss &&
+		    ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+			val = tp->rx_opt.user_mss;
+		if (tp->repair)
+			val = tp->rx_opt.mss_clamp;
+		break;
+	case TCP_NODELAY:
+		val = !!(tp->nonagle&TCP_NAGLE_OFF);
+		break;
+	case TCP_CORK:
+		val = !!(tp->nonagle&TCP_NAGLE_CORK);
+		break;
+	case TCP_KEEPIDLE:
+		val = keepalive_time_when(tp) / HZ;
+		break;
+	case TCP_KEEPINTVL:
+		val = keepalive_intvl_when(tp) / HZ;
+		break;
+	case TCP_KEEPCNT:
+		val = keepalive_probes(tp);
+		break;
+	case TCP_SYNCNT:
+		val = READ_ONCE(icsk->icsk_syn_retries) ? :
+			READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
+		break;
+	case TCP_LINGER2:
+		val = READ_ONCE(tp->linger2);
+		if (val >= 0)
+			val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
+		break;
+	case TCP_DEFER_ACCEPT:
+		val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
+		val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
+				      TCP_RTO_MAX / HZ);
+		break;
+	case TCP_WINDOW_CLAMP:
+		val = tp->window_clamp;
+		break;
+	case TCP_INFO: {
+		struct tcp_info info;
+
+		if (copy_from_sockptr(&len, optlen, sizeof(int)))
+			return -EFAULT;
+
+		tcp_get_info(sk, &info);
+
+		len = min_t(unsigned int, len, sizeof(info));
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, &info, len))
+			return -EFAULT;
+		return 0;
+	}
+	case TCP_CC_INFO: {
+		const struct tcp_congestion_ops *ca_ops;
+		union tcp_cc_info info;
+		size_t sz = 0;
+		int attr;
+
+		if (copy_from_sockptr(&len, optlen, sizeof(int)))
+			return -EFAULT;
+
+		ca_ops = icsk->icsk_ca_ops;
+		if (ca_ops && ca_ops->get_info)
+			sz = ca_ops->get_info(sk, ~0U, &attr, &info);
+
+		len = min_t(unsigned int, len, sz);
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, &info, len))
+			return -EFAULT;
+		return 0;
+	}
+	case TCP_QUICKACK:
+		val = !inet_csk_in_pingpong_mode(sk);
+		break;
+
+	case TCP_CONGESTION:
+		if (copy_from_sockptr(&len, optlen, sizeof(int)))
+			return -EFAULT;
+		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len))
+			return -EFAULT;
+		return 0;
+
+	case TCP_ULP:
+		if (copy_from_sockptr(&len, optlen, sizeof(int)))
+			return -EFAULT;
+		len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
+		if (!icsk->icsk_ulp_ops) {
+			len = 0;
+			if (copy_to_sockptr(optlen, &len, sizeof(int)))
+				return -EFAULT;
+			return 0;
+		}
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len))
+			return -EFAULT;
+		return 0;
+
+	case TCP_FASTOPEN_KEY: {
+		u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
+		unsigned int key_len;
+
+		if (copy_from_sockptr(&len, optlen, sizeof(int)))
+			return -EFAULT;
+
+		key_len = tcp_fastopen_get_cipher(net, icsk, key) *
+				TCP_FASTOPEN_KEY_LENGTH;
+		len = min_t(unsigned int, len, key_len);
+		if (copy_to_sockptr(optlen, &len, sizeof(int)))
+			return -EFAULT;
+		if (copy_to_sockptr(optval, key, len))
+			return -EFAULT;
+		return 0;
+	}
+	case TCP_THIN_LINEAR_TIMEOUTS:
+		val = tp->thin_lto;
+		break;
+
+	case TCP_THIN_DUPACK:
+		val = 0;
+		break;
+
+	case TCP_REPAIR:
+		val = tp->repair;
+		break;
+
+	case TCP_REPAIR_QUEUE:
+		if (tp->repair)
+			val = tp->repair_queue;
+		else
+			return -EINVAL;
+		break;
+
+	case TCP_REPAIR_WINDOW: {
+		struct tcp_repair_window opt;
+
+		if (copy_from_sockptr(&len, optlen, sizeof(int)))
+			return -EFAULT;
+
+		if (len != sizeof(opt))
+			return -EINVAL;
+
+		if (!tp->repair)
+			return -EPERM;
+
+		opt.snd_wl1	= tp->snd_wl1;
+		opt.snd_wnd	= tp->snd_wnd;
+		opt.max_window	= tp->max_window;
+		opt.rcv_wnd	= tp->rcv_wnd;
+		opt.rcv_wup	= tp->rcv_wup;
+
+		if (copy_to_sockptr(optval, &opt, len))
+			return -EFAULT;
+		return 0;
+	}
+	case TCP_QUEUE_SEQ:
+		if (tp->repair_queue == TCP_SEND_QUEUE)
+			val = tp->write_seq;
+		else if (tp->repair_queue == TCP_RECV_QUEUE)
+			val = tp->rcv_nxt;
+		else
+			return -EINVAL;
+		break;
+
+	case TCP_USER_TIMEOUT:
+		val = READ_ONCE(icsk->icsk_user_timeout);
+		break;
+
+	case TCP_FASTOPEN:
+		val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
+		break;
+
+	case TCP_FASTOPEN_CONNECT:
+		val = tp->fastopen_connect;
+		break;
+
+	case TCP_FASTOPEN_NO_COOKIE:
+		val = tp->fastopen_no_cookie;
+		break;
+
+	case TCP_TX_DELAY:
+		val = READ_ONCE(tp->tcp_tx_delay);
+		break;
+
+	case TCP_TIMESTAMP:
+		val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset);
+		break;
+	case TCP_NOTSENT_LOWAT:
+		val = READ_ONCE(tp->notsent_lowat);
+		break;
+	case TCP_INQ:
+		val = tp->recvmsg_inq;
+		break;
+	case TCP_SAVE_SYN:
+		val = tp->save_syn;
+		break;
+	case TCP_SAVED_SYN: {
+		if (copy_from_sockptr(&len, optlen, sizeof(int)))
+			return -EFAULT;
+
+		sockopt_lock_sock(sk);
+		if (tp->saved_syn) {
+			if (len < tcp_saved_syn_len(tp->saved_syn)) {
+				len = tcp_saved_syn_len(tp->saved_syn);
+				if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+					sockopt_release_sock(sk);
+					return -EFAULT;
+				}
+				sockopt_release_sock(sk);
+				return -EINVAL;
+			}
+			len = tcp_saved_syn_len(tp->saved_syn);
+			if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+				sockopt_release_sock(sk);
+				return -EFAULT;
+			}
+			if (copy_to_sockptr(optval, tp->saved_syn->data, len)) {
+				sockopt_release_sock(sk);
+				return -EFAULT;
+			}
+			tcp_saved_syn_free(tp);
+			sockopt_release_sock(sk);
+		} else {
+			sockopt_release_sock(sk);
+			len = 0;
+			if (copy_to_sockptr(optlen, &len, sizeof(int)))
+				return -EFAULT;
+		}
+		return 0;
+	}
+#ifdef CONFIG_MMU
+	case TCP_ZEROCOPY_RECEIVE: {
+		struct scm_timestamping_internal tss;
+		struct tcp_zerocopy_receive zc = {};
+		int err;
+
+		if (copy_from_sockptr(&len, optlen, sizeof(int)))
+			return -EFAULT;
+		if (len < 0 ||
+		    len < offsetofend(struct tcp_zerocopy_receive, length))
+			return -EINVAL;
+		if (unlikely(len > sizeof(zc))) {
+			err = check_zeroed_sockptr(optval, sizeof(zc),
+						   len - sizeof(zc));
+			if (err < 1)
+				return err == 0 ? -EINVAL : err;
+			len = sizeof(zc);
+			if (copy_to_sockptr(optlen, &len, sizeof(int)))
+				return -EFAULT;
+		}
+		if (copy_from_sockptr(&zc, optval, len))
+			return -EFAULT;
+		if (zc.reserved)
+			return -EINVAL;
+		if (zc.msg_flags &  ~(TCP_VALID_ZC_MSG_FLAGS))
+			return -EINVAL;
+		sockopt_lock_sock(sk);
+		err = tcp_zerocopy_receive(sk, &zc, &tss);
+		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+							  &zc, &len, err);
+		sockopt_release_sock(sk);
+		if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
+			goto zerocopy_rcv_cmsg;
+		switch (len) {
+		case offsetofend(struct tcp_zerocopy_receive, msg_flags):
+			goto zerocopy_rcv_cmsg;
+		case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
+		case offsetofend(struct tcp_zerocopy_receive, msg_control):
+		case offsetofend(struct tcp_zerocopy_receive, flags):
+		case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
+		case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
+		case offsetofend(struct tcp_zerocopy_receive, err):
+			goto zerocopy_rcv_sk_err;
+		case offsetofend(struct tcp_zerocopy_receive, inq):
+			goto zerocopy_rcv_inq;
+		case offsetofend(struct tcp_zerocopy_receive, length):
+		default:
+			goto zerocopy_rcv_out;
+		}
+zerocopy_rcv_cmsg:
+		if (zc.msg_flags & TCP_CMSG_TS)
+			tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
+		else
+			zc.msg_flags = 0;
+zerocopy_rcv_sk_err:
+		if (!err)
+			zc.err = sock_error(sk);
+zerocopy_rcv_inq:
+		zc.inq = tcp_inq_hint(sk);
+zerocopy_rcv_out:
+		if (!err && copy_to_sockptr(optval, &zc, len))
+			err = -EFAULT;
+		return err;
+	}
+#endif
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (copy_to_sockptr(optlen, &len, sizeof(int)))
+		return -EFAULT;
+	if (copy_to_sockptr(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+bool tcp_bpf_bypass_getsockopt(int level, int optname)
+{
+	/* TCP do_tcp_getsockopt has optimized getsockopt implementation
+	 * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+	 */
+	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
+
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+		   int __user *optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (level != SOL_TCP)
+		/* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
+		return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
+								optval, optlen);
+	return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
+				 USER_SOCKPTR(optlen));
+}
+EXPORT_SYMBOL(tcp_getsockopt);
+
+#ifdef CONFIG_TCP_MD5SIG
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
+static DEFINE_MUTEX(tcp_md5sig_mutex);
+static bool tcp_md5sig_pool_populated = false;
+
+static void __tcp_alloc_md5sig_pool(void)
+{
+	struct crypto_ahash *hash;
+	int cpu;
+
+	hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(hash))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
+		struct ahash_request *req;
+
+		if (!scratch) {
+			scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
+					       sizeof(struct tcphdr),
+					       GFP_KERNEL,
+					       cpu_to_node(cpu));
+			if (!scratch)
+				return;
+			per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
+		}
+		if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
+			continue;
+
+		req = ahash_request_alloc(hash, GFP_KERNEL);
+		if (!req)
+			return;
+
+		ahash_request_set_callback(req, 0, NULL, NULL);
+
+		per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
+	}
+	/* before setting tcp_md5sig_pool_populated, we must commit all writes
+	 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
+	 */
+	smp_wmb();
+	/* Paired with READ_ONCE() from tcp_alloc_md5sig_pool()
+	 * and tcp_get_md5sig_pool().
+	*/
+	WRITE_ONCE(tcp_md5sig_pool_populated, true);
+}
+
+bool tcp_alloc_md5sig_pool(void)
+{
+	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+	if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
+		mutex_lock(&tcp_md5sig_mutex);
+
+		if (!tcp_md5sig_pool_populated)
+			__tcp_alloc_md5sig_pool();
+
+		mutex_unlock(&tcp_md5sig_mutex);
+	}
+	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+	return READ_ONCE(tcp_md5sig_pool_populated);
+}
+EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
+
+
+/**
+ *	tcp_get_md5sig_pool - get md5sig_pool for this user
+ *
+ *	We use percpu structure, so if we succeed, we exit with preemption
+ *	and BH disabled, to make sure another thread or softirq handling
+ *	wont try to get same context.
+ */
+struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
+{
+	local_bh_disable();
+
+	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+	if (READ_ONCE(tcp_md5sig_pool_populated)) {
+		/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
+		smp_rmb();
+		return this_cpu_ptr(&tcp_md5sig_pool);
+	}
+	local_bh_enable();
+	return NULL;
+}
+EXPORT_SYMBOL(tcp_get_md5sig_pool);
+
+int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
+			  const struct sk_buff *skb, unsigned int header_len)
+{
+	struct scatterlist sg;
+	const struct tcphdr *tp = tcp_hdr(skb);
+	struct ahash_request *req = hp->md5_req;
+	unsigned int i;
+	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+					   skb_headlen(skb) - header_len : 0;
+	const struct skb_shared_info *shi = skb_shinfo(skb);
+	struct sk_buff *frag_iter;
+
+	sg_init_table(&sg, 1);
+
+	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
+	ahash_request_set_crypt(req, &sg, NULL, head_data_len);
+	if (crypto_ahash_update(req))
+		return 1;
+
+	for (i = 0; i < shi->nr_frags; ++i) {
+		const skb_frag_t *f = &shi->frags[i];
+		unsigned int offset = skb_frag_off(f);
+		struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+
+		sg_set_page(&sg, page, skb_frag_size(f),
+			    offset_in_page(offset));
+		ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
+		if (crypto_ahash_update(req))
+			return 1;
+	}
+
+	skb_walk_frags(skb, frag_iter)
+		if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
+			return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(tcp_md5_hash_skb_data);
+
+int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
+{
+	u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
+	struct scatterlist sg;
+
+	sg_init_one(&sg, key->key, keylen);
+	ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
+
+	/* We use data_race() because tcp_md5_do_add() might change key->key under us */
+	return data_race(crypto_ahash_update(hp->md5_req));
+}
+EXPORT_SYMBOL(tcp_md5_hash_key);
+
+/* Called with rcu_read_lock() */
+enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+		     const void *saddr, const void *daddr,
+		     int family, int dif, int sdif)
+{
+	/*
+	 * This gets called for each TCP segment that arrives
+	 * so we want to be efficient.
+	 * We have 3 drop cases:
+	 * o No MD5 hash and one expected.
+	 * o MD5 hash and we're not expecting one.
+	 * o MD5 hash and its wrong.
+	 */
+	const __u8 *hash_location = NULL;
+	struct tcp_md5sig_key *hash_expected;
+	const struct tcphdr *th = tcp_hdr(skb);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	int genhash, l3index;
+	u8 newhash[16];
+
+	/* sdif set, means packet ingressed via a device
+	 * in an L3 domain and dif is set to the l3mdev
+	 */
+	l3index = sdif ? dif : 0;
+
+	hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family);
+	hash_location = tcp_parse_md5sig_option(th);
+
+	/* We've parsed the options - do we have a hash? */
+	if (!hash_expected && !hash_location)
+		return SKB_NOT_DROPPED_YET;
+
+	if (hash_expected && !hash_location) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+		return SKB_DROP_REASON_TCP_MD5NOTFOUND;
+	}
+
+	if (!hash_expected && hash_location) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+		return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
+	}
+
+	/* Check the signature.
+	 * To support dual stack listeners, we need to handle
+	 * IPv4-mapped case.
+	 */
+	if (family == AF_INET)
+		genhash = tcp_v4_md5_hash_skb(newhash,
+					      hash_expected,
+					      NULL, skb);
+	else
+		genhash = tp->af_specific->calc_md5_hash(newhash,
+							 hash_expected,
+							 NULL, skb);
+
+	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
+		if (family == AF_INET) {
+			net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
+					saddr, ntohs(th->source),
+					daddr, ntohs(th->dest),
+					genhash ? " tcp_v4_calc_md5_hash failed"
+					: "", l3index);
+		} else {
+			net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
+					genhash ? "failed" : "mismatch",
+					saddr, ntohs(th->source),
+					daddr, ntohs(th->dest), l3index);
+		}
+		return SKB_DROP_REASON_TCP_MD5FAILURE;
+	}
+	return SKB_NOT_DROPPED_YET;
+}
+EXPORT_SYMBOL(tcp_inbound_md5_hash);
+
+#endif
+
+void tcp_done(struct sock *sk)
+{
+	struct request_sock *req;
+
+	/* We might be called with a new socket, after
+	 * inet_csk_prepare_forced_close() has been called
+	 * so we can not use lockdep_sock_is_held(sk)
+	 */
+	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
+
+	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
+		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+
+	tcp_set_state(sk, TCP_CLOSE);
+	tcp_clear_xmit_timers(sk);
+	if (req)
+		reqsk_fastopen_remove(sk, req, false);
+
+	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+	else
+		inet_csk_destroy_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_done);
+
+int tcp_abort(struct sock *sk, int err)
+{
+	int state = inet_sk_state_load(sk);
+
+	if (state == TCP_NEW_SYN_RECV) {
+		struct request_sock *req = inet_reqsk(sk);
+
+		local_bh_disable();
+		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+		local_bh_enable();
+		return 0;
+	}
+	if (state == TCP_TIME_WAIT) {
+		struct inet_timewait_sock *tw = inet_twsk(sk);
+
+		refcount_inc(&tw->tw_refcnt);
+		local_bh_disable();
+		inet_twsk_deschedule_put(tw);
+		local_bh_enable();
+		return 0;
+	}
+
+	/* BPF context ensures sock locking. */
+	if (!has_current_bpf_ctx())
+		/* Don't race with userspace socket closes such as tcp_close. */
+		lock_sock(sk);
+
+	if (sk->sk_state == TCP_LISTEN) {
+		tcp_set_state(sk, TCP_CLOSE);
+		inet_csk_listen_stop(sk);
+	}
+
+	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
+	local_bh_disable();
+	bh_lock_sock(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		WRITE_ONCE(sk->sk_err, err);
+		/* This barrier is coupled with smp_rmb() in tcp_poll() */
+		smp_wmb();
+		sk_error_report(sk);
+		if (tcp_need_reset(sk->sk_state))
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+		tcp_done(sk);
+	}
+
+	bh_unlock_sock(sk);
+	local_bh_enable();
+	tcp_write_queue_purge(sk);
+	if (!has_current_bpf_ctx())
+		release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_abort);
+
+extern struct tcp_congestion_ops tcp_reno;
+
+static __initdata unsigned long thash_entries;
+static int __init set_thash_entries(char *str)
+{
+	ssize_t ret;
+
+	if (!str)
+		return 0;
+
+	ret = kstrtoul(str, 0, &thash_entries);
+	if (ret)
+		return 0;
+
+	return 1;
+}
+__setup("thash_entries=", set_thash_entries);
+
+static void __init tcp_init_mem(void)
+{
+	unsigned long limit = nr_free_buffer_pages() / 16;
+
+	limit = max(limit, 128UL);
+	sysctl_tcp_mem[0] = limit / 4 * 3;		/* 4.68 % */
+	sysctl_tcp_mem[1] = limit;			/* 6.25 % */
+	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;	/* 9.37 % */
+}
+
+void __init tcp_init(void)
+{
+	int max_rshare, max_wshare, cnt;
+	unsigned long limit;
+	unsigned int i;
+
+	BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
+	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
+		     sizeof_field(struct sk_buff, cb));
+
+	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
+
+	timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
+	mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+
+	inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+			    thash_entries, 21,  /* one slot per 2 MB*/
+			    0, 64 * 1024);
+	tcp_hashinfo.bind_bucket_cachep =
+		kmem_cache_create("tcp_bind_bucket",
+				  sizeof(struct inet_bind_bucket), 0,
+				  SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+				  SLAB_ACCOUNT,
+				  NULL);
+	tcp_hashinfo.bind2_bucket_cachep =
+		kmem_cache_create("tcp_bind2_bucket",
+				  sizeof(struct inet_bind2_bucket), 0,
+				  SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+				  SLAB_ACCOUNT,
+				  NULL);
+
+	/* Size and allocate the main established and bind bucket
+	 * hash tables.
+	 *
+	 * The methodology is similar to that of the buffer cache.
+	 */
+	tcp_hashinfo.ehash =
+		alloc_large_system_hash("TCP established",
+					sizeof(struct inet_ehash_bucket),
+					thash_entries,
+					17, /* one slot per 128 KB of memory */
+					0,
+					NULL,
+					&tcp_hashinfo.ehash_mask,
+					0,
+					thash_entries ? 0 : 512 * 1024);
+	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
+		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
+
+	if (inet_ehash_locks_alloc(&tcp_hashinfo))
+		panic("TCP: failed to alloc ehash_locks");
+	tcp_hashinfo.bhash =
+		alloc_large_system_hash("TCP bind",
+					2 * sizeof(struct inet_bind_hashbucket),
+					tcp_hashinfo.ehash_mask + 1,
+					17, /* one slot per 128 KB of memory */
+					0,
+					&tcp_hashinfo.bhash_size,
+					NULL,
+					0,
+					64 * 1024);
+	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
+	tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
+	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+		spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
+	}
+
+	tcp_hashinfo.pernet = false;
+
+	cnt = tcp_hashinfo.ehash_mask + 1;
+	sysctl_tcp_max_orphans = cnt / 2;
+
+	tcp_init_mem();
+	/* Set per-socket limits to no more than 1/128 the pressure threshold */
+	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+	max_wshare = min(4UL*1024*1024, limit);
+	max_rshare = min(6UL*1024*1024, limit);
+
+	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
+	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+	init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+
+	init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
+	init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
+	init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
+
+	pr_info("Hash tables configured (established %u bind %u)\n",
+		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+
+	tcp_v4_init();
+	tcp_metrics_init();
+	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
+	tcp_tasklet_init();
+	mptcp_init();
+}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 0000000000..146792cd26
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,1202 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ *   On each ACK, update our model of the network path:
+ *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ *      min_rtt = windowed_min(rtt, 10 seconds)
+ *   pacing_rate = pacing_gain * bottleneck_bandwidth
+ *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * Here is a state transition diagram for BBR:
+ *
+ *             |
+ *             V
+ *    +---> STARTUP  ----+
+ *    |        |         |
+ *    |        V         |
+ *    |      DRAIN   ----+
+ *    |        |         |
+ *    |        V         |
+ *    +---> PROBE_BW ----+
+ *    |      ^    |      |
+ *    |      |    |      |
+ *    |      +----+      |
+ *    |                  |
+ *    +---- PROBE_RTT <--+
+ *
+ * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
+ * When it estimates the pipe is full, it enters DRAIN to drain the queue.
+ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
+ * A long-lived BBR flow spends the vast majority of its time remaining
+ * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
+ * in a fair manner, with a small, bounded queue. *If* a flow has been
+ * continuously sending for the entire min_rtt window, and hasn't seen an RTT
+ * sample that matches or decreases its min_rtt estimate for 10 seconds, then
+ * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
+ * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
+ * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
+ * otherwise we enter STARTUP to try to fill the pipe.
+ *
+ * BBR is described in detail in:
+ *   "BBR: Congestion-Based Congestion Control",
+ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ *   https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * otherwise TCP stack falls back to an internal pacing using one high
+ * resolution timer per TCP socket and may use more resources.
+ */
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
+	BBR_DRAIN,	/* drain any queue created during startup */
+	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
+	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
+	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+	u64	cycle_mstamp;	     /* time of this cycle phase start */
+	u32     mode:3,		     /* current bbr_mode in state machine */
+		prev_ca_state:3,     /* CA state on previous ACK */
+		packet_conservation:1,  /* use packet conservation? */
+		round_start:1,	     /* start of packet-timed tx->ack round? */
+		idle_restart:1,	     /* restarting after idle? */
+		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+		unused:13,
+		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
+	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+		cwnd_gain:10,	/* current gain for setting cwnd */
+		full_bw_reached:1,   /* reached full bw in Startup? */
+		full_bw_cnt:2,	/* number of rounds without large bw gains */
+		cycle_idx:3,	/* current index in pacing_gain cycle array */
+		has_seen_rtt:1, /* have we seen an RTT sample yet? */
+		unused_b:5;
+	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+
+	/* For tracking ACK aggregation: */
+	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */
+	u16	extra_acked[2];		/* max excess data ACKed in epoch */
+	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
+		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
+		extra_acked_win_idx:1,	/* current index in extra_acked array */
+		unused_c:6;
+};
+
+#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
+static const int bbr_pacing_margin_percent = 1;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Gain factor for adding extra_acked to target cwnd: */
+static const int bbr_extra_acked_gain = BBR_UNIT;
+/* Window length of extra_acked window. */
+static const u32 bbr_extra_acked_win_rtts = 5;
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
+static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+/* Time period for clamping cwnd increment due to ack aggregation */
+static const u32 bbr_extra_acked_max_us = 100 * 1000;
+
+static void bbr_check_probe_rtt_done(struct sock *sk);
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->full_bw_reached;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return maximum extra acked in past k-2k round trips,
+ * where k = bbr_extra_acked_win_rtts.
+ */
+static u16 bbr_extra_acked(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return max(bbr->extra_acked[0], bbr->extra_acked[1]);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+	unsigned int mss = tcp_sk(sk)->mss_cache;
+
+	rate *= mss;
+	rate *= gain;
+	rate >>= BBR_SCALE;
+	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
+	return rate >> BW_SCALE;
+}
+
+/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	u64 rate = bw;
+
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	return rate;
+}
+
+/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+	u32 rtt_us;
+
+	if (tp->srtt_us) {		/* any RTT sample yet? */
+		rtt_us = max(tp->srtt_us >> 3, 1U);
+		bbr->has_seen_rtt = 1;
+	} else {			 /* no RTT sample yet */
+		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
+	}
+	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
+	do_div(bw, rtt_us);
+	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
+}
+
+/* Pace using current bw estimate and a gain factor. */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+
+	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
+		bbr_init_pacing_rate_from_rtt(sk);
+	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
+		sk->sk_pacing_rate = rate;
+}
+
+/* override sysctl_tcp_min_tso_segs */
+__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
+{
+	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+}
+
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 segs, bytes;
+
+	/* Sort of tcp_tso_autosize() but ignoring
+	 * driver provided sk_gso_max_size.
+	 */
+	bytes = min_t(unsigned long,
+		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
+		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
+	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
+
+	return min(segs, 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+		bbr->prior_cwnd = tcp_snd_cwnd(tp);  /* this cwnd is good enough */
+	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+		bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp));
+}
+
+__bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (event == CA_EVENT_TX_START && tp->app_limited) {
+		bbr->idle_restart = 1;
+		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+		bbr->ack_epoch_acked = 0;
+		/* Avoid pointless buffer overflows: pace at est. bw if we don't
+		 * need more speed (we're restarting from idle and app-limited).
+		 */
+		if (bbr->mode == BBR_PROBE_BW)
+			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+		else if (bbr->mode == BBR_PROBE_RTT)
+			bbr_check_probe_rtt_done(sk);
+	}
+}
+
+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
+ *
+ * bdp = ceil(bw * min_rtt * gain)
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ */
+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bdp;
+	u64 w;
+
+	/* If we've never had a valid RTT sample, cap cwnd at the initial
+	 * default. This should only happen when the connection is not using TCP
+	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+	 */
+	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
+
+	w = (u64)bw * bbr->min_rtt_us;
+
+	/* Apply a gain to the given value, remove the BW_SCALE shift, and
+	 * round the value up to avoid a negative feedback loop.
+	 */
+	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+	return bdp;
+}
+
+/* To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ *   - one skb in sending host Qdisc,
+ *   - one skb in sending host TSO/GSO engine
+ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Allow enough full-sized skbs in flight to utilize end systems. */
+	cwnd += 3 * bbr_tso_segs_goal(sk);
+
+	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+	cwnd = (cwnd + 1) & ~1U;
+
+	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
+		cwnd += 2;
+
+	return cwnd;
+}
+
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
+{
+	u32 inflight;
+
+	inflight = bbr_bdp(sk, bw, gain);
+	inflight = bbr_quantization_budget(sk, inflight);
+
+	return inflight;
+}
+
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 now_ns, edt_ns, interval_us;
+	u32 interval_delivered, inflight_at_edt;
+
+	now_ns = tp->tcp_clock_cache;
+	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+	inflight_at_edt = inflight_now;
+	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
+		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
+	if (interval_delivered >= inflight_at_edt)
+		return 0;
+	return inflight_at_edt - interval_delivered;
+}
+
+/* Find the cwnd increment based on estimate of ack aggregation */
+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+{
+	u32 max_aggr_cwnd, aggr_cwnd = 0;
+
+	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
+		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+				/ BW_UNIT;
+		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
+			     >> BBR_SCALE;
+		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+	}
+
+	return aggr_cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+	u32 cwnd = tcp_snd_cwnd(tp);
+
+	/* An ACK for P pkts should release at most 2*P packets. We do this
+	 * in two steps. First, here we deduct the number of lost packets.
+	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+	 */
+	if (rs->losses > 0)
+		cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+		/* Starting 1st round of Recovery, so do packet conservation. */
+		bbr->packet_conservation = 1;
+		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+		cwnd = tcp_packets_in_flight(tp) + acked;
+	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+		/* Exiting loss recovery; restore cwnd saved before recovery. */
+		cwnd = max(cwnd, bbr->prior_cwnd);
+		bbr->packet_conservation = 0;
+	}
+	bbr->prev_ca_state = state;
+
+	if (bbr->packet_conservation) {
+		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+		return true;	/* yes, using packet conservation */
+	}
+	*new_cwnd = cwnd;
+	return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+			 u32 acked, u32 bw, int gain)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
+
+	if (!acked)
+		goto done;  /* no packet fully ACKed; just apply caps */
+
+	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+		goto done;
+
+	target_cwnd = bbr_bdp(sk, bw, gain);
+
+	/* Increment the cwnd to account for excess ACKed data that seems
+	 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
+	 */
+	target_cwnd += bbr_ack_aggregation_cwnd(sk);
+	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+
+	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+		cwnd = min(cwnd + acked, target_cwnd);
+	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+		cwnd = cwnd + acked;
+	cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
+	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+				    const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_full_length =
+		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
+		bbr->min_rtt_us;
+	u32 inflight, bw;
+
+	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+	 * use the pipe without increasing the queue.
+	 */
+	if (bbr->pacing_gain == BBR_UNIT)
+		return is_full_length;		/* just use wall clock time */
+
+	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+	bw = bbr_max_bw(sk);
+
+	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+	 * a path with small buffers may not hold that much.
+	 */
+	if (bbr->pacing_gain > BBR_UNIT)
+		return is_full_length &&
+			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
+
+	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+	 * probing didn't find more bw. If inflight falls to match BDP then we
+	 * estimate queue is drained; persisting would underutilize the pipe.
+	 */
+	return is_full_length ||
+		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+	bbr->cycle_mstamp = tp->delivered_mstamp;
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+				   const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
+		bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->mode = BBR_STARTUP;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->mode = BBR_PROBE_BW;
+	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
+	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+	if (!bbr_full_bw_reached(sk))
+		bbr_reset_startup_mode(sk);
+	else
+		bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
+	bbr->lt_last_delivered = tp->delivered;
+	bbr->lt_last_lost = tp->lost;
+	bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->lt_bw = 0;
+	bbr->lt_use_bw = 0;
+	bbr->lt_is_sampling = false;
+	bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 diff;
+
+	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+		/* Is new bw close to the lt_bw from the previous interval? */
+		diff = abs(bw - bbr->lt_bw);
+		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+		     bbr_lt_bw_diff)) {
+			/* All criteria are met; estimate we're policed. */
+			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+			bbr->lt_use_bw = 1;
+			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+			bbr->lt_rtt_cnt = 0;
+			return;
+		}
+	}
+	bbr->lt_bw = bw;
+	bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 lost, delivered;
+	u64 bw;
+	u32 t;
+
+	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+		}
+		return;
+	}
+
+	/* Wait for the first loss before sampling, to let the policer exhaust
+	 * its tokens and estimate the steady-state rate allowed by the policer.
+	 * Starting samples earlier includes bursts that over-estimate the bw.
+	 */
+	if (!bbr->lt_is_sampling) {
+		if (!rs->losses)
+			return;
+		bbr_reset_lt_bw_sampling_interval(sk);
+		bbr->lt_is_sampling = true;
+	}
+
+	/* To avoid underestimates, reset sampling if we run out of data. */
+	if (rs->is_app_limited) {
+		bbr_reset_lt_bw_sampling(sk);
+		return;
+	}
+
+	if (bbr->round_start)
+		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+		return;		/* sampling interval needs to be longer */
+	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+		return;
+	}
+
+	/* End sampling interval when a packet is lost, so we estimate the
+	 * policer tokens were exhausted. Stopping the sampling before the
+	 * tokens are exhausted under-estimates the policed rate.
+	 */
+	if (!rs->losses)
+		return;
+
+	/* Calculate packets lost and delivered in sampling interval. */
+	lost = tp->lost - bbr->lt_last_lost;
+	delivered = tp->delivered - bbr->lt_last_delivered;
+	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+		return;
+
+	/* Find average delivery rate in this sampling interval. */
+	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
+	if ((s32)t < 1)
+		return;		/* interval is less than one ms, so wait */
+	/* Check if can multiply without overflow */
+	if (t >= ~0U / USEC_PER_MSEC) {
+		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+		return;
+	}
+	t *= USEC_PER_MSEC;
+	bw = (u64)delivered * BW_UNIT;
+	do_div(bw, t);
+	bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->round_start = 0;
+	if (rs->delivered < 0 || rs->interval_us <= 0)
+		return; /* Not a valid observation */
+
+	/* See if we've reached the next RTT */
+	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+		bbr->next_rtt_delivered = tp->delivered;
+		bbr->rtt_cnt++;
+		bbr->round_start = 1;
+		bbr->packet_conservation = 0;
+	}
+
+	bbr_lt_bw_sampling(sk, rs);
+
+	/* Divide delivered by the interval to find a (lower bound) bottleneck
+	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+	 * ratio will be <<1 for most connections. So delivered is first scaled.
+	 */
+	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
+
+	/* If this sample is application-limited, it is likely to have a very
+	 * low delivered count that represents application behavior rather than
+	 * the available network rate. Such a sample could drag down estimated
+	 * bw, causing needless slow-down. Thus, to continue to send at the
+	 * last measured network rate, we filter out app-limited samples unless
+	 * they describe the path bw at least as well as our bw model.
+	 *
+	 * So the goal during app-limited phase is to proceed with the best
+	 * network rate no matter how long. We automatically leave this
+	 * phase when app writes faster than the network can deliver :)
+	 */
+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+		/* Incorporate new sample into our max bw filter. */
+		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+	}
+}
+
+/* Estimates the windowed max degree of ack aggregation.
+ * This is used to provision extra in-flight data to keep sending during
+ * inter-ACK silences.
+ *
+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
+ *
+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
+ * cwnd += max_extra_acked
+ *
+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
+ * trips.
+ */
+static void bbr_update_ack_aggregation(struct sock *sk,
+				       const struct rate_sample *rs)
+{
+	u32 epoch_us, expected_acked, extra_acked;
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
+	    rs->delivered < 0 || rs->interval_us <= 0)
+		return;
+
+	if (bbr->round_start) {
+		bbr->extra_acked_win_rtts = min(0x1F,
+						bbr->extra_acked_win_rtts + 1);
+		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
+			bbr->extra_acked_win_rtts = 0;
+			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+						   0 : 1;
+			bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
+		}
+	}
+
+	/* Compute how many packets we expected to be delivered over epoch. */
+	epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
+				      bbr->ack_epoch_mstamp);
+	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
+
+	/* Reset the aggregation epoch if ACK rate is below expected rate or
+	 * significantly large no. of ack received since epoch (potentially
+	 * quite old epoch).
+	 */
+	if (bbr->ack_epoch_acked <= expected_acked ||
+	    (bbr->ack_epoch_acked + rs->acked_sacked >=
+	     bbr_ack_epoch_acked_reset_thresh)) {
+		bbr->ack_epoch_acked = 0;
+		bbr->ack_epoch_mstamp = tp->delivered_mstamp;
+		expected_acked = 0;
+	}
+
+	/* Compute excess data delivered, beyond what was expected. */
+	bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
+				     bbr->ack_epoch_acked + rs->acked_sacked);
+	extra_acked = bbr->ack_epoch_acked - expected_acked;
+	extra_acked = min(extra_acked, tcp_snd_cwnd(tp));
+	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
+		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+				      const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw_thresh;
+
+	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+		return;
+
+	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+	if (bbr_max_bw(sk) >= bw_thresh) {
+		bbr->full_bw = bbr_max_bw(sk);
+		bbr->full_bw_cnt = 0;
+		return;
+	}
+	++bbr->full_bw_cnt;
+	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+		tcp_sk(sk)->snd_ssthresh =
+				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+	}	/* fall through to check if in-flight is already small: */
+	if (bbr->mode == BBR_DRAIN &&
+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
+		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+}
+
+static void bbr_check_probe_rtt_done(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (!(bbr->probe_rtt_done_stamp &&
+	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+		return;
+
+	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
+	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+	bbr_reset_mode(sk);
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool filter_expired;
+
+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+	filter_expired = after(tcp_jiffies32,
+			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+	if (rs->rtt_us >= 0 &&
+	    (rs->rtt_us < bbr->min_rtt_us ||
+	     (filter_expired && !rs->is_ack_delayed))) {
+		bbr->min_rtt_us = rs->rtt_us;
+		bbr->min_rtt_stamp = tcp_jiffies32;
+	}
+
+	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+		bbr->probe_rtt_done_stamp = 0;
+	}
+
+	if (bbr->mode == BBR_PROBE_RTT) {
+		/* Ignore low rate samples during this mode. */
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+		/* Maintain min packets in flight for max(200 ms, 1 round). */
+		if (!bbr->probe_rtt_done_stamp &&
+		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
+				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+			bbr->probe_rtt_round_done = 0;
+			bbr->next_rtt_delivered = tp->delivered;
+		} else if (bbr->probe_rtt_done_stamp) {
+			if (bbr->round_start)
+				bbr->probe_rtt_round_done = 1;
+			if (bbr->probe_rtt_round_done)
+				bbr_check_probe_rtt_done(sk);
+		}
+	}
+	/* Restart after idle ends only once we process a new S/ACK for data */
+	if (rs->delivered > 0)
+		bbr->idle_restart = 0;
+}
+
+static void bbr_update_gains(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		bbr->pacing_gain = bbr_high_gain;
+		bbr->cwnd_gain	 = bbr_high_gain;
+		break;
+	case BBR_DRAIN:
+		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
+		break;
+	case BBR_PROBE_BW:
+		bbr->pacing_gain = (bbr->lt_use_bw ?
+				    BBR_UNIT :
+				    bbr_pacing_gain[bbr->cycle_idx]);
+		bbr->cwnd_gain	 = bbr_cwnd_gain;
+		break;
+	case BBR_PROBE_RTT:
+		bbr->pacing_gain = BBR_UNIT;
+		bbr->cwnd_gain	 = BBR_UNIT;
+		break;
+	default:
+		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+		break;
+	}
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+	bbr_update_bw(sk, rs);
+	bbr_update_ack_aggregation(sk, rs);
+	bbr_update_cycle_phase(sk, rs);
+	bbr_check_full_bw_reached(sk, rs);
+	bbr_check_drain(sk, rs);
+	bbr_update_min_rtt(sk, rs);
+	bbr_update_gains(sk);
+}
+
+__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw;
+
+	bbr_update_model(sk, rs);
+
+	bw = bbr_bw(sk);
+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+__bpf_kfunc static void bbr_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->prior_cwnd = 0;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	bbr->rtt_cnt = 0;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr->prev_ca_state = TCP_CA_Open;
+	bbr->packet_conservation = 0;
+
+	bbr->probe_rtt_done_stamp = 0;
+	bbr->probe_rtt_round_done = 0;
+	bbr->min_rtt_us = tcp_min_rtt(tp);
+	bbr->min_rtt_stamp = tcp_jiffies32;
+
+	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
+
+	bbr->has_seen_rtt = 0;
+	bbr_init_pacing_rate_from_rtt(sk);
+
+	bbr->round_start = 0;
+	bbr->idle_restart = 0;
+	bbr->full_bw_reached = 0;
+	bbr->full_bw = 0;
+	bbr->full_bw_cnt = 0;
+	bbr->cycle_mstamp = 0;
+	bbr->cycle_idx = 0;
+	bbr_reset_lt_bw_sampling(sk);
+	bbr_reset_startup_mode(sk);
+
+	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+	bbr->ack_epoch_acked = 0;
+	bbr->extra_acked_win_rtts = 0;
+	bbr->extra_acked_win_idx = 0;
+	bbr->extra_acked[0] = 0;
+	bbr->extra_acked[1] = 0;
+
+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+}
+
+__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+	return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
+	bbr->full_bw_cnt = 0;
+	bbr_reset_lt_bw_sampling(sk);
+	return tcp_snd_cwnd(tcp_sk(sk));
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
+{
+	bbr_save_cwnd(sk);
+	return tcp_sk(sk)->snd_ssthresh;
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+			   union tcp_cc_info *info)
+{
+	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct bbr *bbr = inet_csk_ca(sk);
+		u64 bw = bbr_bw(sk);
+
+		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+		memset(&info->bbr, 0, sizeof(info->bbr));
+		info->bbr.bbr_bw_lo		= (u32)bw;
+		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
+		*attr = INET_DIAG_BBRINFO;
+		return sizeof(info->bbr);
+	}
+	return 0;
+}
+
+__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss) {
+		struct rate_sample rs = { .losses = 1 };
+
+		bbr->prev_ca_state = TCP_CA_Loss;
+		bbr->full_bw = 0;
+		bbr->round_start = 1;	/* treat RTO like end of a round */
+		bbr_lt_bw_sampling(sk, &rs);
+	}
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+	.flags		= TCP_CONG_NON_RESTRICTED,
+	.name		= "bbr",
+	.owner		= THIS_MODULE,
+	.init		= bbr_init,
+	.cong_control	= bbr_main,
+	.sndbuf_expand	= bbr_sndbuf_expand,
+	.undo_cwnd	= bbr_undo_cwnd,
+	.cwnd_event	= bbr_cwnd_event,
+	.ssthresh	= bbr_ssthresh,
+	.min_tso_segs	= bbr_min_tso_segs,
+	.get_info	= bbr_get_info,
+	.set_state	= bbr_set_state,
+};
+
+BTF_SET8_START(tcp_bbr_check_kfunc_ids)
+#ifdef CONFIG_X86
+#ifdef CONFIG_DYNAMIC_FTRACE
+BTF_ID_FLAGS(func, bbr_init)
+BTF_ID_FLAGS(func, bbr_main)
+BTF_ID_FLAGS(func, bbr_sndbuf_expand)
+BTF_ID_FLAGS(func, bbr_undo_cwnd)
+BTF_ID_FLAGS(func, bbr_cwnd_event)
+BTF_ID_FLAGS(func, bbr_ssthresh)
+BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_set_state)
+#endif
+#endif
+BTF_SET8_END(tcp_bbr_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &tcp_bbr_check_kfunc_ids,
+};
+
+static int __init bbr_register(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set);
+	if (ret < 0)
+		return ret;
+	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 0000000000..58358bf92e
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Binary Increase Congestion control for TCP
+ * Home page:
+ *      http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ *  "Binary Increase Congestion Control for Fast, Long Distance
+ *  Networks" in InfoComm 2004
+ * Available from:
+ *  http://netsrv.csc.ncsu.edu/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
+					 * max_cwnd = snd_cwnd * beta
+					 */
+#define BICTCP_B		4	 /*
+					  * In binary search,
+					  * go to point (max+min)/N
+					  */
+
+static int fast_convergence = 1;
+static int max_increment = 16;
+static int low_window = 14;
+static int beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh;
+static int smooth_part = 20;
+
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(low_window, int, 0644);
+MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(smooth_part, int, 0644);
+MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
+
+/* BIC TCP Parameters */
+struct bictcp {
+	u32	cnt;		/* increase cwnd by 1 after ACKs */
+	u32	last_max_cwnd;	/* last maximum snd_cwnd */
+	u32	last_cwnd;	/* the last snd_cwnd */
+	u32	last_time;	/* time when updated last_cwnd */
+	u32	epoch_start;	/* beginning of an epoch */
+#define ACK_RATIO_SHIFT	4
+	u32	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+	ca->cnt = 0;
+	ca->last_max_cwnd = 0;
+	ca->last_cwnd = 0;
+	ca->last_time = 0;
+	ca->epoch_start = 0;
+	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	bictcp_reset(ca);
+
+	if (initial_ssthresh)
+		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+	if (ca->last_cwnd == cwnd &&
+	    (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
+		return;
+
+	ca->last_cwnd = cwnd;
+	ca->last_time = tcp_jiffies32;
+
+	if (ca->epoch_start == 0) /* record the beginning of an epoch */
+		ca->epoch_start = tcp_jiffies32;
+
+	/* start off normal */
+	if (cwnd <= low_window) {
+		ca->cnt = cwnd;
+		return;
+	}
+
+	/* binary increase */
+	if (cwnd < ca->last_max_cwnd) {
+		__u32	dist = (ca->last_max_cwnd - cwnd)
+			/ BICTCP_B;
+
+		if (dist > max_increment)
+			/* linear increase */
+			ca->cnt = cwnd / max_increment;
+		else if (dist <= 1U)
+			/* binary search increase */
+			ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+		else
+			/* binary search increase */
+			ca->cnt = cwnd / dist;
+	} else {
+		/* slow start AMD linear increase */
+		if (cwnd < ca->last_max_cwnd + BICTCP_B)
+			/* slow start */
+			ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+		else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
+			/* slow start */
+			ca->cnt = (cwnd * (BICTCP_B-1))
+				/ (cwnd - ca->last_max_cwnd);
+		else
+			/* linear increase */
+			ca->cnt = cwnd / max_increment;
+	}
+
+	/* if in slow start or link utilization is very low */
+	if (ca->last_max_cwnd == 0) {
+		if (ca->cnt > 20) /* increase cwnd 5% per RTT */
+			ca->cnt = 20;
+	}
+
+	ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+	if (ca->cnt == 0)			/* cannot be zero */
+		ca->cnt = 1;
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+	bictcp_update(ca, tcp_snd_cwnd(tp));
+	tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+/*
+ *	behave like Reno until low_window is reached,
+ *	then increase congestion window slowly
+ */
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	ca->epoch_start = 0;	/* end of epoch */
+
+	/* Wmax and fast convergence */
+	if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+		ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
+			/ (2 * BICTCP_BETA_SCALE);
+	else
+		ca->last_max_cwnd = tcp_snd_cwnd(tp);
+
+	if (tcp_snd_cwnd(tp) <= low_window)
+		return max(tcp_snd_cwnd(tp) >> 1U, 2U);
+	else
+		return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+	if (new_state == TCP_CA_Loss)
+		bictcp_reset(inet_csk_ca(sk));
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
+		struct bictcp *ca = inet_csk_ca(sk);
+
+		ca->delayed_ack += sample->pkts_acked -
+			(ca->delayed_ack >> ACK_RATIO_SHIFT);
+	}
+}
+
+static struct tcp_congestion_ops bictcp __read_mostly = {
+	.init		= bictcp_init,
+	.ssthresh	= bictcp_recalc_ssthresh,
+	.cong_avoid	= bictcp_cong_avoid,
+	.set_state	= bictcp_state,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.pkts_acked     = bictcp_acked,
+	.owner		= THIS_MODULE,
+	.name		= "bic",
+};
+
+static int __init bictcp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&bictcp);
+}
+
+static void __exit bictcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&bictcp);
+}
+
+module_init(bictcp_register);
+module_exit(bictcp_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 0000000000..53b0d62fd2
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/util_macros.h>
+
+#include <net/inet_common.h>
+#include <net/tls.h>
+
+void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tcp;
+	int copied;
+
+	if (!skb || !skb->len || !sk_is_tcp(sk))
+		return;
+
+	if (skb_bpf_strparser(skb))
+		return;
+
+	tcp = tcp_sk(sk);
+	copied = tcp->copied_seq + skb->len;
+	WRITE_ONCE(tcp->copied_seq, copied);
+	tcp_rcv_space_adjust(sk);
+	__tcp_cleanup_rbuf(sk, skb->len);
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+			   struct sk_msg *msg, u32 apply_bytes, int flags)
+{
+	bool apply = apply_bytes;
+	struct scatterlist *sge;
+	u32 size, copied = 0;
+	struct sk_msg *tmp;
+	int i, ret = 0;
+
+	tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+	if (unlikely(!tmp))
+		return -ENOMEM;
+
+	lock_sock(sk);
+	tmp->sg.start = msg->sg.start;
+	i = msg->sg.start;
+	do {
+		sge = sk_msg_elem(msg, i);
+		size = (apply && apply_bytes < sge->length) ?
+			apply_bytes : sge->length;
+		if (!sk_wmem_schedule(sk, size)) {
+			if (!copied)
+				ret = -ENOMEM;
+			break;
+		}
+
+		sk_mem_charge(sk, size);
+		sk_msg_xfer(tmp, msg, i, size);
+		copied += size;
+		if (sge->length)
+			get_page(sk_msg_page(tmp, i));
+		sk_msg_iter_var_next(i);
+		tmp->sg.end = i;
+		if (apply) {
+			apply_bytes -= size;
+			if (!apply_bytes) {
+				if (sge->length)
+					sk_msg_iter_var_prev(i);
+				break;
+			}
+		}
+	} while (i != msg->sg.end);
+
+	if (!ret) {
+		msg->sg.start = i;
+		sk_psock_queue_msg(psock, tmp);
+		sk_psock_data_ready(sk, psock);
+	} else {
+		sk_msg_free(sk, tmp);
+		kfree(tmp);
+	}
+
+	release_sock(sk);
+	return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+			int flags, bool uncharge)
+{
+	struct msghdr msghdr = {};
+	bool apply = apply_bytes;
+	struct scatterlist *sge;
+	struct page *page;
+	int size, ret = 0;
+	u32 off;
+
+	while (1) {
+		struct bio_vec bvec;
+		bool has_tx_ulp;
+
+		sge = sk_msg_elem(msg, msg->sg.start);
+		size = (apply && apply_bytes < sge->length) ?
+			apply_bytes : sge->length;
+		off  = sge->offset;
+		page = sg_page(sge);
+
+		tcp_rate_check_app_limited(sk);
+retry:
+		msghdr.msg_flags = flags | MSG_SPLICE_PAGES;
+		has_tx_ulp = tls_sw_has_ctx_tx(sk);
+		if (has_tx_ulp)
+			msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY;
+
+		if (size < sge->length && msg->sg.start != msg->sg.end)
+			msghdr.msg_flags |= MSG_MORE;
+
+		bvec_set_page(&bvec, page, size, off);
+		iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size);
+		ret = tcp_sendmsg_locked(sk, &msghdr, size);
+		if (ret <= 0)
+			return ret;
+
+		if (apply)
+			apply_bytes -= ret;
+		msg->sg.size -= ret;
+		sge->offset += ret;
+		sge->length -= ret;
+		if (uncharge)
+			sk_mem_uncharge(sk, ret);
+		if (ret != size) {
+			size -= ret;
+			off  += ret;
+			goto retry;
+		}
+		if (!sge->length) {
+			put_page(page);
+			sk_msg_iter_next(msg, start);
+			sg_init_table(sge, 1);
+			if (msg->sg.start == msg->sg.end)
+				break;
+		}
+		if (apply && !apply_bytes)
+			break;
+	}
+
+	return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+			       u32 apply_bytes, int flags, bool uncharge)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+	release_sock(sk);
+	return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+			  struct sk_msg *msg, u32 bytes, int flags)
+{
+	struct sk_psock *psock = sk_psock_get(sk);
+	int ret;
+
+	if (unlikely(!psock))
+		return -EPIPE;
+
+	ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
+			tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+	sk_psock_put(sk, psock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+#ifdef CONFIG_BPF_SYSCALL
+static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+			     long timeo)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret = 0;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return 1;
+
+	if (!timeo)
+		return ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = sk_wait_event(sk, &timeo,
+			    !list_empty(&psock->ingress_msg) ||
+			    !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
+static bool is_next_msg_fin(struct sk_psock *psock)
+{
+	struct scatterlist *sge;
+	struct sk_msg *msg_rx;
+	int i;
+
+	msg_rx = sk_psock_peek_msg(psock);
+	i = msg_rx->sg.start;
+	sge = sk_msg_elem(msg_rx, i);
+	if (!sge->length) {
+		struct sk_buff *skb = msg_rx->skb;
+
+		if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+			return true;
+	}
+	return false;
+}
+
+static int tcp_bpf_recvmsg_parser(struct sock *sk,
+				  struct msghdr *msg,
+				  size_t len,
+				  int flags,
+				  int *addr_len)
+{
+	struct tcp_sock *tcp = tcp_sk(sk);
+	int peek = flags & MSG_PEEK;
+	u32 seq = tcp->copied_seq;
+	struct sk_psock *psock;
+	int copied = 0;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+
+	if (!len)
+		return 0;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_recvmsg(sk, msg, len, flags, addr_len);
+
+	lock_sock(sk);
+
+	/* We may have received data on the sk_receive_queue pre-accept and
+	 * then we can not use read_skb in this context because we haven't
+	 * assigned a sk_socket yet so have no link to the ops. The work-around
+	 * is to check the sk_receive_queue and in these cases read skbs off
+	 * queue again. The read_skb hook is not running at this point because
+	 * of lock_sock so we avoid having multiple runners in read_skb.
+	 */
+	if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+		tcp_data_ready(sk);
+		/* This handles the ENOMEM errors if we both receive data
+		 * pre accept and are already under memory pressure. At least
+		 * let user know to retry.
+		 */
+		if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+			copied = -EAGAIN;
+			goto out;
+		}
+	}
+
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	/* The typical case for EFAULT is the socket was gracefully
+	 * shutdown with a FIN pkt. So check here the other case is
+	 * some error on copy_page_to_iter which would be unexpected.
+	 * On fin return correct return code to zero.
+	 */
+	if (copied == -EFAULT) {
+		bool is_fin = is_next_msg_fin(psock);
+
+		if (is_fin) {
+			copied = 0;
+			seq++;
+			goto out;
+		}
+	}
+	seq += copied;
+	if (!copied) {
+		long timeo;
+		int data;
+
+		if (sock_flag(sk, SOCK_DONE))
+			goto out;
+
+		if (sk->sk_err) {
+			copied = sock_error(sk);
+			goto out;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			goto out;
+
+		if (sk->sk_state == TCP_CLOSE) {
+			copied = -ENOTCONN;
+			goto out;
+		}
+
+		timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+		if (!timeo) {
+			copied = -EAGAIN;
+			goto out;
+		}
+
+		if (signal_pending(current)) {
+			copied = sock_intr_errno(timeo);
+			goto out;
+		}
+
+		data = tcp_msg_wait_data(sk, psock, timeo);
+		if (data < 0) {
+			copied = data;
+			goto unlock;
+		}
+		if (data && !sk_psock_queue_empty(psock))
+			goto msg_bytes_ready;
+		copied = -EAGAIN;
+	}
+out:
+	if (!peek)
+		WRITE_ONCE(tcp->copied_seq, seq);
+	tcp_rcv_space_adjust(sk);
+	if (copied > 0)
+		__tcp_cleanup_rbuf(sk, copied);
+
+unlock:
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return copied;
+}
+
+static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+			   int flags, int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied, ret;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+
+	if (!len)
+		return 0;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_recvmsg(sk, msg, len, flags, addr_len);
+	if (!skb_queue_empty(&sk->sk_receive_queue) &&
+	    sk_psock_queue_empty(psock)) {
+		sk_psock_put(sk, psock);
+		return tcp_recvmsg(sk, msg, len, flags, addr_len);
+	}
+	lock_sock(sk);
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	if (!copied) {
+		long timeo;
+		int data;
+
+		timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+		data = tcp_msg_wait_data(sk, psock, timeo);
+		if (data < 0) {
+			ret = data;
+			goto unlock;
+		}
+		if (data) {
+			if (!sk_psock_queue_empty(psock))
+				goto msg_bytes_ready;
+			release_sock(sk);
+			sk_psock_put(sk, psock);
+			return tcp_recvmsg(sk, msg, len, flags, addr_len);
+		}
+		copied = -EAGAIN;
+	}
+	ret = copied;
+
+unlock:
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return ret;
+}
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+				struct sk_msg *msg, int *copied, int flags)
+{
+	bool cork = false, enospc = sk_msg_full(msg), redir_ingress;
+	struct sock *sk_redir;
+	u32 tosend, origsize, sent, delta = 0;
+	u32 eval;
+	int ret;
+
+more_data:
+	if (psock->eval == __SK_NONE) {
+		/* Track delta in msg size to add/subtract it on SK_DROP from
+		 * returned to user copied size. This ensures user doesn't
+		 * get a positive return code with msg_cut_data and SK_DROP
+		 * verdict.
+		 */
+		delta = msg->sg.size;
+		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+		delta -= msg->sg.size;
+	}
+
+	if (msg->cork_bytes &&
+	    msg->cork_bytes > msg->sg.size && !enospc) {
+		psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+		if (!psock->cork) {
+			psock->cork = kzalloc(sizeof(*psock->cork),
+					      GFP_ATOMIC | __GFP_NOWARN);
+			if (!psock->cork)
+				return -ENOMEM;
+		}
+		memcpy(psock->cork, msg, sizeof(*msg));
+		return 0;
+	}
+
+	tosend = msg->sg.size;
+	if (psock->apply_bytes && psock->apply_bytes < tosend)
+		tosend = psock->apply_bytes;
+	eval = __SK_NONE;
+
+	switch (psock->eval) {
+	case __SK_PASS:
+		ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+		if (unlikely(ret)) {
+			*copied -= sk_msg_free(sk, msg);
+			break;
+		}
+		sk_msg_apply_bytes(psock, tosend);
+		break;
+	case __SK_REDIRECT:
+		redir_ingress = psock->redir_ingress;
+		sk_redir = psock->sk_redir;
+		sk_msg_apply_bytes(psock, tosend);
+		if (!psock->apply_bytes) {
+			/* Clean up before releasing the sock lock. */
+			eval = psock->eval;
+			psock->eval = __SK_NONE;
+			psock->sk_redir = NULL;
+		}
+		if (psock->cork) {
+			cork = true;
+			psock->cork = NULL;
+		}
+		sk_msg_return(sk, msg, tosend);
+		release_sock(sk);
+
+		origsize = msg->sg.size;
+		ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+					    msg, tosend, flags);
+		sent = origsize - msg->sg.size;
+
+		if (eval == __SK_REDIRECT)
+			sock_put(sk_redir);
+
+		lock_sock(sk);
+		if (unlikely(ret < 0)) {
+			int free = sk_msg_free_nocharge(sk, msg);
+
+			if (!cork)
+				*copied -= free;
+		}
+		if (cork) {
+			sk_msg_free(sk, msg);
+			kfree(msg);
+			msg = NULL;
+			ret = 0;
+		}
+		break;
+	case __SK_DROP:
+	default:
+		sk_msg_free_partial(sk, msg, tosend);
+		sk_msg_apply_bytes(psock, tosend);
+		*copied -= (tosend + delta);
+		return -EACCES;
+	}
+
+	if (likely(!ret)) {
+		if (!psock->apply_bytes) {
+			psock->eval =  __SK_NONE;
+			if (psock->sk_redir) {
+				sock_put(psock->sk_redir);
+				psock->sk_redir = NULL;
+			}
+		}
+		if (msg &&
+		    msg->sg.data[msg->sg.start].page_link &&
+		    msg->sg.data[msg->sg.start].length) {
+			if (eval == __SK_REDIRECT)
+				sk_mem_charge(sk, tosend - sent);
+			goto more_data;
+		}
+	}
+	return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct sk_msg tmp, *msg_tx = NULL;
+	int copied = 0, err = 0;
+	struct sk_psock *psock;
+	long timeo;
+	int flags;
+
+	/* Don't let internal flags through */
+	flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED);
+	flags |= MSG_NO_SHARED_FRAGS;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_sendmsg(sk, msg, size);
+
+	lock_sock(sk);
+	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	while (msg_data_left(msg)) {
+		bool enospc = false;
+		u32 copy, osize;
+
+		if (sk->sk_err) {
+			err = -sk->sk_err;
+			goto out_err;
+		}
+
+		copy = msg_data_left(msg);
+		if (!sk_stream_memory_free(sk))
+			goto wait_for_sndbuf;
+		if (psock->cork) {
+			msg_tx = psock->cork;
+		} else {
+			msg_tx = &tmp;
+			sk_msg_init(msg_tx);
+		}
+
+		osize = msg_tx->sg.size;
+		err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+		if (err) {
+			if (err != -ENOSPC)
+				goto wait_for_memory;
+			enospc = true;
+			copy = msg_tx->sg.size - osize;
+		}
+
+		err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+					       copy);
+		if (err < 0) {
+			sk_msg_trim(sk, msg_tx, osize);
+			goto out_err;
+		}
+
+		copied += copy;
+		if (psock->cork_bytes) {
+			if (size > psock->cork_bytes)
+				psock->cork_bytes = 0;
+			else
+				psock->cork_bytes -= size;
+			if (psock->cork_bytes && !enospc)
+				goto out_err;
+			/* All cork bytes are accounted, rerun the prog. */
+			psock->eval = __SK_NONE;
+			psock->cork_bytes = 0;
+		}
+
+		err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+		if (unlikely(err < 0))
+			goto out_err;
+		continue;
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err) {
+			if (msg_tx && msg_tx != psock->cork)
+				sk_msg_free(sk, msg_tx);
+			goto out_err;
+		}
+	}
+out_err:
+	if (err < 0)
+		err = sk_stream_error(sk, msg->msg_flags, err);
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return copied ? copied : err;
+}
+
+enum {
+	TCP_BPF_IPV4,
+	TCP_BPF_IPV6,
+	TCP_BPF_NUM_PROTS,
+};
+
+enum {
+	TCP_BPF_BASE,
+	TCP_BPF_TX,
+	TCP_BPF_RX,
+	TCP_BPF_TXRX,
+	TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+				   struct proto *base)
+{
+	prot[TCP_BPF_BASE]			= *base;
+	prot[TCP_BPF_BASE].destroy		= sock_map_destroy;
+	prot[TCP_BPF_BASE].close		= sock_map_close;
+	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
+	prot[TCP_BPF_BASE].sock_is_readable	= sk_msg_is_readable;
+
+	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
+	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
+
+	prot[TCP_BPF_RX]			= prot[TCP_BPF_BASE];
+	prot[TCP_BPF_RX].recvmsg		= tcp_bpf_recvmsg_parser;
+
+	prot[TCP_BPF_TXRX]			= prot[TCP_BPF_TX];
+	prot[TCP_BPF_TXRX].recvmsg		= tcp_bpf_recvmsg_parser;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
+{
+	if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+		spin_lock_bh(&tcpv6_prot_lock);
+		if (likely(ops != tcpv6_prot_saved)) {
+			tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+			smp_store_release(&tcpv6_prot_saved, ops);
+		}
+		spin_unlock_bh(&tcpv6_prot_lock);
+	}
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+	tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+	return 0;
+}
+late_initcall(tcp_bpf_v4_build_proto);
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+	/* In order to avoid retpoline, we make assumptions when we call
+	 * into ops if e.g. a psock is not present. Make sure they are
+	 * indeed valid assumptions.
+	 */
+	return ops->recvmsg  == tcp_recvmsg &&
+	       ops->sendmsg  == tcp_sendmsg ? 0 : -ENOTSUPP;
+}
+
+int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
+
+	if (psock->progs.stream_verdict || psock->progs.skb_verdict) {
+		config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX;
+	}
+
+	if (restore) {
+		if (inet_csk_has_ulp(sk)) {
+			/* TLS does not have an unhash proto in SW cases,
+			 * but we need to ensure we stop using the sock_map
+			 * unhash routine because the associated psock is being
+			 * removed. So use the original unhash handler.
+			 */
+			WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash);
+			tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+		} else {
+			sk->sk_write_space = psock->saved_write_space;
+			/* Pairs with lockless read in sk_clone_lock() */
+			sock_replace_proto(sk, psock->sk_proto);
+		}
+		return 0;
+	}
+
+	if (sk->sk_family == AF_INET6) {
+		if (tcp_bpf_assert_proto_ops(psock->sk_proto))
+			return -EINVAL;
+
+		tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
+	}
+
+	/* Pairs with lockless read in sk_clone_lock() */
+	sock_replace_proto(sk, &tcp_bpf_prots[family][config]);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
+
+/* If a child got cloned from a listening socket that had tcp_bpf
+ * protocol callbacks installed, we need to restore the callbacks to
+ * the default ones because the child does not inherit the psock state
+ * that tcp_bpf callbacks expect.
+ */
+void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
+{
+	struct proto *prot = newsk->sk_prot;
+
+	if (is_insidevar(prot, tcp_bpf_prots))
+		newsk->sk_prot = sk->sk_prot_creator;
+}
+#endif /* CONFIG_BPF_SYSCALL */
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
new file mode 100644
index 0000000000..ba4d98e510
--- /dev/null
+++ b/net/ipv4/tcp_cdg.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CAIA Delay-Gradient (CDG) congestion control
+ *
+ * This implementation is based on the paper:
+ *   D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ *   delay gradients." In IFIP Networking, pages 328-341. Springer, 2011.
+ *
+ * Scavenger traffic (Less-than-Best-Effort) should disable coexistence
+ * heuristics using parameters use_shadow=0 and use_ineff=0.
+ *
+ * Parameters window, backoff_beta, and backoff_factor are crucial for
+ * throughput and delay. Future work is needed to determine better defaults,
+ * and to provide guidelines for use in different environments/contexts.
+ *
+ * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/.
+ * Parameter window is only configurable when loading tcp_cdg as a module.
+ *
+ * Notable differences from paper/FreeBSD:
+ *   o Using Hybrid Slow start and Proportional Rate Reduction.
+ *   o Add toggle for shadow window mechanism. Suggested by David Hayes.
+ *   o Add toggle for non-congestion loss tolerance.
+ *   o Scaling parameter G is changed to a backoff factor;
+ *     conversion is given by: backoff_factor = 1000/(G * window).
+ *   o Limit shadow window to 2 * cwnd, or to cwnd when application limited.
+ *   o More accurate e^-x.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/sched/clock.h>
+
+#include <net/tcp.h>
+
+#define HYSTART_ACK_TRAIN	1
+#define HYSTART_DELAY		2
+
+static int window __read_mostly = 8;
+static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */
+static unsigned int backoff_factor __read_mostly = 42;
+static unsigned int hystart_detect __read_mostly = 3;
+static unsigned int use_ineff __read_mostly = 5;
+static bool use_shadow __read_mostly = true;
+static bool use_tolerance __read_mostly;
+
+module_param(window, int, 0444);
+MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)");
+module_param(backoff_beta, uint, 0644);
+MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)");
+module_param(backoff_factor, uint, 0644);
+MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor");
+module_param(hystart_detect, uint, 0644);
+MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start "
+		 "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)");
+module_param(use_ineff, uint, 0644);
+MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)");
+module_param(use_shadow, bool, 0644);
+MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
+module_param(use_tolerance, bool, 0644);
+MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
+
+struct cdg_minmax {
+	union {
+		struct {
+			s32 min;
+			s32 max;
+		};
+		u64 v64;
+	};
+};
+
+enum cdg_state {
+	CDG_UNKNOWN = 0,
+	CDG_NONFULL = 1,
+	CDG_FULL    = 2,
+	CDG_BACKOFF = 3,
+};
+
+struct cdg {
+	struct cdg_minmax rtt;
+	struct cdg_minmax rtt_prev;
+	struct cdg_minmax *gradients;
+	struct cdg_minmax gsum;
+	bool gfilled;
+	u8  tail;
+	u8  state;
+	u8  delack;
+	u32 rtt_seq;
+	u32 shadow_wnd;
+	u16 backoff_cnt;
+	u16 sample_cnt;
+	s32 delay_min;
+	u32 last_ack;
+	u32 round_start;
+};
+
+/**
+ * nexp_u32 - negative base-e exponential
+ * @ux: x in units of micro
+ *
+ * Returns exp(ux * -1e-6) * U32_MAX.
+ */
+static u32 __pure nexp_u32(u32 ux)
+{
+	static const u16 v[] = {
+		/* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */
+		65535,
+		65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422,
+		61378, 57484, 50423, 38795, 22965, 8047,  987,   14,
+	};
+	u32 msb = ux >> 8;
+	u32 res;
+	int i;
+
+	/* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */
+	if (msb > U16_MAX)
+		return 0;
+
+	/* Scale first eight bits linearly: */
+	res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000);
+
+	/* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */
+	for (i = 1; msb; i++, msb >>= 1) {
+		u32 y = v[i & -(msb & 1)] + U32_C(1);
+
+		res = ((u64)res * y) >> 16;
+	}
+
+	return res;
+}
+
+/* Based on the HyStart algorithm (by Ha et al.) that is implemented in
+ * tcp_cubic. Differences/experimental changes:
+ *   o Using Hayes' delayed ACK filter.
+ *   o Using a usec clock for the ACK train.
+ *   o Reset ACK train when application limited.
+ *   o Invoked at any cwnd (i.e. also when cwnd < 16).
+ *   o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh).
+ */
+static void tcp_cdg_hystart_update(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min);
+	if (ca->delay_min == 0)
+		return;
+
+	if (hystart_detect & HYSTART_ACK_TRAIN) {
+		u32 now_us = tp->tcp_mstamp;
+
+		if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
+			ca->last_ack = now_us;
+			ca->round_start = now_us;
+		} else if (before(now_us, ca->last_ack + 3000)) {
+			u32 base_owd = max(ca->delay_min / 2U, 125U);
+
+			ca->last_ack = now_us;
+			if (after(now_us, ca->round_start + base_owd)) {
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPHYSTARTTRAINDETECT);
+				NET_ADD_STATS(sock_net(sk),
+					      LINUX_MIB_TCPHYSTARTTRAINCWND,
+					      tcp_snd_cwnd(tp));
+				tp->snd_ssthresh = tcp_snd_cwnd(tp);
+				return;
+			}
+		}
+	}
+
+	if (hystart_detect & HYSTART_DELAY) {
+		if (ca->sample_cnt < 8) {
+			ca->sample_cnt++;
+		} else {
+			s32 thresh = max(ca->delay_min + ca->delay_min / 8U,
+					 125U);
+
+			if (ca->rtt.min > thresh) {
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPHYSTARTDELAYDETECT);
+				NET_ADD_STATS(sock_net(sk),
+					      LINUX_MIB_TCPHYSTARTDELAYCWND,
+					      tcp_snd_cwnd(tp));
+				tp->snd_ssthresh = tcp_snd_cwnd(tp);
+			}
+		}
+	}
+}
+
+static s32 tcp_cdg_grad(struct cdg *ca)
+{
+	s32 gmin = ca->rtt.min - ca->rtt_prev.min;
+	s32 gmax = ca->rtt.max - ca->rtt_prev.max;
+	s32 grad;
+
+	if (ca->gradients) {
+		ca->gsum.min += gmin - ca->gradients[ca->tail].min;
+		ca->gsum.max += gmax - ca->gradients[ca->tail].max;
+		ca->gradients[ca->tail].min = gmin;
+		ca->gradients[ca->tail].max = gmax;
+		ca->tail = (ca->tail + 1) & (window - 1);
+		gmin = ca->gsum.min;
+		gmax = ca->gsum.max;
+	}
+
+	/* We keep sums to ignore gradients during cwnd reductions;
+	 * the paper's smoothed gradients otherwise simplify to:
+	 * (rtt_latest - rtt_oldest) / window.
+	 *
+	 * We also drop division by window here.
+	 */
+	grad = gmin > 0 ? gmin : gmax;
+
+	/* Extrapolate missing values in gradient window: */
+	if (!ca->gfilled) {
+		if (!ca->gradients && window > 1)
+			grad *= window; /* Memory allocation failed. */
+		else if (ca->tail == 0)
+			ca->gfilled = true;
+		else
+			grad = (grad * window) / (int)ca->tail;
+	}
+
+	/* Backoff was effectual: */
+	if (gmin <= -32 || gmax <= -32)
+		ca->backoff_cnt = 0;
+
+	if (use_tolerance) {
+		/* Reduce small variations to zero: */
+		gmin = DIV_ROUND_CLOSEST(gmin, 64);
+		gmax = DIV_ROUND_CLOSEST(gmax, 64);
+
+		if (gmin > 0 && gmax <= 0)
+			ca->state = CDG_FULL;
+		else if ((gmin > 0 && gmax > 0) || gmax < 0)
+			ca->state = CDG_NONFULL;
+	}
+	return grad;
+}
+
+static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (get_random_u32() <= nexp_u32(grad * backoff_factor))
+		return false;
+
+	if (use_ineff) {
+		ca->backoff_cnt++;
+		if (ca->backoff_cnt > use_ineff)
+			return false;
+	}
+
+	ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp));
+	ca->state = CDG_BACKOFF;
+	tcp_enter_cwr(sk);
+	return true;
+}
+
+/* Not called in CWR or Recovery state. */
+static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 prior_snd_cwnd;
+	u32 incr;
+
+	if (tcp_in_slow_start(tp) && hystart_detect)
+		tcp_cdg_hystart_update(sk);
+
+	if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
+		s32 grad = 0;
+
+		if (ca->rtt_prev.v64)
+			grad = tcp_cdg_grad(ca);
+		ca->rtt_seq = tp->snd_nxt;
+		ca->rtt_prev = ca->rtt;
+		ca->rtt.v64 = 0;
+		ca->last_ack = 0;
+		ca->sample_cnt = 0;
+
+		if (grad > 0 && tcp_cdg_backoff(sk, grad))
+			return;
+	}
+
+	if (!tcp_is_cwnd_limited(sk)) {
+		ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp));
+		return;
+	}
+
+	prior_snd_cwnd = tcp_snd_cwnd(tp);
+	tcp_reno_cong_avoid(sk, ack, acked);
+
+	incr = tcp_snd_cwnd(tp) - prior_snd_cwnd;
+	ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
+}
+
+static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (sample->rtt_us <= 0)
+		return;
+
+	/* A heuristic for filtering delayed ACKs, adapted from:
+	 * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support
+	 * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
+	 */
+	if (tp->sacked_out == 0) {
+		if (sample->pkts_acked == 1 && ca->delack) {
+			/* A delayed ACK is only used for the minimum if it is
+			 * provenly lower than an existing non-zero minimum.
+			 */
+			ca->rtt.min = min(ca->rtt.min, sample->rtt_us);
+			ca->delack--;
+			return;
+		} else if (sample->pkts_acked > 1 && ca->delack < 5) {
+			ca->delack++;
+		}
+	}
+
+	ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us);
+	ca->rtt.max = max(ca->rtt.max, sample->rtt_us);
+}
+
+static u32 tcp_cdg_ssthresh(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (ca->state == CDG_BACKOFF)
+		return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10);
+
+	if (ca->state == CDG_NONFULL && use_tolerance)
+		return tcp_snd_cwnd(tp);
+
+	ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp));
+	if (use_shadow)
+		return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1);
+	return max(2U, tcp_snd_cwnd(tp) >> 1);
+}
+
+static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct cdg_minmax *gradients;
+
+	switch (ev) {
+	case CA_EVENT_CWND_RESTART:
+		gradients = ca->gradients;
+		if (gradients)
+			memset(gradients, 0, window * sizeof(gradients[0]));
+		memset(ca, 0, sizeof(*ca));
+
+		ca->gradients = gradients;
+		ca->rtt_seq = tp->snd_nxt;
+		ca->shadow_wnd = tcp_snd_cwnd(tp);
+		break;
+	case CA_EVENT_COMPLETE_CWR:
+		ca->state = CDG_UNKNOWN;
+		ca->rtt_seq = tp->snd_nxt;
+		ca->rtt_prev = ca->rtt;
+		ca->rtt.v64 = 0;
+		break;
+	default:
+		break;
+	}
+}
+
+static void tcp_cdg_init(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->gradients = NULL;
+	/* We silently fall back to window = 1 if allocation fails. */
+	if (window > 1)
+		ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
+					GFP_NOWAIT | __GFP_NOWARN);
+	ca->rtt_seq = tp->snd_nxt;
+	ca->shadow_wnd = tcp_snd_cwnd(tp);
+}
+
+static void tcp_cdg_release(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+
+	kfree(ca->gradients);
+	ca->gradients = NULL;
+}
+
+static struct tcp_congestion_ops tcp_cdg __read_mostly = {
+	.cong_avoid = tcp_cdg_cong_avoid,
+	.cwnd_event = tcp_cdg_cwnd_event,
+	.pkts_acked = tcp_cdg_acked,
+	.undo_cwnd = tcp_reno_undo_cwnd,
+	.ssthresh = tcp_cdg_ssthresh,
+	.release = tcp_cdg_release,
+	.init = tcp_cdg_init,
+	.owner = THIS_MODULE,
+	.name = "cdg",
+};
+
+static int __init tcp_cdg_register(void)
+{
+	if (backoff_beta > 1024 || window < 1 || window > 256)
+		return -ERANGE;
+	if (!is_power_of_2(window))
+		return -EINVAL;
+
+	BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_cdg);
+	return 0;
+}
+
+static void __exit tcp_cdg_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_cdg);
+}
+
+module_init(tcp_cdg_register);
+module_exit(tcp_cdg_unregister);
+MODULE_AUTHOR("Kenneth Klette Jonassen");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP CDG");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 0000000000..1b34050a75
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Pluggable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler support and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/jhash.h>
+#include <net/tcp.h>
+#include <trace/events/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+
+/* Simple linear search, don't expect many entries! */
+struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+	struct tcp_congestion_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+
+	return NULL;
+}
+
+void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	trace_tcp_cong_state_set(sk, ca_state);
+
+	if (icsk->icsk_ca_ops->set_state)
+		icsk->icsk_ca_ops->set_state(sk, ca_state);
+	icsk->icsk_ca_state = ca_state;
+}
+
+/* Must be called with rcu lock held */
+static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
+						       const char *name)
+{
+	struct tcp_congestion_ops *ca = tcp_ca_find(name);
+
+#ifdef CONFIG_MODULES
+	if (!ca && capable(CAP_NET_ADMIN)) {
+		rcu_read_unlock();
+		request_module("tcp_%s", name);
+		rcu_read_lock();
+		ca = tcp_ca_find(name);
+	}
+#endif
+	return ca;
+}
+
+/* Simple linear search, not much in here. */
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
+{
+	struct tcp_congestion_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+		if (e->key == key)
+			return e;
+	}
+
+	return NULL;
+}
+
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
+{
+	/* all algorithms must implement these */
+	if (!ca->ssthresh || !ca->undo_cwnd ||
+	    !(ca->cong_avoid || ca->cong_control)) {
+		pr_err("%s does not implement required ops\n", ca->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+	int ret;
+
+	ret = tcp_validate_congestion_control(ca);
+	if (ret)
+		return ret;
+
+	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+	spin_lock(&tcp_cong_list_lock);
+	if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
+		pr_notice("%s already registered or non-unique key\n",
+			  ca->name);
+		ret = -EEXIST;
+	} else {
+		list_add_tail_rcu(&ca->list, &tcp_cong_list);
+		pr_debug("%s registered\n", ca->name);
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function.  Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+	spin_lock(&tcp_cong_list_lock);
+	list_del_rcu(&ca->list);
+	spin_unlock(&tcp_cong_list_lock);
+
+	/* Wait for outstanding readers to complete before the
+	 * module gets removed entirely.
+	 *
+	 * A try_module_get() should fail by now as our module is
+	 * in "going" state since no refs are held anymore and
+	 * module_exit() handler being called.
+	 */
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+	struct tcp_congestion_ops *existing;
+	int ret;
+
+	ret = tcp_validate_congestion_control(ca);
+	if (ret)
+		return ret;
+
+	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+	spin_lock(&tcp_cong_list_lock);
+	existing = tcp_ca_find_key(old_ca->key);
+	if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+		pr_notice("%s not registered or non-unique key\n",
+			  ca->name);
+		ret = -EINVAL;
+	} else if (existing != old_ca) {
+		pr_notice("invalid old congestion control algorithm to replace\n");
+		ret = -EINVAL;
+	} else {
+		/* Add the new one before removing the old one to keep
+		 * one implementation available all the time.
+		 */
+		list_add_tail_rcu(&ca->list, &tcp_cong_list);
+		list_del_rcu(&existing->list);
+		pr_debug("%s updated\n", ca->name);
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	/* Wait for outstanding readers to complete before the
+	 * module or struct_ops gets removed entirely.
+	 */
+	if (!ret)
+		synchronize_rcu();
+
+	return ret;
+}
+
+u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
+{
+	const struct tcp_congestion_ops *ca;
+	u32 key = TCP_CA_UNSPEC;
+
+	might_sleep();
+
+	rcu_read_lock();
+	ca = tcp_ca_find_autoload(net, name);
+	if (ca) {
+		key = ca->key;
+		*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
+	}
+	rcu_read_unlock();
+
+	return key;
+}
+
+char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+	const struct tcp_congestion_ops *ca;
+	char *ret = NULL;
+
+	rcu_read_lock();
+	ca = tcp_ca_find_key(key);
+	if (ca)
+		ret = strncpy(buffer, ca->name,
+			      TCP_CA_NAME_MAX);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/* Assign choice of congestion control. */
+void tcp_assign_congestion_control(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_congestion_ops *ca;
+
+	rcu_read_lock();
+	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
+	if (unlikely(!bpf_try_module_get(ca, ca->owner)))
+		ca = &tcp_reno;
+	icsk->icsk_ca_ops = ca;
+	rcu_read_unlock();
+
+	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+	if (ca->flags & TCP_CONG_NEEDS_ECN)
+		INET_ECN_xmit(sk);
+	else
+		INET_ECN_dontxmit(sk);
+}
+
+void tcp_init_congestion_control(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tcp_sk(sk)->prior_ssthresh = 0;
+	if (icsk->icsk_ca_ops->init)
+		icsk->icsk_ca_ops->init(sk);
+	if (tcp_ca_needs_ecn(sk))
+		INET_ECN_xmit(sk);
+	else
+		INET_ECN_dontxmit(sk);
+	icsk->icsk_ca_initialized = 1;
+}
+
+static void tcp_reinit_congestion_control(struct sock *sk,
+					  const struct tcp_congestion_ops *ca)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tcp_cleanup_congestion_control(sk);
+	icsk->icsk_ca_ops = ca;
+	icsk->icsk_ca_setsockopt = 1;
+	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+
+	if (ca->flags & TCP_CONG_NEEDS_ECN)
+		INET_ECN_xmit(sk);
+	else
+		INET_ECN_dontxmit(sk);
+
+	if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+		tcp_init_congestion_control(sk);
+}
+
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->release)
+		icsk->icsk_ca_ops->release(sk);
+	bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
+}
+
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(struct net *net, const char *name)
+{
+	struct tcp_congestion_ops *ca;
+	const struct tcp_congestion_ops *prev;
+	int ret;
+
+	rcu_read_lock();
+	ca = tcp_ca_find_autoload(net, name);
+	if (!ca) {
+		ret = -ENOENT;
+	} else if (!bpf_try_module_get(ca, ca->owner)) {
+		ret = -EBUSY;
+	} else if (!net_eq(net, &init_net) &&
+			!(ca->flags & TCP_CONG_NON_RESTRICTED)) {
+		/* Only init netns can set default to a restricted algorithm */
+		ret = -EPERM;
+	} else {
+		prev = xchg(&net->ipv4.tcp_congestion_control, ca);
+		if (prev)
+			bpf_module_put(prev, prev->owner);
+
+		ca->flags |= TCP_CONG_NON_RESTRICTED;
+		ret = 0;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/* Set default value from kernel configuration at bootup */
+static int __init tcp_congestion_default(void)
+{
+	return tcp_set_default_congestion_control(&init_net,
+						  CONFIG_DEFAULT_TCP_CONG);
+}
+late_initcall(tcp_congestion_default);
+
+/* Build string with list of available congestion control values */
+void tcp_get_available_congestion_control(char *buf, size_t maxlen)
+{
+	struct tcp_congestion_ops *ca;
+	size_t offs = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", ca->name);
+
+		if (WARN_ON_ONCE(offs >= maxlen))
+			break;
+	}
+	rcu_read_unlock();
+}
+
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(struct net *net, char *name)
+{
+	const struct tcp_congestion_ops *ca;
+
+	rcu_read_lock();
+	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
+	strncpy(name, ca->name, TCP_CA_NAME_MAX);
+	rcu_read_unlock();
+}
+
+/* Built list of non-restricted congestion control values */
+void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
+{
+	struct tcp_congestion_ops *ca;
+	size_t offs = 0;
+
+	*buf = '\0';
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
+			continue;
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", ca->name);
+
+		if (WARN_ON_ONCE(offs >= maxlen))
+			break;
+	}
+	rcu_read_unlock();
+}
+
+/* Change list of non-restricted congestion control */
+int tcp_set_allowed_congestion_control(char *val)
+{
+	struct tcp_congestion_ops *ca;
+	char *saved_clone, *clone, *name;
+	int ret = 0;
+
+	saved_clone = clone = kstrdup(val, GFP_USER);
+	if (!clone)
+		return -ENOMEM;
+
+	spin_lock(&tcp_cong_list_lock);
+	/* pass 1 check for bad entries */
+	while ((name = strsep(&clone, " ")) && *name) {
+		ca = tcp_ca_find(name);
+		if (!ca) {
+			ret = -ENOENT;
+			goto out;
+		}
+	}
+
+	/* pass 2 clear old values */
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list)
+		ca->flags &= ~TCP_CONG_NON_RESTRICTED;
+
+	/* pass 3 mark as allowed */
+	while ((name = strsep(&val, " ")) && *name) {
+		ca = tcp_ca_find(name);
+		WARN_ON(!ca);
+		if (ca)
+			ca->flags |= TCP_CONG_NON_RESTRICTED;
+	}
+out:
+	spin_unlock(&tcp_cong_list_lock);
+	kfree(saved_clone);
+
+	return ret;
+}
+
+/* Change congestion control for socket. If load is false, then it is the
+ * responsibility of the caller to call tcp_init_congestion_control or
+ * tcp_reinit_congestion_control (if the current congestion control was
+ * already initialized.
+ */
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+			       bool cap_net_admin)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_congestion_ops *ca;
+	int err = 0;
+
+	if (icsk->icsk_ca_dst_locked)
+		return -EPERM;
+
+	rcu_read_lock();
+	if (!load)
+		ca = tcp_ca_find(name);
+	else
+		ca = tcp_ca_find_autoload(sock_net(sk), name);
+
+	/* No change asking for existing value */
+	if (ca == icsk->icsk_ca_ops) {
+		icsk->icsk_ca_setsockopt = 1;
+		goto out;
+	}
+
+	if (!ca)
+		err = -ENOENT;
+	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
+		err = -EPERM;
+	else if (!bpf_try_module_get(ca, ca->owner))
+		err = -EBUSY;
+	else
+		tcp_reinit_congestion_control(sk, ca);
+ out:
+	rcu_read_unlock();
+	return err;
+}
+
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
+ */
+__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
+{
+	u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
+
+	acked -= cwnd - tcp_snd_cwnd(tp);
+	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
+
+	return acked;
+}
+EXPORT_SYMBOL_GPL(tcp_slow_start);
+
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
+ * for every packet that was ACKed.
+ */
+__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
+{
+	/* If credits accumulated at a higher w, apply them gently now. */
+	if (tp->snd_cwnd_cnt >= w) {
+		tp->snd_cwnd_cnt = 0;
+		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+	}
+
+	tp->snd_cwnd_cnt += acked;
+	if (tp->snd_cwnd_cnt >= w) {
+		u32 delta = tp->snd_cwnd_cnt / w;
+
+		tp->snd_cwnd_cnt -= delta * w;
+		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta);
+	}
+	tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	/* In "safe" area, increase. */
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+	/* In dangerous area, increase slowly. */
+	tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+
+/* Slow start threshold is half the congestion window (min 2) */
+__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return max(tcp_snd_cwnd(tp) >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+
+__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
+
+struct tcp_congestion_ops tcp_reno = {
+	.flags		= TCP_CONG_NON_RESTRICTED,
+	.name		= "reno",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+};
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 0000000000..0fd78ecb67
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
+ * Home page:
+ *      http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of CUBIC TCP in
+ * Sangtae Ha, Injong Rhee and Lisong Xu,
+ *  "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
+ *  in ACM SIGOPS Operating System Review, July 2008.
+ * Available from:
+ *  http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
+ *
+ * CUBIC integrates a new slow start algorithm, called HyStart.
+ * The details of HyStart are presented in
+ *  Sangtae Ha and Injong Rhee,
+ *  "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
+ * Available from:
+ *  http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
+ *
+ * All testing results are available from:
+ * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
+ *
+ * Unless CUBIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+
+#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
+					 * max_cwnd = snd_cwnd * beta
+					 */
+#define	BICTCP_HZ		10	/* BIC HZ 2^10 = 1024 */
+
+/* Two methods of hybrid slow start */
+#define HYSTART_ACK_TRAIN	0x1
+#define HYSTART_DELAY		0x2
+
+/* Number of delay samples for detecting the increase of delay */
+#define HYSTART_MIN_SAMPLES	8
+#define HYSTART_DELAY_MIN	(4000U)	/* 4 ms */
+#define HYSTART_DELAY_MAX	(16000U)	/* 16 ms */
+#define HYSTART_DELAY_THRESH(x)	clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
+
+static int fast_convergence __read_mostly = 1;
+static int beta __read_mostly = 717;	/* = 717/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh __read_mostly;
+static int bic_scale __read_mostly = 41;
+static int tcp_friendliness __read_mostly = 1;
+
+static int hystart __read_mostly = 1;
+static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
+static int hystart_low_window __read_mostly = 16;
+static int hystart_ack_delta_us __read_mostly = 2000;
+
+static u32 cube_rtt_scale __read_mostly;
+static u32 beta_scale __read_mostly;
+static u64 cube_factor __read_mostly;
+
+/* Note parameters that are used for precomputing scale factors are read-only */
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(bic_scale, int, 0444);
+MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
+module_param(tcp_friendliness, int, 0644);
+MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
+module_param(hystart, int, 0644);
+MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
+module_param(hystart_detect, int, 0644);
+MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
+		 " 1: packet-train 2: delay 3: both packet-train and delay");
+module_param(hystart_low_window, int, 0644);
+MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
+module_param(hystart_ack_delta_us, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
+
+/* BIC TCP Parameters */
+struct bictcp {
+	u32	cnt;		/* increase cwnd by 1 after ACKs */
+	u32	last_max_cwnd;	/* last maximum snd_cwnd */
+	u32	last_cwnd;	/* the last snd_cwnd */
+	u32	last_time;	/* time when updated last_cwnd */
+	u32	bic_origin_point;/* origin point of bic function */
+	u32	bic_K;		/* time to origin point
+				   from the beginning of the current epoch */
+	u32	delay_min;	/* min delay (usec) */
+	u32	epoch_start;	/* beginning of an epoch */
+	u32	ack_cnt;	/* number of acks */
+	u32	tcp_cwnd;	/* estimated tcp cwnd */
+	u16	unused;
+	u8	sample_cnt;	/* number of samples to decide curr_rtt */
+	u8	found;		/* the exit point is found? */
+	u32	round_start;	/* beginning of each round */
+	u32	end_seq;	/* end_seq of the round */
+	u32	last_ack;	/* last time when the ACK spacing is close */
+	u32	curr_rtt;	/* the minimum rtt of current round */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+	memset(ca, 0, offsetof(struct bictcp, unused));
+	ca->found = 0;
+}
+
+static inline u32 bictcp_clock_us(const struct sock *sk)
+{
+	return tcp_sk(sk)->tcp_mstamp;
+}
+
+static inline void bictcp_hystart_reset(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	ca->round_start = ca->last_ack = bictcp_clock_us(sk);
+	ca->end_seq = tp->snd_nxt;
+	ca->curr_rtt = ~0U;
+	ca->sample_cnt = 0;
+}
+
+__bpf_kfunc static void cubictcp_init(struct sock *sk)
+{
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	bictcp_reset(ca);
+
+	if (hystart)
+		bictcp_hystart_reset(sk);
+
+	if (!hystart && initial_ssthresh)
+		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+__bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_TX_START) {
+		struct bictcp *ca = inet_csk_ca(sk);
+		u32 now = tcp_jiffies32;
+		s32 delta;
+
+		delta = now - tcp_sk(sk)->lsndtime;
+
+		/* We were application limited (idle) for a while.
+		 * Shift epoch_start to keep cwnd growth to cubic curve.
+		 */
+		if (ca->epoch_start && delta > 0) {
+			ca->epoch_start += delta;
+			if (after(ca->epoch_start, now))
+				ca->epoch_start = now;
+		}
+		return;
+	}
+}
+
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
+ */
+static u32 cubic_root(u64 a)
+{
+	u32 x, b, shift;
+	/*
+	 * cbrt(x) MSB values for x MSB values in [0..63].
+	 * Precomputed then refined by hand - Willy Tarreau
+	 *
+	 * For x in [0..63],
+	 *   v = cbrt(x << 18) - 1
+	 *   cbrt(x) = (v[x] + 10) >> 6
+	 */
+	static const u8 v[] = {
+		/* 0x00 */    0,   54,   54,   54,  118,  118,  118,  118,
+		/* 0x08 */  123,  129,  134,  138,  143,  147,  151,  156,
+		/* 0x10 */  157,  161,  164,  168,  170,  173,  176,  179,
+		/* 0x18 */  181,  185,  187,  190,  192,  194,  197,  199,
+		/* 0x20 */  200,  202,  204,  206,  209,  211,  213,  215,
+		/* 0x28 */  217,  219,  221,  222,  224,  225,  227,  229,
+		/* 0x30 */  231,  232,  234,  236,  237,  239,  240,  242,
+		/* 0x38 */  244,  245,  246,  248,  250,  251,  252,  254,
+	};
+
+	b = fls64(a);
+	if (b < 7) {
+		/* a in [0..63] */
+		return ((u32)v[(u32)a] + 35) >> 6;
+	}
+
+	b = ((b * 84) >> 8) - 1;
+	shift = (a >> (b * 3));
+
+	x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
+
+	/*
+	 * Newton-Raphson iteration
+	 *                         2
+	 * x    = ( 2 * x  +  a / x  ) / 3
+	 *  k+1          k         k
+	 */
+	x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
+	x = ((x * 341) >> 10);
+	return x;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
+{
+	u32 delta, bic_target, max_cnt;
+	u64 offs, t;
+
+	ca->ack_cnt += acked;	/* count the number of ACKed packets */
+
+	if (ca->last_cwnd == cwnd &&
+	    (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
+		return;
+
+	/* The CUBIC function can update ca->cnt at most once per jiffy.
+	 * On all cwnd reduction events, ca->epoch_start is set to 0,
+	 * which will force a recalculation of ca->cnt.
+	 */
+	if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
+		goto tcp_friendliness;
+
+	ca->last_cwnd = cwnd;
+	ca->last_time = tcp_jiffies32;
+
+	if (ca->epoch_start == 0) {
+		ca->epoch_start = tcp_jiffies32;	/* record beginning */
+		ca->ack_cnt = acked;			/* start counting */
+		ca->tcp_cwnd = cwnd;			/* syn with cubic */
+
+		if (ca->last_max_cwnd <= cwnd) {
+			ca->bic_K = 0;
+			ca->bic_origin_point = cwnd;
+		} else {
+			/* Compute new K based on
+			 * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+			 */
+			ca->bic_K = cubic_root(cube_factor
+					       * (ca->last_max_cwnd - cwnd));
+			ca->bic_origin_point = ca->last_max_cwnd;
+		}
+	}
+
+	/* cubic function - calc*/
+	/* calculate c * time^3 / rtt,
+	 *  while considering overflow in calculation of time^3
+	 * (so time^3 is done by using 64 bit)
+	 * and without the support of division of 64bit numbers
+	 * (so all divisions are done by using 32 bit)
+	 *  also NOTE the unit of those veriables
+	 *	  time  = (t - K) / 2^bictcp_HZ
+	 *	  c = bic_scale >> 10
+	 * rtt  = (srtt >> 3) / HZ
+	 * !!! The following code does not have overflow problems,
+	 * if the cwnd < 1 million packets !!!
+	 */
+
+	t = (s32)(tcp_jiffies32 - ca->epoch_start);
+	t += usecs_to_jiffies(ca->delay_min);
+	/* change the unit from HZ to bictcp_HZ */
+	t <<= BICTCP_HZ;
+	do_div(t, HZ);
+
+	if (t < ca->bic_K)		/* t - K */
+		offs = ca->bic_K - t;
+	else
+		offs = t - ca->bic_K;
+
+	/* c/rtt * (t-K)^3 */
+	delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
+	if (t < ca->bic_K)                            /* below origin*/
+		bic_target = ca->bic_origin_point - delta;
+	else                                          /* above origin*/
+		bic_target = ca->bic_origin_point + delta;
+
+	/* cubic function - calc bictcp_cnt*/
+	if (bic_target > cwnd) {
+		ca->cnt = cwnd / (bic_target - cwnd);
+	} else {
+		ca->cnt = 100 * cwnd;              /* very small increment*/
+	}
+
+	/*
+	 * The initial growth of cubic function may be too conservative
+	 * when the available bandwidth is still unknown.
+	 */
+	if (ca->last_max_cwnd == 0 && ca->cnt > 20)
+		ca->cnt = 20;	/* increase cwnd 5% per RTT */
+
+tcp_friendliness:
+	/* TCP Friendly */
+	if (tcp_friendliness) {
+		u32 scale = beta_scale;
+
+		delta = (cwnd * scale) >> 3;
+		while (ca->ack_cnt > delta) {		/* update tcp cwnd */
+			ca->ack_cnt -= delta;
+			ca->tcp_cwnd++;
+		}
+
+		if (ca->tcp_cwnd > cwnd) {	/* if bic is slower than tcp */
+			delta = ca->tcp_cwnd - cwnd;
+			max_cnt = cwnd / delta;
+			if (ca->cnt > max_cnt)
+				ca->cnt = max_cnt;
+		}
+	}
+
+	/* The maximum rate of cwnd increase CUBIC allows is 1 packet per
+	 * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
+	 */
+	ca->cnt = max(ca->cnt, 2U);
+}
+
+__bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+	bictcp_update(ca, tcp_snd_cwnd(tp), acked);
+	tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+__bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	ca->epoch_start = 0;	/* end of epoch */
+
+	/* Wmax and fast convergence */
+	if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+		ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
+			/ (2 * BICTCP_BETA_SCALE);
+	else
+		ca->last_max_cwnd = tcp_snd_cwnd(tp);
+
+	return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+__bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state)
+{
+	if (new_state == TCP_CA_Loss) {
+		bictcp_reset(inet_csk_ca(sk));
+		bictcp_hystart_reset(sk);
+	}
+}
+
+/* Account for TSO/GRO delays.
+ * Otherwise short RTT flows could get too small ssthresh, since during
+ * slow start we begin with small TSO packets and ca->delay_min would
+ * not account for long aggregation delay when TSO packets get bigger.
+ * Ideally even with a very small RTT we would like to have at least one
+ * TSO packet being sent and received by GRO, and another one in qdisc layer.
+ * We apply another 100% factor because @rate is doubled at this point.
+ * We cap the cushion to 1ms.
+ */
+static u32 hystart_ack_delay(const struct sock *sk)
+{
+	unsigned long rate;
+
+	rate = READ_ONCE(sk->sk_pacing_rate);
+	if (!rate)
+		return 0;
+	return min_t(u64, USEC_PER_MSEC,
+		     div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate));
+}
+
+static void hystart_update(struct sock *sk, u32 delay)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+	u32 threshold;
+
+	if (after(tp->snd_una, ca->end_seq))
+		bictcp_hystart_reset(sk);
+
+	if (hystart_detect & HYSTART_ACK_TRAIN) {
+		u32 now = bictcp_clock_us(sk);
+
+		/* first detection parameter - ack-train detection */
+		if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
+			ca->last_ack = now;
+
+			threshold = ca->delay_min + hystart_ack_delay(sk);
+
+			/* Hystart ack train triggers if we get ack past
+			 * ca->delay_min/2.
+			 * Pacing might have delayed packets up to RTT/2
+			 * during slow start.
+			 */
+			if (sk->sk_pacing_status == SK_PACING_NONE)
+				threshold >>= 1;
+
+			if ((s32)(now - ca->round_start) > threshold) {
+				ca->found = 1;
+				pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
+					 now - ca->round_start, threshold,
+					 ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp));
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPHYSTARTTRAINDETECT);
+				NET_ADD_STATS(sock_net(sk),
+					      LINUX_MIB_TCPHYSTARTTRAINCWND,
+					      tcp_snd_cwnd(tp));
+				tp->snd_ssthresh = tcp_snd_cwnd(tp);
+			}
+		}
+	}
+
+	if (hystart_detect & HYSTART_DELAY) {
+		/* obtain the minimum delay of more than sampling packets */
+		if (ca->curr_rtt > delay)
+			ca->curr_rtt = delay;
+		if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
+			ca->sample_cnt++;
+		} else {
+			if (ca->curr_rtt > ca->delay_min +
+			    HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
+				ca->found = 1;
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPHYSTARTDELAYDETECT);
+				NET_ADD_STATS(sock_net(sk),
+					      LINUX_MIB_TCPHYSTARTDELAYCWND,
+					      tcp_snd_cwnd(tp));
+				tp->snd_ssthresh = tcp_snd_cwnd(tp);
+			}
+		}
+	}
+}
+
+__bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+	u32 delay;
+
+	/* Some calls are for duplicates without timetamps */
+	if (sample->rtt_us < 0)
+		return;
+
+	/* Discard delay samples right after fast recovery */
+	if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
+		return;
+
+	delay = sample->rtt_us;
+	if (delay == 0)
+		delay = 1;
+
+	/* first time call or link delay decreases */
+	if (ca->delay_min == 0 || ca->delay_min > delay)
+		ca->delay_min = delay;
+
+	/* hystart triggers when cwnd is larger than some threshold */
+	if (!ca->found && tcp_in_slow_start(tp) && hystart &&
+	    tcp_snd_cwnd(tp) >= hystart_low_window)
+		hystart_update(sk, delay);
+}
+
+static struct tcp_congestion_ops cubictcp __read_mostly = {
+	.init		= cubictcp_init,
+	.ssthresh	= cubictcp_recalc_ssthresh,
+	.cong_avoid	= cubictcp_cong_avoid,
+	.set_state	= cubictcp_state,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.cwnd_event	= cubictcp_cwnd_event,
+	.pkts_acked     = cubictcp_acked,
+	.owner		= THIS_MODULE,
+	.name		= "cubic",
+};
+
+BTF_SET8_START(tcp_cubic_check_kfunc_ids)
+#ifdef CONFIG_X86
+#ifdef CONFIG_DYNAMIC_FTRACE
+BTF_ID_FLAGS(func, cubictcp_init)
+BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh)
+BTF_ID_FLAGS(func, cubictcp_cong_avoid)
+BTF_ID_FLAGS(func, cubictcp_state)
+BTF_ID_FLAGS(func, cubictcp_cwnd_event)
+BTF_ID_FLAGS(func, cubictcp_acked)
+#endif
+#endif
+BTF_SET8_END(tcp_cubic_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &tcp_cubic_check_kfunc_ids,
+};
+
+static int __init cubictcp_register(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+
+	/* Precompute a bunch of the scaling factors that are used per-packet
+	 * based on SRTT of 100ms
+	 */
+
+	beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
+		/ (BICTCP_BETA_SCALE - beta);
+
+	cube_rtt_scale = (bic_scale * 10);	/* 1024*c/rtt */
+
+	/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+	 *  so K = cubic_root( (wmax-cwnd)*rtt/c )
+	 * the unit of K is bictcp_HZ=2^10, not HZ
+	 *
+	 *  c = bic_scale >> 10
+	 *  rtt = 100ms
+	 *
+	 * the following code has been designed and tested for
+	 * cwnd < 1 million packets
+	 * RTT < 100 seconds
+	 * HZ < 1,000,00  (corresponding to 10 nano-second)
+	 */
+
+	/* 1/c * 2^2*bictcp_HZ * srtt */
+	cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
+
+	/* divide by bic_scale and by constant Srtt (100ms) */
+	do_div(cube_factor, bic_scale * 10);
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
+	if (ret < 0)
+		return ret;
+	return tcp_register_congestion_control(&cubictcp);
+}
+
+static void __exit cubictcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&cubictcp);
+}
+
+module_init(cubictcp_register);
+module_exit(cubictcp_unregister);
+
+MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CUBIC TCP");
+MODULE_VERSION("2.3");
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 0000000000..bb23bb5b38
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* DataCenter TCP (DCTCP) congestion control.
+ *
+ * http://simula.stanford.edu/~alizade/Site/DCTCP.html
+ *
+ * This is an implementation of DCTCP over Reno, an enhancement to the
+ * TCP congestion control algorithm designed for data centers. DCTCP
+ * leverages Explicit Congestion Notification (ECN) in the network to
+ * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
+ * the following three data center transport requirements:
+ *
+ *  - High burst tolerance (incast due to partition/aggregate)
+ *  - Low latency (short flows, queries)
+ *  - High throughput (continuous data updates, large file transfers)
+ *    with commodity shallow buffered switches
+ *
+ * The algorithm is described in detail in the following two papers:
+ *
+ * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ *    Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ *      "Data Center TCP (DCTCP)", Data Center Networks session
+ *      Proc. ACM SIGCOMM, New Delhi, 2010.
+ *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ *
+ * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ *      "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ *      Proc. ACM SIGMETRICS, San Jose, 2011.
+ *   http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+ *
+ * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
+ *
+ * Authors:
+ *
+ *	Daniel Borkmann <dborkman@redhat.com>
+ *	Florian Westphal <fw@strlen.de>
+ *	Glenn Judd <glenn.judd@morganstanley.com>
+ */
+
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include "tcp_dctcp.h"
+
+#define DCTCP_MAX_ALPHA	1024U
+
+struct dctcp {
+	u32 old_delivered;
+	u32 old_delivered_ce;
+	u32 prior_rcv_nxt;
+	u32 dctcp_alpha;
+	u32 next_seq;
+	u32 ce_state;
+	u32 loss_cwnd;
+	struct tcp_plb_state plb;
+};
+
+static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+module_param(dctcp_shift_g, uint, 0644);
+MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
+
+static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+module_param(dctcp_alpha_on_init, uint, 0644);
+MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+
+static struct tcp_congestion_ops dctcp_reno;
+
+static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+{
+	ca->next_seq = tp->snd_nxt;
+
+	ca->old_delivered = tp->delivered;
+	ca->old_delivered_ce = tp->delivered_ce;
+}
+
+__bpf_kfunc static void dctcp_init(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	if ((tp->ecn_flags & TCP_ECN_OK) ||
+	    (sk->sk_state == TCP_LISTEN ||
+	     sk->sk_state == TCP_CLOSE)) {
+		struct dctcp *ca = inet_csk_ca(sk);
+
+		ca->prior_rcv_nxt = tp->rcv_nxt;
+
+		ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+
+		ca->loss_cwnd = 0;
+		ca->ce_state = 0;
+
+		dctcp_reset(tp, ca);
+		tcp_plb_init(sk, &ca->plb);
+
+		return;
+	}
+
+	/* No ECN support? Fall back to Reno. Also need to clear
+	 * ECT from sk since it is set during 3WHS for DCTCP.
+	 */
+	inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
+	INET_ECN_dontxmit(sk);
+}
+
+__bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = tcp_snd_cwnd(tp);
+	return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+__bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct dctcp *ca = inet_csk_ca(sk);
+
+	/* Expired RTT */
+	if (!before(tp->snd_una, ca->next_seq)) {
+		u32 delivered = tp->delivered - ca->old_delivered;
+		u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
+		u32 alpha = ca->dctcp_alpha;
+		u32 ce_ratio = 0;
+
+		if (delivered > 0) {
+			/* dctcp_alpha keeps EWMA of fraction of ECN marked
+			 * packets. Because of EWMA smoothing, PLB reaction can
+			 * be slow so we use ce_ratio which is an instantaneous
+			 * measure of congestion. ce_ratio is the fraction of
+			 * ECN marked packets in the previous RTT.
+			 */
+			if (delivered_ce > 0)
+				ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered;
+			tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio);
+			tcp_plb_check_rehash(sk, &ca->plb);
+		}
+
+		/* alpha = (1 - g) * alpha + g * F */
+
+		alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
+		if (delivered_ce) {
+
+			/* If dctcp_shift_g == 1, a 32bit value would overflow
+			 * after 8 M packets.
+			 */
+			delivered_ce <<= (10 - dctcp_shift_g);
+			delivered_ce /= max(1U, delivered);
+
+			alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
+		}
+		/* dctcp_alpha can be read from dctcp_get_info() without
+		 * synchro, so we ask compiler to not use dctcp_alpha
+		 * as a temporary variable in prior operations.
+		 */
+		WRITE_ONCE(ca->dctcp_alpha, alpha);
+		dctcp_reset(tp, ca);
+	}
+}
+
+static void dctcp_react_to_loss(struct sock *sk)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = tcp_snd_cwnd(tp);
+	tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
+}
+
+__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state)
+{
+	if (new_state == TCP_CA_Recovery &&
+	    new_state != inet_csk(sk)->icsk_ca_state)
+		dctcp_react_to_loss(sk);
+	/* We handle RTO in dctcp_cwnd_event to ensure that we perform only
+	 * one loss-adjustment per RTT.
+	 */
+}
+
+__bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+
+	switch (ev) {
+	case CA_EVENT_ECN_IS_CE:
+	case CA_EVENT_ECN_NO_CE:
+		dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
+		break;
+	case CA_EVENT_LOSS:
+		tcp_plb_update_state_upon_rto(sk, &ca->plb);
+		dctcp_react_to_loss(sk);
+		break;
+	case CA_EVENT_TX_START:
+		tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */
+		break;
+	default:
+		/* Don't care for the rest. */
+		break;
+	}
+}
+
+static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
+			     union tcp_cc_info *info)
+{
+	const struct dctcp *ca = inet_csk_ca(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Fill it also in case of VEGASINFO due to req struct limits.
+	 * We can still correctly retrieve it later.
+	 */
+	if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		memset(&info->dctcp, 0, sizeof(info->dctcp));
+		if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
+			info->dctcp.dctcp_enabled = 1;
+			info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
+			info->dctcp.dctcp_alpha = ca->dctcp_alpha;
+			info->dctcp.dctcp_ab_ecn = tp->mss_cache *
+						   (tp->delivered_ce - ca->old_delivered_ce);
+			info->dctcp.dctcp_ab_tot = tp->mss_cache *
+						   (tp->delivered - ca->old_delivered);
+		}
+
+		*attr = INET_DIAG_DCTCPINFO;
+		return sizeof(info->dctcp);
+	}
+	return 0;
+}
+
+__bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk)
+{
+	const struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return max(tcp_snd_cwnd(tp), ca->loss_cwnd);
+}
+
+static struct tcp_congestion_ops dctcp __read_mostly = {
+	.init		= dctcp_init,
+	.in_ack_event   = dctcp_update_alpha,
+	.cwnd_event	= dctcp_cwnd_event,
+	.ssthresh	= dctcp_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.undo_cwnd	= dctcp_cwnd_undo,
+	.set_state	= dctcp_state,
+	.get_info	= dctcp_get_info,
+	.flags		= TCP_CONG_NEEDS_ECN,
+	.owner		= THIS_MODULE,
+	.name		= "dctcp",
+};
+
+static struct tcp_congestion_ops dctcp_reno __read_mostly = {
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.get_info	= dctcp_get_info,
+	.owner		= THIS_MODULE,
+	.name		= "dctcp-reno",
+};
+
+BTF_SET8_START(tcp_dctcp_check_kfunc_ids)
+#ifdef CONFIG_X86
+#ifdef CONFIG_DYNAMIC_FTRACE
+BTF_ID_FLAGS(func, dctcp_init)
+BTF_ID_FLAGS(func, dctcp_update_alpha)
+BTF_ID_FLAGS(func, dctcp_cwnd_event)
+BTF_ID_FLAGS(func, dctcp_ssthresh)
+BTF_ID_FLAGS(func, dctcp_cwnd_undo)
+BTF_ID_FLAGS(func, dctcp_state)
+#endif
+#endif
+BTF_SET8_END(tcp_dctcp_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &tcp_dctcp_check_kfunc_ids,
+};
+
+static int __init dctcp_register(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
+	if (ret < 0)
+		return ret;
+	return tcp_register_congestion_control(&dctcp);
+}
+
+static void __exit dctcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&dctcp);
+}
+
+module_init(dctcp_register);
+module_exit(dctcp_unregister);
+
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h
new file mode 100644
index 0000000000..d69a77cbd0
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.h
@@ -0,0 +1,40 @@
+#ifndef _TCP_DCTCP_H
+#define _TCP_DCTCP_H
+
+static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (ce_state == 1)
+		tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+	else
+		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S:	0 <- last pkt was non-CE
+ *	1 <- last pkt was CE
+ */
+static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
+					u32 *prior_rcv_nxt, u32 *ce_state)
+{
+	u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
+
+	if (*ce_state != new_ce_state) {
+		/* CE state has changed, force an immediate ACK to
+		 * reflect the new CE state. If an ACK was delayed,
+		 * send that first to reflect the prior CE state.
+		 */
+		if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+			dctcp_ece_ack_cwr(sk, *ce_state);
+			__tcp_send_ack(sk, *prior_rcv_nxt);
+		}
+		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+	}
+	*prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
+	*ce_state = new_ce_state;
+	dctcp_ece_ack_cwr(sk, new_ce_state);
+}
+
+#endif
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
new file mode 100644
index 0000000000..01b50fa791
--- /dev/null
+++ b/net/ipv4/tcp_diag.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * tcp_diag.c	Module for monitoring TCP transport protocols sockets.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+
+#include <linux/tcp.h>
+
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			      void *_info)
+{
+	struct tcp_info *info = _info;
+
+	if (inet_sk_state_load(sk) == TCP_LISTEN) {
+		r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
+		r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog);
+	} else if (sk->sk_type == SOCK_STREAM) {
+		const struct tcp_sock *tp = tcp_sk(sk);
+
+		r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+					     READ_ONCE(tp->copied_seq), 0);
+		r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una;
+	}
+	if (info)
+		tcp_get_info(sk, info);
+}
+
+#ifdef CONFIG_TCP_MD5SIG
+static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info,
+				 const struct tcp_md5sig_key *key)
+{
+	info->tcpm_family = key->family;
+	info->tcpm_prefixlen = key->prefixlen;
+	info->tcpm_keylen = key->keylen;
+	memcpy(info->tcpm_key, key->key, key->keylen);
+
+	if (key->family == AF_INET)
+		info->tcpm_addr[0] = key->addr.a4.s_addr;
+	#if IS_ENABLED(CONFIG_IPV6)
+	else if (key->family == AF_INET6)
+		memcpy(&info->tcpm_addr, &key->addr.a6,
+		       sizeof(info->tcpm_addr));
+	#endif
+}
+
+static int tcp_diag_put_md5sig(struct sk_buff *skb,
+			       const struct tcp_md5sig_info *md5sig)
+{
+	const struct tcp_md5sig_key *key;
+	struct tcp_diag_md5sig *info;
+	struct nlattr *attr;
+	int md5sig_count = 0;
+
+	hlist_for_each_entry_rcu(key, &md5sig->head, node)
+		md5sig_count++;
+	if (md5sig_count == 0)
+		return 0;
+
+	attr = nla_reserve(skb, INET_DIAG_MD5SIG,
+			   md5sig_count * sizeof(struct tcp_diag_md5sig));
+	if (!attr)
+		return -EMSGSIZE;
+
+	info = nla_data(attr);
+	memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig));
+	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+		tcp_diag_md5sig_fill(info++, key);
+		if (--md5sig_count == 0)
+			break;
+	}
+
+	return 0;
+}
+#endif
+
+static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
+			    const struct tcp_ulp_ops *ulp_ops)
+{
+	struct nlattr *nest;
+	int err;
+
+	nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO);
+	if (!nest)
+		return -EMSGSIZE;
+
+	err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name);
+	if (err)
+		goto nla_failure;
+
+	if (ulp_ops->get_info)
+		err = ulp_ops->get_info(sk, skb);
+	if (err)
+		goto nla_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_failure:
+	nla_nest_cancel(skb, nest);
+	return err;
+}
+
+static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
+			    struct sk_buff *skb)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int err = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (net_admin) {
+		struct tcp_md5sig_info *md5sig;
+
+		rcu_read_lock();
+		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+		if (md5sig)
+			err = tcp_diag_put_md5sig(skb, md5sig);
+		rcu_read_unlock();
+		if (err < 0)
+			return err;
+	}
+#endif
+
+	if (net_admin) {
+		const struct tcp_ulp_ops *ulp_ops;
+
+		ulp_ops = icsk->icsk_ulp_ops;
+		if (ulp_ops)
+			err = tcp_diag_put_ulp(skb, sk, ulp_ops);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	size_t size = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (net_admin && sk_fullsock(sk)) {
+		const struct tcp_md5sig_info *md5sig;
+		const struct tcp_md5sig_key *key;
+		size_t md5sig_count = 0;
+
+		rcu_read_lock();
+		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+		if (md5sig) {
+			hlist_for_each_entry_rcu(key, &md5sig->head, node)
+				md5sig_count++;
+		}
+		rcu_read_unlock();
+		size += nla_total_size(md5sig_count *
+				       sizeof(struct tcp_diag_md5sig));
+	}
+#endif
+
+	if (net_admin && sk_fullsock(sk)) {
+		const struct tcp_ulp_ops *ulp_ops;
+
+		ulp_ops = icsk->icsk_ulp_ops;
+		if (ulp_ops) {
+			size += nla_total_size(0) +
+				nla_total_size(TCP_ULP_NAME_MAX);
+			if (ulp_ops->get_info_size)
+				size += ulp_ops->get_info_size(sk);
+		}
+	}
+	return size;
+}
+
+static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  const struct inet_diag_req_v2 *r)
+{
+	struct inet_hashinfo *hinfo;
+
+	hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+
+	inet_diag_dump_icsk(hinfo, skb, cb, r);
+}
+
+static int tcp_diag_dump_one(struct netlink_callback *cb,
+			     const struct inet_diag_req_v2 *req)
+{
+	struct inet_hashinfo *hinfo;
+
+	hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+
+	return inet_diag_dump_one_icsk(hinfo, cb, req);
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int tcp_diag_destroy(struct sk_buff *in_skb,
+			    const struct inet_diag_req_v2 *req)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct inet_hashinfo *hinfo;
+	struct sock *sk;
+	int err;
+
+	hinfo = net->ipv4.tcp_death_row.hashinfo;
+	sk = inet_diag_find_one_icsk(net, hinfo, req);
+
+	if (IS_ERR(sk))
+		return PTR_ERR(sk);
+
+	err = sock_diag_destroy(sk, ECONNABORTED);
+
+	sock_gen_put(sk);
+
+	return err;
+}
+#endif
+
+static const struct inet_diag_handler tcp_diag_handler = {
+	.dump			= tcp_diag_dump,
+	.dump_one		= tcp_diag_dump_one,
+	.idiag_get_info		= tcp_diag_get_info,
+	.idiag_get_aux		= tcp_diag_get_aux,
+	.idiag_get_aux_size	= tcp_diag_get_aux_size,
+	.idiag_type		= IPPROTO_TCP,
+	.idiag_info_size	= sizeof(struct tcp_info),
+#ifdef CONFIG_INET_DIAG_DESTROY
+	.destroy		= tcp_diag_destroy,
+#endif
+};
+
+static int __init tcp_diag_init(void)
+{
+	return inet_diag_register(&tcp_diag_handler);
+}
+
+static void __exit tcp_diag_exit(void)
+{
+	inet_diag_unregister(&tcp_diag_handler);
+}
+
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 0000000000..8ed54e7334
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,595 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/tcp.h>
+#include <linux/rcupdate.h>
+#include <net/tcp.h>
+
+void tcp_fastopen_init_key_once(struct net *net)
+{
+	u8 key[TCP_FASTOPEN_KEY_LENGTH];
+	struct tcp_fastopen_context *ctxt;
+
+	rcu_read_lock();
+	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+	if (ctxt) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+
+	/* tcp_fastopen_reset_cipher publishes the new context
+	 * atomically, so we allow this race happening here.
+	 *
+	 * All call sites of tcp_fastopen_cookie_gen also check
+	 * for a valid cookie, so this is an acceptable risk.
+	 */
+	get_random_bytes(key, sizeof(key));
+	tcp_fastopen_reset_cipher(net, NULL, key, NULL);
+}
+
+static void tcp_fastopen_ctx_free(struct rcu_head *head)
+{
+	struct tcp_fastopen_context *ctx =
+	    container_of(head, struct tcp_fastopen_context, rcu);
+
+	kfree_sensitive(ctx);
+}
+
+void tcp_fastopen_destroy_cipher(struct sock *sk)
+{
+	struct tcp_fastopen_context *ctx;
+
+	ctx = rcu_dereference_protected(
+			inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1);
+	if (ctx)
+		call_rcu(&ctx->rcu, tcp_fastopen_ctx_free);
+}
+
+void tcp_fastopen_ctx_destroy(struct net *net)
+{
+	struct tcp_fastopen_context *ctxt;
+
+	ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL);
+
+	if (ctxt)
+		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
+}
+
+int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
+			      void *primary_key, void *backup_key)
+{
+	struct tcp_fastopen_context *ctx, *octx;
+	struct fastopen_queue *q;
+	int err = 0;
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ctx->key[0].key[0] = get_unaligned_le64(primary_key);
+	ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
+	if (backup_key) {
+		ctx->key[1].key[0] = get_unaligned_le64(backup_key);
+		ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
+		ctx->num = 2;
+	} else {
+		ctx->num = 1;
+	}
+
+	if (sk) {
+		q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+		octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx);
+	} else {
+		octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx);
+	}
+
+	if (octx)
+		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+out:
+	return err;
+}
+
+int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
+			    u64 *key)
+{
+	struct tcp_fastopen_context *ctx;
+	int n_keys = 0, i;
+
+	rcu_read_lock();
+	if (icsk)
+		ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
+	else
+		ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+	if (ctx) {
+		n_keys = tcp_fastopen_context_len(ctx);
+		for (i = 0; i < n_keys; i++) {
+			put_unaligned_le64(ctx->key[i].key[0], key + (i * 2));
+			put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1);
+		}
+	}
+	rcu_read_unlock();
+
+	return n_keys;
+}
+
+static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
+					     struct sk_buff *syn,
+					     const siphash_key_t *key,
+					     struct tcp_fastopen_cookie *foc)
+{
+	BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
+
+	if (req->rsk_ops->family == AF_INET) {
+		const struct iphdr *iph = ip_hdr(syn);
+
+		foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
+					  sizeof(iph->saddr) +
+					  sizeof(iph->daddr),
+					  key));
+		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+		return true;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (req->rsk_ops->family == AF_INET6) {
+		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
+
+		foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
+					  sizeof(ip6h->saddr) +
+					  sizeof(ip6h->daddr),
+					  key));
+		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+		return true;
+	}
+#endif
+	return false;
+}
+
+/* Generate the fastopen cookie by applying SipHash to both the source and
+ * destination addresses.
+ */
+static void tcp_fastopen_cookie_gen(struct sock *sk,
+				    struct request_sock *req,
+				    struct sk_buff *syn,
+				    struct tcp_fastopen_cookie *foc)
+{
+	struct tcp_fastopen_context *ctx;
+
+	rcu_read_lock();
+	ctx = tcp_fastopen_get_ctx(sk);
+	if (ctx)
+		__tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
+	rcu_read_unlock();
+}
+
+/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
+ * queue this additional data / FIN.
+ */
+void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
+		return;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb_dst_drop(skb);
+	/* segs_in has been initialized to 1 in tcp_create_openreq_child().
+	 * Hence, reset segs_in to 0 before calling tcp_segs_in()
+	 * to avoid double counting.  Also, tcp_segs_in() expects
+	 * skb->len to include the tcp_hdrlen.  Hence, it should
+	 * be called before __skb_pull().
+	 */
+	tp->segs_in = 0;
+	tcp_segs_in(tp, skb);
+	__skb_pull(skb, tcp_hdrlen(skb));
+	sk_forced_mem_schedule(sk, skb->truesize);
+	skb_set_owner_r(skb, sk);
+
+	TCP_SKB_CB(skb)->seq++;
+	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+
+	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+	tp->syn_data_acked = 1;
+
+	/* u64_stats_update_begin(&tp->syncp) not needed here,
+	 * as we certainly are not changing upper 32bit value (0)
+	 */
+	tp->bytes_received = skb->len;
+
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		tcp_fin(sk);
+}
+
+/* returns 0 - no key match, 1 for primary, 2 for backup */
+static int tcp_fastopen_cookie_gen_check(struct sock *sk,
+					 struct request_sock *req,
+					 struct sk_buff *syn,
+					 struct tcp_fastopen_cookie *orig,
+					 struct tcp_fastopen_cookie *valid_foc)
+{
+	struct tcp_fastopen_cookie search_foc = { .len = -1 };
+	struct tcp_fastopen_cookie *foc = valid_foc;
+	struct tcp_fastopen_context *ctx;
+	int i, ret = 0;
+
+	rcu_read_lock();
+	ctx = tcp_fastopen_get_ctx(sk);
+	if (!ctx)
+		goto out;
+	for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
+		__tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
+		if (tcp_fastopen_cookie_match(foc, orig)) {
+			ret = i + 1;
+			goto out;
+		}
+		foc = &search_foc;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static struct sock *tcp_fastopen_create_child(struct sock *sk,
+					      struct sk_buff *skb,
+					      struct request_sock *req)
+{
+	struct tcp_sock *tp;
+	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+	struct sock *child;
+	bool own_req;
+
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+							 NULL, &own_req);
+	if (!child)
+		return NULL;
+
+	spin_lock(&queue->fastopenq.lock);
+	queue->fastopenq.qlen++;
+	spin_unlock(&queue->fastopenq.lock);
+
+	/* Initialize the child socket. Have to fix some values to take
+	 * into account the child is a Fast Open socket and is created
+	 * only out of the bits carried in the SYN packet.
+	 */
+	tp = tcp_sk(child);
+
+	rcu_assign_pointer(tp->fastopen_rsk, req);
+	tcp_rsk(req)->tfo_listener = true;
+
+	/* RFC1323: The window in SYN & SYN/ACK segments is never
+	 * scaled. So correct it appropriately.
+	 */
+	tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+	tp->max_window = tp->snd_wnd;
+
+	/* Activate the retrans timer so that SYNACK can be retransmitted.
+	 * The request socket is not added to the ehash
+	 * because it's been added to the accept queue directly.
+	 */
+	req->timeout = tcp_timeout_init(child);
+	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+				  req->timeout, TCP_RTO_MAX);
+
+	refcount_set(&req->rsk_refcnt, 2);
+
+	/* Now finish processing the fastopen child socket. */
+	tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
+
+	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+
+	tcp_fastopen_add_skb(child, skb);
+
+	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
+	tp->rcv_wup = tp->rcv_nxt;
+	/* tcp_conn_request() is sending the SYNACK,
+	 * and queues the child into listener accept queue.
+	 */
+	return child;
+}
+
+static bool tcp_fastopen_queue_check(struct sock *sk)
+{
+	struct fastopen_queue *fastopenq;
+	int max_qlen;
+
+	/* Make sure the listener has enabled fastopen, and we don't
+	 * exceed the max # of pending TFO requests allowed before trying
+	 * to validating the cookie in order to avoid burning CPU cycles
+	 * unnecessarily.
+	 *
+	 * XXX (TFO) - The implication of checking the max_qlen before
+	 * processing a cookie request is that clients can't differentiate
+	 * between qlen overflow causing Fast Open to be disabled
+	 * temporarily vs a server not supporting Fast Open at all.
+	 */
+	fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+	max_qlen = READ_ONCE(fastopenq->max_qlen);
+	if (max_qlen == 0)
+		return false;
+
+	if (fastopenq->qlen >= max_qlen) {
+		struct request_sock *req1;
+		spin_lock(&fastopenq->lock);
+		req1 = fastopenq->rskq_rst_head;
+		if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
+			__NET_INC_STATS(sock_net(sk),
+					LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+			spin_unlock(&fastopenq->lock);
+			return false;
+		}
+		fastopenq->rskq_rst_head = req1->dl_next;
+		fastopenq->qlen--;
+		spin_unlock(&fastopenq->lock);
+		reqsk_put(req1);
+	}
+	return true;
+}
+
+static bool tcp_fastopen_no_cookie(const struct sock *sk,
+				   const struct dst_entry *dst,
+				   int flag)
+{
+	return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) ||
+	       tcp_sk(sk)->fastopen_no_cookie ||
+	       (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
+}
+
+/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
+ * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
+ * cookie request (foc->len == 0).
+ */
+struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+			      struct request_sock *req,
+			      struct tcp_fastopen_cookie *foc,
+			      const struct dst_entry *dst)
+{
+	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+	int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
+	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+	struct sock *child;
+	int ret = 0;
+
+	if (foc->len == 0) /* Client requests a cookie */
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+
+	if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
+	      (syn_data || foc->len >= 0) &&
+	      tcp_fastopen_queue_check(sk))) {
+		foc->len = -1;
+		return NULL;
+	}
+
+	if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
+		goto fastopen;
+
+	if (foc->len == 0) {
+		/* Client requests a cookie. */
+		tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc);
+	} else if (foc->len > 0) {
+		ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc,
+						    &valid_foc);
+		if (!ret) {
+			NET_INC_STATS(sock_net(sk),
+				      LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+		} else {
+			/* Cookie is valid. Create a (full) child socket to
+			 * accept the data in SYN before returning a SYN-ACK to
+			 * ack the data. If we fail to create the socket, fall
+			 * back and ack the ISN only but includes the same
+			 * cookie.
+			 *
+			 * Note: Data-less SYN with valid cookie is allowed to
+			 * send data in SYN_RECV state.
+			 */
+fastopen:
+			child = tcp_fastopen_create_child(sk, skb, req);
+			if (child) {
+				if (ret == 2) {
+					valid_foc.exp = foc->exp;
+					*foc = valid_foc;
+					NET_INC_STATS(sock_net(sk),
+						      LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
+				} else {
+					foc->len = -1;
+				}
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPFASTOPENPASSIVE);
+				return child;
+			}
+			NET_INC_STATS(sock_net(sk),
+				      LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+		}
+	}
+	valid_foc.exp = foc->exp;
+	*foc = valid_foc;
+	return NULL;
+}
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+			       struct tcp_fastopen_cookie *cookie)
+{
+	const struct dst_entry *dst;
+
+	tcp_fastopen_cache_get(sk, mss, cookie);
+
+	/* Firewall blackhole issue check */
+	if (tcp_fastopen_active_should_disable(sk)) {
+		cookie->len = -1;
+		return false;
+	}
+
+	dst = __sk_dst_get(sk);
+
+	if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
+		cookie->len = -1;
+		return true;
+	}
+	if (cookie->len > 0)
+		return true;
+	tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
+	return false;
+}
+
+/* This function checks if we want to defer sending SYN until the first
+ * write().  We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ *               return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+	struct tcp_fastopen_cookie cookie = { .len = 0 };
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 mss;
+
+	if (tp->fastopen_connect && !tp->fastopen_req) {
+		if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+			inet_set_bit(DEFER_CONNECT, sk);
+			return true;
+		}
+
+		/* Alloc fastopen_req in order for FO option to be included
+		 * in SYN
+		 */
+		tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+					   sk->sk_allocation);
+		if (tp->fastopen_req)
+			tp->fastopen_req->cookie = cookie;
+		else
+			*err = -ENOBUFS;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
+
+/*
+ * The following code block is to deal with middle box issues with TFO:
+ * Middlebox firewall issues can potentially cause server's data being
+ * blackholed after a successful 3WHS using TFO.
+ * The proposed solution is to disable active TFO globally under the
+ * following circumstances:
+ *   1. client side TFO socket receives out of order FIN
+ *   2. client side TFO socket receives out of order RST
+ *   3. client side TFO socket has timed out three times consecutively during
+ *      or after handshake
+ * We disable active side TFO globally for 1hr at first. Then if it
+ * happens again, we disable it for 2h, then 4h, 8h, ...
+ * And we reset the timeout back to 1hr when we see a successful active
+ * TFO connection with data exchanges.
+ */
+
+/* Disable active TFO and record current jiffies and
+ * tfo_active_disable_times
+ */
+void tcp_fastopen_active_disable(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+
+	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout))
+		return;
+
+	/* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */
+	WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies);
+
+	/* Paired with smp_rmb() in tcp_fastopen_active_should_disable().
+	 * We want net->ipv4.tfo_active_disable_stamp to be updated first.
+	 */
+	smp_mb__before_atomic();
+	atomic_inc(&net->ipv4.tfo_active_disable_times);
+
+	NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
+}
+
+/* Calculate timeout for tfo active disable
+ * Return true if we are still in the active TFO disable period
+ * Return false if timeout already expired and we should use active TFO
+ */
+bool tcp_fastopen_active_should_disable(struct sock *sk)
+{
+	unsigned int tfo_bh_timeout =
+		READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout);
+	unsigned long timeout;
+	int tfo_da_times;
+	int multiplier;
+
+	if (!tfo_bh_timeout)
+		return false;
+
+	tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
+	if (!tfo_da_times)
+		return false;
+
+	/* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */
+	smp_rmb();
+
+	/* Limit timeout to max: 2^6 * initial timeout */
+	multiplier = 1 << min(tfo_da_times - 1, 6);
+
+	/* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */
+	timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) +
+		  multiplier * tfo_bh_timeout * HZ;
+	if (time_before(jiffies, timeout))
+		return true;
+
+	/* Mark check bit so we can check for successful active TFO
+	 * condition and reset tfo_active_disable_times
+	 */
+	tcp_sk(sk)->syn_fastopen_ch = 1;
+	return false;
+}
+
+/* Disable active TFO if FIN is the only packet in the ofo queue
+ * and no data is received.
+ * Also check if we can reset tfo_active_disable_times if data is
+ * received successfully on a marked active TFO sockets opened on
+ * a non-loopback interface
+ */
+void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst;
+	struct sk_buff *skb;
+
+	if (!tp->syn_fastopen)
+		return;
+
+	if (!tp->data_segs_in) {
+		skb = skb_rb_first(&tp->out_of_order_queue);
+		if (skb && !skb_rb_next(skb)) {
+			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+				tcp_fastopen_active_disable(sk);
+				return;
+			}
+		}
+	} else if (tp->syn_fastopen_ch &&
+		   atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
+		dst = sk_dst_get(sk);
+		if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
+			atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
+		dst_release(dst);
+	}
+}
+
+void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
+{
+	u32 timeouts = inet_csk(sk)->icsk_retransmits;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Broken middle-boxes may black-hole Fast Open connection during or
+	 * even after the handshake. Be extremely conservative and pause
+	 * Fast Open globally after hitting the third consecutive timeout or
+	 * exceeding the configured timeout limit.
+	 */
+	if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) &&
+	    (timeouts == 2 || (timeouts < 2 && expired))) {
+		tcp_fastopen_active_disable(sk);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+	}
+}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 0000000000..c6de5ce79a
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Sally Floyd's High Speed TCP (RFC 3649) congestion control
+ *
+ * See https://www.icir.org/floyd/hstcp.html
+ *
+ * John Heffner <jheffner@psc.edu>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* From AIMD tables from RFC 3649 appendix B,
+ * with fixed-point MD scaled <<8.
+ */
+static const struct hstcp_aimd_val {
+	unsigned int cwnd;
+	unsigned int md;
+} hstcp_aimd_vals[] = {
+	{     38,  128, /*  0.50 */ },
+	{    118,  112, /*  0.44 */ },
+	{    221,  104, /*  0.41 */ },
+	{    347,   98, /*  0.38 */ },
+	{    495,   93, /*  0.37 */ },
+	{    663,   89, /*  0.35 */ },
+	{    851,   86, /*  0.34 */ },
+	{   1058,   83, /*  0.33 */ },
+	{   1284,   81, /*  0.32 */ },
+	{   1529,   78, /*  0.31 */ },
+	{   1793,   76, /*  0.30 */ },
+	{   2076,   74, /*  0.29 */ },
+	{   2378,   72, /*  0.28 */ },
+	{   2699,   71, /*  0.28 */ },
+	{   3039,   69, /*  0.27 */ },
+	{   3399,   68, /*  0.27 */ },
+	{   3778,   66, /*  0.26 */ },
+	{   4177,   65, /*  0.26 */ },
+	{   4596,   64, /*  0.25 */ },
+	{   5036,   62, /*  0.25 */ },
+	{   5497,   61, /*  0.24 */ },
+	{   5979,   60, /*  0.24 */ },
+	{   6483,   59, /*  0.23 */ },
+	{   7009,   58, /*  0.23 */ },
+	{   7558,   57, /*  0.22 */ },
+	{   8130,   56, /*  0.22 */ },
+	{   8726,   55, /*  0.22 */ },
+	{   9346,   54, /*  0.21 */ },
+	{   9991,   53, /*  0.21 */ },
+	{  10661,   52, /*  0.21 */ },
+	{  11358,   52, /*  0.20 */ },
+	{  12082,   51, /*  0.20 */ },
+	{  12834,   50, /*  0.20 */ },
+	{  13614,   49, /*  0.19 */ },
+	{  14424,   48, /*  0.19 */ },
+	{  15265,   48, /*  0.19 */ },
+	{  16137,   47, /*  0.19 */ },
+	{  17042,   46, /*  0.18 */ },
+	{  17981,   45, /*  0.18 */ },
+	{  18955,   45, /*  0.18 */ },
+	{  19965,   44, /*  0.17 */ },
+	{  21013,   43, /*  0.17 */ },
+	{  22101,   43, /*  0.17 */ },
+	{  23230,   42, /*  0.17 */ },
+	{  24402,   41, /*  0.16 */ },
+	{  25618,   41, /*  0.16 */ },
+	{  26881,   40, /*  0.16 */ },
+	{  28193,   39, /*  0.16 */ },
+	{  29557,   39, /*  0.15 */ },
+	{  30975,   38, /*  0.15 */ },
+	{  32450,   38, /*  0.15 */ },
+	{  33986,   37, /*  0.15 */ },
+	{  35586,   36, /*  0.14 */ },
+	{  37253,   36, /*  0.14 */ },
+	{  38992,   35, /*  0.14 */ },
+	{  40808,   35, /*  0.14 */ },
+	{  42707,   34, /*  0.13 */ },
+	{  44694,   33, /*  0.13 */ },
+	{  46776,   33, /*  0.13 */ },
+	{  48961,   32, /*  0.13 */ },
+	{  51258,   32, /*  0.13 */ },
+	{  53677,   31, /*  0.12 */ },
+	{  56230,   30, /*  0.12 */ },
+	{  58932,   30, /*  0.12 */ },
+	{  61799,   29, /*  0.12 */ },
+	{  64851,   28, /*  0.11 */ },
+	{  68113,   28, /*  0.11 */ },
+	{  71617,   27, /*  0.11 */ },
+	{  75401,   26, /*  0.10 */ },
+	{  79517,   26, /*  0.10 */ },
+	{  84035,   25, /*  0.10 */ },
+	{  89053,   24, /*  0.10 */ },
+};
+
+#define HSTCP_AIMD_MAX	ARRAY_SIZE(hstcp_aimd_vals)
+
+struct hstcp {
+	u32	ai;
+};
+
+static void hstcp_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
+
+	ca->ai = 0;
+
+	/* Ensure the MD arithmetic works.  This is somewhat pedantic,
+	 * since I don't think we will see a cwnd this large. :) */
+	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	if (tcp_in_slow_start(tp))
+		tcp_slow_start(tp, acked);
+	else {
+		/* Update AIMD parameters.
+		 *
+		 * We want to guarantee that:
+		 *     hstcp_aimd_vals[ca->ai-1].cwnd <
+		 *     snd_cwnd <=
+		 *     hstcp_aimd_vals[ca->ai].cwnd
+		 */
+		if (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd) {
+			while (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd &&
+			       ca->ai < HSTCP_AIMD_MAX - 1)
+				ca->ai++;
+		} else if (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd) {
+			while (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd)
+				ca->ai--;
+		}
+
+		/* Do additive increase */
+		if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
+			/* cwnd = cwnd + a(w) / cwnd */
+			tp->snd_cwnd_cnt += ca->ai + 1;
+			if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+				tp->snd_cwnd_cnt -= tcp_snd_cwnd(tp);
+				tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+			}
+		}
+	}
+}
+
+static u32 hstcp_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
+
+	/* Do multiplicative decrease */
+	return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+}
+
+static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
+	.init		= hstcp_init,
+	.ssthresh	= hstcp_ssthresh,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.cong_avoid	= hstcp_cong_avoid,
+
+	.owner		= THIS_MODULE,
+	.name		= "highspeed"
+};
+
+static int __init hstcp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_highspeed);
+}
+
+static void __exit hstcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_highspeed);
+}
+
+module_init(hstcp_register);
+module_exit(hstcp_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 0000000000..52b1f2665d
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ *   "H-TCP: TCP for high-speed and long-distance networks"
+ *   Proc. PFLDnet, Argonne, 2004.
+ * https://www.hamilton.ie/net/htcp3.pdf
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define ALPHA_BASE	(1<<7)	/* 1.0 with shift << 7 */
+#define BETA_MIN	(1<<6)	/* 0.5 with shift << 7 */
+#define BETA_MAX	102	/* 0.8 with shift << 7 */
+
+static int use_rtt_scaling __read_mostly = 1;
+module_param(use_rtt_scaling, int, 0644);
+MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
+
+static int use_bandwidth_switch __read_mostly = 1;
+module_param(use_bandwidth_switch, int, 0644);
+MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
+
+struct htcp {
+	u32	alpha;		/* Fixed point arith, << 7 */
+	u8	beta;           /* Fixed point arith, << 7 */
+	u8	modeswitch;	/* Delay modeswitch
+				   until we had at least one congestion event */
+	u16	pkts_acked;
+	u32	packetcount;
+	u32	minRTT;
+	u32	maxRTT;
+	u32	last_cong;	/* Time since last congestion event end */
+	u32	undo_last_cong;
+
+	u32	undo_maxRTT;
+	u32	undo_old_maxB;
+
+	/* Bandwidth estimation */
+	u32	minB;
+	u32	maxB;
+	u32	old_maxB;
+	u32	Bi;
+	u32	lasttime;
+};
+
+static inline u32 htcp_cong_time(const struct htcp *ca)
+{
+	return jiffies - ca->last_cong;
+}
+
+static inline u32 htcp_ccount(const struct htcp *ca)
+{
+	return htcp_cong_time(ca) / ca->minRTT;
+}
+
+static inline void htcp_reset(struct htcp *ca)
+{
+	ca->undo_last_cong = ca->last_cong;
+	ca->undo_maxRTT = ca->maxRTT;
+	ca->undo_old_maxB = ca->old_maxB;
+
+	ca->last_cong = jiffies;
+}
+
+static u32 htcp_cwnd_undo(struct sock *sk)
+{
+	struct htcp *ca = inet_csk_ca(sk);
+
+	if (ca->undo_last_cong) {
+		ca->last_cong = ca->undo_last_cong;
+		ca->maxRTT = ca->undo_maxRTT;
+		ca->old_maxB = ca->undo_old_maxB;
+		ca->undo_last_cong = 0;
+	}
+
+	return tcp_reno_undo_cwnd(sk);
+}
+
+static inline void measure_rtt(struct sock *sk, u32 srtt)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
+
+	/* keep track of minimum RTT seen so far, minRTT is zero at first */
+	if (ca->minRTT > srtt || !ca->minRTT)
+		ca->minRTT = srtt;
+
+	/* max RTT */
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
+		if (ca->maxRTT < ca->minRTT)
+			ca->maxRTT = ca->minRTT;
+		if (ca->maxRTT < srtt &&
+		    srtt <= ca->maxRTT + msecs_to_jiffies(20))
+			ca->maxRTT = srtt;
+	}
+}
+
+static void measure_achieved_throughput(struct sock *sk,
+					const struct ack_sample *sample)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
+	u32 now = tcp_jiffies32;
+
+	if (icsk->icsk_ca_state == TCP_CA_Open)
+		ca->pkts_acked = sample->pkts_acked;
+
+	if (sample->rtt_us > 0)
+		measure_rtt(sk, usecs_to_jiffies(sample->rtt_us));
+
+	if (!use_bandwidth_switch)
+		return;
+
+	/* achieved throughput calculations */
+	if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
+		ca->packetcount = 0;
+		ca->lasttime = now;
+		return;
+	}
+
+	ca->packetcount += sample->pkts_acked;
+
+	if (ca->packetcount >= tcp_snd_cwnd(tp) - (ca->alpha >> 7 ? : 1) &&
+	    now - ca->lasttime >= ca->minRTT &&
+	    ca->minRTT > 0) {
+		__u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
+
+		if (htcp_ccount(ca) <= 3) {
+			/* just after backoff */
+			ca->minB = ca->maxB = ca->Bi = cur_Bi;
+		} else {
+			ca->Bi = (3 * ca->Bi + cur_Bi) / 4;
+			if (ca->Bi > ca->maxB)
+				ca->maxB = ca->Bi;
+			if (ca->minB > ca->maxB)
+				ca->minB = ca->maxB;
+		}
+		ca->packetcount = 0;
+		ca->lasttime = now;
+	}
+}
+
+static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
+{
+	if (use_bandwidth_switch) {
+		u32 maxB = ca->maxB;
+		u32 old_maxB = ca->old_maxB;
+
+		ca->old_maxB = ca->maxB;
+		if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
+			ca->beta = BETA_MIN;
+			ca->modeswitch = 0;
+			return;
+		}
+	}
+
+	if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) {
+		ca->beta = (minRTT << 7) / maxRTT;
+		if (ca->beta < BETA_MIN)
+			ca->beta = BETA_MIN;
+		else if (ca->beta > BETA_MAX)
+			ca->beta = BETA_MAX;
+	} else {
+		ca->beta = BETA_MIN;
+		ca->modeswitch = 1;
+	}
+}
+
+static inline void htcp_alpha_update(struct htcp *ca)
+{
+	u32 minRTT = ca->minRTT;
+	u32 factor = 1;
+	u32 diff = htcp_cong_time(ca);
+
+	if (diff > HZ) {
+		diff -= HZ;
+		factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ;
+	}
+
+	if (use_rtt_scaling && minRTT) {
+		u32 scale = (HZ << 3) / (10 * minRTT);
+
+		/* clamping ratio to interval [0.5,10]<<3 */
+		scale = min(max(scale, 1U << 2), 10U << 3);
+		factor = (factor << 3) / scale;
+		if (!factor)
+			factor = 1;
+	}
+
+	ca->alpha = 2 * factor * ((1 << 7) - ca->beta);
+	if (!ca->alpha)
+		ca->alpha = ALPHA_BASE;
+}
+
+/*
+ * After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void htcp_param_update(struct sock *sk)
+{
+	struct htcp *ca = inet_csk_ca(sk);
+	u32 minRTT = ca->minRTT;
+	u32 maxRTT = ca->maxRTT;
+
+	htcp_beta_update(ca, minRTT, maxRTT);
+	htcp_alpha_update(ca);
+
+	/* add slowly fading memory for maxRTT to accommodate routing changes */
+	if (minRTT > 0 && maxRTT > minRTT)
+		ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100;
+}
+
+static u32 htcp_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct htcp *ca = inet_csk_ca(sk);
+
+	htcp_param_update(sk);
+	return max((tcp_snd_cwnd(tp) * ca->beta) >> 7, 2U);
+}
+
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	if (tcp_in_slow_start(tp))
+		tcp_slow_start(tp, acked);
+	else {
+		/* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
+		 */
+		if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tcp_snd_cwnd(tp)) {
+			if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp)
+				tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+			tp->snd_cwnd_cnt = 0;
+			htcp_alpha_update(ca);
+		} else
+			tp->snd_cwnd_cnt += ca->pkts_acked;
+
+		ca->pkts_acked = 1;
+	}
+}
+
+static void htcp_init(struct sock *sk)
+{
+	struct htcp *ca = inet_csk_ca(sk);
+
+	memset(ca, 0, sizeof(struct htcp));
+	ca->alpha = ALPHA_BASE;
+	ca->beta = BETA_MIN;
+	ca->pkts_acked = 1;
+	ca->last_cong = jiffies;
+}
+
+static void htcp_state(struct sock *sk, u8 new_state)
+{
+	switch (new_state) {
+	case TCP_CA_Open:
+		{
+			struct htcp *ca = inet_csk_ca(sk);
+
+			if (ca->undo_last_cong) {
+				ca->last_cong = jiffies;
+				ca->undo_last_cong = 0;
+			}
+		}
+		break;
+	case TCP_CA_CWR:
+	case TCP_CA_Recovery:
+	case TCP_CA_Loss:
+		htcp_reset(inet_csk_ca(sk));
+		break;
+	}
+}
+
+static struct tcp_congestion_ops htcp __read_mostly = {
+	.init		= htcp_init,
+	.ssthresh	= htcp_recalc_ssthresh,
+	.cong_avoid	= htcp_cong_avoid,
+	.set_state	= htcp_state,
+	.undo_cwnd	= htcp_cwnd_undo,
+	.pkts_acked	= measure_achieved_throughput,
+	.owner		= THIS_MODULE,
+	.name		= "htcp",
+};
+
+static int __init htcp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
+	BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
+	return tcp_register_congestion_control(&htcp);
+}
+
+static void __exit htcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&htcp);
+}
+
+module_init(htcp_register);
+module_exit(htcp_unregister);
+
+MODULE_AUTHOR("Baruch Even");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 0000000000..abd7d91807
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP HYBLA
+ *
+ * TCP-HYBLA Congestion control algorithm, based on:
+ *   C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
+ *   for Heterogeneous Networks",
+ *   International Journal on satellite Communications,
+ *				       September 2004
+ *    Daniele Lacamera
+ *    root at danielinux.net
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* Tcp Hybla structure. */
+struct hybla {
+	bool  hybla_en;
+	u32   snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
+	u32   rho;	      /* Rho parameter, integer part  */
+	u32   rho2;	      /* Rho * Rho, integer part */
+	u32   rho_3ls;	      /* Rho parameter, <<3 */
+	u32   rho2_7ls;	      /* Rho^2, <<7	*/
+	u32   minrtt_us;      /* Minimum smoothed round trip time value seen */
+};
+
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
+static int rtt0 = 25;
+module_param(rtt0, int, 0644);
+MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
+
+/* This is called to refresh values for hybla parameters */
+static inline void hybla_recalc_param (struct sock *sk)
+{
+	struct hybla *ca = inet_csk_ca(sk);
+
+	ca->rho_3ls = max_t(u32,
+			    tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
+			    8U);
+	ca->rho = ca->rho_3ls >> 3;
+	ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
+	ca->rho2 = ca->rho2_7ls >> 7;
+}
+
+static void hybla_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hybla *ca = inet_csk_ca(sk);
+
+	ca->rho = 0;
+	ca->rho2 = 0;
+	ca->rho_3ls = 0;
+	ca->rho2_7ls = 0;
+	ca->snd_cwnd_cents = 0;
+	ca->hybla_en = true;
+	tcp_snd_cwnd_set(tp, 2);
+	tp->snd_cwnd_clamp = 65535;
+
+	/* 1st Rho measurement based on initial srtt */
+	hybla_recalc_param(sk);
+
+	/* set minimum rtt as this is the 1st ever seen */
+	ca->minrtt_us = tp->srtt_us;
+	tcp_snd_cwnd_set(tp, ca->rho);
+}
+
+static void hybla_state(struct sock *sk, u8 ca_state)
+{
+	struct hybla *ca = inet_csk_ca(sk);
+
+	ca->hybla_en = (ca_state == TCP_CA_Open);
+}
+
+static inline u32 hybla_fraction(u32 odds)
+{
+	static const u32 fractions[] = {
+		128, 139, 152, 165, 181, 197, 215, 234,
+	};
+
+	return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
+}
+
+/* TCP Hybla main routine.
+ * This is the algorithm behavior:
+ *     o Recalc Hybla parameters if min_rtt has changed
+ *     o Give cwnd a new value based on the model proposed
+ *     o remember increments <1
+ */
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hybla *ca = inet_csk_ca(sk);
+	u32 increment, odd, rho_fractions;
+	int is_slowstart = 0;
+
+	/*  Recalculate rho only if this srtt is the lowest */
+	if (tp->srtt_us < ca->minrtt_us) {
+		hybla_recalc_param(sk);
+		ca->minrtt_us = tp->srtt_us;
+	}
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	if (!ca->hybla_en) {
+		tcp_reno_cong_avoid(sk, ack, acked);
+		return;
+	}
+
+	if (ca->rho == 0)
+		hybla_recalc_param(sk);
+
+	rho_fractions = ca->rho_3ls - (ca->rho << 3);
+
+	if (tcp_in_slow_start(tp)) {
+		/*
+		 * slow start
+		 *      INC = 2^RHO - 1
+		 * This is done by splitting the rho parameter
+		 * into 2 parts: an integer part and a fraction part.
+		 * Inrement<<7 is estimated by doing:
+		 *	       [2^(int+fract)]<<7
+		 * that is equal to:
+		 *	       (2^int)	*  [(2^fract) <<7]
+		 * 2^int is straightly computed as 1<<int,
+		 * while we will use hybla_slowstart_fraction_increment() to
+		 * calculate 2^fract in a <<7 value.
+		 */
+		is_slowstart = 1;
+		increment = ((1 << min(ca->rho, 16U)) *
+			hybla_fraction(rho_fractions)) - 128;
+	} else {
+		/*
+		 * congestion avoidance
+		 * INC = RHO^2 / W
+		 * as long as increment is estimated as (rho<<7)/window
+		 * it already is <<7 and we can easily count its fractions.
+		 */
+		increment = ca->rho2_7ls / tcp_snd_cwnd(tp);
+		if (increment < 128)
+			tp->snd_cwnd_cnt++;
+	}
+
+	odd = increment % 128;
+	tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + (increment >> 7));
+	ca->snd_cwnd_cents += odd;
+
+	/* check when fractions goes >=128 and increase cwnd by 1. */
+	while (ca->snd_cwnd_cents >= 128) {
+		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+		ca->snd_cwnd_cents -= 128;
+		tp->snd_cwnd_cnt = 0;
+	}
+	/* check when cwnd has not been incremented for a while */
+	if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+		tp->snd_cwnd_cnt = 0;
+	}
+	/* clamp down slowstart cwnd to ssthresh value. */
+	if (is_slowstart)
+		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_ssthresh));
+
+	tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
+}
+
+static struct tcp_congestion_ops tcp_hybla __read_mostly = {
+	.init		= hybla_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.cong_avoid	= hybla_cong_avoid,
+	.set_state	= hybla_state,
+
+	.owner		= THIS_MODULE,
+	.name		= "hybla"
+};
+
+static int __init hybla_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_hybla);
+}
+
+static void __exit hybla_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_hybla);
+}
+
+module_init(hybla_register);
+module_exit(hybla_unregister);
+
+MODULE_AUTHOR("Daniele Lacamera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
new file mode 100644
index 0000000000..c0c81a2c77
--- /dev/null
+++ b/net/ipv4/tcp_illinois.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP Illinois congestion control.
+ * Home page:
+ *	http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ *
+ * The algorithm is described in:
+ * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
+ *  for High-Speed Networks"
+ * http://tamerbasar.csl.illinois.edu/LiuBasarSrikantPerfEvalArtJun2008.pdf
+ *
+ * Implemented from description in paper and ns-2 simulation.
+ * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <asm/div64.h>
+#include <net/tcp.h>
+
+#define ALPHA_SHIFT	7
+#define ALPHA_SCALE	(1u<<ALPHA_SHIFT)
+#define ALPHA_MIN	((3*ALPHA_SCALE)/10)	/* ~0.3 */
+#define ALPHA_MAX	(10*ALPHA_SCALE)	/* 10.0 */
+#define ALPHA_BASE	ALPHA_SCALE		/* 1.0 */
+#define RTT_MAX		(U32_MAX / ALPHA_MAX)	/* 3.3 secs */
+
+#define BETA_SHIFT	6
+#define BETA_SCALE	(1u<<BETA_SHIFT)
+#define BETA_MIN	(BETA_SCALE/8)		/* 0.125 */
+#define BETA_MAX	(BETA_SCALE/2)		/* 0.5 */
+#define BETA_BASE	BETA_MAX
+
+static int win_thresh __read_mostly = 15;
+module_param(win_thresh, int, 0);
+MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing");
+
+static int theta __read_mostly = 5;
+module_param(theta, int, 0);
+MODULE_PARM_DESC(theta, "# of fast RTT's before full growth");
+
+/* TCP Illinois Parameters */
+struct illinois {
+	u64	sum_rtt;	/* sum of rtt's measured within last rtt */
+	u16	cnt_rtt;	/* # of rtts measured within last rtt */
+	u32	base_rtt;	/* min of all rtt in usec */
+	u32	max_rtt;	/* max of all rtt in usec */
+	u32	end_seq;	/* right edge of current RTT */
+	u32	alpha;		/* Additive increase */
+	u32	beta;		/* Muliplicative decrease */
+	u16	acked;		/* # packets acked by current ACK */
+	u8	rtt_above;	/* average rtt has gone above threshold */
+	u8	rtt_low;	/* # of rtts measurements below threshold */
+};
+
+static void rtt_reset(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct illinois *ca = inet_csk_ca(sk);
+
+	ca->end_seq = tp->snd_nxt;
+	ca->cnt_rtt = 0;
+	ca->sum_rtt = 0;
+
+	/* TODO: age max_rtt? */
+}
+
+static void tcp_illinois_init(struct sock *sk)
+{
+	struct illinois *ca = inet_csk_ca(sk);
+
+	ca->alpha = ALPHA_MAX;
+	ca->beta = BETA_BASE;
+	ca->base_rtt = 0x7fffffff;
+	ca->max_rtt = 0;
+
+	ca->acked = 0;
+	ca->rtt_low = 0;
+	ca->rtt_above = 0;
+
+	rtt_reset(sk);
+}
+
+/* Measure RTT for each ack. */
+static void tcp_illinois_acked(struct sock *sk, const struct ack_sample *sample)
+{
+	struct illinois *ca = inet_csk_ca(sk);
+	s32 rtt_us = sample->rtt_us;
+
+	ca->acked = sample->pkts_acked;
+
+	/* dup ack, no rtt sample */
+	if (rtt_us < 0)
+		return;
+
+	/* ignore bogus values, this prevents wraparound in alpha math */
+	if (rtt_us > RTT_MAX)
+		rtt_us = RTT_MAX;
+
+	/* keep track of minimum RTT seen so far */
+	if (ca->base_rtt > rtt_us)
+		ca->base_rtt = rtt_us;
+
+	/* and max */
+	if (ca->max_rtt < rtt_us)
+		ca->max_rtt = rtt_us;
+
+	++ca->cnt_rtt;
+	ca->sum_rtt += rtt_us;
+}
+
+/* Maximum queuing delay */
+static inline u32 max_delay(const struct illinois *ca)
+{
+	return ca->max_rtt - ca->base_rtt;
+}
+
+/* Average queuing delay */
+static inline u32 avg_delay(const struct illinois *ca)
+{
+	u64 t = ca->sum_rtt;
+
+	do_div(t, ca->cnt_rtt);
+	return t - ca->base_rtt;
+}
+
+/*
+ * Compute value of alpha used for additive increase.
+ * If small window then use 1.0, equivalent to Reno.
+ *
+ * For larger windows, adjust based on average delay.
+ * A. If average delay is at minimum (we are uncongested),
+ *    then use large alpha (10.0) to increase faster.
+ * B. If average delay is at maximum (getting congested)
+ *    then use small alpha (0.3)
+ *
+ * The result is a convex window growth curve.
+ */
+static u32 alpha(struct illinois *ca, u32 da, u32 dm)
+{
+	u32 d1 = dm / 100;	/* Low threshold */
+
+	if (da <= d1) {
+		/* If never got out of low delay zone, then use max */
+		if (!ca->rtt_above)
+			return ALPHA_MAX;
+
+		/* Wait for 5 good RTT's before allowing alpha to go alpha max.
+		 * This prevents one good RTT from causing sudden window increase.
+		 */
+		if (++ca->rtt_low < theta)
+			return ca->alpha;
+
+		ca->rtt_low = 0;
+		ca->rtt_above = 0;
+		return ALPHA_MAX;
+	}
+
+	ca->rtt_above = 1;
+
+	/*
+	 * Based on:
+	 *
+	 *      (dm - d1) amin amax
+	 * k1 = -------------------
+	 *         amax - amin
+	 *
+	 *       (dm - d1) amin
+	 * k2 = ----------------  - d1
+	 *        amax - amin
+	 *
+	 *             k1
+	 * alpha = ----------
+	 *          k2 + da
+	 */
+
+	dm -= d1;
+	da -= d1;
+	return (dm * ALPHA_MAX) /
+		(dm + (da  * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN);
+}
+
+/*
+ * Beta used for multiplicative decrease.
+ * For small window sizes returns same value as Reno (0.5)
+ *
+ * If delay is small (10% of max) then beta = 1/8
+ * If delay is up to 80% of max then beta = 1/2
+ * In between is a linear function
+ */
+static u32 beta(u32 da, u32 dm)
+{
+	u32 d2, d3;
+
+	d2 = dm / 10;
+	if (da <= d2)
+		return BETA_MIN;
+
+	d3 = (8 * dm) / 10;
+	if (da >= d3 || d3 <= d2)
+		return BETA_MAX;
+
+	/*
+	 * Based on:
+	 *
+	 *       bmin d3 - bmax d2
+	 * k3 = -------------------
+	 *         d3 - d2
+	 *
+	 *       bmax - bmin
+	 * k4 = -------------
+	 *         d3 - d2
+	 *
+	 * b = k3 + k4 da
+	 */
+	return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da)
+		/ (d3 - d2);
+}
+
+/* Update alpha and beta values once per RTT */
+static void update_params(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct illinois *ca = inet_csk_ca(sk);
+
+	if (tcp_snd_cwnd(tp) < win_thresh) {
+		ca->alpha = ALPHA_BASE;
+		ca->beta = BETA_BASE;
+	} else if (ca->cnt_rtt > 0) {
+		u32 dm = max_delay(ca);
+		u32 da = avg_delay(ca);
+
+		ca->alpha = alpha(ca, da, dm);
+		ca->beta = beta(da, dm);
+	}
+
+	rtt_reset(sk);
+}
+
+/*
+ * In case of loss, reset to default values
+ */
+static void tcp_illinois_state(struct sock *sk, u8 new_state)
+{
+	struct illinois *ca = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss) {
+		ca->alpha = ALPHA_BASE;
+		ca->beta = BETA_BASE;
+		ca->rtt_low = 0;
+		ca->rtt_above = 0;
+		rtt_reset(sk);
+	}
+}
+
+/*
+ * Increase window in response to successful acknowledgment.
+ */
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct illinois *ca = inet_csk_ca(sk);
+
+	if (after(ack, ca->end_seq))
+		update_params(sk);
+
+	/* RFC2861 only increase cwnd if fully utilized */
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	/* In slow start */
+	if (tcp_in_slow_start(tp))
+		tcp_slow_start(tp, acked);
+
+	else {
+		u32 delta;
+
+		/* snd_cwnd_cnt is # of packets since last cwnd increment */
+		tp->snd_cwnd_cnt += ca->acked;
+		ca->acked = 1;
+
+		/* This is close approximation of:
+		 * tp->snd_cwnd += alpha/tp->snd_cwnd
+		*/
+		delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
+		if (delta >= tcp_snd_cwnd(tp)) {
+			tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp) + delta / tcp_snd_cwnd(tp),
+						 (u32)tp->snd_cwnd_clamp));
+			tp->snd_cwnd_cnt = 0;
+		}
+	}
+}
+
+static u32 tcp_illinois_ssthresh(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct illinois *ca = inet_csk_ca(sk);
+	u32 decr;
+
+	/* Multiplicative decrease */
+	decr = (tcp_snd_cwnd(tp) * ca->beta) >> BETA_SHIFT;
+	return max(tcp_snd_cwnd(tp) - decr, 2U);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
+				union tcp_cc_info *info)
+{
+	const struct illinois *ca = inet_csk_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		info->vegas.tcpv_enabled = 1;
+		info->vegas.tcpv_rttcnt = ca->cnt_rtt;
+		info->vegas.tcpv_minrtt = ca->base_rtt;
+		info->vegas.tcpv_rtt = 0;
+
+		if (info->vegas.tcpv_rttcnt > 0) {
+			u64 t = ca->sum_rtt;
+
+			do_div(t, info->vegas.tcpv_rttcnt);
+			info->vegas.tcpv_rtt = t;
+		}
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
+	}
+	return 0;
+}
+
+static struct tcp_congestion_ops tcp_illinois __read_mostly = {
+	.init		= tcp_illinois_init,
+	.ssthresh	= tcp_illinois_ssthresh,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.cong_avoid	= tcp_illinois_cong_avoid,
+	.set_state	= tcp_illinois_state,
+	.get_info	= tcp_illinois_info,
+	.pkts_acked	= tcp_illinois_acked,
+
+	.owner		= THIS_MODULE,
+	.name		= "illinois",
+};
+
+static int __init tcp_illinois_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_illinois);
+}
+
+static void __exit tcp_illinois_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_illinois);
+}
+
+module_init(tcp_illinois_register);
+module_exit(tcp_illinois_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Shao Liu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Illinois");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
new file mode 100644
index 0000000000..e6c4929549
--- /dev/null
+++ b/net/ipv4/tcp_input.c
@@ -0,0 +1,7113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:
+ *		Pedro Roque	:	Fast Retransmit/Recovery.
+ *					Two receive queues.
+ *					Retransmit queue handled by TCP.
+ *					Better retransmit timer handling.
+ *					New congestion avoidance.
+ *					Header prediction.
+ *					Variable renaming.
+ *
+ *		Eric		:	Fast Retransmit.
+ *		Randy Scott	:	MSS option defines.
+ *		Eric Schenk	:	Fixes to slow start algorithm.
+ *		Eric Schenk	:	Yet another double ACK bug.
+ *		Eric Schenk	:	Delayed ACK bug fixes.
+ *		Eric Schenk	:	Floyd style fast retrans war avoidance.
+ *		David S. Miller	:	Don't allow zero congestion window.
+ *		Eric Schenk	:	Fix retransmitter so that it sends
+ *					next packet on ack of previous packet.
+ *		Andi Kleen	:	Moved open_request checking here
+ *					and process RSTs for open_requests.
+ *		Andi Kleen	:	Better prune_queue, and other fixes.
+ *		Andrey Savochkin:	Fix RTT measurements in the presence of
+ *					timestamps.
+ *		Andrey Savochkin:	Check sequence numbers correctly when
+ *					removing SACKs due to in sequence incoming
+ *					data segments.
+ *		Andi Kleen:		Make sure we never ack data there is not
+ *					enough room for. Also make this condition
+ *					a fatal error if it might still happen.
+ *		Andi Kleen:		Add tcp_measure_rcv_mss to make
+ *					connections with MSS<min(MTU,ann. MSS)
+ *					work without delayed acks.
+ *		Andi Kleen:		Process packets with PSH set in the
+ *					fast path.
+ *		J Hadi Salim:		ECN support
+ *	 	Andrei Gurtov,
+ *		Pasi Sarolahti,
+ *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
+ *					engine. Lots of bugs are found.
+ *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/kernel.h>
+#include <linux/prefetch.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/ipsec.h>
+#include <asm/unaligned.h>
+#include <linux/errqueue.h>
+#include <trace/events/tcp.h>
+#include <linux/jump_label_ratelimit.h>
+#include <net/busy_poll.h>
+#include <net/mptcp.h>
+
+int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
+
+#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
+#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
+#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
+#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
+#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
+#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
+#define FLAG_ECE		0x40 /* ECE in this ACK				*/
+#define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */
+#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
+#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
+#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
+#define FLAG_SET_XMIT_TIMER	0x1000 /* Set TLP or RTO timer */
+#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
+#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
+#define FLAG_NO_CHALLENGE_ACK	0x8000 /* do not call tcp_send_challenge_ack()	*/
+#define FLAG_ACK_MAYBE_DELAYED	0x10000 /* Likely a delayed ACK */
+#define FLAG_DSACK_TLP		0x20000 /* DSACK for tail loss probe */
+
+#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
+#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
+
+#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+
+#define REXMIT_NONE	0 /* no loss recovery to do */
+#define REXMIT_LOST	1 /* retransmit packets marked lost */
+#define REXMIT_NEW	2 /* FRTO-style transmit of unsent/new packets */
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
+
+void clean_acked_data_enable(struct inet_connection_sock *icsk,
+			     void (*cad)(struct sock *sk, u32 ack_seq))
+{
+	icsk->icsk_clean_acked = cad;
+	static_branch_deferred_inc(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_enable);
+
+void clean_acked_data_disable(struct inet_connection_sock *icsk)
+{
+	static_branch_slow_dec_deferred(&clean_acked_data_enabled);
+	icsk->icsk_clean_acked = NULL;
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_disable);
+
+void clean_acked_data_flush(void)
+{
+	static_key_deferred_flush(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_flush);
+#endif
+
+#ifdef CONFIG_CGROUP_BPF
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+	bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
+		BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+				       BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
+	bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+						    BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+	struct bpf_sock_ops_kern sock_ops;
+
+	if (likely(!unknown_opt && !parse_all_opt))
+		return;
+
+	/* The skb will be handled in the
+	 * bpf_skops_established() or
+	 * bpf_skops_write_hdr_opt().
+	 */
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+	case TCP_SYN_SENT:
+	case TCP_LISTEN:
+		return;
+	}
+
+	sock_owned_by_me(sk);
+
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+	sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
+	sock_ops.is_fullsock = 1;
+	sock_ops.sk = sk;
+	bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+				  struct sk_buff *skb)
+{
+	struct bpf_sock_ops_kern sock_ops;
+
+	sock_owned_by_me(sk);
+
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+	sock_ops.op = bpf_op;
+	sock_ops.is_fullsock = 1;
+	sock_ops.sk = sk;
+	/* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+	if (skb)
+		bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+#else
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+				  struct sk_buff *skb)
+{
+}
+#endif
+
+static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
+			     unsigned int len)
+{
+	static bool __once __read_mostly;
+
+	if (!__once) {
+		struct net_device *dev;
+
+		__once = true;
+
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+		if (!dev || len >= dev->mtu)
+			pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+				dev ? dev->name : "Unknown driver");
+		rcu_read_unlock();
+	}
+}
+
+/* Adapt the MSS value used to make delayed ack decision to the
+ * real world.
+ */
+static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const unsigned int lss = icsk->icsk_ack.last_seg_size;
+	unsigned int len;
+
+	icsk->icsk_ack.last_seg_size = 0;
+
+	/* skb->len may jitter because of SACKs, even if peer
+	 * sends good full-sized frames.
+	 */
+	len = skb_shinfo(skb)->gso_size ? : skb->len;
+	if (len >= icsk->icsk_ack.rcv_mss) {
+		/* Note: divides are still a bit expensive.
+		 * For the moment, only adjust scaling_ratio
+		 * when we update icsk_ack.rcv_mss.
+		 */
+		if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
+			u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+
+			do_div(val, skb->truesize);
+			tcp_sk(sk)->scaling_ratio = val ? val : 1;
+		}
+		icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
+					       tcp_sk(sk)->advmss);
+		/* Account for possibly-removed options */
+		if (unlikely(len > icsk->icsk_ack.rcv_mss +
+				   MAX_TCP_OPTION_SPACE))
+			tcp_gro_dev_warn(sk, skb, len);
+		/* If the skb has a len of exactly 1*MSS and has the PSH bit
+		 * set then it is likely the end of an application write. So
+		 * more data may not be arriving soon, and yet the data sender
+		 * may be waiting for an ACK if cwnd-bound or using TX zero
+		 * copy. So we set ICSK_ACK_PUSHED here so that
+		 * tcp_cleanup_rbuf() will send an ACK immediately if the app
+		 * reads all of the data and is not ping-pong. If len > MSS
+		 * then this logic does not matter (and does not hurt) because
+		 * tcp_cleanup_rbuf() will always ACK immediately if the app
+		 * reads data and there is more than an MSS of unACKed data.
+		 */
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
+			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+	} else {
+		/* Otherwise, we make more careful check taking into account,
+		 * that SACKs block is variable.
+		 *
+		 * "len" is invariant segment length, including TCP header.
+		 */
+		len += skb->data - skb_transport_header(skb);
+		if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
+		    /* If PSH is not set, packet should be
+		     * full sized, provided peer TCP is not badly broken.
+		     * This observation (if it is correct 8)) allows
+		     * to handle super-low mtu links fairly.
+		     */
+		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
+		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
+			/* Subtract also invariant (if peer is RFC compliant),
+			 * tcp header plus fixed timestamp option length.
+			 * Resulting "len" is MSS free of SACK jitter.
+			 */
+			len -= tcp_sk(sk)->tcp_header_len;
+			icsk->icsk_ack.last_seg_size = len;
+			if (len == lss) {
+				icsk->icsk_ack.rcv_mss = len;
+				return;
+			}
+		}
+		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
+			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
+		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+	}
+}
+
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+
+	if (quickacks == 0)
+		quickacks = 2;
+	quickacks = min(quickacks, max_quickacks);
+	if (quickacks > icsk->icsk_ack.quick)
+		icsk->icsk_ack.quick = quickacks;
+}
+
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tcp_incr_quickack(sk, max_quickacks);
+	inet_csk_exit_pingpong_mode(sk);
+	icsk->icsk_ack.ato = TCP_ATO_MIN;
+}
+
+/* Send ACKs quickly, if "quick" count is not exhausted
+ * and the session is not interactive.
+ */
+
+static bool tcp_in_quickack_mode(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct dst_entry *dst = __sk_dst_get(sk);
+
+	return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
+		(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
+}
+
+static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
+{
+	if (tp->ecn_flags & TCP_ECN_OK)
+		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
+{
+	if (tcp_hdr(skb)->cwr) {
+		tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+
+		/* If the sender is telling us it has entered CWR, then its
+		 * cwnd may be very low (even just 1 packet), so we should ACK
+		 * immediately.
+		 */
+		if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
+			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+	}
+}
+
+static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
+{
+	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+}
+
+static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
+	case INET_ECN_NOT_ECT:
+		/* Funny extension: if ECT is not set on a segment,
+		 * and we already seen ECT on a previous segment,
+		 * it is probably a retransmit.
+		 */
+		if (tp->ecn_flags & TCP_ECN_SEEN)
+			tcp_enter_quickack_mode(sk, 2);
+		break;
+	case INET_ECN_CE:
+		if (tcp_ca_needs_ecn(sk))
+			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
+
+		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+			/* Better not delay acks, sender can have a very low cwnd */
+			tcp_enter_quickack_mode(sk, 2);
+			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+		}
+		tp->ecn_flags |= TCP_ECN_SEEN;
+		break;
+	default:
+		if (tcp_ca_needs_ecn(sk))
+			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+		tp->ecn_flags |= TCP_ECN_SEEN;
+		break;
+	}
+}
+
+static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+{
+	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
+		__tcp_ecn_check_ce(sk, skb);
+}
+
+static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+{
+	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
+		tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
+{
+	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
+		tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+{
+	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
+		return true;
+	return false;
+}
+
+/* Buffer size and advertised window tuning.
+ *
+ * 1. Tuning sk->sk_sndbuf, when connection enters established state.
+ */
+
+static void tcp_sndbuf_expand(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+	int sndmem, per_mss;
+	u32 nr_segs;
+
+	/* Worst case is non GSO/TSO : each frame consumes one skb
+	 * and skb->head is kmalloced using power of two area of memory
+	 */
+	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+		  MAX_TCP_HEADER +
+		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	per_mss = roundup_pow_of_two(per_mss) +
+		  SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+	nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp));
+	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+
+	/* Fast Recovery (RFC 5681 3.2) :
+	 * Cubic needs 1.7 factor, rounded to 2 to include
+	 * extra cushion (application might react slowly to EPOLLOUT)
+	 */
+	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
+	sndmem *= nr_segs * per_mss;
+
+	if (sk->sk_sndbuf < sndmem)
+		WRITE_ONCE(sk->sk_sndbuf,
+			   min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
+}
+
+/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
+ *
+ * All tcp_full_space() is split to two parts: "network" buffer, allocated
+ * forward and advertised in receiver window (tp->rcv_wnd) and
+ * "application buffer", required to isolate scheduling/application
+ * latencies from network.
+ * window_clamp is maximal advertised window. It can be less than
+ * tcp_full_space(), in this case tcp_full_space() - window_clamp
+ * is reserved for "application" buffer. The less window_clamp is
+ * the smoother our behaviour from viewpoint of network, but the lower
+ * throughput and the higher sensitivity of the connection to losses. 8)
+ *
+ * rcv_ssthresh is more strict window_clamp used at "slow start"
+ * phase to predict further behaviour of this connection.
+ * It is used for two goals:
+ * - to enforce header prediction at sender, even when application
+ *   requires some significant "application buffer". It is check #1.
+ * - to prevent pruning of receive queue because of misprediction
+ *   of receiver window. Check #2.
+ *
+ * The scheme does not work when sender sends good segments opening
+ * window and then starts to feed us spaghetti. But it should work
+ * in common situations. Otherwise, we have to rely on queue collapsing.
+ */
+
+/* Slow part of check#2. */
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
+			     unsigned int skbtruesize)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	/* Optimize this! */
+	int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
+	int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
+
+	while (tp->rcv_ssthresh <= window) {
+		if (truesize <= skb->len)
+			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
+
+		truesize >>= 1;
+		window >>= 1;
+	}
+	return 0;
+}
+
+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
+ * can play nice with us, as sk_buff and skb->head might be either
+ * freed or shared with up to MAX_SKB_FRAGS segments.
+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
+ * and if no payload was pulled in skb->head before reaching us.
+ */
+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
+{
+	u32 truesize = skb->truesize;
+
+	if (adjust && !skb_headlen(skb)) {
+		truesize -= SKB_TRUESIZE(skb_end_offset(skb));
+		/* paranoid check, some drivers might be buggy */
+		if (unlikely((int)truesize < (int)skb->len))
+			truesize = skb->truesize;
+	}
+	return truesize;
+}
+
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
+			    bool adjust)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int room;
+
+	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
+
+	if (room <= 0)
+		return;
+
+	/* Check #1 */
+	if (!tcp_under_memory_pressure(sk)) {
+		unsigned int truesize = truesize_adjust(adjust, skb);
+		int incr;
+
+		/* Check #2. Increase window, if skb with such overhead
+		 * will fit to rcvbuf in future.
+		 */
+		if (tcp_win_from_space(sk, truesize) <= skb->len)
+			incr = 2 * tp->advmss;
+		else
+			incr = __tcp_grow_window(sk, skb, truesize);
+
+		if (incr) {
+			incr = max_t(int, incr, 2 * skb->len);
+			tp->rcv_ssthresh += min(room, incr);
+			inet_csk(sk)->icsk_ack.quick |= 1;
+		}
+	} else {
+		/* Under pressure:
+		 * Adjust rcv_ssthresh according to reserved mem
+		 */
+		tcp_adjust_rcv_ssthresh(sk);
+	}
+}
+
+/* 3. Try to fixup all. It is made immediately after connection enters
+ *    established state.
+ */
+static void tcp_init_buffer_space(struct sock *sk)
+{
+	int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int maxwin;
+
+	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
+		tcp_sndbuf_expand(sk);
+
+	tcp_mstamp_refresh(tp);
+	tp->rcvq_space.time = tp->tcp_mstamp;
+	tp->rcvq_space.seq = tp->copied_seq;
+
+	maxwin = tcp_full_space(sk);
+
+	if (tp->window_clamp >= maxwin) {
+		tp->window_clamp = maxwin;
+
+		if (tcp_app_win && maxwin > 4 * tp->advmss)
+			tp->window_clamp = max(maxwin -
+					       (maxwin >> tcp_app_win),
+					       4 * tp->advmss);
+	}
+
+	/* Force reservation of one segment. */
+	if (tcp_app_win &&
+	    tp->window_clamp > 2 * tp->advmss &&
+	    tp->window_clamp + tp->advmss > maxwin)
+		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+
+	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+	tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
+				    (u32)TCP_INIT_CWND * tp->advmss);
+}
+
+/* 4. Recalculate window clamp after socket hit its memory bounds. */
+static void tcp_clamp_window(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct net *net = sock_net(sk);
+	int rmem2;
+
+	icsk->icsk_ack.quick = 0;
+	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+
+	if (sk->sk_rcvbuf < rmem2 &&
+	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+	    !tcp_under_memory_pressure(sk) &&
+	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
+		WRITE_ONCE(sk->sk_rcvbuf,
+			   min(atomic_read(&sk->sk_rmem_alloc), rmem2));
+	}
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
+}
+
+/* Initialize RCV_MSS value.
+ * RCV_MSS is an our guess about MSS used by the peer.
+ * We haven't any direct information about the MSS.
+ * It's better to underestimate the RCV_MSS rather than overestimate.
+ * Overestimations make us ACKing less frequently than needed.
+ * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
+ */
+void tcp_initialize_rcv_mss(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
+
+	hint = min(hint, tp->rcv_wnd / 2);
+	hint = min(hint, TCP_MSS_DEFAULT);
+	hint = max(hint, TCP_MIN_MSS);
+
+	inet_csk(sk)->icsk_ack.rcv_mss = hint;
+}
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
+
+/* Receiver "autotuning" code.
+ *
+ * The algorithm for RTT estimation w/o timestamps is based on
+ * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
+ *
+ * More detail on this code can be found at
+ * <http://staff.psc.edu/jheffner/>,
+ * though this reference is out of date.  A new paper
+ * is pending.
+ */
+static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
+{
+	u32 new_sample = tp->rcv_rtt_est.rtt_us;
+	long m = sample;
+
+	if (new_sample != 0) {
+		/* If we sample in larger samples in the non-timestamp
+		 * case, we could grossly overestimate the RTT especially
+		 * with chatty applications or bulk transfer apps which
+		 * are stalled on filesystem I/O.
+		 *
+		 * Also, since we are only going for a minimum in the
+		 * non-timestamp case, we do not smooth things out
+		 * else with timestamps disabled convergence takes too
+		 * long.
+		 */
+		if (!win_dep) {
+			m -= (new_sample >> 3);
+			new_sample += m;
+		} else {
+			m <<= 3;
+			if (m < new_sample)
+				new_sample = m;
+		}
+	} else {
+		/* No previous measure. */
+		new_sample = m << 3;
+	}
+
+	tp->rcv_rtt_est.rtt_us = new_sample;
+}
+
+static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
+{
+	u32 delta_us;
+
+	if (tp->rcv_rtt_est.time == 0)
+		goto new_measure;
+	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+		return;
+	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
+	if (!delta_us)
+		delta_us = 1;
+	tcp_rcv_rtt_update(tp, delta_us, 1);
+
+new_measure:
+	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
+	tp->rcv_rtt_est.time = tp->tcp_mstamp;
+}
+
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
+					  const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
+		return;
+	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
+
+	if (TCP_SKB_CB(skb)->end_seq -
+	    TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
+		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
+		u32 delta_us;
+
+		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+			if (!delta)
+				delta = 1;
+			delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+			tcp_rcv_rtt_update(tp, delta_us, 0);
+		}
+	}
+}
+
+/*
+ * This function should be called every time data is copied to user space.
+ * It calculates the appropriate TCP receive buffer space.
+ */
+void tcp_rcv_space_adjust(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 copied;
+	int time;
+
+	trace_tcp_rcv_space_adjust(sk);
+
+	tcp_mstamp_refresh(tp);
+	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
+	if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
+		return;
+
+	/* Number of bytes copied to user in last RTT */
+	copied = tp->copied_seq - tp->rcvq_space.seq;
+	if (copied <= tp->rcvq_space.space)
+		goto new_measure;
+
+	/* A bit of theory :
+	 * copied = bytes received in previous RTT, our base window
+	 * To cope with packet losses, we need a 2x factor
+	 * To cope with slow start, and sender growing its cwin by 100 %
+	 * every RTT, we need a 4x factor, because the ACK we are sending
+	 * now is for the next RTT, not the current one :
+	 * <prev RTT . ><current RTT .. ><next RTT .... >
+	 */
+
+	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
+	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+		u64 rcvwin, grow;
+		int rcvbuf;
+
+		/* minimal window to cope with packet losses, assuming
+		 * steady state. Add some cushion because of small variations.
+		 */
+		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
+
+		/* Accommodate for sender rate increase (eg. slow start) */
+		grow = rcvwin * (copied - tp->rcvq_space.space);
+		do_div(grow, tp->rcvq_space.space);
+		rcvwin += (grow << 1);
+
+		rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
+			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
+		if (rcvbuf > sk->sk_rcvbuf) {
+			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+
+			/* Make the window clamp follow along.  */
+			tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
+		}
+	}
+	tp->rcvq_space.space = copied;
+
+new_measure:
+	tp->rcvq_space.seq = tp->copied_seq;
+	tp->rcvq_space.time = tp->tcp_mstamp;
+}
+
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval.  When a
+ * connection starts up, we want to ack as quickly as possible.  The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission.  The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time.  For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue.  -DaveM
+ */
+static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 now;
+
+	inet_csk_schedule_ack(sk);
+
+	tcp_measure_rcv_mss(sk, skb);
+
+	tcp_rcv_rtt_measure(tp);
+
+	now = tcp_jiffies32;
+
+	if (!icsk->icsk_ack.ato) {
+		/* The _first_ data packet received, initialize
+		 * delayed ACK engine.
+		 */
+		tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
+		icsk->icsk_ack.ato = TCP_ATO_MIN;
+	} else {
+		int m = now - icsk->icsk_ack.lrcvtime;
+
+		if (m <= TCP_ATO_MIN / 2) {
+			/* The fastest case is the first. */
+			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+		} else if (m < icsk->icsk_ack.ato) {
+			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+			if (icsk->icsk_ack.ato > icsk->icsk_rto)
+				icsk->icsk_ack.ato = icsk->icsk_rto;
+		} else if (m > icsk->icsk_rto) {
+			/* Too long gap. Apparently sender failed to
+			 * restart window, so that we send ACKs quickly.
+			 */
+			tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
+		}
+	}
+	icsk->icsk_ack.lrcvtime = now;
+
+	tcp_ecn_check_ce(sk, skb);
+
+	if (skb->len >= 128)
+		tcp_grow_window(sk, skb, true);
+}
+
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	long m = mrtt_us; /* RTT */
+	u32 srtt = tp->srtt_us;
+
+	/*	The following amusing code comes from Jacobson's
+	 *	article in SIGCOMM '88.  Note that rtt and mdev
+	 *	are scaled versions of rtt and mean deviation.
+	 *	This is designed to be as fast as possible
+	 *	m stands for "measurement".
+	 *
+	 *	On a 1990 paper the rto value is changed to:
+	 *	RTO = rtt + 4 * mdev
+	 *
+	 * Funny. This algorithm seems to be very broken.
+	 * These formulae increase RTO, when it should be decreased, increase
+	 * too slowly, when it should be increased quickly, decrease too quickly
+	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
+	 * does not matter how to _calculate_ it. Seems, it was trap
+	 * that VJ failed to avoid. 8)
+	 */
+	if (srtt != 0) {
+		m -= (srtt >> 3);	/* m is now error in rtt est */
+		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
+		if (m < 0) {
+			m = -m;		/* m is now abs(error) */
+			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
+			/* This is similar to one of Eifel findings.
+			 * Eifel blocks mdev updates when rtt decreases.
+			 * This solution is a bit different: we use finer gain
+			 * for mdev in this case (alpha*beta).
+			 * Like Eifel it also prevents growth of rto,
+			 * but also it limits too fast rto decreases,
+			 * happening in pure Eifel.
+			 */
+			if (m > 0)
+				m >>= 3;
+		} else {
+			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
+		}
+		tp->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */
+		if (tp->mdev_us > tp->mdev_max_us) {
+			tp->mdev_max_us = tp->mdev_us;
+			if (tp->mdev_max_us > tp->rttvar_us)
+				tp->rttvar_us = tp->mdev_max_us;
+		}
+		if (after(tp->snd_una, tp->rtt_seq)) {
+			if (tp->mdev_max_us < tp->rttvar_us)
+				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
+			tp->rtt_seq = tp->snd_nxt;
+			tp->mdev_max_us = tcp_rto_min_us(sk);
+
+			tcp_bpf_rtt(sk);
+		}
+	} else {
+		/* no previous measure. */
+		srtt = m << 3;		/* take the measured time to be rtt */
+		tp->mdev_us = m << 1;	/* make sure rto = 3*rtt */
+		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
+		tp->mdev_max_us = tp->rttvar_us;
+		tp->rtt_seq = tp->snd_nxt;
+
+		tcp_bpf_rtt(sk);
+	}
+	tp->srtt_us = max(1U, srtt);
+}
+
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u64 rate;
+
+	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
+
+	/* current rate is (cwnd * mss) / srtt
+	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
+	 * In Congestion Avoidance phase, set it to 120 % the current rate.
+	 *
+	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
+	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
+	 *	 end of slow start and should slow down.
+	 */
+	if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2)
+		rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio);
+	else
+		rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio);
+
+	rate *= max(tcp_snd_cwnd(tp), tp->packets_out);
+
+	if (likely(tp->srtt_us))
+		do_div(rate, tp->srtt_us);
+
+	/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
+	 * without any lock. We want to make sure compiler wont store
+	 * intermediate values in this location.
+	 */
+	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
+					     sk->sk_max_pacing_rate));
+}
+
+/* Calculate rto without backoff.  This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+static void tcp_set_rto(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	/* Old crap is replaced with new one. 8)
+	 *
+	 * More seriously:
+	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
+	 *    It cannot be less due to utterly erratic ACK generation made
+	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+	 *    to do with delayed acks, because at cwnd>2 true delack timeout
+	 *    is invisible. Actually, Linux-2.4 also generates erratic
+	 *    ACKs in some circumstances.
+	 */
+	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
+
+	/* 2. Fixups made earlier cannot be right.
+	 *    If we do not estimate RTO correctly without them,
+	 *    all the algo is pure shit and should be replaced
+	 *    with correct one. It is exactly, which we pretend to do.
+	 */
+
+	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+	 * guarantees that rto is higher.
+	 */
+	tcp_bound_rto(sk);
+}
+
+__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
+{
+	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+
+	if (!cwnd)
+		cwnd = TCP_INIT_CWND;
+	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+}
+
+struct tcp_sacktag_state {
+	/* Timestamps for earliest and latest never-retransmitted segment
+	 * that was SACKed. RTO needs the earliest RTT to stay conservative,
+	 * but congestion control should still get an accurate delay signal.
+	 */
+	u64	first_sackt;
+	u64	last_sackt;
+	u32	reord;
+	u32	sack_delivered;
+	int	flag;
+	unsigned int mss_now;
+	struct rate_sample *rate;
+};
+
+/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
+ * and spurious retransmission information if this DSACK is unlikely caused by
+ * sender's action:
+ * - DSACKed sequence range is larger than maximum receiver's window.
+ * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
+ */
+static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
+			  u32 end_seq, struct tcp_sacktag_state *state)
+{
+	u32 seq_len, dup_segs = 1;
+
+	if (!before(start_seq, end_seq))
+		return 0;
+
+	seq_len = end_seq - start_seq;
+	/* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
+	if (seq_len > tp->max_window)
+		return 0;
+	if (seq_len > tp->mss_cache)
+		dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
+	else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
+		state->flag |= FLAG_DSACK_TLP;
+
+	tp->dsack_dups += dup_segs;
+	/* Skip the DSACK if dup segs weren't retransmitted by sender */
+	if (tp->dsack_dups > tp->total_retrans)
+		return 0;
+
+	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+	/* We increase the RACK ordering window in rounds where we receive
+	 * DSACKs that may have been due to reordering causing RACK to trigger
+	 * a spurious fast recovery. Thus RACK ignores DSACKs that happen
+	 * without having seen reordering, or that match TLP probes (TLP
+	 * is timer-driven, not triggered by RACK).
+	 */
+	if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
+		tp->rack.dsack_seen = 1;
+
+	state->flag |= FLAG_DSACKING_ACK;
+	/* A spurious retransmission is delivered */
+	state->sack_delivered += dup_segs;
+
+	return dup_segs;
+}
+
+/* It's reordering when higher sequence was delivered (i.e. sacked) before
+ * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
+ * distance is approximated in full-mss packet distance ("reordering").
+ */
+static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
+				      const int ts)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	const u32 mss = tp->mss_cache;
+	u32 fack, metric;
+
+	fack = tcp_highest_sack_seq(tp);
+	if (!before(low_seq, fack))
+		return;
+
+	metric = fack - low_seq;
+	if ((metric > tp->reordering * mss) && mss) {
+#if FASTRETRANS_DEBUG > 1
+		pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+			 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+			 tp->reordering,
+			 0,
+			 tp->sacked_out,
+			 tp->undo_marker ? tp->undo_retrans : 0);
+#endif
+		tp->reordering = min_t(u32, (metric + mss - 1) / mss,
+				       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
+	}
+
+	/* This exciting event is worth to be remembered. 8) */
+	tp->reord_seen++;
+	NET_INC_STATS(sock_net(sk),
+		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
+}
+
+ /* This must be called before lost_out or retrans_out are updated
+  * on a new loss, because we want to know if all skbs previously
+  * known to be lost have already been retransmitted, indicating
+  * that this newly lost skb is our next skb to retransmit.
+  */
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
+	    (tp->retransmit_skb_hint &&
+	     before(TCP_SKB_CB(skb)->seq,
+		    TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
+		tp->retransmit_skb_hint = skb;
+}
+
+/* Sum the number of packets on the wire we have marked as lost, and
+ * notify the congestion control module that the given skb was marked lost.
+ */
+static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+	tp->lost += tcp_skb_pcount(skb);
+}
+
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+{
+	__u8 sacked = TCP_SKB_CB(skb)->sacked;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (sacked & TCPCB_SACKED_ACKED)
+		return;
+
+	tcp_verify_retransmit_hint(tp, skb);
+	if (sacked & TCPCB_LOST) {
+		if (sacked & TCPCB_SACKED_RETRANS) {
+			/* Account for retransmits that are lost again */
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+			tp->retrans_out -= tcp_skb_pcount(skb);
+			NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
+				      tcp_skb_pcount(skb));
+			tcp_notify_skb_loss_event(tp, skb);
+		}
+	} else {
+		tp->lost_out += tcp_skb_pcount(skb);
+		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+		tcp_notify_skb_loss_event(tp, skb);
+	}
+}
+
+/* Updates the delivered and delivered_ce counts */
+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
+				bool ece_ack)
+{
+	tp->delivered += delivered;
+	if (ece_ack)
+		tp->delivered_ce += delivered;
+}
+
+/* This procedure tags the retransmission queue when SACKs arrive.
+ *
+ * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
+ * Packets in queue with these bits set are counted in variables
+ * sacked_out, retrans_out and lost_out, correspondingly.
+ *
+ * Valid combinations are:
+ * Tag  InFlight	Description
+ * 0	1		- orig segment is in flight.
+ * S	0		- nothing flies, orig reached receiver.
+ * L	0		- nothing flies, orig lost by net.
+ * R	2		- both orig and retransmit are in flight.
+ * L|R	1		- orig is lost, retransmit is in flight.
+ * S|R  1		- orig reached receiver, retrans is still in flight.
+ * (L|S|R is logically valid, it could occur when L|R is sacked,
+ *  but it is equivalent to plain S and code short-curcuits it to S.
+ *  L|S is logically invalid, it would mean -1 packet in flight 8))
+ *
+ * These 6 states form finite state machine, controlled by the following events:
+ * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
+ * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
+ * 3. Loss detection event of two flavors:
+ *	A. Scoreboard estimator decided the packet is lost.
+ *	   A'. Reno "three dupacks" marks head of queue lost.
+ *	B. SACK arrives sacking SND.NXT at the moment, when the
+ *	   segment was retransmitted.
+ * 4. D-SACK added new rule: D-SACK changes any tag to S.
+ *
+ * It is pleasant to note, that state diagram turns out to be commutative,
+ * so that we are allowed not to be bothered by order of our actions,
+ * when multiple events arrive simultaneously. (see the function below).
+ *
+ * Reordering detection.
+ * --------------------
+ * Reordering metric is maximal distance, which a packet can be displaced
+ * in packet stream. With SACKs we can estimate it:
+ *
+ * 1. SACK fills old hole and the corresponding segment was not
+ *    ever retransmitted -> reordering. Alas, we cannot use it
+ *    when segment was retransmitted.
+ * 2. The last flaw is solved with D-SACK. D-SACK arrives
+ *    for retransmitted and already SACKed segment -> reordering..
+ * Both of these heuristics are not used in Loss state, when we cannot
+ * account for retransmits accurately.
+ *
+ * SACK block validation.
+ * ----------------------
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself reporting
+ * SACK reneging when it should advance SND.UNA. Such SACK block this is
+ * perfectly valid, however, in light of RFC2018 which explicitly states
+ * that "SACK block MUST reflect the newest segment.  Even if the newest
+ * segment is going to be discarded ...", not that it looks very clever
+ * in case of head skb. Due to potentional receiver driven attacks, we
+ * choose to avoid immediate execution of a walk in write queue due to
+ * reneging and defer head skb's loss recovery to standard loss recovery
+ * procedure that will eventually trigger (nothing forbids us doing this).
+ *
+ * Implements also blockage to start_seq wrap-around. Problem lies in the
+ * fact that though start_seq (s) is before end_seq (i.e., not reversed),
+ * there's no guarantee that it will be before snd_nxt (n). The problem
+ * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
+ * wrap (s_w):
+ *
+ *         <- outs wnd ->                          <- wrapzone ->
+ *         u     e      n                         u_w   e_w  s n_w
+ *         |     |      |                          |     |   |  |
+ * |<------------+------+----- TCP seqno space --------------+---------->|
+ * ...-- <2^31 ->|                                           |<--------...
+ * ...---- >2^31 ------>|                                    |<--------...
+ *
+ * Current code wouldn't be vulnerable but it's better still to discard such
+ * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
+ * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
+ * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
+ * equal to the ideal case (infinite seqno space without wrap caused issues).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. Yet
+ * again, D-SACK block must not to go across snd_una (for the same reason as
+ * for the normal SACK blocks, explained above). But there all simplicity
+ * ends, TCP might receive valid D-SACKs below that. As long as they reside
+ * fully below undo_marker they do not affect behavior in anyway and can
+ * therefore be safely ignored. In rare cases (which are more or less
+ * theoretical ones), the D-SACK will nicely cross that boundary due to skb
+ * fragmentation and packet reordering past skb's retransmission. To consider
+ * them correctly, the acceptable range must be extended even more though
+ * the exact amount is rather hard to quantify. However, tp->max_window can
+ * be used as an exaggerated estimate.
+ */
+static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
+				   u32 start_seq, u32 end_seq)
+{
+	/* Too far in future, or reversed (interpretation is ambiguous) */
+	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
+		return false;
+
+	/* Nasty start_seq wrap-around check (see comments above) */
+	if (!before(start_seq, tp->snd_nxt))
+		return false;
+
+	/* In outstanding window? ...This is valid exit for D-SACKs too.
+	 * start_seq == snd_una is non-sensical (see comments above)
+	 */
+	if (after(start_seq, tp->snd_una))
+		return true;
+
+	if (!is_dsack || !tp->undo_marker)
+		return false;
+
+	/* ...Then it's D-SACK, and must reside below snd_una completely */
+	if (after(end_seq, tp->snd_una))
+		return false;
+
+	if (!before(start_seq, tp->undo_marker))
+		return true;
+
+	/* Too old */
+	if (!after(end_seq, tp->undo_marker))
+		return false;
+
+	/* Undo_marker boundary crossing (overestimates a lot). Known already:
+	 *   start_seq < undo_marker and end_seq >= undo_marker.
+	 */
+	return !before(start_seq, end_seq - tp->max_window);
+}
+
+static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
+			    struct tcp_sack_block_wire *sp, int num_sacks,
+			    u32 prior_snd_una, struct tcp_sacktag_state *state)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
+	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
+	u32 dup_segs;
+
+	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+	} else if (num_sacks > 1) {
+		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
+		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
+
+		if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
+			return false;
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
+	} else {
+		return false;
+	}
+
+	dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
+	if (!dup_segs) {	/* Skip dubious DSACK */
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
+		return false;
+	}
+
+	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
+
+	/* D-SACK for already forgotten data... Do dumb counting. */
+	if (tp->undo_marker && tp->undo_retrans > 0 &&
+	    !after(end_seq_0, prior_snd_una) &&
+	    after(end_seq_0, tp->undo_marker))
+		tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
+
+	return true;
+}
+
+/* Check if skb is fully within the SACK block. In presence of GSO skbs,
+ * the incoming SACK may not exactly match but we can find smaller MSS
+ * aligned portion of it that matches. Therefore we might need to fragment
+ * which may fail and creates some hassle (caller must handle error case
+ * returns).
+ *
+ * FIXME: this could be merged to shift decision code
+ */
+static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
+				  u32 start_seq, u32 end_seq)
+{
+	int err;
+	bool in_sack;
+	unsigned int pkt_len;
+	unsigned int mss;
+
+	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
+	    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
+		mss = tcp_skb_mss(skb);
+		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+
+		if (!in_sack) {
+			pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
+			if (pkt_len < mss)
+				pkt_len = mss;
+		} else {
+			pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
+			if (pkt_len < mss)
+				return -EINVAL;
+		}
+
+		/* Round if necessary so that SACKs cover only full MSSes
+		 * and/or the remaining small portion (if present)
+		 */
+		if (pkt_len > mss) {
+			unsigned int new_len = (pkt_len / mss) * mss;
+			if (!in_sack && new_len < pkt_len)
+				new_len += mss;
+			pkt_len = new_len;
+		}
+
+		if (pkt_len >= skb->len && !in_sack)
+			return 0;
+
+		err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+				   pkt_len, mss, GFP_ATOMIC);
+		if (err < 0)
+			return err;
+	}
+
+	return in_sack;
+}
+
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+			  struct tcp_sacktag_state *state, u8 sacked,
+			  u32 start_seq, u32 end_seq,
+			  int dup_sack, int pcount,
+			  u64 xmit_time)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Account D-SACK for retransmitted packet. */
+	if (dup_sack && (sacked & TCPCB_RETRANS)) {
+		if (tp->undo_marker && tp->undo_retrans > 0 &&
+		    after(end_seq, tp->undo_marker))
+			tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
+		if ((sacked & TCPCB_SACKED_ACKED) &&
+		    before(start_seq, state->reord))
+				state->reord = start_seq;
+	}
+
+	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
+	if (!after(end_seq, tp->snd_una))
+		return sacked;
+
+	if (!(sacked & TCPCB_SACKED_ACKED)) {
+		tcp_rack_advance(tp, sacked, end_seq, xmit_time);
+
+		if (sacked & TCPCB_SACKED_RETRANS) {
+			/* If the segment is not tagged as lost,
+			 * we do not clear RETRANS, believing
+			 * that retransmission is still in flight.
+			 */
+			if (sacked & TCPCB_LOST) {
+				sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+				tp->lost_out -= pcount;
+				tp->retrans_out -= pcount;
+			}
+		} else {
+			if (!(sacked & TCPCB_RETRANS)) {
+				/* New sack for not retransmitted frame,
+				 * which was in hole. It is reordering.
+				 */
+				if (before(start_seq,
+					   tcp_highest_sack_seq(tp)) &&
+				    before(start_seq, state->reord))
+					state->reord = start_seq;
+
+				if (!after(end_seq, tp->high_seq))
+					state->flag |= FLAG_ORIG_SACK_ACKED;
+				if (state->first_sackt == 0)
+					state->first_sackt = xmit_time;
+				state->last_sackt = xmit_time;
+			}
+
+			if (sacked & TCPCB_LOST) {
+				sacked &= ~TCPCB_LOST;
+				tp->lost_out -= pcount;
+			}
+		}
+
+		sacked |= TCPCB_SACKED_ACKED;
+		state->flag |= FLAG_DATA_SACKED;
+		tp->sacked_out += pcount;
+		/* Out-of-order packets delivered */
+		state->sack_delivered += pcount;
+
+		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
+		if (tp->lost_skb_hint &&
+		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
+			tp->lost_cnt_hint += pcount;
+	}
+
+	/* D-SACK. We can detect redundant retransmission in S|R and plain R
+	 * frames and clear it. undo_retrans is decreased above, L|R frames
+	 * are accounted above as well.
+	 */
+	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
+		sacked &= ~TCPCB_SACKED_RETRANS;
+		tp->retrans_out -= pcount;
+	}
+
+	return sacked;
+}
+
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+			    struct sk_buff *skb,
+			    struct tcp_sacktag_state *state,
+			    unsigned int pcount, int shifted, int mss,
+			    bool dup_sack)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 start_seq = TCP_SKB_CB(skb)->seq;	/* start of newly-SACKed */
+	u32 end_seq = start_seq + shifted;	/* end of newly-SACKed */
+
+	BUG_ON(!pcount);
+
+	/* Adjust counters and hints for the newly sacked sequence
+	 * range but discard the return value since prev is already
+	 * marked. We must tag the range first because the seq
+	 * advancement below implicitly advances
+	 * tcp_highest_sack_seq() when skb is highest_sack.
+	 */
+	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+			start_seq, end_seq, dup_sack, pcount,
+			tcp_skb_timestamp_us(skb));
+	tcp_rate_skb_delivered(sk, skb, state->rate);
+
+	if (skb == tp->lost_skb_hint)
+		tp->lost_cnt_hint += pcount;
+
+	TCP_SKB_CB(prev)->end_seq += shifted;
+	TCP_SKB_CB(skb)->seq += shifted;
+
+	tcp_skb_pcount_add(prev, pcount);
+	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
+	tcp_skb_pcount_add(skb, -pcount);
+
+	/* When we're adding to gso_segs == 1, gso_size will be zero,
+	 * in theory this shouldn't be necessary but as long as DSACK
+	 * code can come after this skb later on it's better to keep
+	 * setting gso_size to something.
+	 */
+	if (!TCP_SKB_CB(prev)->tcp_gso_size)
+		TCP_SKB_CB(prev)->tcp_gso_size = mss;
+
+	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
+	if (tcp_skb_pcount(skb) <= 1)
+		TCP_SKB_CB(skb)->tcp_gso_size = 0;
+
+	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
+	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+	if (skb->len > 0) {
+		BUG_ON(!tcp_skb_pcount(skb));
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+		return false;
+	}
+
+	/* Whole SKB was eaten :-) */
+
+	if (skb == tp->retransmit_skb_hint)
+		tp->retransmit_skb_hint = prev;
+	if (skb == tp->lost_skb_hint) {
+		tp->lost_skb_hint = prev;
+		tp->lost_cnt_hint -= tcp_skb_pcount(prev);
+	}
+
+	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		TCP_SKB_CB(prev)->end_seq++;
+
+	if (skb == tcp_highest_sack(sk))
+		tcp_advance_highest_sack(sk, skb);
+
+	tcp_skb_collapse_tstamp(prev, skb);
+	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
+		TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
+
+	tcp_rtx_queue_unlink_and_free(skb, sk);
+
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
+
+	return true;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_skb_seglen(const struct sk_buff *skb)
+{
+	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(const struct sk_buff *skb)
+{
+	return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
+		  int pcount, int shiftlen)
+{
+	/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
+	 * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
+	 * to make sure not storing more than 65535 * 8 bytes per skb,
+	 * even if current MSS is bigger.
+	 */
+	if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
+		return 0;
+	if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
+		return 0;
+	return skb_shift(to, from, shiftlen);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+					  struct tcp_sacktag_state *state,
+					  u32 start_seq, u32 end_seq,
+					  bool dup_sack)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *prev;
+	int mss;
+	int pcount = 0;
+	int len;
+	int in_sack;
+
+	/* Normally R but no L won't result in plain S */
+	if (!dup_sack &&
+	    (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
+		goto fallback;
+	if (!skb_can_shift(skb))
+		goto fallback;
+	/* This frame is about to be dropped (was ACKed). */
+	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+		goto fallback;
+
+	/* Can only happen with delayed DSACK + discard craziness */
+	prev = skb_rb_prev(skb);
+	if (!prev)
+		goto fallback;
+
+	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+		goto fallback;
+
+	if (!tcp_skb_can_collapse(prev, skb))
+		goto fallback;
+
+	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+	if (in_sack) {
+		len = skb->len;
+		pcount = tcp_skb_pcount(skb);
+		mss = tcp_skb_seglen(skb);
+
+		/* TODO: Fix DSACKs to not fragment already SACKed and we can
+		 * drop this restriction as unnecessary
+		 */
+		if (mss != tcp_skb_seglen(prev))
+			goto fallback;
+	} else {
+		if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+			goto noop;
+		/* CHECKME: This is non-MSS split case only?, this will
+		 * cause skipped skbs due to advancing loop btw, original
+		 * has that feature too
+		 */
+		if (tcp_skb_pcount(skb) <= 1)
+			goto noop;
+
+		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+		if (!in_sack) {
+			/* TODO: head merge to next could be attempted here
+			 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+			 * though it might not be worth of the additional hassle
+			 *
+			 * ...we can probably just fallback to what was done
+			 * previously. We could try merging non-SACKed ones
+			 * as well but it probably isn't going to buy off
+			 * because later SACKs might again split them, and
+			 * it would make skb timestamp tracking considerably
+			 * harder problem.
+			 */
+			goto fallback;
+		}
+
+		len = end_seq - TCP_SKB_CB(skb)->seq;
+		BUG_ON(len < 0);
+		BUG_ON(len > skb->len);
+
+		/* MSS boundaries should be honoured or else pcount will
+		 * severely break even though it makes things bit trickier.
+		 * Optimize common case to avoid most of the divides
+		 */
+		mss = tcp_skb_mss(skb);
+
+		/* TODO: Fix DSACKs to not fragment already SACKed and we can
+		 * drop this restriction as unnecessary
+		 */
+		if (mss != tcp_skb_seglen(prev))
+			goto fallback;
+
+		if (len == mss) {
+			pcount = 1;
+		} else if (len < mss) {
+			goto noop;
+		} else {
+			pcount = len / mss;
+			len = pcount * mss;
+		}
+	}
+
+	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+		goto fallback;
+
+	if (!tcp_skb_shift(prev, skb, pcount, len))
+		goto fallback;
+	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
+		goto out;
+
+	/* Hole filled allows collapsing with the next as well, this is very
+	 * useful when hole on every nth skb pattern happens
+	 */
+	skb = skb_rb_next(prev);
+	if (!skb)
+		goto out;
+
+	if (!skb_can_shift(skb) ||
+	    ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
+	    (mss != tcp_skb_seglen(skb)))
+		goto out;
+
+	if (!tcp_skb_can_collapse(prev, skb))
+		goto out;
+	len = skb->len;
+	pcount = tcp_skb_pcount(skb);
+	if (tcp_skb_shift(prev, skb, pcount, len))
+		tcp_shifted_skb(sk, prev, skb, state, pcount,
+				len, mss, 0);
+
+out:
+	return prev;
+
+noop:
+	return skb;
+
+fallback:
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+	return NULL;
+}
+
+static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
+					struct tcp_sack_block *next_dup,
+					struct tcp_sacktag_state *state,
+					u32 start_seq, u32 end_seq,
+					bool dup_sack_in)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *tmp;
+
+	skb_rbtree_walk_from(skb) {
+		int in_sack = 0;
+		bool dup_sack = dup_sack_in;
+
+		/* queue is in-order => we can short-circuit the walk early */
+		if (!before(TCP_SKB_CB(skb)->seq, end_seq))
+			break;
+
+		if (next_dup  &&
+		    before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
+			in_sack = tcp_match_skb_to_sack(sk, skb,
+							next_dup->start_seq,
+							next_dup->end_seq);
+			if (in_sack > 0)
+				dup_sack = true;
+		}
+
+		/* skb reference here is a bit tricky to get right, since
+		 * shifting can eat and free both this skb and the next,
+		 * so not even _safe variant of the loop is enough.
+		 */
+		if (in_sack <= 0) {
+			tmp = tcp_shift_skb_data(sk, skb, state,
+						 start_seq, end_seq, dup_sack);
+			if (tmp) {
+				if (tmp != skb) {
+					skb = tmp;
+					continue;
+				}
+
+				in_sack = 0;
+			} else {
+				in_sack = tcp_match_skb_to_sack(sk, skb,
+								start_seq,
+								end_seq);
+			}
+		}
+
+		if (unlikely(in_sack < 0))
+			break;
+
+		if (in_sack) {
+			TCP_SKB_CB(skb)->sacked =
+				tcp_sacktag_one(sk,
+						state,
+						TCP_SKB_CB(skb)->sacked,
+						TCP_SKB_CB(skb)->seq,
+						TCP_SKB_CB(skb)->end_seq,
+						dup_sack,
+						tcp_skb_pcount(skb),
+						tcp_skb_timestamp_us(skb));
+			tcp_rate_skb_delivered(sk, skb, state->rate);
+			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+				list_del_init(&skb->tcp_tsorted_anchor);
+
+			if (!before(TCP_SKB_CB(skb)->seq,
+				    tcp_highest_sack_seq(tp)))
+				tcp_advance_highest_sack(sk, skb);
+		}
+	}
+	return skb;
+}
+
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
+{
+	struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+	struct sk_buff *skb;
+
+	while (*p) {
+		parent = *p;
+		skb = rb_to_skb(parent);
+		if (before(seq, TCP_SKB_CB(skb)->seq)) {
+			p = &parent->rb_left;
+			continue;
+		}
+		if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+			p = &parent->rb_right;
+			continue;
+		}
+		return skb;
+	}
+	return NULL;
+}
+
+static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
+					u32 skip_to_seq)
+{
+	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+		return skb;
+
+	return tcp_sacktag_bsearch(sk, skip_to_seq);
+}
+
+static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
+						struct sock *sk,
+						struct tcp_sack_block *next_dup,
+						struct tcp_sacktag_state *state,
+						u32 skip_to_seq)
+{
+	if (!next_dup)
+		return skb;
+
+	if (before(next_dup->start_seq, skip_to_seq)) {
+		skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
+		skb = tcp_sacktag_walk(skb, sk, NULL, state,
+				       next_dup->start_seq, next_dup->end_seq,
+				       1);
+	}
+
+	return skb;
+}
+
+static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
+{
+	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+}
+
+static int
+tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
+			u32 prior_snd_una, struct tcp_sacktag_state *state)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	const unsigned char *ptr = (skb_transport_header(ack_skb) +
+				    TCP_SKB_CB(ack_skb)->sacked);
+	struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
+	struct tcp_sack_block sp[TCP_NUM_SACKS];
+	struct tcp_sack_block *cache;
+	struct sk_buff *skb;
+	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
+	int used_sacks;
+	bool found_dup_sack = false;
+	int i, j;
+	int first_sack_index;
+
+	state->flag = 0;
+	state->reord = tp->snd_nxt;
+
+	if (!tp->sacked_out)
+		tcp_highest_sack_reset(sk);
+
+	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
+					 num_sacks, prior_snd_una, state);
+
+	/* Eliminate too old ACKs, but take into
+	 * account more or less fresh ones, they can
+	 * contain valid SACK info.
+	 */
+	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
+		return 0;
+
+	if (!tp->packets_out)
+		goto out;
+
+	used_sacks = 0;
+	first_sack_index = 0;
+	for (i = 0; i < num_sacks; i++) {
+		bool dup_sack = !i && found_dup_sack;
+
+		sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
+		sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
+
+		if (!tcp_is_sackblock_valid(tp, dup_sack,
+					    sp[used_sacks].start_seq,
+					    sp[used_sacks].end_seq)) {
+			int mib_idx;
+
+			if (dup_sack) {
+				if (!tp->undo_marker)
+					mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
+				else
+					mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
+			} else {
+				/* Don't count olds caused by ACK reordering */
+				if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
+				    !after(sp[used_sacks].end_seq, tp->snd_una))
+					continue;
+				mib_idx = LINUX_MIB_TCPSACKDISCARD;
+			}
+
+			NET_INC_STATS(sock_net(sk), mib_idx);
+			if (i == 0)
+				first_sack_index = -1;
+			continue;
+		}
+
+		/* Ignore very old stuff early */
+		if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
+			if (i == 0)
+				first_sack_index = -1;
+			continue;
+		}
+
+		used_sacks++;
+	}
+
+	/* order SACK blocks to allow in order walk of the retrans queue */
+	for (i = used_sacks - 1; i > 0; i--) {
+		for (j = 0; j < i; j++) {
+			if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
+				swap(sp[j], sp[j + 1]);
+
+				/* Track where the first SACK block goes to */
+				if (j == first_sack_index)
+					first_sack_index = j + 1;
+			}
+		}
+	}
+
+	state->mss_now = tcp_current_mss(sk);
+	skb = NULL;
+	i = 0;
+
+	if (!tp->sacked_out) {
+		/* It's already past, so skip checking against it */
+		cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+	} else {
+		cache = tp->recv_sack_cache;
+		/* Skip empty blocks in at head of the cache */
+		while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
+		       !cache->end_seq)
+			cache++;
+	}
+
+	while (i < used_sacks) {
+		u32 start_seq = sp[i].start_seq;
+		u32 end_seq = sp[i].end_seq;
+		bool dup_sack = (found_dup_sack && (i == first_sack_index));
+		struct tcp_sack_block *next_dup = NULL;
+
+		if (found_dup_sack && ((i + 1) == first_sack_index))
+			next_dup = &sp[i + 1];
+
+		/* Skip too early cached blocks */
+		while (tcp_sack_cache_ok(tp, cache) &&
+		       !before(start_seq, cache->end_seq))
+			cache++;
+
+		/* Can skip some work by looking recv_sack_cache? */
+		if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
+		    after(end_seq, cache->start_seq)) {
+
+			/* Head todo? */
+			if (before(start_seq, cache->start_seq)) {
+				skb = tcp_sacktag_skip(skb, sk, start_seq);
+				skb = tcp_sacktag_walk(skb, sk, next_dup,
+						       state,
+						       start_seq,
+						       cache->start_seq,
+						       dup_sack);
+			}
+
+			/* Rest of the block already fully processed? */
+			if (!after(end_seq, cache->end_seq))
+				goto advance_sp;
+
+			skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
+						       state,
+						       cache->end_seq);
+
+			/* ...tail remains todo... */
+			if (tcp_highest_sack_seq(tp) == cache->end_seq) {
+				/* ...but better entrypoint exists! */
+				skb = tcp_highest_sack(sk);
+				if (!skb)
+					break;
+				cache++;
+				goto walk;
+			}
+
+			skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
+			/* Check overlap against next cached too (past this one already) */
+			cache++;
+			continue;
+		}
+
+		if (!before(start_seq, tcp_highest_sack_seq(tp))) {
+			skb = tcp_highest_sack(sk);
+			if (!skb)
+				break;
+		}
+		skb = tcp_sacktag_skip(skb, sk, start_seq);
+
+walk:
+		skb = tcp_sacktag_walk(skb, sk, next_dup, state,
+				       start_seq, end_seq, dup_sack);
+
+advance_sp:
+		i++;
+	}
+
+	/* Clear the head of the cache sack blocks so we can skip it next time */
+	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
+		tp->recv_sack_cache[i].start_seq = 0;
+		tp->recv_sack_cache[i].end_seq = 0;
+	}
+	for (j = 0; j < used_sacks; j++)
+		tp->recv_sack_cache[i++] = sp[j];
+
+	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
+		tcp_check_sack_reordering(sk, state->reord, 0);
+
+	tcp_verify_left_out(tp);
+out:
+
+#if FASTRETRANS_DEBUG > 0
+	WARN_ON((int)tp->sacked_out < 0);
+	WARN_ON((int)tp->lost_out < 0);
+	WARN_ON((int)tp->retrans_out < 0);
+	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
+#endif
+	return state->flag;
+}
+
+/* Limits sacked_out so that sum with lost_out isn't ever larger than
+ * packets_out. Returns false if sacked_out adjustement wasn't necessary.
+ */
+static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
+{
+	u32 holes;
+
+	holes = max(tp->lost_out, 1U);
+	holes = min(holes, tp->packets_out);
+
+	if ((tp->sacked_out + holes) > tp->packets_out) {
+		tp->sacked_out = tp->packets_out - holes;
+		return true;
+	}
+	return false;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tcp_limit_reno_sacked(tp))
+		return;
+
+	tp->reordering = min_t(u32, tp->packets_out + addend,
+			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
+	tp->reord_seen++;
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
+{
+	if (num_dupack) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		u32 prior_sacked = tp->sacked_out;
+		s32 delivered;
+
+		tp->sacked_out += num_dupack;
+		tcp_check_reno_reordering(sk, 0);
+		delivered = tp->sacked_out - prior_sacked;
+		if (delivered > 0)
+			tcp_count_delivered(tp, delivered, ece_ack);
+		tcp_verify_left_out(tp);
+	}
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (acked > 0) {
+		/* One ACK acked hole. The rest eat duplicate ACKs. */
+		tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
+				    ece_ack);
+		if (acked - 1 >= tp->sacked_out)
+			tp->sacked_out = 0;
+		else
+			tp->sacked_out -= acked - 1;
+	}
+	tcp_check_reno_reordering(sk, acked);
+	tcp_verify_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+	tp->sacked_out = 0;
+}
+
+void tcp_clear_retrans(struct tcp_sock *tp)
+{
+	tp->retrans_out = 0;
+	tp->lost_out = 0;
+	tp->undo_marker = 0;
+	tp->undo_retrans = -1;
+	tp->sacked_out = 0;
+}
+
+static inline void tcp_init_undo(struct tcp_sock *tp)
+{
+	tp->undo_marker = tp->snd_una;
+	/* Retransmission still in flight may cause DSACKs later. */
+	tp->undo_retrans = tp->retrans_out ? : -1;
+}
+
+static bool tcp_is_rack(const struct sock *sk)
+{
+	return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+		TCP_RACK_LOSS_DETECTION;
+}
+
+/* If we detect SACK reneging, forget all SACK information
+ * and reset tags completely, otherwise preserve SACKs. If receiver
+ * dropped its ofo queue, we will know this due to reneging detection.
+ */
+static void tcp_timeout_mark_lost(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb, *head;
+	bool is_reneg;			/* is receiver reneging on SACKs? */
+
+	head = tcp_rtx_queue_head(sk);
+	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
+	if (is_reneg) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+		tp->sacked_out = 0;
+		/* Mark SACK reneging until we recover from this loss event. */
+		tp->is_sack_reneg = 1;
+	} else if (tcp_is_reno(tp)) {
+		tcp_reset_reno_sack(tp);
+	}
+
+	skb = head;
+	skb_rbtree_walk_from(skb) {
+		if (is_reneg)
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+		else if (tcp_is_rack(sk) && skb != head &&
+			 tcp_rack_skb_timeout(tp, skb, 0) > 0)
+			continue; /* Don't mark recently sent ones lost yet */
+		tcp_mark_skb_lost(sk, skb);
+	}
+	tcp_verify_left_out(tp);
+	tcp_clear_all_retrans_hints(tp);
+}
+
+/* Enter Loss state. */
+void tcp_enter_loss(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
+	u8 reordering;
+
+	tcp_timeout_mark_lost(sk);
+
+	/* Reduce ssthresh if it has not yet been made inside this window. */
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+	    !after(tp->high_seq, tp->snd_una) ||
+	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		tp->prior_cwnd = tcp_snd_cwnd(tp);
+		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		tcp_ca_event(sk, CA_EVENT_LOSS);
+		tcp_init_undo(tp);
+	}
+	tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1);
+	tp->snd_cwnd_cnt   = 0;
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+
+	/* Timeout in disordered state after receiving substantial DUPACKs
+	 * suggests that the degree of reordering is over-estimated.
+	 */
+	reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
+	    tp->sacked_out >= reordering)
+		tp->reordering = min_t(unsigned int, tp->reordering,
+				       reordering);
+
+	tcp_set_ca_state(sk, TCP_CA_Loss);
+	tp->high_seq = tp->snd_nxt;
+	tcp_ecn_queue_cwr(tp);
+
+	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+	 * loss recovery is underway except recurring timeout(s) on
+	 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+	 */
+	tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
+		   (new_recovery || icsk->icsk_retransmits) &&
+		   !inet_csk(sk)->icsk_mtup.probe_size;
+}
+
+/* If ACK arrived pointing to a remembered SACK, it means that our
+ * remembered SACKs do not reflect real state of receiver i.e.
+ * receiver _host_ is heavily congested (or buggy).
+ *
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
+ */
+static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag)
+{
+	if (*ack_flag & FLAG_SACK_RENEGING &&
+	    *ack_flag & FLAG_SND_UNA_ADVANCED) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+					  msecs_to_jiffies(10));
+
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  delay, TCP_RTO_MAX);
+		*ack_flag &= ~FLAG_SET_XMIT_TIMER;
+		return true;
+	}
+	return false;
+}
+
+/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
+ * counter when SACK is enabled (without SACK, sacked_out is used for
+ * that purpose).
+ *
+ * With reordering, holes may still be in flight, so RFC3517 recovery
+ * uses pure sacked_out (total number of SACKed segments) even though
+ * it violates the RFC that uses duplicate ACKs, often these are equal
+ * but when e.g. out-of-window ACKs or packet duplication occurs,
+ * they differ. Since neither occurs due to loss, TCP should really
+ * ignore them.
+ */
+static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
+{
+	return tp->sacked_out + 1;
+}
+
+/* Linux NewReno/SACK/ECN state machine.
+ * --------------------------------------
+ *
+ * "Open"	Normal state, no dubious events, fast path.
+ * "Disorder"   In all the respects it is "Open",
+ *		but requires a bit more attention. It is entered when
+ *		we see some SACKs or dupacks. It is split of "Open"
+ *		mainly to move some processing from fast path to slow one.
+ * "CWR"	CWND was reduced due to some Congestion Notification event.
+ *		It can be ECN, ICMP source quench, local device congestion.
+ * "Recovery"	CWND was reduced, we are fast-retransmitting.
+ * "Loss"	CWND was reduced due to RTO timeout or SACK reneging.
+ *
+ * tcp_fastretrans_alert() is entered:
+ * - each incoming ACK, if state is not "Open"
+ * - when arrived ACK is unusual, namely:
+ *	* SACK
+ *	* Duplicate ACK.
+ *	* ECN ECE.
+ *
+ * Counting packets in flight is pretty simple.
+ *
+ *	in_flight = packets_out - left_out + retrans_out
+ *
+ *	packets_out is SND.NXT-SND.UNA counted in packets.
+ *
+ *	retrans_out is number of retransmitted segments.
+ *
+ *	left_out is number of segments left network, but not ACKed yet.
+ *
+ *		left_out = sacked_out + lost_out
+ *
+ *     sacked_out: Packets, which arrived to receiver out of order
+ *		   and hence not ACKed. With SACKs this number is simply
+ *		   amount of SACKed data. Even without SACKs
+ *		   it is easy to give pretty reliable estimate of this number,
+ *		   counting duplicate ACKs.
+ *
+ *       lost_out: Packets lost by network. TCP has no explicit
+ *		   "loss notification" feedback from network (for now).
+ *		   It means that this number can be only _guessed_.
+ *		   Actually, it is the heuristics to predict lossage that
+ *		   distinguishes different algorithms.
+ *
+ *	F.e. after RTO, when all the queue is considered as lost,
+ *	lost_out = packets_out and in_flight = retrans_out.
+ *
+ *		Essentially, we have now a few algorithms detecting
+ *		lost packets.
+ *
+ *		If the receiver supports SACK:
+ *
+ *		RFC6675/3517: It is the conventional algorithm. A packet is
+ *		considered lost if the number of higher sequence packets
+ *		SACKed is greater than or equal the DUPACK thoreshold
+ *		(reordering). This is implemented in tcp_mark_head_lost and
+ *		tcp_update_scoreboard.
+ *
+ *		RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ *		(2017-) that checks timing instead of counting DUPACKs.
+ *		Essentially a packet is considered lost if it's not S/ACKed
+ *		after RTT + reordering_window, where both metrics are
+ *		dynamically measured and adjusted. This is implemented in
+ *		tcp_rack_mark_lost.
+ *
+ *		If the receiver does not support SACK:
+ *
+ *		NewReno (RFC6582): in Recovery we assume that one segment
+ *		is lost (classic Reno). While we are in Recovery and
+ *		a partial ACK arrives, we assume that one more packet
+ *		is lost (NewReno). This heuristics are the same in NewReno
+ *		and SACK.
+ *
+ * Really tricky (and requiring careful tuning) part of algorithm
+ * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The first determines the moment _when_ we should reduce CWND and,
+ * hence, slow down forward transmission. In fact, it determines the moment
+ * when we decide that hole is caused by loss, rather than by a reorder.
+ *
+ * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
+ * holes, caused by lost packets.
+ *
+ * And the most logically complicated part of algorithm is undo
+ * heuristics. We detect false retransmits due to both too early
+ * fast retransmit (reordering) and underestimated RTO, analyzing
+ * timestamps and D-SACKs. When we detect that some segments were
+ * retransmitted by mistake and CWND reduction was wrong, we undo
+ * window reduction and abort recovery phase. This logic is hidden
+ * inside several functions named tcp_try_undo_<something>.
+ */
+
+/* This function decides, when we should leave Disordered state
+ * and enter Recovery phase, reducing congestion window.
+ *
+ * Main question: may we further continue forward transmission
+ * with the same cwnd?
+ */
+static bool tcp_time_to_recover(struct sock *sk, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Trick#1: The loss is proven. */
+	if (tp->lost_out)
+		return true;
+
+	/* Not-A-Trick#2 : Classic rule... */
+	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
+		return true;
+
+	return false;
+}
+
+/* Detect loss in event "A" above by marking head of queue up as lost.
+ * For RFC3517 SACK, a segment is considered lost if it
+ * has at least tp->reordering SACKed seqments above it; "packets" refers to
+ * the maximum SACKed segments to pass before reaching this limit.
+ */
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int cnt;
+	/* Use SACK to deduce losses of new sequences sent during recovery */
+	const u32 loss_high = tp->snd_nxt;
+
+	WARN_ON(packets > tp->packets_out);
+	skb = tp->lost_skb_hint;
+	if (skb) {
+		/* Head already handled? */
+		if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
+			return;
+		cnt = tp->lost_cnt_hint;
+	} else {
+		skb = tcp_rtx_queue_head(sk);
+		cnt = 0;
+	}
+
+	skb_rbtree_walk_from(skb) {
+		/* TODO: do this better */
+		/* this is not the most efficient way to do this... */
+		tp->lost_skb_hint = skb;
+		tp->lost_cnt_hint = cnt;
+
+		if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
+			break;
+
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+			cnt += tcp_skb_pcount(skb);
+
+		if (cnt > packets)
+			break;
+
+		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
+			tcp_mark_skb_lost(sk, skb);
+
+		if (mark_head)
+			break;
+	}
+	tcp_verify_left_out(tp);
+}
+
+/* Account newly detected lost packet(s) */
+
+static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_is_sack(tp)) {
+		int sacked_upto = tp->sacked_out - tp->reordering;
+		if (sacked_upto >= 0)
+			tcp_mark_head_lost(sk, sacked_upto, 0);
+		else if (fast_rexmit)
+			tcp_mark_head_lost(sk, 1, 1);
+	}
+}
+
+static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
+{
+	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+	       before(tp->rx_opt.rcv_tsecr, when);
+}
+
+/* skb is spurious retransmitted if the returned timestamp echo
+ * reply is prior to the skb transmission time
+ */
+static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
+				     const struct sk_buff *skb)
+{
+	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
+	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+}
+
+/* Nothing was retransmitted or returned timestamp is less
+ * than timestamp of the first retransmission.
+ */
+static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
+{
+	return tp->retrans_stamp &&
+	       tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
+}
+
+/* Undo procedures. */
+
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
+ */
+static bool tcp_any_retrans_done(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (tp->retrans_out)
+		return true;
+
+	skb = tcp_rtx_queue_head(sk);
+	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+		return true;
+
+	return false;
+}
+
+static void DBGUNDO(struct sock *sk, const char *msg)
+{
+#if FASTRETRANS_DEBUG > 1
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (sk->sk_family == AF_INET) {
+		pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+			 msg,
+			 &inet->inet_daddr, ntohs(inet->inet_dport),
+			 tcp_snd_cwnd(tp), tcp_left_out(tp),
+			 tp->snd_ssthresh, tp->prior_ssthresh,
+			 tp->packets_out);
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (sk->sk_family == AF_INET6) {
+		pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+			 msg,
+			 &sk->sk_v6_daddr, ntohs(inet->inet_dport),
+			 tcp_snd_cwnd(tp), tcp_left_out(tp),
+			 tp->snd_ssthresh, tp->prior_ssthresh,
+			 tp->packets_out);
+	}
+#endif
+#endif
+}
+
+static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (unmark_loss) {
+		struct sk_buff *skb;
+
+		skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+		}
+		tp->lost_out = 0;
+		tcp_clear_all_retrans_hints(tp);
+	}
+
+	if (tp->prior_ssthresh) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+
+		tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk));
+
+		if (tp->prior_ssthresh > tp->snd_ssthresh) {
+			tp->snd_ssthresh = tp->prior_ssthresh;
+			tcp_ecn_withdraw_cwr(tp);
+		}
+	}
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+	tp->undo_marker = 0;
+	tp->rack.advanced = 1; /* Force RACK to re-exam losses */
+}
+
+static inline bool tcp_may_undo(const struct tcp_sock *tp)
+{
+	return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
+}
+
+static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
+		/* Hold old state until something *above* high_seq
+		 * is ACKed. For Reno it is MUST to prevent false
+		 * fast retransmits (RFC2582). SACK TCP is safe. */
+		if (!tcp_any_retrans_done(sk))
+			tp->retrans_stamp = 0;
+		return true;
+	}
+	return false;
+}
+
+/* People celebrate: "We love our President!" */
+static bool tcp_try_undo_recovery(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_may_undo(tp)) {
+		int mib_idx;
+
+		/* Happy end! We did not retransmit anything
+		 * or our original transmission succeeded.
+		 */
+		DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+		tcp_undo_cwnd_reduction(sk, false);
+		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+			mib_idx = LINUX_MIB_TCPLOSSUNDO;
+		else
+			mib_idx = LINUX_MIB_TCPFULLUNDO;
+
+		NET_INC_STATS(sock_net(sk), mib_idx);
+	} else if (tp->rack.reo_wnd_persist) {
+		tp->rack.reo_wnd_persist--;
+	}
+	if (tcp_is_non_sack_preventing_reopen(sk))
+		return true;
+	tcp_set_ca_state(sk, TCP_CA_Open);
+	tp->is_sack_reneg = 0;
+	return false;
+}
+
+/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
+static bool tcp_try_undo_dsack(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->undo_marker && !tp->undo_retrans) {
+		tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
+					       tp->rack.reo_wnd_persist + 1);
+		DBGUNDO(sk, "D-SACK");
+		tcp_undo_cwnd_reduction(sk, false);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+		return true;
+	}
+	return false;
+}
+
+/* Undo during loss recovery after partial ACK or using F-RTO. */
+static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (frto_undo || tcp_may_undo(tp)) {
+		tcp_undo_cwnd_reduction(sk, true);
+
+		DBGUNDO(sk, "partial loss");
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+		if (frto_undo)
+			NET_INC_STATS(sock_net(sk),
+					LINUX_MIB_TCPSPURIOUSRTOS);
+		inet_csk(sk)->icsk_retransmits = 0;
+		if (tcp_is_non_sack_preventing_reopen(sk))
+			return true;
+		if (frto_undo || tcp_is_sack(tp)) {
+			tcp_set_ca_state(sk, TCP_CA_Open);
+			tp->is_sack_reneg = 0;
+		}
+		return true;
+	}
+	return false;
+}
+
+/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ *   1) If the packets in flight is larger than ssthresh, PRR spreads the
+ *	cwnd reductions across a full RTT.
+ *   2) Otherwise PRR uses packet conservation to send as much as delivered.
+ *      But when SND_UNA is acked without further losses,
+ *      slow starts cwnd up to ssthresh to speed up the recovery.
+ */
+static void tcp_init_cwnd_reduction(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->high_seq = tp->snd_nxt;
+	tp->tlp_high_seq = 0;
+	tp->snd_cwnd_cnt = 0;
+	tp->prior_cwnd = tcp_snd_cwnd(tp);
+	tp->prr_delivered = 0;
+	tp->prr_out = 0;
+	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+	tcp_ecn_queue_cwr(tp);
+}
+
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int sndcnt = 0;
+	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+
+	if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
+		return;
+
+	tp->prr_delivered += newly_acked_sacked;
+	if (delta < 0) {
+		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+			       tp->prior_cwnd - 1;
+		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+	} else {
+		sndcnt = max_t(int, tp->prr_delivered - tp->prr_out,
+			       newly_acked_sacked);
+		if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
+			sndcnt++;
+		sndcnt = min(delta, sndcnt);
+	}
+	/* Force a fast retransmit upon entering fast recovery */
+	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
+	tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt);
+}
+
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (inet_csk(sk)->icsk_ca_ops->cong_control)
+		return;
+
+	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
+	    (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
+		tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
+		tp->snd_cwnd_stamp = tcp_jiffies32;
+	}
+	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
+}
+
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
+void tcp_enter_cwr(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->prior_ssthresh = 0;
+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+		tp->undo_marker = 0;
+		tcp_init_cwnd_reduction(sk);
+		tcp_set_ca_state(sk, TCP_CA_CWR);
+	}
+}
+EXPORT_SYMBOL(tcp_enter_cwr);
+
+static void tcp_try_keep_open(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int state = TCP_CA_Open;
+
+	if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
+		state = TCP_CA_Disorder;
+
+	if (inet_csk(sk)->icsk_ca_state != state) {
+		tcp_set_ca_state(sk, state);
+		tp->high_seq = tp->snd_nxt;
+	}
+}
+
+static void tcp_try_to_open(struct sock *sk, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_verify_left_out(tp);
+
+	if (!tcp_any_retrans_done(sk))
+		tp->retrans_stamp = 0;
+
+	if (flag & FLAG_ECE)
+		tcp_enter_cwr(sk);
+
+	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
+		tcp_try_keep_open(sk);
+	}
+}
+
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+	icsk->icsk_mtup.probe_size = 0;
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
+}
+
+static void tcp_mtup_probe_success(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	u64 val;
+
+	tp->prior_ssthresh = tcp_current_ssthresh(sk);
+
+	val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache);
+	do_div(val, icsk->icsk_mtup.probe_size);
+	DEBUG_NET_WARN_ON_ONCE((u32)val != val);
+	tcp_snd_cwnd_set(tp, max_t(u32, 1U, val));
+
+	tp->snd_cwnd_cnt = 0;
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+	tp->snd_ssthresh = tcp_current_ssthresh(sk);
+
+	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+	icsk->icsk_mtup.probe_size = 0;
+	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
+}
+
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery.
+ * The socket is already locked here.
+ */
+void tcp_simple_retransmit(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int mss;
+
+	/* A fastopen SYN request is stored as two separate packets within
+	 * the retransmit queue, this is done by tcp_send_syn_data().
+	 * As a result simply checking the MSS of the frames in the queue
+	 * will not work for the SYN packet.
+	 *
+	 * Us being here is an indication of a path MTU issue so we can
+	 * assume that the fastopen SYN was lost and just mark all the
+	 * frames in the retransmit queue as lost. We will use an MSS of
+	 * -1 to mark all frames as lost, otherwise compute the current MSS.
+	 */
+	if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
+		mss = -1;
+	else
+		mss = tcp_current_mss(sk);
+
+	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+		if (tcp_skb_seglen(skb) > mss)
+			tcp_mark_skb_lost(sk, skb);
+	}
+
+	tcp_clear_retrans_hints_partial(tp);
+
+	if (!tp->lost_out)
+		return;
+
+	if (tcp_is_reno(tp))
+		tcp_limit_reno_sacked(tp);
+
+	tcp_verify_left_out(tp);
+
+	/* Don't muck with the congestion window here.
+	 * Reason is that we do not increase amount of _data_
+	 * in network, but units changed and effective
+	 * cwnd/ssthresh really reduced now.
+	 */
+	if (icsk->icsk_ca_state != TCP_CA_Loss) {
+		tp->high_seq = tp->snd_nxt;
+		tp->snd_ssthresh = tcp_current_ssthresh(sk);
+		tp->prior_ssthresh = 0;
+		tp->undo_marker = 0;
+		tcp_set_ca_state(sk, TCP_CA_Loss);
+	}
+	tcp_xmit_retransmit_queue(sk);
+}
+EXPORT_SYMBOL(tcp_simple_retransmit);
+
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mib_idx;
+
+	if (tcp_is_reno(tp))
+		mib_idx = LINUX_MIB_TCPRENORECOVERY;
+	else
+		mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+	NET_INC_STATS(sock_net(sk), mib_idx);
+
+	tp->prior_ssthresh = 0;
+	tcp_init_undo(tp);
+
+	if (!tcp_in_cwnd_reduction(sk)) {
+		if (!ece_ack)
+			tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		tcp_init_cwnd_reduction(sk);
+	}
+	tcp_set_ca_state(sk, TCP_CA_Recovery);
+}
+
+/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
+ * recovered or spurious. Otherwise retransmits more on partial ACKs.
+ */
+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
+			     int *rexmit)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	bool recovered = !before(tp->snd_una, tp->high_seq);
+
+	if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
+	    tcp_try_undo_loss(sk, false))
+		return;
+
+	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+		/* Step 3.b. A timeout is spurious if not all data are
+		 * lost, i.e., never-retransmitted data are (s)acked.
+		 */
+		if ((flag & FLAG_ORIG_SACK_ACKED) &&
+		    tcp_try_undo_loss(sk, true))
+			return;
+
+		if (after(tp->snd_nxt, tp->high_seq)) {
+			if (flag & FLAG_DATA_SACKED || num_dupack)
+				tp->frto = 0; /* Step 3.a. loss was real */
+		} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
+			tp->high_seq = tp->snd_nxt;
+			/* Step 2.b. Try send new data (but deferred until cwnd
+			 * is updated in tcp_ack()). Otherwise fall back to
+			 * the conventional recovery.
+			 */
+			if (!tcp_write_queue_empty(sk) &&
+			    after(tcp_wnd_end(tp), tp->snd_nxt)) {
+				*rexmit = REXMIT_NEW;
+				return;
+			}
+			tp->frto = 0;
+		}
+	}
+
+	if (recovered) {
+		/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
+		tcp_try_undo_recovery(sk);
+		return;
+	}
+	if (tcp_is_reno(tp)) {
+		/* A Reno DUPACK means new data in F-RTO step 2.b above are
+		 * delivered. Lower inflight to clock out (re)transmissions.
+		 */
+		if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
+			tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
+		else if (flag & FLAG_SND_UNA_ADVANCED)
+			tcp_reset_reno_sack(tp);
+	}
+	*rexmit = REXMIT_LOST;
+}
+
+static bool tcp_force_fast_retransmit(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return after(tcp_highest_sack_seq(tp),
+		     tp->snd_una + tp->reordering * tp->mss_cache);
+}
+
+/* Undo during fast recovery after partial ACK. */
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
+				 bool *do_lost)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->undo_marker && tcp_packet_delayed(tp)) {
+		/* Plain luck! Hole if filled with delayed
+		 * packet, rather than with a retransmit. Check reordering.
+		 */
+		tcp_check_sack_reordering(sk, prior_snd_una, 1);
+
+		/* We are getting evidence that the reordering degree is higher
+		 * than we realized. If there are no retransmits out then we
+		 * can undo. Otherwise we clock out new packets but do not
+		 * mark more packets lost or retransmit more.
+		 */
+		if (tp->retrans_out)
+			return true;
+
+		if (!tcp_any_retrans_done(sk))
+			tp->retrans_stamp = 0;
+
+		DBGUNDO(sk, "partial recovery");
+		tcp_undo_cwnd_reduction(sk, true);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+		tcp_try_keep_open(sk);
+	} else {
+		/* Partial ACK arrived. Force fast retransmit. */
+		*do_lost = tcp_force_fast_retransmit(sk);
+	}
+	return false;
+}
+
+static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_rtx_queue_empty(sk))
+		return;
+
+	if (unlikely(tcp_is_reno(tp))) {
+		tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
+	} else if (tcp_is_rack(sk)) {
+		u32 prior_retrans = tp->retrans_out;
+
+		if (tcp_rack_mark_lost(sk))
+			*ack_flag &= ~FLAG_SET_XMIT_TIMER;
+		if (prior_retrans > tp->retrans_out)
+			*ack_flag |= FLAG_LOST_RETRANS;
+	}
+}
+
+/* Process an event, which can update packets-in-flight not trivially.
+ * Main goal of this function is to calculate new estimate for left_out,
+ * taking into account both packets sitting in receiver's buffer and
+ * packets lost by network.
+ *
+ * Besides that it updates the congestion state when packet loss or ECN
+ * is detected. But it does not reduce the cwnd, it is done by the
+ * congestion control later.
+ *
+ * It does _not_ decide what to send, it is made in function
+ * tcp_xmit_retransmit_queue().
+ */
+static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+				  int num_dupack, int *ack_flag, int *rexmit)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int fast_rexmit = 0, flag = *ack_flag;
+	bool ece_ack = flag & FLAG_ECE;
+	bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
+				      tcp_force_fast_retransmit(sk));
+
+	if (!tp->packets_out && tp->sacked_out)
+		tp->sacked_out = 0;
+
+	/* Now state machine starts.
+	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
+	if (ece_ack)
+		tp->prior_ssthresh = 0;
+
+	/* B. In all the states check for reneging SACKs. */
+	if (tcp_check_sack_reneging(sk, ack_flag))
+		return;
+
+	/* C. Check consistency of the current state. */
+	tcp_verify_left_out(tp);
+
+	/* D. Check state exit conditions. State can be terminated
+	 *    when high_seq is ACKed. */
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
+		WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
+		tp->retrans_stamp = 0;
+	} else if (!before(tp->snd_una, tp->high_seq)) {
+		switch (icsk->icsk_ca_state) {
+		case TCP_CA_CWR:
+			/* CWR is to be held something *above* high_seq
+			 * is ACKed for CWR bit to reach receiver. */
+			if (tp->snd_una != tp->high_seq) {
+				tcp_end_cwnd_reduction(sk);
+				tcp_set_ca_state(sk, TCP_CA_Open);
+			}
+			break;
+
+		case TCP_CA_Recovery:
+			if (tcp_is_reno(tp))
+				tcp_reset_reno_sack(tp);
+			if (tcp_try_undo_recovery(sk))
+				return;
+			tcp_end_cwnd_reduction(sk);
+			break;
+		}
+	}
+
+	/* E. Process state. */
+	switch (icsk->icsk_ca_state) {
+	case TCP_CA_Recovery:
+		if (!(flag & FLAG_SND_UNA_ADVANCED)) {
+			if (tcp_is_reno(tp))
+				tcp_add_reno_sack(sk, num_dupack, ece_ack);
+		} else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
+			return;
+
+		if (tcp_try_undo_dsack(sk))
+			tcp_try_keep_open(sk);
+
+		tcp_identify_packet_loss(sk, ack_flag);
+		if (icsk->icsk_ca_state != TCP_CA_Recovery) {
+			if (!tcp_time_to_recover(sk, flag))
+				return;
+			/* Undo reverts the recovery state. If loss is evident,
+			 * starts a new recovery (e.g. reordering then loss);
+			 */
+			tcp_enter_recovery(sk, ece_ack);
+		}
+		break;
+	case TCP_CA_Loss:
+		tcp_process_loss(sk, flag, num_dupack, rexmit);
+		tcp_identify_packet_loss(sk, ack_flag);
+		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+		      (*ack_flag & FLAG_LOST_RETRANS)))
+			return;
+		/* Change state if cwnd is undone or retransmits are lost */
+		fallthrough;
+	default:
+		if (tcp_is_reno(tp)) {
+			if (flag & FLAG_SND_UNA_ADVANCED)
+				tcp_reset_reno_sack(tp);
+			tcp_add_reno_sack(sk, num_dupack, ece_ack);
+		}
+
+		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
+			tcp_try_undo_dsack(sk);
+
+		tcp_identify_packet_loss(sk, ack_flag);
+		if (!tcp_time_to_recover(sk, flag)) {
+			tcp_try_to_open(sk, flag);
+			return;
+		}
+
+		/* MTU probe failure: don't reduce cwnd */
+		if (icsk->icsk_ca_state < TCP_CA_CWR &&
+		    icsk->icsk_mtup.probe_size &&
+		    tp->snd_una == tp->mtu_probe.probe_seq_start) {
+			tcp_mtup_probe_failed(sk);
+			/* Restores the reduction we did in tcp_mtup_probe() */
+			tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+			tcp_simple_retransmit(sk);
+			return;
+		}
+
+		/* Otherwise enter Recovery state */
+		tcp_enter_recovery(sk, ece_ack);
+		fast_rexmit = 1;
+	}
+
+	if (!tcp_is_rack(sk) && do_lost)
+		tcp_update_scoreboard(sk, fast_rexmit);
+	*rexmit = REXMIT_LOST;
+}
+
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
+{
+	u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
+		/* If the remote keeps returning delayed ACKs, eventually
+		 * the min filter would pick it up and overestimate the
+		 * prop. delay when it expires. Skip suspected delayed ACKs.
+		 */
+		return;
+	}
+	minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
+			   rtt_us ? : jiffies_to_usecs(1));
+}
+
+static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+			       long seq_rtt_us, long sack_rtt_us,
+			       long ca_rtt_us, struct rate_sample *rs)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
+	 * broken middle-boxes or peers may corrupt TS-ECR fields. But
+	 * Karn's algorithm forbids taking RTT if some retransmitted data
+	 * is acked (RFC6298).
+	 */
+	if (seq_rtt_us < 0)
+		seq_rtt_us = sack_rtt_us;
+
+	/* RTTM Rule: A TSecr value received in a segment is used to
+	 * update the averaged RTT measurement only if the segment
+	 * acknowledges some new data, i.e., only if it advances the
+	 * left edge of the send window.
+	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
+	 */
+	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+	    flag & FLAG_ACKED) {
+		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
+
+		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+			if (!delta)
+				delta = 1;
+			seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+			ca_rtt_us = seq_rtt_us;
+		}
+	}
+	rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
+	if (seq_rtt_us < 0)
+		return false;
+
+	/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
+	 * always taken together with ACK, SACK, or TS-opts. Any negative
+	 * values will be skipped with the seq_rtt_us < 0 check above.
+	 */
+	tcp_update_rtt_min(sk, ca_rtt_us, flag);
+	tcp_rtt_estimator(sk, seq_rtt_us);
+	tcp_set_rto(sk);
+
+	/* RFC6298: only reset backoff on valid RTT measurement. */
+	inet_csk(sk)->icsk_backoff = 0;
+	return true;
+}
+
+/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
+void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
+{
+	struct rate_sample rs;
+	long rtt_us = -1L;
+
+	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
+		rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
+
+	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
+}
+
+
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
+	tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
+}
+
+/* Restart timer after forward progress on connection.
+ * RFC2988 recommends to restart timer to now+rto.
+ */
+void tcp_rearm_rto(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* If the retrans timer is currently being used by Fast Open
+	 * for SYN-ACK retrans purpose, stay put.
+	 */
+	if (rcu_access_pointer(tp->fastopen_rsk))
+		return;
+
+	if (!tp->packets_out) {
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+	} else {
+		u32 rto = inet_csk(sk)->icsk_rto;
+		/* Offset the time elapsed after installing regular RTO */
+		if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+			s64 delta_us = tcp_rto_delta_us(sk);
+			/* delta_us may not be positive if the socket is locked
+			 * when the retrans timer fires and is rescheduled.
+			 */
+			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
+		}
+		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+				     TCP_RTO_MAX);
+	}
+}
+
+/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
+static void tcp_set_xmit_timer(struct sock *sk)
+{
+	if (!tcp_schedule_loss_probe(sk, true))
+		tcp_rearm_rto(sk);
+}
+
+/* If we get here, the whole TSO packet has not been acked. */
+static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 packets_acked;
+
+	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
+
+	packets_acked = tcp_skb_pcount(skb);
+	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+		return 0;
+	packets_acked -= tcp_skb_pcount(skb);
+
+	if (packets_acked) {
+		BUG_ON(tcp_skb_pcount(skb) == 0);
+		BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
+	}
+
+	return packets_acked;
+}
+
+static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
+			   const struct sk_buff *ack_skb, u32 prior_snd_una)
+{
+	const struct skb_shared_info *shinfo;
+
+	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
+	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
+		return;
+
+	shinfo = skb_shinfo(skb);
+	if (!before(shinfo->tskey, prior_snd_una) &&
+	    before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
+		tcp_skb_tsorted_save(skb) {
+			__skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
+		} tcp_skb_tsorted_restore(skb);
+	}
+}
+
+/* Remove acknowledged frames from the retransmission queue. If our packet
+ * is before the ack sequence we can discard it as it's confirmed to have
+ * arrived at the other end.
+ */
+static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
+			       u32 prior_fack, u32 prior_snd_una,
+			       struct tcp_sacktag_state *sack, bool ece_ack)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	u64 first_ackt, last_ackt;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 prior_sacked = tp->sacked_out;
+	u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
+	struct sk_buff *skb, *next;
+	bool fully_acked = true;
+	long sack_rtt_us = -1L;
+	long seq_rtt_us = -1L;
+	long ca_rtt_us = -1L;
+	u32 pkts_acked = 0;
+	bool rtt_update;
+	int flag = 0;
+
+	first_ackt = 0;
+
+	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
+		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+		const u32 start_seq = scb->seq;
+		u8 sacked = scb->sacked;
+		u32 acked_pcount;
+
+		/* Determine how many packets and what bytes were acked, tso and else */
+		if (after(scb->end_seq, tp->snd_una)) {
+			if (tcp_skb_pcount(skb) == 1 ||
+			    !after(tp->snd_una, scb->seq))
+				break;
+
+			acked_pcount = tcp_tso_acked(sk, skb);
+			if (!acked_pcount)
+				break;
+			fully_acked = false;
+		} else {
+			acked_pcount = tcp_skb_pcount(skb);
+		}
+
+		if (unlikely(sacked & TCPCB_RETRANS)) {
+			if (sacked & TCPCB_SACKED_RETRANS)
+				tp->retrans_out -= acked_pcount;
+			flag |= FLAG_RETRANS_DATA_ACKED;
+		} else if (!(sacked & TCPCB_SACKED_ACKED)) {
+			last_ackt = tcp_skb_timestamp_us(skb);
+			WARN_ON_ONCE(last_ackt == 0);
+			if (!first_ackt)
+				first_ackt = last_ackt;
+
+			if (before(start_seq, reord))
+				reord = start_seq;
+			if (!after(scb->end_seq, tp->high_seq))
+				flag |= FLAG_ORIG_SACK_ACKED;
+		}
+
+		if (sacked & TCPCB_SACKED_ACKED) {
+			tp->sacked_out -= acked_pcount;
+		} else if (tcp_is_sack(tp)) {
+			tcp_count_delivered(tp, acked_pcount, ece_ack);
+			if (!tcp_skb_spurious_retrans(tp, skb))
+				tcp_rack_advance(tp, sacked, scb->end_seq,
+						 tcp_skb_timestamp_us(skb));
+		}
+		if (sacked & TCPCB_LOST)
+			tp->lost_out -= acked_pcount;
+
+		tp->packets_out -= acked_pcount;
+		pkts_acked += acked_pcount;
+		tcp_rate_skb_delivered(sk, skb, sack->rate);
+
+		/* Initial outgoing SYN's get put onto the write_queue
+		 * just like anything else we transmit.  It is not
+		 * true data, and if we misinform our callers that
+		 * this ACK acks real data, we will erroneously exit
+		 * connection startup slow start one packet too
+		 * quickly.  This is severely frowned upon behavior.
+		 */
+		if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
+			flag |= FLAG_DATA_ACKED;
+		} else {
+			flag |= FLAG_SYN_ACKED;
+			tp->retrans_stamp = 0;
+		}
+
+		if (!fully_acked)
+			break;
+
+		tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
+
+		next = skb_rb_next(skb);
+		if (unlikely(skb == tp->retransmit_skb_hint))
+			tp->retransmit_skb_hint = NULL;
+		if (unlikely(skb == tp->lost_skb_hint))
+			tp->lost_skb_hint = NULL;
+		tcp_highest_sack_replace(sk, skb, next);
+		tcp_rtx_queue_unlink_and_free(skb, sk);
+	}
+
+	if (!skb)
+		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+
+	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
+		tp->snd_up = tp->snd_una;
+
+	if (skb) {
+		tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+			flag |= FLAG_SACK_RENEGING;
+	}
+
+	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
+		seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
+		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
+
+		if (pkts_acked == 1 && fully_acked && !prior_sacked &&
+		    (tp->snd_una - prior_snd_una) < tp->mss_cache &&
+		    sack->rate->prior_delivered + 1 == tp->delivered &&
+		    !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
+			/* Conservatively mark a delayed ACK. It's typically
+			 * from a lone runt packet over the round trip to
+			 * a receiver w/o out-of-order or CE events.
+			 */
+			flag |= FLAG_ACK_MAYBE_DELAYED;
+		}
+	}
+	if (sack->first_sackt) {
+		sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
+		ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
+	}
+	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
+					ca_rtt_us, sack->rate);
+
+	if (flag & FLAG_ACKED) {
+		flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
+		if (unlikely(icsk->icsk_mtup.probe_size &&
+			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
+			tcp_mtup_probe_success(sk);
+		}
+
+		if (tcp_is_reno(tp)) {
+			tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
+
+			/* If any of the cumulatively ACKed segments was
+			 * retransmitted, non-SACK case cannot confirm that
+			 * progress was due to original transmission due to
+			 * lack of TCPCB_SACKED_ACKED bits even if some of
+			 * the packets may have been never retransmitted.
+			 */
+			if (flag & FLAG_RETRANS_DATA_ACKED)
+				flag &= ~FLAG_ORIG_SACK_ACKED;
+		} else {
+			int delta;
+
+			/* Non-retransmitted hole got filled? That's reordering */
+			if (before(reord, prior_fack))
+				tcp_check_sack_reordering(sk, reord, 0);
+
+			delta = prior_sacked - tp->sacked_out;
+			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
+		}
+	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
+		   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
+						    tcp_skb_timestamp_us(skb))) {
+		/* Do not re-arm RTO if the sack RTT is measured from data sent
+		 * after when the head was last (re)transmitted. Otherwise the
+		 * timeout may continue to extend in loss recovery.
+		 */
+		flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
+	}
+
+	if (icsk->icsk_ca_ops->pkts_acked) {
+		struct ack_sample sample = { .pkts_acked = pkts_acked,
+					     .rtt_us = sack->rate->rtt_us };
+
+		sample.in_flight = tp->mss_cache *
+			(tp->delivered - sack->rate->prior_delivered);
+		icsk->icsk_ca_ops->pkts_acked(sk, &sample);
+	}
+
+#if FASTRETRANS_DEBUG > 0
+	WARN_ON((int)tp->sacked_out < 0);
+	WARN_ON((int)tp->lost_out < 0);
+	WARN_ON((int)tp->retrans_out < 0);
+	if (!tp->packets_out && tcp_is_sack(tp)) {
+		icsk = inet_csk(sk);
+		if (tp->lost_out) {
+			pr_debug("Leak l=%u %d\n",
+				 tp->lost_out, icsk->icsk_ca_state);
+			tp->lost_out = 0;
+		}
+		if (tp->sacked_out) {
+			pr_debug("Leak s=%u %d\n",
+				 tp->sacked_out, icsk->icsk_ca_state);
+			tp->sacked_out = 0;
+		}
+		if (tp->retrans_out) {
+			pr_debug("Leak r=%u %d\n",
+				 tp->retrans_out, icsk->icsk_ca_state);
+			tp->retrans_out = 0;
+		}
+	}
+#endif
+	return flag;
+}
+
+static void tcp_ack_probe(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *head = tcp_send_head(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Was it a usable window open? */
+	if (!head)
+		return;
+	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
+		icsk->icsk_backoff = 0;
+		icsk->icsk_probes_tstamp = 0;
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
+		/* Socket must be waked up by subsequent tcp_data_snd_check().
+		 * This function is not for random using!
+		 */
+	} else {
+		unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
+
+		when = tcp_clamp_probe0_to_user_timeout(sk, when);
+		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
+	}
+}
+
+static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
+{
+	return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
+}
+
+/* Decide wheather to run the increase function of congestion control. */
+static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+{
+	/* If reordering is high then always grow cwnd whenever data is
+	 * delivered regardless of its ordering. Otherwise stay conservative
+	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
+	 * new SACK or ECE mark may first advance cwnd here and later reduce
+	 * cwnd in tcp_fastretrans_alert() based on more states.
+	 */
+	if (tcp_sk(sk)->reordering >
+	    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
+		return flag & FLAG_FORWARD_PROGRESS;
+
+	return flag & FLAG_DATA_ACKED;
+}
+
+/* The "ultimate" congestion control function that aims to replace the rigid
+ * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
+ * It's called toward the end of processing an ACK with precise rate
+ * information. All transmission or retransmission are delayed afterwards.
+ */
+static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+			     int flag, const struct rate_sample *rs)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->cong_control) {
+		icsk->icsk_ca_ops->cong_control(sk, rs);
+		return;
+	}
+
+	if (tcp_in_cwnd_reduction(sk)) {
+		/* Reduce cwnd if state mandates */
+		tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
+	} else if (tcp_may_raise_cwnd(sk, flag)) {
+		/* Advance cwnd if state allows */
+		tcp_cong_avoid(sk, ack, acked_sacked);
+	}
+	tcp_update_pacing_rate(sk);
+}
+
+/* Check that window update is acceptable.
+ * The function assumes that snd_una<=ack<=snd_next.
+ */
+static inline bool tcp_may_update_window(const struct tcp_sock *tp,
+					const u32 ack, const u32 ack_seq,
+					const u32 nwin)
+{
+	return	after(ack, tp->snd_una) ||
+		after(ack_seq, tp->snd_wl1) ||
+		(ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin));
+}
+
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+	u32 delta = ack - tp->snd_una;
+
+	sock_owned_by_me((struct sock *)tp);
+	tp->bytes_acked += delta;
+	tp->snd_una = ack;
+}
+
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+	u32 delta = seq - tp->rcv_nxt;
+
+	sock_owned_by_me((struct sock *)tp);
+	tp->bytes_received += delta;
+	WRITE_ONCE(tp->rcv_nxt, seq);
+}
+
+/* Update our send window.
+ *
+ * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
+ * and in FreeBSD. NetBSD's one is even worse.) is wrong.
+ */
+static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
+				 u32 ack_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int flag = 0;
+	u32 nwin = ntohs(tcp_hdr(skb)->window);
+
+	if (likely(!tcp_hdr(skb)->syn))
+		nwin <<= tp->rx_opt.snd_wscale;
+
+	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
+		flag |= FLAG_WIN_UPDATE;
+		tcp_update_wl(tp, ack_seq);
+
+		if (tp->snd_wnd != nwin) {
+			tp->snd_wnd = nwin;
+
+			/* Note, it is the only place, where
+			 * fast path is recovered for sending TCP.
+			 */
+			tp->pred_flags = 0;
+			tcp_fast_path_check(sk);
+
+			if (!tcp_write_queue_empty(sk))
+				tcp_slow_start_after_idle_check(sk);
+
+			if (nwin > tp->max_window) {
+				tp->max_window = nwin;
+				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
+			}
+		}
+	}
+
+	tcp_snd_una_update(tp, ack);
+
+	return flag;
+}
+
+static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
+				   u32 *last_oow_ack_time)
+{
+	/* Paired with the WRITE_ONCE() in this function. */
+	u32 val = READ_ONCE(*last_oow_ack_time);
+
+	if (val) {
+		s32 elapsed = (s32)(tcp_jiffies32 - val);
+
+		if (0 <= elapsed &&
+		    elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
+			NET_INC_STATS(net, mib_idx);
+			return true;	/* rate-limited: don't send yet! */
+		}
+	}
+
+	/* Paired with the prior READ_ONCE() and with itself,
+	 * as we might be lockless.
+	 */
+	WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
+
+	return false;	/* not rate-limited: go ahead, send dupack now! */
+}
+
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+			  int mib_idx, u32 *last_oow_ack_time)
+{
+	/* Data packets without SYNs are not likely part of an ACK loop. */
+	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+	    !tcp_hdr(skb)->syn)
+		return false;
+
+	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
+}
+
+/* RFC 5961 7 [ACK Throttling] */
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	u32 count, now, ack_limit;
+
+	/* First check our per-socket dupack rate limit. */
+	if (__tcp_oow_rate_limited(net,
+				   LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+				   &tp->last_oow_ack_time))
+		return;
+
+	ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
+	if (ack_limit == INT_MAX)
+		goto send_ack;
+
+	/* Then check host-wide RFC 5961 rate limit. */
+	now = jiffies / HZ;
+	if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
+		u32 half = (ack_limit + 1) >> 1;
+
+		WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
+		WRITE_ONCE(net->ipv4.tcp_challenge_count,
+			   get_random_u32_inclusive(half, ack_limit + half - 1));
+	}
+	count = READ_ONCE(net->ipv4.tcp_challenge_count);
+	if (count > 0) {
+		WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
+send_ack:
+		NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
+		tcp_send_ack(sk);
+	}
+}
+
+static void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
+}
+
+static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
+		 * extra check below makes sure this can only happen
+		 * for pure ACK frames.  -DaveM
+		 *
+		 * Not only, also it occurs for expired timestamps.
+		 */
+
+		if (tcp_paws_check(&tp->rx_opt, 0))
+			tcp_store_ts_recent(tp);
+	}
+}
+
+/* This routine deals with acks during a TLP episode and ends an episode by
+ * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+ */
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (before(ack, tp->tlp_high_seq))
+		return;
+
+	if (!tp->tlp_retrans) {
+		/* TLP of new data has been acknowledged */
+		tp->tlp_high_seq = 0;
+	} else if (flag & FLAG_DSACK_TLP) {
+		/* This DSACK means original and TLP probe arrived; no loss */
+		tp->tlp_high_seq = 0;
+	} else if (after(ack, tp->tlp_high_seq)) {
+		/* ACK advances: there was a loss, so reduce cwnd. Reset
+		 * tlp_high_seq in tcp_init_cwnd_reduction()
+		 */
+		tcp_init_cwnd_reduction(sk);
+		tcp_set_ca_state(sk, TCP_CA_CWR);
+		tcp_end_cwnd_reduction(sk);
+		tcp_try_keep_open(sk);
+		NET_INC_STATS(sock_net(sk),
+				LINUX_MIB_TCPLOSSPROBERECOVERY);
+	} else if (!(flag & (FLAG_SND_UNA_ADVANCED |
+			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+		/* Pure dupack: original and TLP probe arrived; no loss */
+		tp->tlp_high_seq = 0;
+	}
+}
+
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->in_ack_event)
+		icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
+
+/* Congestion control has updated the cwnd already. So if we're in
+ * loss recovery then now we do any new sends (for FRTO) or
+ * retransmits (for CA_Loss or CA_recovery) that make sense.
+ */
+static void tcp_xmit_recovery(struct sock *sk, int rexmit)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
+		return;
+
+	if (unlikely(rexmit == REXMIT_NEW)) {
+		__tcp_push_pending_frames(sk, tcp_current_mss(sk),
+					  TCP_NAGLE_OFF);
+		if (after(tp->snd_nxt, tp->high_seq))
+			return;
+		tp->frto = 0;
+	}
+	tcp_xmit_retransmit_queue(sk);
+}
+
+/* Returns the number of packets newly acked or sacked by the current ACK */
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+{
+	const struct net *net = sock_net(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 delivered;
+
+	delivered = tp->delivered - prior_delivered;
+	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+	if (flag & FLAG_ECE)
+		NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+
+	return delivered;
+}
+
+/* This routine deals with incoming acks, but not outgoing ones. */
+static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_sacktag_state sack_state;
+	struct rate_sample rs = { .prior_delivered = 0 };
+	u32 prior_snd_una = tp->snd_una;
+	bool is_sack_reneg = tp->is_sack_reneg;
+	u32 ack_seq = TCP_SKB_CB(skb)->seq;
+	u32 ack = TCP_SKB_CB(skb)->ack_seq;
+	int num_dupack = 0;
+	int prior_packets = tp->packets_out;
+	u32 delivered = tp->delivered;
+	u32 lost = tp->lost;
+	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+	u32 prior_fack;
+
+	sack_state.first_sackt = 0;
+	sack_state.rate = &rs;
+	sack_state.sack_delivered = 0;
+
+	/* We very likely will need to access rtx queue. */
+	prefetch(sk->tcp_rtx_queue.rb_node);
+
+	/* If the ack is older than previous acks
+	 * then we can probably ignore it.
+	 */
+	if (before(ack, prior_snd_una)) {
+		u32 max_window;
+
+		/* do not accept ACK for bytes we never sent. */
+		max_window = min_t(u64, tp->max_window, tp->bytes_acked);
+		/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
+		if (before(ack, prior_snd_una - max_window)) {
+			if (!(flag & FLAG_NO_CHALLENGE_ACK))
+				tcp_send_challenge_ack(sk);
+			return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
+		}
+		goto old_ack;
+	}
+
+	/* If the ack includes data we haven't sent yet, discard
+	 * this segment (RFC793 Section 3.9).
+	 */
+	if (after(ack, tp->snd_nxt))
+		return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
+
+	if (after(ack, prior_snd_una)) {
+		flag |= FLAG_SND_UNA_ADVANCED;
+		icsk->icsk_retransmits = 0;
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+		if (static_branch_unlikely(&clean_acked_data_enabled.key))
+			if (icsk->icsk_clean_acked)
+				icsk->icsk_clean_acked(sk, ack);
+#endif
+	}
+
+	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
+	rs.prior_in_flight = tcp_packets_in_flight(tp);
+
+	/* ts_recent update must be made after we are sure that the packet
+	 * is in window.
+	 */
+	if (flag & FLAG_UPDATE_TS_RECENT)
+		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+	if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
+	    FLAG_SND_UNA_ADVANCED) {
+		/* Window is constant, pure forward advance.
+		 * No more checks are required.
+		 * Note, we use the fact that SND.UNA>=SND.WL2.
+		 */
+		tcp_update_wl(tp, ack_seq);
+		tcp_snd_una_update(tp, ack);
+		flag |= FLAG_WIN_UPDATE;
+
+		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
+
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
+	} else {
+		u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
+		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+			flag |= FLAG_DATA;
+		else
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+
+		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+
+		if (TCP_SKB_CB(skb)->sacked)
+			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+							&sack_state);
+
+		if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+			flag |= FLAG_ECE;
+			ack_ev_flags |= CA_ACK_ECE;
+		}
+
+		if (sack_state.sack_delivered)
+			tcp_count_delivered(tp, sack_state.sack_delivered,
+					    flag & FLAG_ECE);
+
+		if (flag & FLAG_WIN_UPDATE)
+			ack_ev_flags |= CA_ACK_WIN_UPDATE;
+
+		tcp_in_ack_event(sk, ack_ev_flags);
+	}
+
+	/* This is a deviation from RFC3168 since it states that:
+	 * "When the TCP data sender is ready to set the CWR bit after reducing
+	 * the congestion window, it SHOULD set the CWR bit only on the first
+	 * new data packet that it transmits."
+	 * We accept CWR on pure ACKs to be more robust
+	 * with widely-deployed TCP implementations that do this.
+	 */
+	tcp_ecn_accept_cwr(sk, skb);
+
+	/* We passed data and got it acked, remove any soft error
+	 * log. Something worked...
+	 */
+	WRITE_ONCE(sk->sk_err_soft, 0);
+	icsk->icsk_probes_out = 0;
+	tp->rcv_tstamp = tcp_jiffies32;
+	if (!prior_packets)
+		goto no_queue;
+
+	/* See if we can take anything off of the retransmit queue. */
+	flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
+				    &sack_state, flag & FLAG_ECE);
+
+	tcp_rack_update_reo_wnd(sk, &rs);
+
+	if (tp->tlp_high_seq)
+		tcp_process_tlp_ack(sk, ack, flag);
+
+	if (tcp_ack_is_dubious(sk, flag)) {
+		if (!(flag & (FLAG_SND_UNA_ADVANCED |
+			      FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
+			num_dupack = 1;
+			/* Consider if pure acks were aggregated in tcp_add_backlog() */
+			if (!(flag & FLAG_DATA))
+				num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+		}
+		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+				      &rexmit);
+	}
+
+	/* If needed, reset TLP/RTO timer when RACK doesn't set. */
+	if (flag & FLAG_SET_XMIT_TIMER)
+		tcp_set_xmit_timer(sk);
+
+	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+		sk_dst_confirm(sk);
+
+	delivered = tcp_newly_delivered(sk, delivered, flag);
+	lost = tp->lost - lost;			/* freshly marked lost */
+	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+	tcp_xmit_recovery(sk, rexmit);
+	return 1;
+
+no_queue:
+	/* If data was DSACKed, see if we can undo a cwnd reduction. */
+	if (flag & FLAG_DSACKING_ACK) {
+		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+				      &rexmit);
+		tcp_newly_delivered(sk, delivered, flag);
+	}
+	/* If this ack opens up a zero window, clear backoff.  It was
+	 * being used to time the probes, and is probably far higher than
+	 * it needs to be for normal retransmission.
+	 */
+	tcp_ack_probe(sk);
+
+	if (tp->tlp_high_seq)
+		tcp_process_tlp_ack(sk, ack, flag);
+	return 1;
+
+old_ack:
+	/* If data was SACKed, tag it and see if we should send more data.
+	 * If data was DSACKed, see if we can undo a cwnd reduction.
+	 */
+	if (TCP_SKB_CB(skb)->sacked) {
+		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+						&sack_state);
+		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+				      &rexmit);
+		tcp_newly_delivered(sk, delivered, flag);
+		tcp_xmit_recovery(sk, rexmit);
+	}
+
+	return 0;
+}
+
+static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
+				      bool syn, struct tcp_fastopen_cookie *foc,
+				      bool exp_opt)
+{
+	/* Valid only in SYN or SYN-ACK with an even length.  */
+	if (!foc || !syn || len < 0 || (len & 1))
+		return;
+
+	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
+	    len <= TCP_FASTOPEN_COOKIE_MAX)
+		memcpy(foc->val, cookie, len);
+	else if (len != 0)
+		len = -1;
+	foc->len = len;
+	foc->exp = exp_opt;
+}
+
+static bool smc_parse_options(const struct tcphdr *th,
+			      struct tcp_options_received *opt_rx,
+			      const unsigned char *ptr,
+			      int opsize)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (th->syn && !(opsize & 1) &&
+		    opsize >= TCPOLEN_EXP_SMC_BASE &&
+		    get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
+			opt_rx->smc_ok = 1;
+			return true;
+		}
+	}
+#endif
+	return false;
+}
+
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
+ * value on success.
+ */
+u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+{
+	const unsigned char *ptr = (const unsigned char *)(th + 1);
+	int length = (th->doff * 4) - sizeof(struct tcphdr);
+	u16 mss = 0;
+
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return mss;
+		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+			length--;
+			continue;
+		default:
+			if (length < 2)
+				return mss;
+			opsize = *ptr++;
+			if (opsize < 2) /* "silly options" */
+				return mss;
+			if (opsize > length)
+				return mss;	/* fail on partial options */
+			if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
+				u16 in_mss = get_unaligned_be16(ptr);
+
+				if (in_mss) {
+					if (user_mss && user_mss < in_mss)
+						in_mss = user_mss;
+					mss = in_mss;
+				}
+			}
+			ptr += opsize - 2;
+			length -= opsize;
+		}
+	}
+	return mss;
+}
+EXPORT_SYMBOL_GPL(tcp_parse_mss_option);
+
+/* Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ */
+void tcp_parse_options(const struct net *net,
+		       const struct sk_buff *skb,
+		       struct tcp_options_received *opt_rx, int estab,
+		       struct tcp_fastopen_cookie *foc)
+{
+	const unsigned char *ptr;
+	const struct tcphdr *th = tcp_hdr(skb);
+	int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+	ptr = (const unsigned char *)(th + 1);
+	opt_rx->saw_tstamp = 0;
+	opt_rx->saw_unknown = 0;
+
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+			length--;
+			continue;
+		default:
+			if (length < 2)
+				return;
+			opsize = *ptr++;
+			if (opsize < 2) /* "silly options" */
+				return;
+			if (opsize > length)
+				return;	/* don't parse partial options */
+			switch (opcode) {
+			case TCPOPT_MSS:
+				if (opsize == TCPOLEN_MSS && th->syn && !estab) {
+					u16 in_mss = get_unaligned_be16(ptr);
+					if (in_mss) {
+						if (opt_rx->user_mss &&
+						    opt_rx->user_mss < in_mss)
+							in_mss = opt_rx->user_mss;
+						opt_rx->mss_clamp = in_mss;
+					}
+				}
+				break;
+			case TCPOPT_WINDOW:
+				if (opsize == TCPOLEN_WINDOW && th->syn &&
+				    !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
+					__u8 snd_wscale = *(__u8 *)ptr;
+					opt_rx->wscale_ok = 1;
+					if (snd_wscale > TCP_MAX_WSCALE) {
+						net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
+								     __func__,
+								     snd_wscale,
+								     TCP_MAX_WSCALE);
+						snd_wscale = TCP_MAX_WSCALE;
+					}
+					opt_rx->snd_wscale = snd_wscale;
+				}
+				break;
+			case TCPOPT_TIMESTAMP:
+				if ((opsize == TCPOLEN_TIMESTAMP) &&
+				    ((estab && opt_rx->tstamp_ok) ||
+				     (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
+					opt_rx->saw_tstamp = 1;
+					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
+					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
+				}
+				break;
+			case TCPOPT_SACK_PERM:
+				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
+				    !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
+					opt_rx->sack_ok = TCP_SACK_SEEN;
+					tcp_sack_reset(opt_rx);
+				}
+				break;
+
+			case TCPOPT_SACK:
+				if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+				   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+				   opt_rx->sack_ok) {
+					TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+				}
+				break;
+#ifdef CONFIG_TCP_MD5SIG
+			case TCPOPT_MD5SIG:
+				/* The MD5 Hash has already been
+				 * checked (see tcp_v{4,6}_rcv()).
+				 */
+				break;
+#endif
+			case TCPOPT_FASTOPEN:
+				tcp_parse_fastopen_option(
+					opsize - TCPOLEN_FASTOPEN_BASE,
+					ptr, th->syn, foc, false);
+				break;
+
+			case TCPOPT_EXP:
+				/* Fast Open option shares code 254 using a
+				 * 16 bits magic number.
+				 */
+				if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
+				    get_unaligned_be16(ptr) ==
+				    TCPOPT_FASTOPEN_MAGIC) {
+					tcp_parse_fastopen_option(opsize -
+						TCPOLEN_EXP_FASTOPEN_BASE,
+						ptr + 2, th->syn, foc, true);
+					break;
+				}
+
+				if (smc_parse_options(th, opt_rx, ptr, opsize))
+					break;
+
+				opt_rx->saw_unknown = 1;
+				break;
+
+			default:
+				opt_rx->saw_unknown = 1;
+			}
+			ptr += opsize-2;
+			length -= opsize;
+		}
+	}
+}
+EXPORT_SYMBOL(tcp_parse_options);
+
+static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
+{
+	const __be32 *ptr = (const __be32 *)(th + 1);
+
+	if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+			  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+		tp->rx_opt.saw_tstamp = 1;
+		++ptr;
+		tp->rx_opt.rcv_tsval = ntohl(*ptr);
+		++ptr;
+		if (*ptr)
+			tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
+		else
+			tp->rx_opt.rcv_tsecr = 0;
+		return true;
+	}
+	return false;
+}
+
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_options().
+ */
+static bool tcp_fast_parse_options(const struct net *net,
+				   const struct sk_buff *skb,
+				   const struct tcphdr *th, struct tcp_sock *tp)
+{
+	/* In the spirit of fast parsing, compare doff directly to constant
+	 * values.  Because equality is used, short doff can be ignored here.
+	 */
+	if (th->doff == (sizeof(*th) / 4)) {
+		tp->rx_opt.saw_tstamp = 0;
+		return false;
+	} else if (tp->rx_opt.tstamp_ok &&
+		   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
+		if (tcp_parse_aligned_timestamp(tp, th))
+			return true;
+	}
+
+	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
+	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+	return true;
+}
+
+#ifdef CONFIG_TCP_MD5SIG
+/*
+ * Parse MD5 Signature option
+ */
+const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
+{
+	int length = (th->doff << 2) - sizeof(*th);
+	const u8 *ptr = (const u8 *)(th + 1);
+
+	/* If not enough data remaining, we can short cut */
+	while (length >= TCPOLEN_MD5SIG) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return NULL;
+		case TCPOPT_NOP:
+			length--;
+			continue;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2 || opsize > length)
+				return NULL;
+			if (opcode == TCPOPT_MD5SIG)
+				return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
+		}
+		ptr += opsize - 2;
+		length -= opsize;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(tcp_parse_md5sig_option);
+#endif
+
+/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+ *
+ * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
+ * it can pass through stack. So, the following predicate verifies that
+ * this segment is not used for anything but congestion avoidance or
+ * fast retransmit. Moreover, we even are able to eliminate most of such
+ * second order effects, if we apply some small "replay" window (~RTO)
+ * to timestamp space.
+ *
+ * All these measures still do not guarantee that we reject wrapped ACKs
+ * on networks with high bandwidth, when sequence space is recycled fastly,
+ * but it guarantees that such events will be very rare and do not affect
+ * connection seriously. This doesn't look nice, but alas, PAWS is really
+ * buggy extension.
+ *
+ * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
+ * states that events when retransmit arrives after original data are rare.
+ * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
+ * the biggest problem on large power networks even with minor reordering.
+ * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
+ * up to bandwidth of 18Gigabit/sec. 8) ]
+ */
+
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcphdr *th = tcp_hdr(skb);
+	u32 seq = TCP_SKB_CB(skb)->seq;
+	u32 ack = TCP_SKB_CB(skb)->ack_seq;
+
+	return (/* 1. Pure ACK with correct sequence number. */
+		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+
+		/* 2. ... and duplicate ACK. */
+		ack == tp->snd_una &&
+
+		/* 3. ... and does not update window. */
+		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+
+		/* 4. ... and sits in replay window. */
+		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
+}
+
+static inline bool tcp_paws_discard(const struct sock *sk,
+				   const struct sk_buff *skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
+	       !tcp_disordered_ack(sk, skb);
+}
+
+/* Check segment sequence number for validity.
+ *
+ * Segment controls are considered valid, if the segment
+ * fits to the window after truncation to the window. Acceptability
+ * of data (and SYN, FIN, of course) is checked separately.
+ * See tcp_data_queue(), for example.
+ *
+ * Also, controls (RST is main one) are accepted using RCV.WUP instead
+ * of RCV.NXT. Peer still did not advance his SND.UNA when we
+ * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
+ * (borrowed from freebsd)
+ */
+
+static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
+					 u32 seq, u32 end_seq)
+{
+	if (before(end_seq, tp->rcv_wup))
+		return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
+
+	if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
+		return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+
+	return SKB_NOT_DROPPED_YET;
+}
+
+/* When we get a reset we do this. */
+void tcp_reset(struct sock *sk, struct sk_buff *skb)
+{
+	trace_tcp_receive_reset(sk);
+
+	/* mptcp can't tell us to ignore reset pkts,
+	 * so just ignore the return value of mptcp_incoming_options().
+	 */
+	if (sk_is_mptcp(sk))
+		mptcp_incoming_options(sk, skb);
+
+	/* We want the right error as BSD sees it (and indeed as we do). */
+	switch (sk->sk_state) {
+	case TCP_SYN_SENT:
+		WRITE_ONCE(sk->sk_err, ECONNREFUSED);
+		break;
+	case TCP_CLOSE_WAIT:
+		WRITE_ONCE(sk->sk_err, EPIPE);
+		break;
+	case TCP_CLOSE:
+		return;
+	default:
+		WRITE_ONCE(sk->sk_err, ECONNRESET);
+	}
+	/* This barrier is coupled with smp_rmb() in tcp_poll() */
+	smp_wmb();
+
+	tcp_write_queue_purge(sk);
+	tcp_done(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk_error_report(sk);
+}
+
+/*
+ * 	Process the FIN bit. This now behaves as it is supposed to work
+ *	and the FIN takes effect when it is validly part of sequence
+ *	space. Not before when we get holes.
+ *
+ *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
+ *	TIME-WAIT)
+ *
+ *	If we are in FINWAIT-1, a received FIN indicates simultaneous
+ *	close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ */
+void tcp_fin(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	inet_csk_schedule_ack(sk);
+
+	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
+	sock_set_flag(sk, SOCK_DONE);
+
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+	case TCP_ESTABLISHED:
+		/* Move to CLOSE_WAIT */
+		tcp_set_state(sk, TCP_CLOSE_WAIT);
+		inet_csk_enter_pingpong_mode(sk);
+		break;
+
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+		/* Received a retransmission of the FIN, do
+		 * nothing.
+		 */
+		break;
+	case TCP_LAST_ACK:
+		/* RFC793: Remain in the LAST-ACK state. */
+		break;
+
+	case TCP_FIN_WAIT1:
+		/* This case occurs when a simultaneous close
+		 * happens, we must ack the received FIN and
+		 * enter the CLOSING state.
+		 */
+		tcp_send_ack(sk);
+		tcp_set_state(sk, TCP_CLOSING);
+		break;
+	case TCP_FIN_WAIT2:
+		/* Received a FIN -- send ACK and enter TIME_WAIT. */
+		tcp_send_ack(sk);
+		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+		break;
+	default:
+		/* Only TCP_LISTEN and TCP_CLOSE are left, in these
+		 * cases we should never reach this piece of code.
+		 */
+		pr_err("%s: Impossible, sk->sk_state=%d\n",
+		       __func__, sk->sk_state);
+		break;
+	}
+
+	/* It _is_ possible, that we have something out-of-order _after_ FIN.
+	 * Probably, we should reset in this case. For now drop them.
+	 */
+	skb_rbtree_purge(&tp->out_of_order_queue);
+	if (tcp_is_sack(tp))
+		tcp_sack_reset(&tp->rx_opt);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+
+		/* Do not send POLL_HUP for half duplex close. */
+		if (sk->sk_shutdown == SHUTDOWN_MASK ||
+		    sk->sk_state == TCP_CLOSE)
+			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+		else
+			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	}
+}
+
+static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+				  u32 end_seq)
+{
+	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
+		if (before(seq, sp->start_seq))
+			sp->start_seq = seq;
+		if (after(end_seq, sp->end_seq))
+			sp->end_seq = end_seq;
+		return true;
+	}
+	return false;
+}
+
+static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
+		int mib_idx;
+
+		if (before(seq, tp->rcv_nxt))
+			mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
+		else
+			mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
+
+		NET_INC_STATS(sock_net(sk), mib_idx);
+
+		tp->rx_opt.dsack = 1;
+		tp->duplicate_sack[0].start_seq = seq;
+		tp->duplicate_sack[0].end_seq = end_seq;
+	}
+}
+
+static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->rx_opt.dsack)
+		tcp_dsack_set(sk, seq, end_seq);
+	else
+		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
+}
+
+static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
+{
+	/* When the ACK path fails or drops most ACKs, the sender would
+	 * timeout and spuriously retransmit the same segment repeatedly.
+	 * The receiver remembers and reflects via DSACKs. Leverage the
+	 * DSACK state and change the txhash to re-route speculatively.
+	 */
+	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
+	    sk_rethink_txhash(sk))
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+}
+
+static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+
+		if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
+			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+			tcp_rcv_spurious_retrans(sk, skb);
+			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
+				end_seq = tp->rcv_nxt;
+			tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
+		}
+	}
+
+	tcp_send_ack(sk);
+}
+
+/* These routines update the SACK block as out-of-order packets arrive or
+ * in-order packets close up the sequence space.
+ */
+static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
+{
+	int this_sack;
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	struct tcp_sack_block *swalk = sp + 1;
+
+	/* See if the recent change to the first SACK eats into
+	 * or hits the sequence space of other SACK blocks, if so coalesce.
+	 */
+	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
+		if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
+			int i;
+
+			/* Zap SWALK, by moving every further SACK up by one slot.
+			 * Decrease num_sacks.
+			 */
+			tp->rx_opt.num_sacks--;
+			for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
+				sp[i] = sp[i + 1];
+			continue;
+		}
+		this_sack++;
+		swalk++;
+	}
+}
+
+void tcp_sack_compress_send_ack(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->compressed_ack)
+		return;
+
+	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+		__sock_put(sk);
+
+	/* Since we have to send one ack finally,
+	 * substract one from tp->compressed_ack to keep
+	 * LINUX_MIB_TCPACKCOMPRESSED accurate.
+	 */
+	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+		      tp->compressed_ack - 1);
+
+	tp->compressed_ack = 0;
+	tcp_send_ack(sk);
+}
+
+/* Reasonable amount of sack blocks included in TCP SACK option
+ * The max is 4, but this becomes 3 if TCP timestamps are there.
+ * Given that SACK packets might be lost, be conservative and use 2.
+ */
+#define TCP_SACK_BLOCKS_EXPECTED 2
+
+static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	int cur_sacks = tp->rx_opt.num_sacks;
+	int this_sack;
+
+	if (!cur_sacks)
+		goto new_sack;
+
+	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
+		if (tcp_sack_extend(sp, seq, end_seq)) {
+			if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+				tcp_sack_compress_send_ack(sk);
+			/* Rotate this_sack to the first one. */
+			for (; this_sack > 0; this_sack--, sp--)
+				swap(*sp, *(sp - 1));
+			if (cur_sacks > 1)
+				tcp_sack_maybe_coalesce(tp);
+			return;
+		}
+	}
+
+	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+		tcp_sack_compress_send_ack(sk);
+
+	/* Could not find an adjacent existing SACK, build a new one,
+	 * put it at the front, and shift everyone else down.  We
+	 * always know there is at least one SACK present already here.
+	 *
+	 * If the sack array is full, forget about the last one.
+	 */
+	if (this_sack >= TCP_NUM_SACKS) {
+		this_sack--;
+		tp->rx_opt.num_sacks--;
+		sp--;
+	}
+	for (; this_sack > 0; this_sack--, sp--)
+		*sp = *(sp - 1);
+
+new_sack:
+	/* Build the new head SACK, and we're done. */
+	sp->start_seq = seq;
+	sp->end_seq = end_seq;
+	tp->rx_opt.num_sacks++;
+}
+
+/* RCV.NXT advances, some SACKs should be eaten. */
+
+static void tcp_sack_remove(struct tcp_sock *tp)
+{
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	int num_sacks = tp->rx_opt.num_sacks;
+	int this_sack;
+
+	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+		tp->rx_opt.num_sacks = 0;
+		return;
+	}
+
+	for (this_sack = 0; this_sack < num_sacks;) {
+		/* Check if the start of the sack is covered by RCV.NXT. */
+		if (!before(tp->rcv_nxt, sp->start_seq)) {
+			int i;
+
+			/* RCV.NXT must cover all the block! */
+			WARN_ON(before(tp->rcv_nxt, sp->end_seq));
+
+			/* Zap this SACK, by moving forward any other SACKS. */
+			for (i = this_sack+1; i < num_sacks; i++)
+				tp->selective_acks[i-1] = tp->selective_acks[i];
+			num_sacks--;
+			continue;
+		}
+		this_sack++;
+		sp++;
+	}
+	tp->rx_opt.num_sacks = num_sacks;
+}
+
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+			     struct sk_buff *to,
+			     struct sk_buff *from,
+			     bool *fragstolen)
+{
+	int delta;
+
+	*fragstolen = false;
+
+	/* Its possible this segment overlaps with prior segment in queue */
+	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+		return false;
+
+	if (!mptcp_skb_can_collapse(to, from))
+		return false;
+
+#ifdef CONFIG_TLS_DEVICE
+	if (from->decrypted != to->decrypted)
+		return false;
+#endif
+
+	if (!skb_try_coalesce(to, from, fragstolen, &delta))
+		return false;
+
+	atomic_add(delta, &sk->sk_rmem_alloc);
+	sk_mem_charge(sk, delta);
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+	TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
+
+	if (TCP_SKB_CB(from)->has_rxtstamp) {
+		TCP_SKB_CB(to)->has_rxtstamp = true;
+		to->tstamp = from->tstamp;
+		skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
+	}
+
+	return true;
+}
+
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+			     struct sk_buff *to,
+			     struct sk_buff *from,
+			     bool *fragstolen)
+{
+	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+	/* In case tcp_drop_reason() is called later, update to->gso_segs */
+	if (res) {
+		u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+			       max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+		skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+	}
+	return res;
+}
+
+static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb,
+			    enum skb_drop_reason reason)
+{
+	sk_drops_add(sk, skb);
+	kfree_skb_reason(skb, reason);
+}
+
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 dsack_high = tp->rcv_nxt;
+	bool fin, fragstolen, eaten;
+	struct sk_buff *skb, *tail;
+	struct rb_node *p;
+
+	p = rb_first(&tp->out_of_order_queue);
+	while (p) {
+		skb = rb_to_skb(p);
+		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+			break;
+
+		if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
+			__u32 dsack = dsack_high;
+			if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
+				dsack_high = TCP_SKB_CB(skb)->end_seq;
+			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
+		}
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
+
+		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
+			tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP);
+			continue;
+		}
+
+		tail = skb_peek_tail(&sk->sk_receive_queue);
+		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+		if (!eaten)
+			__skb_queue_tail(&sk->sk_receive_queue, skb);
+		else
+			kfree_skb_partial(skb, fragstolen);
+
+		if (unlikely(fin)) {
+			tcp_fin(sk);
+			/* tcp_fin() purges tp->out_of_order_queue,
+			 * so we must end this loop right now.
+			 */
+			break;
+		}
+	}
+}
+
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
+
+static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+				 unsigned int size)
+{
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	    !sk_rmem_schedule(sk, skb, size)) {
+
+		if (tcp_prune_queue(sk, skb) < 0)
+			return -1;
+
+		while (!sk_rmem_schedule(sk, skb, size)) {
+			if (!tcp_prune_ofo_queue(sk, skb))
+				return -1;
+		}
+	}
+	return 0;
+}
+
+static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct rb_node **p, *parent;
+	struct sk_buff *skb1;
+	u32 seq, end_seq;
+	bool fragstolen;
+
+	tcp_ecn_check_ce(sk, skb);
+
+	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
+		sk->sk_data_ready(sk);
+		tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM);
+		return;
+	}
+
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
+	inet_csk_schedule_ack(sk);
+
+	tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+	seq = TCP_SKB_CB(skb)->seq;
+	end_seq = TCP_SKB_CB(skb)->end_seq;
+
+	p = &tp->out_of_order_queue.rb_node;
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+		/* Initial out of order segment, build 1 SACK. */
+		if (tcp_is_sack(tp)) {
+			tp->rx_opt.num_sacks = 1;
+			tp->selective_acks[0].start_seq = seq;
+			tp->selective_acks[0].end_seq = end_seq;
+		}
+		rb_link_node(&skb->rbnode, NULL, p);
+		rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+		tp->ooo_last_skb = skb;
+		goto end;
+	}
+
+	/* In the typical case, we are adding an skb to the end of the list.
+	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+	 */
+	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+				 skb, &fragstolen)) {
+coalesce_done:
+		/* For non sack flows, do not grow window to force DUPACK
+		 * and trigger fast retransmit.
+		 */
+		if (tcp_is_sack(tp))
+			tcp_grow_window(sk, skb, true);
+		kfree_skb_partial(skb, fragstolen);
+		skb = NULL;
+		goto add_sack;
+	}
+	/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+	if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
+		parent = &tp->ooo_last_skb->rbnode;
+		p = &parent->rb_right;
+		goto insert;
+	}
+
+	/* Find place to insert this segment. Handle overlaps on the way. */
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		skb1 = rb_to_skb(parent);
+		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
+			p = &parent->rb_left;
+			continue;
+		}
+		if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+				/* All the bits are present. Drop. */
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				tcp_drop_reason(sk, skb,
+						SKB_DROP_REASON_TCP_OFOMERGE);
+				skb = NULL;
+				tcp_dsack_set(sk, seq, end_seq);
+				goto add_sack;
+			}
+			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+				/* Partial overlap. */
+				tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
+			} else {
+				/* skb's seq == skb1's seq and skb covers skb1.
+				 * Replace skb1 with skb.
+				 */
+				rb_replace_node(&skb1->rbnode, &skb->rbnode,
+						&tp->out_of_order_queue);
+				tcp_dsack_extend(sk,
+						 TCP_SKB_CB(skb1)->seq,
+						 TCP_SKB_CB(skb1)->end_seq);
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPOFOMERGE);
+				tcp_drop_reason(sk, skb1,
+						SKB_DROP_REASON_TCP_OFOMERGE);
+				goto merge_right;
+			}
+		} else if (tcp_ooo_try_coalesce(sk, skb1,
+						skb, &fragstolen)) {
+			goto coalesce_done;
+		}
+		p = &parent->rb_right;
+	}
+insert:
+	/* Insert segment into RB tree. */
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
+
+merge_right:
+	/* Remove other segments covered by skb. */
+	while ((skb1 = skb_rb_next(skb)) != NULL) {
+		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+			break;
+		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+					 end_seq);
+			break;
+		}
+		rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
+		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+				 TCP_SKB_CB(skb1)->end_seq);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+		tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE);
+	}
+	/* If there is no skb after us, we are the last_skb ! */
+	if (!skb1)
+		tp->ooo_last_skb = skb;
+
+add_sack:
+	if (tcp_is_sack(tp))
+		tcp_sack_new_ofo_skb(sk, seq, end_seq);
+end:
+	if (skb) {
+		/* For non sack flows, do not grow window to force DUPACK
+		 * and trigger fast retransmit.
+		 */
+		if (tcp_is_sack(tp))
+			tcp_grow_window(sk, skb, false);
+		skb_condense(skb);
+		skb_set_owner_r(skb, sk);
+	}
+}
+
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
+				      bool *fragstolen)
+{
+	int eaten;
+	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+
+	eaten = (tail &&
+		 tcp_try_coalesce(sk, tail,
+				  skb, fragstolen)) ? 1 : 0;
+	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
+	if (!eaten) {
+		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		skb_set_owner_r(skb, sk);
+	}
+	return eaten;
+}
+
+int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct sk_buff *skb;
+	int err = -ENOMEM;
+	int data_len = 0;
+	bool fragstolen;
+
+	if (size == 0)
+		return 0;
+
+	if (size > PAGE_SIZE) {
+		int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
+
+		data_len = npages << PAGE_SHIFT;
+		size = data_len + (size & ~PAGE_MASK);
+	}
+	skb = alloc_skb_with_frags(size - data_len, data_len,
+				   PAGE_ALLOC_COSTLY_ORDER,
+				   &err, sk->sk_allocation);
+	if (!skb)
+		goto err;
+
+	skb_put(skb, size - data_len);
+	skb->data_len = data_len;
+	skb->len = size;
+
+	if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+		goto err_free;
+	}
+
+	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+	if (err)
+		goto err_free;
+
+	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+	if (tcp_queue_rcv(sk, skb, &fragstolen)) {
+		WARN_ON_ONCE(fragstolen); /* should not happen */
+		__kfree_skb(skb);
+	}
+	return size;
+
+err_free:
+	kfree_skb(skb);
+err:
+	return err;
+
+}
+
+void tcp_data_ready(struct sock *sk)
+{
+	if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
+		sk->sk_data_ready(sk);
+}
+
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	enum skb_drop_reason reason;
+	bool fragstolen;
+	int eaten;
+
+	/* If a subflow has been reset, the packet should not continue
+	 * to be processed, drop the packet.
+	 */
+	if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+		__kfree_skb(skb);
+		return;
+	}
+
+	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+		__kfree_skb(skb);
+		return;
+	}
+	skb_dst_drop(skb);
+	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
+
+	reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	tp->rx_opt.dsack = 0;
+
+	/*  Queue data for delivery to the user.
+	 *  Packets in sequence go to the receive queue.
+	 *  Out of sequence packets to the out_of_order_queue.
+	 */
+	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+		if (tcp_receive_window(tp) == 0) {
+			reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
+			goto out_of_window;
+		}
+
+		/* Ok. In sequence. In window. */
+queue_and_out:
+		if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+			/* TODO: maybe ratelimit these WIN 0 ACK ? */
+			inet_csk(sk)->icsk_ack.pending |=
+					(ICSK_ACK_NOMEM | ICSK_ACK_NOW);
+			inet_csk_schedule_ack(sk);
+			sk->sk_data_ready(sk);
+
+			if (skb_queue_len(&sk->sk_receive_queue)) {
+				reason = SKB_DROP_REASON_PROTO_MEM;
+				NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+				goto drop;
+			}
+			sk_forced_mem_schedule(sk, skb->truesize);
+		}
+
+		eaten = tcp_queue_rcv(sk, skb, &fragstolen);
+		if (skb->len)
+			tcp_event_data_recv(sk, skb);
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+			tcp_fin(sk);
+
+		if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+			tcp_ofo_queue(sk);
+
+			/* RFC5681. 4.2. SHOULD send immediate ACK, when
+			 * gap in queue is filled.
+			 */
+			if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+				inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+		}
+
+		if (tp->rx_opt.num_sacks)
+			tcp_sack_remove(tp);
+
+		tcp_fast_path_check(sk);
+
+		if (eaten > 0)
+			kfree_skb_partial(skb, fragstolen);
+		if (!sock_flag(sk, SOCK_DEAD))
+			tcp_data_ready(sk);
+		return;
+	}
+
+	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+		tcp_rcv_spurious_retrans(sk, skb);
+		/* A retransmit, 2nd most common case.  Force an immediate ack. */
+		reason = SKB_DROP_REASON_TCP_OLD_DATA;
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+out_of_window:
+		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+		inet_csk_schedule_ack(sk);
+drop:
+		tcp_drop_reason(sk, skb, reason);
+		return;
+	}
+
+	/* Out of window. F.e. zero window probe. */
+	if (!before(TCP_SKB_CB(skb)->seq,
+		    tp->rcv_nxt + tcp_receive_window(tp))) {
+		reason = SKB_DROP_REASON_TCP_OVERWINDOW;
+		goto out_of_window;
+	}
+
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		/* Partial packet, seq < rcv_next < end_seq */
+		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
+
+		/* If window is closed, drop tail of packet. But after
+		 * remembering D-SACK for its head made in previous line.
+		 */
+		if (!tcp_receive_window(tp)) {
+			reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
+			goto out_of_window;
+		}
+		goto queue_and_out;
+	}
+
+	tcp_data_queue_ofo(sk, skb);
+}
+
+static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
+{
+	if (list)
+		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
+
+	return skb_rb_next(skb);
+}
+
+static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+					struct sk_buff_head *list,
+					struct rb_root *root)
+{
+	struct sk_buff *next = tcp_skb_next(skb, list);
+
+	if (list)
+		__skb_unlink(skb, list);
+	else
+		rb_erase(&skb->rbnode, root);
+
+	__kfree_skb(skb);
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+
+	return next;
+}
+
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sk_buff *skb1;
+
+	while (*p) {
+		parent = *p;
+		skb1 = rb_to_skb(parent);
+		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&skb->rbnode, parent, p);
+	rb_insert_color(&skb->rbnode, root);
+}
+
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ *
+ * If tail is NULL, this means until the end of the queue.
+ *
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+	     struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
+{
+	struct sk_buff *skb = head, *n;
+	struct sk_buff_head tmp;
+	bool end_of_skbs;
+
+	/* First, check that queue is collapsible and find
+	 * the point where collapsing can be useful.
+	 */
+restart:
+	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
+		n = tcp_skb_next(skb, list);
+
+		/* No new bits? It is possible on ofo queue. */
+		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+			skb = tcp_collapse_one(sk, skb, list, root);
+			if (!skb)
+				break;
+			goto restart;
+		}
+
+		/* The first skb to collapse is:
+		 * - not SYN/FIN and
+		 * - bloated or contains data before "start" or
+		 *   overlaps to the next one and mptcp allow collapsing.
+		 */
+		if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
+		    (tcp_win_from_space(sk, skb->truesize) > skb->len ||
+		     before(TCP_SKB_CB(skb)->seq, start))) {
+			end_of_skbs = false;
+			break;
+		}
+
+		if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
+		    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
+			end_of_skbs = false;
+			break;
+		}
+
+		/* Decided to skip this, advance start seq. */
+		start = TCP_SKB_CB(skb)->end_seq;
+	}
+	if (end_of_skbs ||
+	    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+		return;
+
+	__skb_queue_head_init(&tmp);
+
+	while (before(start, end)) {
+		int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
+		struct sk_buff *nskb;
+
+		nskb = alloc_skb(copy, GFP_ATOMIC);
+		if (!nskb)
+			break;
+
+		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+#ifdef CONFIG_TLS_DEVICE
+		nskb->decrypted = skb->decrypted;
+#endif
+		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+		if (list)
+			__skb_queue_before(list, skb, nskb);
+		else
+			__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
+		skb_set_owner_r(nskb, sk);
+		mptcp_skb_ext_move(nskb, skb);
+
+		/* Copy data, releasing collapsed skbs. */
+		while (copy > 0) {
+			int offset = start - TCP_SKB_CB(skb)->seq;
+			int size = TCP_SKB_CB(skb)->end_seq - start;
+
+			BUG_ON(offset < 0);
+			if (size > 0) {
+				size = min(copy, size);
+				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+					BUG();
+				TCP_SKB_CB(nskb)->end_seq += size;
+				copy -= size;
+				start += size;
+			}
+			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+				skb = tcp_collapse_one(sk, skb, list, root);
+				if (!skb ||
+				    skb == tail ||
+				    !mptcp_skb_can_collapse(nskb, skb) ||
+				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+					goto end;
+#ifdef CONFIG_TLS_DEVICE
+				if (skb->decrypted != nskb->decrypted)
+					goto end;
+#endif
+			}
+		}
+	}
+end:
+	skb_queue_walk_safe(&tmp, skb, n)
+		tcp_rbtree_insert(root, skb);
+}
+
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 range_truesize, sum_tiny = 0;
+	struct sk_buff *skb, *head;
+	u32 start, end;
+
+	skb = skb_rb_first(&tp->out_of_order_queue);
+new_range:
+	if (!skb) {
+		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
+		return;
+	}
+	start = TCP_SKB_CB(skb)->seq;
+	end = TCP_SKB_CB(skb)->end_seq;
+	range_truesize = skb->truesize;
+
+	for (head = skb;;) {
+		skb = skb_rb_next(skb);
+
+		/* Range is terminated when we see a gap or when
+		 * we are at the queue end.
+		 */
+		if (!skb ||
+		    after(TCP_SKB_CB(skb)->seq, end) ||
+		    before(TCP_SKB_CB(skb)->end_seq, start)) {
+			/* Do not attempt collapsing tiny skbs */
+			if (range_truesize != head->truesize ||
+			    end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
+				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+					     head, skb, start, end);
+			} else {
+				sum_tiny += range_truesize;
+				if (sum_tiny > sk->sk_rcvbuf >> 3)
+					return;
+			}
+			goto new_range;
+		}
+
+		range_truesize += skb->truesize;
+		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
+			start = TCP_SKB_CB(skb)->seq;
+		if (after(TCP_SKB_CB(skb)->end_seq, end))
+			end = TCP_SKB_CB(skb)->end_seq;
+	}
+}
+
+/*
+ * Clean the out-of-order queue to make room.
+ * We drop high sequences packets to :
+ * 1) Let a chance for holes to be filled.
+ *    This means we do not drop packets from ooo queue if their sequence
+ *    is before incoming packet sequence.
+ * 2) not add too big latencies if thousands of packets sit there.
+ *    (But if application shrinks SO_RCVBUF, we could still end up
+ *     freeing whole queue here)
+ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+ *
+ * Return true if queue has shrunk.
+ */
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct rb_node *node, *prev;
+	bool pruned = false;
+	int goal;
+
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+		return false;
+
+	goal = sk->sk_rcvbuf >> 3;
+	node = &tp->ooo_last_skb->rbnode;
+
+	do {
+		struct sk_buff *skb = rb_to_skb(node);
+
+		/* If incoming skb would land last in ofo queue, stop pruning. */
+		if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
+			break;
+		pruned = true;
+		prev = rb_prev(node);
+		rb_erase(node, &tp->out_of_order_queue);
+		goal -= skb->truesize;
+		tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
+		tp->ooo_last_skb = rb_to_skb(prev);
+		if (!prev || goal <= 0) {
+			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+			    !tcp_under_memory_pressure(sk))
+				break;
+			goal = sk->sk_rcvbuf >> 3;
+		}
+		node = prev;
+	} while (node);
+
+	if (pruned) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+		/* Reset SACK state.  A conforming SACK implementation will
+		 * do the same at a timeout based retransmit.  When a connection
+		 * is in a sad state like this, we care only about integrity
+		 * of the connection not performance.
+		 */
+		if (tp->rx_opt.sack_ok)
+			tcp_sack_reset(&tp->rx_opt);
+	}
+	return pruned;
+}
+
+/* Reduce allocated memory if we can, trying to get
+ * the socket within its memory limits again.
+ *
+ * Return less than zero if we should start dropping frames
+ * until the socket owning process reads some of the data
+ * to stabilize the situation.
+ */
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		tcp_clamp_window(sk);
+	else if (tcp_under_memory_pressure(sk))
+		tcp_adjust_rcv_ssthresh(sk);
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
+	tcp_collapse_ofo_queue(sk);
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+			     skb_peek(&sk->sk_receive_queue),
+			     NULL,
+			     tp->copied_seq, tp->rcv_nxt);
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
+	/* Collapsing did not help, destructive actions follow.
+	 * This must not ever occur. */
+
+	tcp_prune_ofo_queue(sk, in_skb);
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
+	/* If we are really being abused, tell the caller to silently
+	 * drop receive data on the floor.  It will get retransmitted
+	 * and hopefully then we'll have sufficient space.
+	 */
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
+
+	/* Massive buffer overcommit. */
+	tp->pred_flags = 0;
+	return -1;
+}
+
+static bool tcp_should_expand_sndbuf(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	/* If the user specified a specific send buffer setting, do
+	 * not modify it.
+	 */
+	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+		return false;
+
+	/* If we are under global TCP memory pressure, do not expand.  */
+	if (tcp_under_memory_pressure(sk)) {
+		int unused_mem = sk_unused_reserved_mem(sk);
+
+		/* Adjust sndbuf according to reserved mem. But make sure
+		 * it never goes below SOCK_MIN_SNDBUF.
+		 * See sk_stream_moderate_sndbuf() for more details.
+		 */
+		if (unused_mem > SOCK_MIN_SNDBUF)
+			WRITE_ONCE(sk->sk_sndbuf, unused_mem);
+
+		return false;
+	}
+
+	/* If we are under soft global TCP memory pressure, do not expand.  */
+	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+		return false;
+
+	/* If we filled the congestion window, do not expand.  */
+	if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp))
+		return false;
+
+	return true;
+}
+
+static void tcp_new_space(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_should_expand_sndbuf(sk)) {
+		tcp_sndbuf_expand(sk);
+		tp->snd_cwnd_stamp = tcp_jiffies32;
+	}
+
+	INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
+}
+
+/* Caller made space either from:
+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
+ *
+ * We might be able to generate EPOLLOUT to the application if:
+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
+ *    small enough that tcp_stream_memory_free() decides it
+ *    is time to generate EPOLLOUT.
+ */
+void tcp_check_space(struct sock *sk)
+{
+	/* pairs with tcp_poll() */
+	smp_mb();
+	if (sk->sk_socket &&
+	    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		tcp_new_space(sk);
+		if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+			tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+	}
+}
+
+static inline void tcp_data_snd_check(struct sock *sk)
+{
+	tcp_push_pending_frames(sk);
+	tcp_check_space(sk);
+}
+
+/*
+ * Check if sending an ack is needed.
+ */
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned long rtt, delay;
+
+	    /* More than one full frame received... */
+	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	     /* ... and right edge of window advances far enough.
+	      * (tcp_recvmsg() will send ACK otherwise).
+	      * If application uses SO_RCVLOWAT, we want send ack now if
+	      * we have not received enough bytes to satisfy the condition.
+	      */
+	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
+	    /* We ACK each frame or... */
+	    tcp_in_quickack_mode(sk) ||
+	    /* Protocol state mandates a one-time immediate ACK */
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
+send_now:
+		tcp_send_ack(sk);
+		return;
+	}
+
+	if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
+		tcp_send_delayed_ack(sk);
+		return;
+	}
+
+	if (!tcp_is_sack(tp) ||
+	    tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
+		goto send_now;
+
+	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
+		tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
+		tp->dup_ack_counter = 0;
+	}
+	if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
+		tp->dup_ack_counter++;
+		goto send_now;
+	}
+	tp->compressed_ack++;
+	if (hrtimer_is_queued(&tp->compressed_ack_timer))
+		return;
+
+	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
+
+	rtt = tp->rcv_rtt_est.rtt_us;
+	if (tp->srtt_us && tp->srtt_us < rtt)
+		rtt = tp->srtt_us;
+
+	delay = min_t(unsigned long,
+		      READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
+		      rtt * (NSEC_PER_USEC >> 3)/20);
+	sock_hold(sk);
+	hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
+			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
+			       HRTIMER_MODE_REL_PINNED_SOFT);
+}
+
+static inline void tcp_ack_snd_check(struct sock *sk)
+{
+	if (!inet_csk_ack_scheduled(sk)) {
+		/* We sent a data segment already. */
+		return;
+	}
+	__tcp_ack_snd_check(sk, 1);
+}
+
+/*
+ *	This routine is only called when we have urgent data
+ *	signaled. Its the 'slow' part of tcp_urg. It could be
+ *	moved inline now as tcp_urg is only called from one
+ *	place. We handle URGent data wrong. We have to - as
+ *	BSD still doesn't use the correction from RFC961.
+ *	For 1003.1g we should support a new option TCP_STDURG to permit
+ *	either form (or just set the sysctl tcp_stdurg).
+ */
+
+static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 ptr = ntohs(th->urg_ptr);
+
+	if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
+		ptr--;
+	ptr += ntohl(th->seq);
+
+	/* Ignore urgent data that we've already seen and read. */
+	if (after(tp->copied_seq, ptr))
+		return;
+
+	/* Do not replay urg ptr.
+	 *
+	 * NOTE: interesting situation not covered by specs.
+	 * Misbehaving sender may send urg ptr, pointing to segment,
+	 * which we already have in ofo queue. We are not able to fetch
+	 * such data and will stay in TCP_URG_NOTYET until will be eaten
+	 * by recvmsg(). Seems, we are not obliged to handle such wicked
+	 * situations. But it is worth to think about possibility of some
+	 * DoSes using some hypothetical application level deadlock.
+	 */
+	if (before(ptr, tp->rcv_nxt))
+		return;
+
+	/* Do we already have a newer (or duplicate) urgent pointer? */
+	if (tp->urg_data && !after(ptr, tp->urg_seq))
+		return;
+
+	/* Tell the world about our new urgent pointer. */
+	sk_send_sigurg(sk);
+
+	/* We may be adding urgent data when the last byte read was
+	 * urgent. To do this requires some care. We cannot just ignore
+	 * tp->copied_seq since we would read the last urgent byte again
+	 * as data, nor can we alter copied_seq until this data arrives
+	 * or we break the semantics of SIOCATMARK (and thus sockatmark())
+	 *
+	 * NOTE. Double Dutch. Rendering to plain English: author of comment
+	 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
+	 * and expect that both A and B disappear from stream. This is _wrong_.
+	 * Though this happens in BSD with high probability, this is occasional.
+	 * Any application relying on this is buggy. Note also, that fix "works"
+	 * only in this artificial test. Insert some normal data between A and B and we will
+	 * decline of BSD again. Verdict: it is better to remove to trap
+	 * buggy users.
+	 */
+	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+		tp->copied_seq++;
+		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			__kfree_skb(skb);
+		}
+	}
+
+	WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
+	WRITE_ONCE(tp->urg_seq, ptr);
+
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
+}
+
+/* This is the 'fast' part of urgent handling. */
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Check if we get a new urgent pointer - normally not. */
+	if (unlikely(th->urg))
+		tcp_check_urg(sk, th);
+
+	/* Do we wait for any urgent data? - normally not... */
+	if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
+		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
+			  th->syn;
+
+		/* Is the urgent pointer pointing into this packet? */
+		if (ptr < skb->len) {
+			u8 tmp;
+			if (skb_copy_bits(skb, ptr, &tmp, 1))
+				BUG();
+			WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
+			if (!sock_flag(sk, SOCK_DEAD))
+				sk->sk_data_ready(sk);
+		}
+	}
+}
+
+/* Accept RST for rcv_nxt - 1 after a FIN.
+ * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
+ * FIN is sent followed by a RST packet. The RST is sent with the same
+ * sequence number as the FIN, and thus according to RFC 5961 a challenge
+ * ACK should be sent. However, Mac OSX rate limits replies to challenge
+ * ACKs on the closed socket. In addition middleboxes can drop either the
+ * challenge ACK or a subsequent RST.
+ */
+static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
+			(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
+					       TCPF_CLOSING));
+}
+
+/* Does PAWS and seqno based validation of an incoming segment, flags will
+ * play significant role here.
+ */
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+				  const struct tcphdr *th, int syn_inerr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	SKB_DR(reason);
+
+	/* RFC1323: H1. Apply PAWS check first. */
+	if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
+	    tp->rx_opt.saw_tstamp &&
+	    tcp_paws_discard(sk, skb)) {
+		if (!th->rst) {
+			if (unlikely(th->syn))
+				goto syn_challenge;
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+			if (!tcp_oow_rate_limited(sock_net(sk), skb,
+						  LINUX_MIB_TCPACKSKIPPEDPAWS,
+						  &tp->last_oow_ack_time))
+				tcp_send_dupack(sk, skb);
+			SKB_DR_SET(reason, TCP_RFC7323_PAWS);
+			goto discard;
+		}
+		/* Reset is accepted even if it did not pass PAWS. */
+	}
+
+	/* Step 1: check sequence number */
+	reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+	if (reason) {
+		/* RFC793, page 37: "In all states except SYN-SENT, all reset
+		 * (RST) segments are validated by checking their SEQ-fields."
+		 * And page 69: "If an incoming segment is not acceptable,
+		 * an acknowledgment should be sent in reply (unless the RST
+		 * bit is set, if so drop the segment and return)".
+		 */
+		if (!th->rst) {
+			if (th->syn)
+				goto syn_challenge;
+			if (!tcp_oow_rate_limited(sock_net(sk), skb,
+						  LINUX_MIB_TCPACKSKIPPEDSEQ,
+						  &tp->last_oow_ack_time))
+				tcp_send_dupack(sk, skb);
+		} else if (tcp_reset_check(sk, skb)) {
+			goto reset;
+		}
+		goto discard;
+	}
+
+	/* Step 2: check RST bit */
+	if (th->rst) {
+		/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
+		 * FIN and SACK too if available):
+		 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
+		 * the right-most SACK block,
+		 * then
+		 *     RESET the connection
+		 * else
+		 *     Send a challenge ACK
+		 */
+		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
+		    tcp_reset_check(sk, skb))
+			goto reset;
+
+		if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
+			struct tcp_sack_block *sp = &tp->selective_acks[0];
+			int max_sack = sp[0].end_seq;
+			int this_sack;
+
+			for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
+			     ++this_sack) {
+				max_sack = after(sp[this_sack].end_seq,
+						 max_sack) ?
+					sp[this_sack].end_seq : max_sack;
+			}
+
+			if (TCP_SKB_CB(skb)->seq == max_sack)
+				goto reset;
+		}
+
+		/* Disable TFO if RST is out-of-order
+		 * and no data has been received
+		 * for current active TFO socket
+		 */
+		if (tp->syn_fastopen && !tp->data_segs_in &&
+		    sk->sk_state == TCP_ESTABLISHED)
+			tcp_fastopen_active_disable(sk);
+		tcp_send_challenge_ack(sk);
+		SKB_DR_SET(reason, TCP_RESET);
+		goto discard;
+	}
+
+	/* step 3: check security and precedence [ignored] */
+
+	/* step 4: Check for a SYN
+	 * RFC 5961 4.2 : Send a challenge ack
+	 */
+	if (th->syn) {
+syn_challenge:
+		if (syn_inerr)
+			TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+		tcp_send_challenge_ack(sk);
+		SKB_DR_SET(reason, TCP_INVALID_SYN);
+		goto discard;
+	}
+
+	bpf_skops_parse_hdr(sk, skb);
+
+	return true;
+
+discard:
+	tcp_drop_reason(sk, skb, reason);
+	return false;
+
+reset:
+	tcp_reset(sk, skb);
+	__kfree_skb(skb);
+	return false;
+}
+
+/*
+ *	TCP receive function for the ESTABLISHED state.
+ *
+ *	It is split into a fast path and a slow path. The fast path is
+ * 	disabled when:
+ *	- A zero window was announced from us - zero window probing
+ *        is only handled properly in the slow path.
+ *	- Out of order segments arrived.
+ *	- Urgent data is expected.
+ *	- There is no buffer space left
+ *	- Unexpected TCP flags/window values/header lengths are received
+ *	  (detected by checking the TCP header against pred_flags)
+ *	- Data is sent in both directions. Fast path only supports pure senders
+ *	  or pure receivers (this means either the sequence number or the ack
+ *	  value must stay constant)
+ *	- Unexpected TCP option.
+ *
+ *	When these conditions are not satisfied it drops into a standard
+ *	receive procedure patterned after RFC793 to handle all cases.
+ *	The first three cases are guaranteed by proper pred_flags setting,
+ *	the rest is checked inline. Fast processing is turned on in
+ *	tcp_data_queue when everything is OK.
+ */
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
+{
+	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	const struct tcphdr *th = (const struct tcphdr *)skb->data;
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int len = skb->len;
+
+	/* TCP congestion window tracking */
+	trace_tcp_probe(sk, skb);
+
+	tcp_mstamp_refresh(tp);
+	if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
+		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
+	/*
+	 *	Header prediction.
+	 *	The code loosely follows the one in the famous
+	 *	"30 instruction TCP receive" Van Jacobson mail.
+	 *
+	 *	Van's trick is to deposit buffers into socket queue
+	 *	on a device interrupt, to call tcp_recv function
+	 *	on the receive process context and checksum and copy
+	 *	the buffer to user space. smart...
+	 *
+	 *	Our current scheme is not silly either but we take the
+	 *	extra cost of the net_bh soft interrupt processing...
+	 *	We do checksum and copy also but from device to kernel.
+	 */
+
+	tp->rx_opt.saw_tstamp = 0;
+
+	/*	pred_flags is 0xS?10 << 16 + snd_wnd
+	 *	if header_prediction is to be made
+	 *	'S' will always be tp->tcp_header_len >> 2
+	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
+	 *  turn it off	(when there are holes in the receive
+	 *	 space for instance)
+	 *	PSH flag is ignored.
+	 */
+
+	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+		int tcp_header_len = tp->tcp_header_len;
+
+		/* Timestamp header prediction: tcp_header_len
+		 * is automatically equal to th->doff*4 due to pred_flags
+		 * match.
+		 */
+
+		/* Check timestamp */
+		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+			/* No? Slow path! */
+			if (!tcp_parse_aligned_timestamp(tp, th))
+				goto slow_path;
+
+			/* If PAWS failed, check it more carefully in slow path */
+			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+				goto slow_path;
+
+			/* DO NOT update ts_recent here, if checksum fails
+			 * and timestamp was corrupted part, it will result
+			 * in a hung connection since we will drop all
+			 * future packets due to the PAWS test.
+			 */
+		}
+
+		if (len <= tcp_header_len) {
+			/* Bulk data transfer: sender */
+			if (len == tcp_header_len) {
+				/* Predicted packet is in window by definition.
+				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+				 * Hence, check seq<=rcv_wup reduces to:
+				 */
+				if (tcp_header_len ==
+				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+				    tp->rcv_nxt == tp->rcv_wup)
+					tcp_store_ts_recent(tp);
+
+				/* We know that such packets are checksummed
+				 * on entry.
+				 */
+				tcp_ack(sk, skb, 0);
+				__kfree_skb(skb);
+				tcp_data_snd_check(sk);
+				/* When receiving pure ack in fast path, update
+				 * last ts ecr directly instead of calling
+				 * tcp_rcv_rtt_measure_ts()
+				 */
+				tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
+				return;
+			} else { /* Header too small */
+				reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+				TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+				goto discard;
+			}
+		} else {
+			int eaten = 0;
+			bool fragstolen = false;
+
+			if (tcp_checksum_complete(skb))
+				goto csum_error;
+
+			if ((int)skb->truesize > sk->sk_forward_alloc)
+				goto step5;
+
+			/* Predicted packet is in window by definition.
+			 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+			 * Hence, check seq<=rcv_wup reduces to:
+			 */
+			if (tcp_header_len ==
+			    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+			    tp->rcv_nxt == tp->rcv_wup)
+				tcp_store_ts_recent(tp);
+
+			tcp_rcv_rtt_measure_ts(sk, skb);
+
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
+
+			/* Bulk data transfer: receiver */
+			skb_dst_drop(skb);
+			__skb_pull(skb, tcp_header_len);
+			eaten = tcp_queue_rcv(sk, skb, &fragstolen);
+
+			tcp_event_data_recv(sk, skb);
+
+			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+				/* Well, only one small jumplet in fast path... */
+				tcp_ack(sk, skb, FLAG_DATA);
+				tcp_data_snd_check(sk);
+				if (!inet_csk_ack_scheduled(sk))
+					goto no_ack;
+			} else {
+				tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
+			}
+
+			__tcp_ack_snd_check(sk, 0);
+no_ack:
+			if (eaten)
+				kfree_skb_partial(skb, fragstolen);
+			tcp_data_ready(sk);
+			return;
+		}
+	}
+
+slow_path:
+	if (len < (th->doff << 2) || tcp_checksum_complete(skb))
+		goto csum_error;
+
+	if (!th->ack && !th->rst && !th->syn) {
+		reason = SKB_DROP_REASON_TCP_FLAGS;
+		goto discard;
+	}
+
+	/*
+	 *	Standard slow path.
+	 */
+
+	if (!tcp_validate_incoming(sk, skb, th, 1))
+		return;
+
+step5:
+	reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
+	if ((int)reason < 0) {
+		reason = -reason;
+		goto discard;
+	}
+	tcp_rcv_rtt_measure_ts(sk, skb);
+
+	/* Process urgent data. */
+	tcp_urg(sk, skb, th);
+
+	/* step 7: process the segment text */
+	tcp_data_queue(sk, skb);
+
+	tcp_data_snd_check(sk);
+	tcp_ack_snd_check(sk);
+	return;
+
+csum_error:
+	reason = SKB_DROP_REASON_TCP_CSUM;
+	trace_tcp_bad_csum(skb);
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+
+discard:
+	tcp_drop_reason(sk, skb, reason);
+}
+EXPORT_SYMBOL(tcp_rcv_established);
+
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_mtup_init(sk);
+	icsk->icsk_af_ops->rebuild_header(sk);
+	tcp_init_metrics(sk);
+
+	/* Initialize the congestion window to start the transfer.
+	 * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+	 * retransmitted. In light of RFC6298 more aggressive 1sec
+	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+	 * retransmission has occurred.
+	 */
+	if (tp->total_retrans > 1 && tp->undo_marker)
+		tcp_snd_cwnd_set(tp, 1);
+	else
+		tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk)));
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+
+	bpf_skops_established(sk, bpf_op, skb);
+	/* Initialize congestion control unless BPF initialized it already: */
+	if (!icsk->icsk_ca_initialized)
+		tcp_init_congestion_control(sk);
+	tcp_init_buffer_space(sk);
+}
+
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tcp_set_state(sk, TCP_ESTABLISHED);
+	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
+
+	if (skb) {
+		icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
+		security_inet_conn_established(sk, skb);
+		sk_mark_napi_id(sk, skb);
+	}
+
+	tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
+
+	/* Prevent spurious tcp_cwnd_restart() on first data
+	 * packet.
+	 */
+	tp->lsndtime = tcp_jiffies32;
+
+	if (sock_flag(sk, SOCK_KEEPOPEN))
+		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+	if (!tp->rx_opt.snd_wscale)
+		__tcp_fast_path_on(tp, tp->snd_wnd);
+	else
+		tp->pred_flags = 0;
+}
+
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+				    struct tcp_fastopen_cookie *cookie)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
+	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
+	bool syn_drop = false;
+
+	if (mss == tp->rx_opt.user_mss) {
+		struct tcp_options_received opt;
+
+		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
+		tcp_clear_options(&opt);
+		opt.user_mss = opt.mss_clamp = 0;
+		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
+		mss = opt.mss_clamp;
+	}
+
+	if (!tp->syn_fastopen) {
+		/* Ignore an unsolicited cookie */
+		cookie->len = -1;
+	} else if (tp->total_retrans) {
+		/* SYN timed out and the SYN-ACK neither has a cookie nor
+		 * acknowledges data. Presumably the remote received only
+		 * the retransmitted (regular) SYNs: either the original
+		 * SYN-data or the corresponding SYN-ACK was dropped.
+		 */
+		syn_drop = (cookie->len < 0 && data);
+	} else if (cookie->len < 0 && !tp->syn_data) {
+		/* We requested a cookie but didn't get it. If we did not use
+		 * the (old) exp opt format then try so next time (try_exp=1).
+		 * Otherwise we go back to use the RFC7413 opt (try_exp=2).
+		 */
+		try_exp = tp->syn_fastopen_exp ? 2 : 1;
+	}
+
+	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
+
+	if (data) { /* Retransmit unacked data in SYN */
+		if (tp->total_retrans)
+			tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
+		else
+			tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
+		skb_rbtree_walk_from(data)
+			 tcp_mark_skb_lost(sk, data);
+		tcp_xmit_retransmit_queue(sk);
+		NET_INC_STATS(sock_net(sk),
+				LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+		return true;
+	}
+	tp->syn_data_acked = tp->syn_data;
+	if (tp->syn_data_acked) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+		/* SYN-data is counted as two separate packets in tcp_ack() */
+		if (tp->delivered > 1)
+			--tp->delivered;
+	}
+
+	tcp_fastopen_add_skb(sk, synack);
+
+	return false;
+}
+
+static void smc_check_reset_syn(struct tcp_sock *tp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc && !tp->rx_opt.smc_ok)
+			tp->syn_smc = 0;
+	}
+#endif
+}
+
+static void tcp_try_undo_spurious_syn(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 syn_stamp;
+
+	/* undo_marker is set when SYN or SYNACK times out. The timeout is
+	 * spurious if the ACK's timestamp option echo value matches the
+	 * original SYN timestamp.
+	 */
+	syn_stamp = tp->retrans_stamp;
+	if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
+	    syn_stamp == tp->rx_opt.rcv_tsecr)
+		tp->undo_marker = 0;
+}
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+					 const struct tcphdr *th)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_fastopen_cookie foc = { .len = -1 };
+	int saved_clamp = tp->rx_opt.mss_clamp;
+	bool fastopen_fail;
+	SKB_DR(reason);
+
+	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
+	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+	if (th->ack) {
+		/* rfc793:
+		 * "If the state is SYN-SENT then
+		 *    first check the ACK bit
+		 *      If the ACK bit is set
+		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+		 *        a reset (unless the RST bit is set, if so drop
+		 *        the segment and return)"
+		 */
+		if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+		    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+			/* Previous FIN/ACK or RST/ACK might be ignored. */
+			if (icsk->icsk_retransmits == 0)
+				inet_csk_reset_xmit_timer(sk,
+						ICSK_TIME_RETRANS,
+						TCP_TIMEOUT_MIN, TCP_RTO_MAX);
+			goto reset_and_undo;
+		}
+
+		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+		    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
+			     tcp_time_stamp(tp))) {
+			NET_INC_STATS(sock_net(sk),
+					LINUX_MIB_PAWSACTIVEREJECTED);
+			goto reset_and_undo;
+		}
+
+		/* Now ACK is acceptable.
+		 *
+		 * "If the RST bit is set
+		 *    If the ACK was acceptable then signal the user "error:
+		 *    connection reset", drop the segment, enter CLOSED state,
+		 *    delete TCB, and return."
+		 */
+
+		if (th->rst) {
+			tcp_reset(sk, skb);
+consume:
+			__kfree_skb(skb);
+			return 0;
+		}
+
+		/* rfc793:
+		 *   "fifth, if neither of the SYN or RST bits is set then
+		 *    drop the segment and return."
+		 *
+		 *    See note below!
+		 *                                        --ANK(990513)
+		 */
+		if (!th->syn) {
+			SKB_DR_SET(reason, TCP_FLAGS);
+			goto discard_and_undo;
+		}
+		/* rfc793:
+		 *   "If the SYN bit is on ...
+		 *    are acceptable then ...
+		 *    (our SYN has been ACKed), change the connection
+		 *    state to ESTABLISHED..."
+		 */
+
+		tcp_ecn_rcv_synack(tp, th);
+
+		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+		tcp_try_undo_spurious_syn(sk);
+		tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+		/* Ok.. it's good. Set up sequence numbers and
+		 * move to established.
+		 */
+		WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
+		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+		/* RFC1323: The window in SYN & SYN/ACK segments is
+		 * never scaled.
+		 */
+		tp->snd_wnd = ntohs(th->window);
+
+		if (!tp->rx_opt.wscale_ok) {
+			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
+			tp->window_clamp = min(tp->window_clamp, 65535U);
+		}
+
+		if (tp->rx_opt.saw_tstamp) {
+			tp->rx_opt.tstamp_ok	   = 1;
+			tp->tcp_header_len =
+				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+			tp->advmss	    -= TCPOLEN_TSTAMP_ALIGNED;
+			tcp_store_ts_recent(tp);
+		} else {
+			tp->tcp_header_len = sizeof(struct tcphdr);
+		}
+
+		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+		tcp_initialize_rcv_mss(sk);
+
+		/* Remember, tcp_poll() does not lock socket!
+		 * Change state from SYN-SENT only after copied_seq
+		 * is initialized. */
+		WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+
+		smc_check_reset_syn(tp);
+
+		smp_mb();
+
+		tcp_finish_connect(sk, skb);
+
+		fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
+				tcp_rcv_fastopen_synack(sk, skb, &foc);
+
+		if (!sock_flag(sk, SOCK_DEAD)) {
+			sk->sk_state_change(sk);
+			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+		}
+		if (fastopen_fail)
+			return -1;
+		if (sk->sk_write_pending ||
+		    READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
+		    inet_csk_in_pingpong_mode(sk)) {
+			/* Save one ACK. Data will be ready after
+			 * several ticks, if write_pending is set.
+			 *
+			 * It may be deleted, but with this feature tcpdumps
+			 * look so _wonderfully_ clever, that I was not able
+			 * to stand against the temptation 8)     --ANK
+			 */
+			inet_csk_schedule_ack(sk);
+			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MAX, TCP_RTO_MAX);
+			goto consume;
+		}
+		tcp_send_ack(sk);
+		return -1;
+	}
+
+	/* No ACK in the segment */
+
+	if (th->rst) {
+		/* rfc793:
+		 * "If the RST bit is set
+		 *
+		 *      Otherwise (no ACK) drop the segment and return."
+		 */
+		SKB_DR_SET(reason, TCP_RESET);
+		goto discard_and_undo;
+	}
+
+	/* PAWS check. */
+	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
+	    tcp_paws_reject(&tp->rx_opt, 0)) {
+		SKB_DR_SET(reason, TCP_RFC7323_PAWS);
+		goto discard_and_undo;
+	}
+	if (th->syn) {
+		/* We see SYN without ACK. It is attempt of
+		 * simultaneous connect with crossed SYNs.
+		 * Particularly, it can be connect to self.
+		 */
+		tcp_set_state(sk, TCP_SYN_RECV);
+
+		if (tp->rx_opt.saw_tstamp) {
+			tp->rx_opt.tstamp_ok = 1;
+			tcp_store_ts_recent(tp);
+			tp->tcp_header_len =
+				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+		} else {
+			tp->tcp_header_len = sizeof(struct tcphdr);
+		}
+
+		WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
+		WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+		/* RFC1323: The window in SYN & SYN/ACK segments is
+		 * never scaled.
+		 */
+		tp->snd_wnd    = ntohs(th->window);
+		tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
+		tp->max_window = tp->snd_wnd;
+
+		tcp_ecn_rcv_syn(tp, th);
+
+		tcp_mtup_init(sk);
+		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+		tcp_initialize_rcv_mss(sk);
+
+		tcp_send_synack(sk);
+#if 0
+		/* Note, we could accept data and URG from this segment.
+		 * There are no obstacles to make this (except that we must
+		 * either change tcp_recvmsg() to prevent it from returning data
+		 * before 3WHS completes per RFC793, or employ TCP Fast Open).
+		 *
+		 * However, if we ignore data in ACKless segments sometimes,
+		 * we have no reasons to accept it sometimes.
+		 * Also, seems the code doing it in step6 of tcp_rcv_state_process
+		 * is not flawless. So, discard packet for sanity.
+		 * Uncomment this return to process the data.
+		 */
+		return -1;
+#else
+		goto consume;
+#endif
+	}
+	/* "fifth, if neither of the SYN or RST bits is set then
+	 * drop the segment and return."
+	 */
+
+discard_and_undo:
+	tcp_clear_options(&tp->rx_opt);
+	tp->rx_opt.mss_clamp = saved_clamp;
+	tcp_drop_reason(sk, skb, reason);
+	return 0;
+
+reset_and_undo:
+	tcp_clear_options(&tp->rx_opt);
+	tp->rx_opt.mss_clamp = saved_clamp;
+	return 1;
+}
+
+static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct request_sock *req;
+
+	/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
+	 * undo. If peer SACKs triggered fast recovery, we can't undo here.
+	 */
+	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
+		tcp_try_undo_recovery(sk);
+
+	/* Reset rtx states to prevent spurious retransmits_timed_out() */
+	tp->retrans_stamp = 0;
+	inet_csk(sk)->icsk_retransmits = 0;
+
+	/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
+	 * we no longer need req so release it.
+	 */
+	req = rcu_dereference_protected(tp->fastopen_rsk,
+					lockdep_sock_is_held(sk));
+	reqsk_fastopen_remove(sk, req, false);
+
+	/* Re-arm the timer because data may have been sent out.
+	 * This is similar to the regular data transmission case
+	 * when new data has just been ack'ed.
+	 *
+	 * (TFO) - we could try to be more aggressive and
+	 * retransmitting any data sooner based on when they
+	 * are sent out.
+	 */
+	tcp_rearm_rto(sk);
+}
+
+/*
+ *	This function implements the receiving procedure of RFC 793 for
+ *	all states except ESTABLISHED and TIME_WAIT.
+ *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ *	address independent.
+ */
+
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcphdr *th = tcp_hdr(skb);
+	struct request_sock *req;
+	int queued = 0;
+	bool acceptable;
+	SKB_DR(reason);
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		SKB_DR_SET(reason, TCP_CLOSE);
+		goto discard;
+
+	case TCP_LISTEN:
+		if (th->ack)
+			return 1;
+
+		if (th->rst) {
+			SKB_DR_SET(reason, TCP_RESET);
+			goto discard;
+		}
+		if (th->syn) {
+			if (th->fin) {
+				SKB_DR_SET(reason, TCP_FLAGS);
+				goto discard;
+			}
+			/* It is possible that we process SYN packets from backlog,
+			 * so we need to make sure to disable BH and RCU right there.
+			 */
+			rcu_read_lock();
+			local_bh_disable();
+			acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
+			local_bh_enable();
+			rcu_read_unlock();
+
+			if (!acceptable)
+				return 1;
+			consume_skb(skb);
+			return 0;
+		}
+		SKB_DR_SET(reason, TCP_FLAGS);
+		goto discard;
+
+	case TCP_SYN_SENT:
+		tp->rx_opt.saw_tstamp = 0;
+		tcp_mstamp_refresh(tp);
+		queued = tcp_rcv_synsent_state_process(sk, skb, th);
+		if (queued >= 0)
+			return queued;
+
+		/* Do step6 onward by hand. */
+		tcp_urg(sk, skb, th);
+		__kfree_skb(skb);
+		tcp_data_snd_check(sk);
+		return 0;
+	}
+
+	tcp_mstamp_refresh(tp);
+	tp->rx_opt.saw_tstamp = 0;
+	req = rcu_dereference_protected(tp->fastopen_rsk,
+					lockdep_sock_is_held(sk));
+	if (req) {
+		bool req_stolen;
+
+		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+		    sk->sk_state != TCP_FIN_WAIT1);
+
+		if (!tcp_check_req(sk, skb, req, true, &req_stolen)) {
+			SKB_DR_SET(reason, TCP_FASTOPEN);
+			goto discard;
+		}
+	}
+
+	if (!th->ack && !th->rst && !th->syn) {
+		SKB_DR_SET(reason, TCP_FLAGS);
+		goto discard;
+	}
+	if (!tcp_validate_incoming(sk, skb, th, 0))
+		return 0;
+
+	/* step 5: check the ACK field */
+	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+				      FLAG_UPDATE_TS_RECENT |
+				      FLAG_NO_CHALLENGE_ACK) > 0;
+
+	if (!acceptable) {
+		if (sk->sk_state == TCP_SYN_RECV)
+			return 1;	/* send one RST */
+		tcp_send_challenge_ack(sk);
+		SKB_DR_SET(reason, TCP_OLD_ACK);
+		goto discard;
+	}
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+		tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
+		if (!tp->srtt_us)
+			tcp_synack_rtt_meas(sk, req);
+
+		if (req) {
+			tcp_rcv_synrecv_state_fastopen(sk);
+		} else {
+			tcp_try_undo_spurious_syn(sk);
+			tp->retrans_stamp = 0;
+			tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
+					  skb);
+			WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+		}
+		smp_mb();
+		tcp_set_state(sk, TCP_ESTABLISHED);
+		sk->sk_state_change(sk);
+
+		/* Note, that this wakeup is only for marginal crossed SYN case.
+		 * Passively open sockets are not waked up, because
+		 * sk->sk_sleep == NULL and sk->sk_socket == NULL.
+		 */
+		if (sk->sk_socket)
+			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+
+		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
+		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+
+		if (tp->rx_opt.tstamp_ok)
+			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
+		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+			tcp_update_pacing_rate(sk);
+
+		/* Prevent spurious tcp_cwnd_restart() on first data packet */
+		tp->lsndtime = tcp_jiffies32;
+
+		tcp_initialize_rcv_mss(sk);
+		tcp_fast_path_on(tp);
+		break;
+
+	case TCP_FIN_WAIT1: {
+		int tmo;
+
+		if (req)
+			tcp_rcv_synrecv_state_fastopen(sk);
+
+		if (tp->snd_una != tp->write_seq)
+			break;
+
+		tcp_set_state(sk, TCP_FIN_WAIT2);
+		WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN);
+
+		sk_dst_confirm(sk);
+
+		if (!sock_flag(sk, SOCK_DEAD)) {
+			/* Wake up lingering close() */
+			sk->sk_state_change(sk);
+			break;
+		}
+
+		if (READ_ONCE(tp->linger2) < 0) {
+			tcp_done(sk);
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+			return 1;
+		}
+		if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+		    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+			/* Receive out of order FIN after close() */
+			if (tp->syn_fastopen && th->fin)
+				tcp_fastopen_active_disable(sk);
+			tcp_done(sk);
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+			return 1;
+		}
+
+		tmo = tcp_fin_time(sk);
+		if (tmo > TCP_TIMEWAIT_LEN) {
+			inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+		} else if (th->fin || sock_owned_by_user(sk)) {
+			/* Bad case. We could lose such FIN otherwise.
+			 * It is not a big problem, but it looks confusing
+			 * and not so rare event. We still can lose it now,
+			 * if it spins in bh_lock_sock(), but it is really
+			 * marginal case.
+			 */
+			inet_csk_reset_keepalive_timer(sk, tmo);
+		} else {
+			tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+			goto consume;
+		}
+		break;
+	}
+
+	case TCP_CLOSING:
+		if (tp->snd_una == tp->write_seq) {
+			tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+			goto consume;
+		}
+		break;
+
+	case TCP_LAST_ACK:
+		if (tp->snd_una == tp->write_seq) {
+			tcp_update_metrics(sk);
+			tcp_done(sk);
+			goto consume;
+		}
+		break;
+	}
+
+	/* step 6: check the URG bit */
+	tcp_urg(sk, skb, th);
+
+	/* step 7: process the segment text */
+	switch (sk->sk_state) {
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+	case TCP_LAST_ACK:
+		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+			/* If a subflow has been reset, the packet should not
+			 * continue to be processed, drop the packet.
+			 */
+			if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
+				goto discard;
+			break;
+		}
+		fallthrough;
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+		/* RFC 793 says to queue data in these states,
+		 * RFC 1122 says we MUST send a reset.
+		 * BSD 4.4 also does reset.
+		 */
+		if (sk->sk_shutdown & RCV_SHUTDOWN) {
+			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+				NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+				tcp_reset(sk, skb);
+				return 1;
+			}
+		}
+		fallthrough;
+	case TCP_ESTABLISHED:
+		tcp_data_queue(sk, skb);
+		queued = 1;
+		break;
+	}
+
+	/* tcp_data could move socket to TIME-WAIT */
+	if (sk->sk_state != TCP_CLOSE) {
+		tcp_data_snd_check(sk);
+		tcp_ack_snd_check(sk);
+	}
+
+	if (!queued) {
+discard:
+		tcp_drop_reason(sk, skb, reason);
+	}
+	return 0;
+
+consume:
+	__kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_rcv_state_process);
+
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+
+	if (family == AF_INET)
+		net_dbg_ratelimited("drop open request from %pI4/%u\n",
+				    &ireq->ir_rmt_addr, port);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (family == AF_INET6)
+		net_dbg_ratelimited("drop open request from %pI6/%u\n",
+				    &ireq->ir_v6_rmt_addr, port);
+#endif
+}
+
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negotiation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control: Linux DCTCP asserts ECT on all packets,
+ * including SYN, which is most optimal solution; however,
+ * others, such as FreeBSD do not.
+ *
+ * Exception: At least one of the reserved bits of the TCP header (th->res1) is
+ * set, indicating the use of a future TCP extension (such as AccECN). See
+ * RFC8311 §4.3 which updates RFC3168 to allow the development of such
+ * extensions.
+ */
+static void tcp_ecn_create_request(struct request_sock *req,
+				   const struct sk_buff *skb,
+				   const struct sock *listen_sk,
+				   const struct dst_entry *dst)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	const struct net *net = sock_net(listen_sk);
+	bool th_ecn = th->ece && th->cwr;
+	bool ect, ecn_ok;
+	u32 ecn_ok_dst;
+
+	if (!th_ecn)
+		return;
+
+	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
+	ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
+
+	if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+	    (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
+	    tcp_bpf_ca_needs_ecn((struct sock *)req))
+		inet_rsk(req)->ecn_ok = 1;
+}
+
+static void tcp_openreq_init(struct request_sock *req,
+			     const struct tcp_options_received *rx_opt,
+			     struct sk_buff *skb, const struct sock *sk)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+
+	req->rsk_rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
+	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
+	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+	tcp_rsk(req)->snt_synack = 0;
+	tcp_rsk(req)->last_oow_ack_time = 0;
+	req->mss = rx_opt->mss_clamp;
+	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+	ireq->tstamp_ok = rx_opt->tstamp_ok;
+	ireq->sack_ok = rx_opt->sack_ok;
+	ireq->snd_wscale = rx_opt->snd_wscale;
+	ireq->wscale_ok = rx_opt->wscale_ok;
+	ireq->acked = 0;
+	ireq->ecn_ok = 0;
+	ireq->ir_rmt_port = tcp_hdr(skb)->source;
+	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+	ireq->ir_mark = inet_request_mark(sk, skb);
+#if IS_ENABLED(CONFIG_SMC)
+	ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
+			tcp_sk(sk)->smc_hs_congested(sk));
+#endif
+}
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+				      struct sock *sk_listener,
+				      bool attach_listener)
+{
+	struct request_sock *req = reqsk_alloc(ops, sk_listener,
+					       attach_listener);
+
+	if (req) {
+		struct inet_request_sock *ireq = inet_rsk(req);
+
+		ireq->ireq_opt = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+		ireq->pktopts = NULL;
+#endif
+		atomic64_set(&ireq->ir_cookie, 0);
+		ireq->ireq_state = TCP_NEW_SYN_RECV;
+		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+		ireq->ireq_family = sk_listener->sk_family;
+		req->timeout = TCP_TIMEOUT_INIT;
+	}
+
+	return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
+/*
+ * Return true if a syncookie should be sent
+ */
+static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
+{
+	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+	const char *msg = "Dropping request";
+	struct net *net = sock_net(sk);
+	bool want_cookie = false;
+	u8 syncookies;
+
+	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
+
+#ifdef CONFIG_SYN_COOKIES
+	if (syncookies) {
+		msg = "Sending cookies";
+		want_cookie = true;
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+	} else
+#endif
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+
+	if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 &&
+	    xchg(&queue->synflood_warned, 1) == 0) {
+		if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
+			net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
+					proto, inet6_rcv_saddr(sk),
+					sk->sk_num, msg);
+		} else {
+			net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n",
+					proto, &sk->sk_rcv_saddr,
+					sk->sk_num, msg);
+		}
+	}
+
+	return want_cookie;
+}
+
+static void tcp_reqsk_record_syn(const struct sock *sk,
+				 struct request_sock *req,
+				 const struct sk_buff *skb)
+{
+	if (tcp_sk(sk)->save_syn) {
+		u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
+		struct saved_syn *saved_syn;
+		u32 mac_hdrlen;
+		void *base;
+
+		if (tcp_sk(sk)->save_syn == 2) {  /* Save full header. */
+			base = skb_mac_header(skb);
+			mac_hdrlen = skb_mac_header_len(skb);
+			len += mac_hdrlen;
+		} else {
+			base = skb_network_header(skb);
+			mac_hdrlen = 0;
+		}
+
+		saved_syn = kmalloc(struct_size(saved_syn, data, len),
+				    GFP_ATOMIC);
+		if (saved_syn) {
+			saved_syn->mac_hdrlen = mac_hdrlen;
+			saved_syn->network_hdrlen = skb_network_header_len(skb);
+			saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
+			memcpy(saved_syn->data, base, len);
+			req->saved_syn = saved_syn;
+		}
+	}
+}
+
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
+ * used for SYN cookie generation.
+ */
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
+			  const struct tcp_request_sock_ops *af_ops,
+			  struct sock *sk, struct tcphdr *th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 mss;
+
+	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
+	    !inet_csk_reqsk_queue_is_full(sk))
+		return 0;
+
+	if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
+		return 0;
+
+	if (sk_acceptq_is_full(sk)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+		return 0;
+	}
+
+	mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+	if (!mss)
+		mss = af_ops->mss_clamp;
+
+	return mss;
+}
+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+		     const struct tcp_request_sock_ops *af_ops,
+		     struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_fastopen_cookie foc = { .len = -1 };
+	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
+	struct tcp_options_received tmp_opt;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct sock *fastopen_sk = NULL;
+	struct request_sock *req;
+	bool want_cookie = false;
+	struct dst_entry *dst;
+	struct flowi fl;
+	u8 syncookies;
+
+	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
+
+	/* TW buckets are converted to open requests without
+	 * limitations, they conserve resources and peer is
+	 * evidently real one.
+	 */
+	if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+		want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
+		if (!want_cookie)
+			goto drop;
+	}
+
+	if (sk_acceptq_is_full(sk)) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+		goto drop;
+	}
+
+	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
+	if (!req)
+		goto drop;
+
+	req->syncookie = want_cookie;
+	tcp_rsk(req)->af_specific = af_ops;
+	tcp_rsk(req)->ts_off = 0;
+#if IS_ENABLED(CONFIG_MPTCP)
+	tcp_rsk(req)->is_mptcp = 0;
+#endif
+
+	tcp_clear_options(&tmp_opt);
+	tmp_opt.mss_clamp = af_ops->mss_clamp;
+	tmp_opt.user_mss  = tp->rx_opt.user_mss;
+	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
+			  want_cookie ? NULL : &foc);
+
+	if (want_cookie && !tmp_opt.saw_tstamp)
+		tcp_clear_options(&tmp_opt);
+
+	if (IS_ENABLED(CONFIG_SMC) && want_cookie)
+		tmp_opt.smc_ok = 0;
+
+	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+	tcp_openreq_init(req, &tmp_opt, skb, sk);
+	inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk);
+
+	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
+	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
+
+	dst = af_ops->route_req(sk, skb, &fl, req);
+	if (!dst)
+		goto drop_and_free;
+
+	if (tmp_opt.tstamp_ok)
+		tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
+
+	if (!want_cookie && !isn) {
+		int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
+
+		/* Kill the following clause, if you dislike this way. */
+		if (!syncookies &&
+		    (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+		     (max_syn_backlog >> 2)) &&
+		    !tcp_peer_is_proven(req, dst)) {
+			/* Without syncookies last quarter of
+			 * backlog is filled with destinations,
+			 * proven to be alive.
+			 * It means that we continue to communicate
+			 * to destinations, already remembered
+			 * to the moment of synflood.
+			 */
+			pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+				    rsk_ops->family);
+			goto drop_and_release;
+		}
+
+		isn = af_ops->init_seq(skb);
+	}
+
+	tcp_ecn_create_request(req, skb, sk, dst);
+
+	if (want_cookie) {
+		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+		if (!tmp_opt.tstamp_ok)
+			inet_rsk(req)->ecn_ok = 0;
+	}
+
+	tcp_rsk(req)->snt_isn = isn;
+	tcp_rsk(req)->txhash = net_tx_rndhash();
+	tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+	tcp_openreq_init_rwin(req, sk, dst);
+	sk_rx_queue_set(req_to_sk(req), skb);
+	if (!want_cookie) {
+		tcp_reqsk_record_syn(sk, req, skb);
+		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
+	}
+	if (fastopen_sk) {
+		af_ops->send_synack(fastopen_sk, dst, &fl, req,
+				    &foc, TCP_SYNACK_FASTOPEN, skb);
+		/* Add the child socket directly into the accept queue */
+		if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
+			reqsk_fastopen_remove(fastopen_sk, req, false);
+			bh_unlock_sock(fastopen_sk);
+			sock_put(fastopen_sk);
+			goto drop_and_free;
+		}
+		sk->sk_data_ready(sk);
+		bh_unlock_sock(fastopen_sk);
+		sock_put(fastopen_sk);
+	} else {
+		tcp_rsk(req)->tfo_listener = false;
+		if (!want_cookie) {
+			req->timeout = tcp_timeout_init((struct sock *)req);
+			inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
+		}
+		af_ops->send_synack(sk, dst, &fl, req, &foc,
+				    !want_cookie ? TCP_SYNACK_NORMAL :
+						   TCP_SYNACK_COOKIE,
+				    skb);
+		if (want_cookie) {
+			reqsk_free(req);
+			return 0;
+		}
+	}
+	reqsk_put(req);
+	return 0;
+
+drop_and_release:
+	dst_release(dst);
+drop_and_free:
+	__reqsk_free(req);
+drop:
+	tcp_listendrop(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 0000000000..c7ffab37a3
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,3411 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ *		IPv4 specific functions
+ *
+ *		code split from:
+ *		linux/ipv4/tcp.c
+ *		linux/ipv4/tcp_input.c
+ *		linux/ipv4/tcp_output.c
+ *
+ *		See tcp.c for author information
+ */
+
+/*
+ * Changes:
+ *		David S. Miller	:	New socket lookup architecture.
+ *					This code is dedicated to John Dyson.
+ *		David S. Miller :	Change semantics of established hash,
+ *					half is devoted to TIME_WAIT sockets
+ *					and the rest go in the other half.
+ *		Andi Kleen :		Add support for syncookies and fixed
+ *					some bugs: ip options weren't passed to
+ *					the TCP layer, missed a check for an
+ *					ACK bit.
+ *		Andi Kleen :		Implemented fast path mtu discovery.
+ *	     				Fixed many serious bugs in the
+ *					request_sock handling and moved
+ *					most of it into the af independent code.
+ *					Added tail drop and some other bugfixes.
+ *					Added new listen semantics.
+ *		Mike McLagan	:	Routing by source
+ *	Juan Jose Ciarlante:		ip_dynaddr bits
+ *		Andi Kleen:		various fixes.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year
+ *					coma.
+ *	Andi Kleen		:	Fix new listen.
+ *	Andi Kleen		:	Fix accept error reporting.
+ *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
+ *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
+ *					a single port at the same time.
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/bottom_half.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp.h>
+#include <net/transp_v6.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/timewait_sock.h>
+#include <net/xfrm.h>
+#include <net/secure_seq.h>
+#include <net/busy_poll.h>
+
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/stddef.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
+#include <linux/btf_ids.h>
+
+#include <crypto/hash.h>
+#include <linux/scatterlist.h>
+
+#include <trace/events/tcp.h>
+
+#ifdef CONFIG_TCP_MD5SIG
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
+#endif
+
+struct inet_hashinfo tcp_hashinfo;
+EXPORT_SYMBOL(tcp_hashinfo);
+
+static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
+
+static u32 tcp_v4_init_seq(const struct sk_buff *skb)
+{
+	return secure_tcp_seq(ip_hdr(skb)->daddr,
+			      ip_hdr(skb)->saddr,
+			      tcp_hdr(skb)->dest,
+			      tcp_hdr(skb)->source);
+}
+
+static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
+{
+	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
+}
+
+int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
+	const struct inet_timewait_sock *tw = inet_twsk(sktw);
+	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (reuse == 2) {
+		/* Still does not detect *everything* that goes through
+		 * lo, since we require a loopback src or dst address
+		 * or direct binding to 'lo' interface.
+		 */
+		bool loopback = false;
+		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
+			loopback = true;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (tw->tw_family == AF_INET6) {
+			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
+			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
+			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
+			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
+				loopback = true;
+		} else
+#endif
+		{
+			if (ipv4_is_loopback(tw->tw_daddr) ||
+			    ipv4_is_loopback(tw->tw_rcv_saddr))
+				loopback = true;
+		}
+		if (!loopback)
+			reuse = 0;
+	}
+
+	/* With PAWS, it is safe from the viewpoint
+	   of data integrity. Even without PAWS it is safe provided sequence
+	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
+
+	   Actually, the idea is close to VJ's one, only timestamp cache is
+	   held not per host, but per port pair and TW bucket is used as state
+	   holder.
+
+	   If TW bucket has been already destroyed we fall back to VJ's scheme
+	   and use initial timestamp retrieved from peer table.
+	 */
+	if (tcptw->tw_ts_recent_stamp &&
+	    (!twp || (reuse && time_after32(ktime_get_seconds(),
+					    tcptw->tw_ts_recent_stamp)))) {
+		/* In case of repair and re-using TIME-WAIT sockets we still
+		 * want to be sure that it is safe as above but honor the
+		 * sequence numbers and time stamps set as part of the repair
+		 * process.
+		 *
+		 * Without this check re-using a TIME-WAIT socket with TCP
+		 * repair would accumulate a -1 on the repair assigned
+		 * sequence number. The first time it is reused the sequence
+		 * is -1, the second time -2, etc. This fixes that issue
+		 * without appearing to create any others.
+		 */
+		if (likely(!tp->repair)) {
+			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
+
+			if (!seq)
+				seq = 1;
+			WRITE_ONCE(tp->write_seq, seq);
+			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+		}
+		sock_hold(sktw);
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+
+static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+			      int addr_len)
+{
+	/* This check is replicated from tcp_v4_connect() and intended to
+	 * prevent BPF program called below from accessing bytes that are out
+	 * of the bound specified by user in addr_len.
+	 */
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	sock_owned_by_me(sk);
+
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
+}
+
+/* This will initiate an outgoing connection. */
+int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+	struct inet_timewait_death_row *tcp_death_row;
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct ip_options_rcu *inet_opt;
+	struct net *net = sock_net(sk);
+	__be16 orig_sport, orig_dport;
+	__be32 daddr, nexthop;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	int err;
+
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	if (usin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	nexthop = daddr = usin->sin_addr.s_addr;
+	inet_opt = rcu_dereference_protected(inet->inet_opt,
+					     lockdep_sock_is_held(sk));
+	if (inet_opt && inet_opt->opt.srr) {
+		if (!daddr)
+			return -EINVAL;
+		nexthop = inet_opt->opt.faddr;
+	}
+
+	orig_sport = inet->inet_sport;
+	orig_dport = usin->sin_port;
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
+			      orig_dport, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		if (err == -ENETUNREACH)
+			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+		return err;
+	}
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		ip_rt_put(rt);
+		return -ENETUNREACH;
+	}
+
+	if (!inet_opt || !inet_opt->opt.srr)
+		daddr = fl4->daddr;
+
+	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+
+	if (!inet->inet_saddr) {
+		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
+		if (err) {
+			ip_rt_put(rt);
+			return err;
+		}
+	} else {
+		sk_rcv_saddr_set(sk, inet->inet_saddr);
+	}
+
+	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
+		/* Reset inherited state */
+		tp->rx_opt.ts_recent	   = 0;
+		tp->rx_opt.ts_recent_stamp = 0;
+		if (likely(!tp->repair))
+			WRITE_ONCE(tp->write_seq, 0);
+	}
+
+	inet->inet_dport = usin->sin_port;
+	sk_daddr_set(sk, daddr);
+
+	inet_csk(sk)->icsk_ext_hdr_len = 0;
+	if (inet_opt)
+		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+
+	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
+
+	/* Socket identity is still unknown (sport may be zero).
+	 * However we set state to SYN-SENT and not releasing socket
+	 * lock select source port, enter ourselves into the hash tables and
+	 * complete initialization after this.
+	 */
+	tcp_set_state(sk, TCP_SYN_SENT);
+	err = inet_hash_connect(tcp_death_row, sk);
+	if (err)
+		goto failure;
+
+	sk_set_txhash(sk);
+
+	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+			       inet->inet_sport, inet->inet_dport, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		rt = NULL;
+		goto failure;
+	}
+	/* OK, now commit destination to socket.  */
+	sk->sk_gso_type = SKB_GSO_TCPV4;
+	sk_setup_caps(sk, &rt->dst);
+	rt = NULL;
+
+	if (likely(!tp->repair)) {
+		if (!tp->write_seq)
+			WRITE_ONCE(tp->write_seq,
+				   secure_tcp_seq(inet->inet_saddr,
+						  inet->inet_daddr,
+						  inet->inet_sport,
+						  usin->sin_port));
+		WRITE_ONCE(tp->tsoffset,
+			   secure_tcp_ts_off(net, inet->inet_saddr,
+					     inet->inet_daddr));
+	}
+
+	atomic_set(&inet->inet_id, get_random_u16());
+
+	if (tcp_fastopen_defer_connect(sk, &err))
+		return err;
+	if (err)
+		goto failure;
+
+	err = tcp_connect(sk);
+
+	if (err)
+		goto failure;
+
+	return 0;
+
+failure:
+	/*
+	 * This unhashes the socket and releases the local port,
+	 * if necessary.
+	 */
+	tcp_set_state(sk, TCP_CLOSE);
+	inet_bhash2_reset_saddr(sk);
+	ip_rt_put(rt);
+	sk->sk_route_caps = 0;
+	inet->inet_dport = 0;
+	return err;
+}
+EXPORT_SYMBOL(tcp_v4_connect);
+
+/*
+ * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * It can be called through tcp_release_cb() if socket was owned by user
+ * at the time tcp_v4_err() was called to handle ICMP message.
+ */
+void tcp_v4_mtu_reduced(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct dst_entry *dst;
+	u32 mtu;
+
+	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+		return;
+	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
+	dst = inet_csk_update_pmtu(sk, mtu);
+	if (!dst)
+		return;
+
+	/* Something is about to be wrong... Remember soft error
+	 * for the case, if this connection will not able to recover.
+	 */
+	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
+
+	mtu = dst_mtu(dst);
+
+	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    ip_sk_accept_pmtu(sk) &&
+	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
+		tcp_sync_mss(sk, mtu);
+
+		/* Resend the TCP packet because it's
+		 * clear that the old packet has been
+		 * dropped. This is the new "fast" path mtu
+		 * discovery.
+		 */
+		tcp_simple_retransmit(sk);
+	} /* else let the usual retransmit timer handle it */
+}
+EXPORT_SYMBOL(tcp_v4_mtu_reduced);
+
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+	if (dst)
+		dst->ops->redirect(dst, sk, skb);
+}
+
+
+/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
+void tcp_req_err(struct sock *sk, u32 seq, bool abort)
+{
+	struct request_sock *req = inet_reqsk(sk);
+	struct net *net = sock_net(sk);
+
+	/* ICMPs are not backlogged, hence we cannot get
+	 * an established socket here.
+	 */
+	if (seq != tcp_rsk(req)->snt_isn) {
+		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
+	} else if (abort) {
+		/*
+		 * Still in SYN_RECV, just remove it silently.
+		 * There is no good way to pass the error to the newly
+		 * created socket, and POSIX does not want network
+		 * errors returned from accept().
+		 */
+		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+		tcp_listendrop(req->rsk_listener);
+	}
+	reqsk_put(req);
+}
+EXPORT_SYMBOL(tcp_req_err);
+
+/* TCP-LD (RFC 6069) logic */
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	s32 remaining;
+	u32 delta_us;
+
+	if (sock_owned_by_user(sk))
+		return;
+
+	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
+	    !icsk->icsk_backoff)
+		return;
+
+	skb = tcp_rtx_queue_head(sk);
+	if (WARN_ON_ONCE(!skb))
+		return;
+
+	icsk->icsk_backoff--;
+	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
+	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
+	tcp_mstamp_refresh(tp);
+	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
+	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
+
+	if (remaining > 0) {
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  remaining, TCP_RTO_MAX);
+	} else {
+		/* RTO revert clocked out retransmission.
+		 * Will retransmit now.
+		 */
+		tcp_retransmit_timer(sk);
+	}
+}
+EXPORT_SYMBOL(tcp_ld_RTO_revert);
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.  After adjustment
+ * header points to the first 8 bytes of the tcp header.  We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+
+int tcp_v4_err(struct sk_buff *skb, u32 info)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
+	struct tcp_sock *tp;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct sock *sk;
+	struct request_sock *fastopen;
+	u32 seq, snd_una;
+	int err;
+	struct net *net = dev_net(skb->dev);
+
+	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
+				       iph->daddr, th->dest, iph->saddr,
+				       ntohs(th->source), inet_iif(skb), 0);
+	if (!sk) {
+		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+		return -ENOENT;
+	}
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		inet_twsk_put(inet_twsk(sk));
+		return 0;
+	}
+	seq = ntohl(th->seq);
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+				     type == ICMP_TIME_EXCEEDED ||
+				     (type == ICMP_DEST_UNREACH &&
+				      (code == ICMP_NET_UNREACH ||
+				       code == ICMP_HOST_UNREACH)));
+		return 0;
+	}
+
+	bh_lock_sock(sk);
+	/* If too many ICMPs get dropped on busy
+	 * servers this needs to be solved differently.
+	 * We do take care of PMTU discovery (RFC1191) special case :
+	 * we can receive locally generated ICMP messages while socket is held.
+	 */
+	if (sock_owned_by_user(sk)) {
+		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
+			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
+	}
+	if (sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	if (static_branch_unlikely(&ip4_min_ttl)) {
+		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
+		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+			goto out;
+		}
+	}
+
+	tp = tcp_sk(sk);
+	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+	fastopen = rcu_dereference(tp->fastopen_rsk);
+	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
+	if (sk->sk_state != TCP_LISTEN &&
+	    !between(seq, snd_una, tp->snd_nxt)) {
+		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
+		goto out;
+	}
+
+	switch (type) {
+	case ICMP_REDIRECT:
+		if (!sock_owned_by_user(sk))
+			do_redirect(skb, sk);
+		goto out;
+	case ICMP_SOURCE_QUENCH:
+		/* Just silently ignore these. */
+		goto out;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code > NR_ICMP_UNREACH)
+			goto out;
+
+		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+			/* We are not interested in TCP_LISTEN and open_requests
+			 * (SYN-ACKs send out by Linux are always <576bytes so
+			 * they should go through unfragmented).
+			 */
+			if (sk->sk_state == TCP_LISTEN)
+				goto out;
+
+			WRITE_ONCE(tp->mtu_info, info);
+			if (!sock_owned_by_user(sk)) {
+				tcp_v4_mtu_reduced(sk);
+			} else {
+				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
+					sock_hold(sk);
+			}
+			goto out;
+		}
+
+		err = icmp_err_convert[code].errno;
+		/* check if this ICMP message allows revert of backoff.
+		 * (see RFC 6069)
+		 */
+		if (!fastopen &&
+		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
+			tcp_ld_RTO_revert(sk, seq);
+		break;
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	default:
+		goto out;
+	}
+
+	switch (sk->sk_state) {
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:
+		/* Only in fast or simultaneous open. If a fast open socket is
+		 * already accepted it is treated as a connected one below.
+		 */
+		if (fastopen && !fastopen->sk)
+			break;
+
+		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
+
+		if (!sock_owned_by_user(sk)) {
+			WRITE_ONCE(sk->sk_err, err);
+
+			sk_error_report(sk);
+
+			tcp_done(sk);
+		} else {
+			WRITE_ONCE(sk->sk_err_soft, err);
+		}
+		goto out;
+	}
+
+	/* If we've already connected we will keep trying
+	 * until we time out, or the user gives up.
+	 *
+	 * rfc1122 4.2.3.9 allows to consider as hard errors
+	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+	 * but it is obsoleted by pmtu discovery).
+	 *
+	 * Note, that in modern internet, where routing is unreliable
+	 * and in each dark corner broken firewalls sit, sending random
+	 * errors ordered by their masters even this two messages finally lose
+	 * their original sense (even Linux sends invalid PORT_UNREACHs)
+	 *
+	 * Now we are in compliance with RFCs.
+	 *							--ANK (980905)
+	 */
+
+	if (!sock_owned_by_user(sk) &&
+	    inet_test_bit(RECVERR, sk)) {
+		WRITE_ONCE(sk->sk_err, err);
+		sk_error_report(sk);
+	} else	{ /* Only an error on timeout */
+		WRITE_ONCE(sk->sk_err_soft, err);
+	}
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+	return 0;
+}
+
+void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+
+	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
+	skb->csum_start = skb_transport_header(skb) - skb->head;
+	skb->csum_offset = offsetof(struct tcphdr, check);
+}
+
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+
+	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
+}
+EXPORT_SYMBOL(tcp_v4_send_check);
+
+/*
+ *	This routine will send an RST to the other tcp.
+ *
+ *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
+ *		      for reset.
+ *	Answer: if a packet caused RST, it is not for a socket
+ *		existing in our system, if it is matched to a socket,
+ *		it is just duplicate segment or bug in other side's TCP.
+ *		So that we build reply only basing on parameters
+ *		arrived with segment.
+ *	Exception: precedence violation. We do not implement it in any case.
+ */
+
+#ifdef CONFIG_TCP_MD5SIG
+#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
+#else
+#define OPTION_BYTES sizeof(__be32)
+#endif
+
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	struct {
+		struct tcphdr th;
+		__be32 opt[OPTION_BYTES / sizeof(__be32)];
+	} rep;
+	struct ip_reply_arg arg;
+#ifdef CONFIG_TCP_MD5SIG
+	struct tcp_md5sig_key *key = NULL;
+	const __u8 *hash_location = NULL;
+	unsigned char newhash[16];
+	int genhash;
+	struct sock *sk1 = NULL;
+#endif
+	u64 transmit_time = 0;
+	struct sock *ctl_sk;
+	struct net *net;
+	u32 txhash = 0;
+
+	/* Never send a reset in response to a reset. */
+	if (th->rst)
+		return;
+
+	/* If sk not NULL, it means we did a successful lookup and incoming
+	 * route had to be correct. prequeue might have dropped our dst.
+	 */
+	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
+		return;
+
+	/* Swap the send and the receive. */
+	memset(&rep, 0, sizeof(rep));
+	rep.th.dest   = th->source;
+	rep.th.source = th->dest;
+	rep.th.doff   = sizeof(struct tcphdr) / 4;
+	rep.th.rst    = 1;
+
+	if (th->ack) {
+		rep.th.seq = th->ack_seq;
+	} else {
+		rep.th.ack = 1;
+		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
+				       skb->len - (th->doff << 2));
+	}
+
+	memset(&arg, 0, sizeof(arg));
+	arg.iov[0].iov_base = (unsigned char *)&rep;
+	arg.iov[0].iov_len  = sizeof(rep.th);
+
+	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+#ifdef CONFIG_TCP_MD5SIG
+	rcu_read_lock();
+	hash_location = tcp_parse_md5sig_option(th);
+	if (sk && sk_fullsock(sk)) {
+		const union tcp_md5_addr *addr;
+		int l3index;
+
+		/* sdif set, means packet ingressed via a device
+		 * in an L3 domain and inet_iif is set to it.
+		 */
+		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+	} else if (hash_location) {
+		const union tcp_md5_addr *addr;
+		int sdif = tcp_v4_sdif(skb);
+		int dif = inet_iif(skb);
+		int l3index;
+
+		/*
+		 * active side is lost. Try to find listening socket through
+		 * source port, and then find md5 key through listening socket.
+		 * we are not loose security here:
+		 * Incoming packet is checked with md5 hash with finding key,
+		 * no RST generated if md5 hash doesn't match.
+		 */
+		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
+					     NULL, 0, ip_hdr(skb)->saddr,
+					     th->source, ip_hdr(skb)->daddr,
+					     ntohs(th->source), dif, sdif);
+		/* don't send rst if it can't find key */
+		if (!sk1)
+			goto out;
+
+		/* sdif set, means packet ingressed via a device
+		 * in an L3 domain and dif is set to it.
+		 */
+		l3index = sdif ? dif : 0;
+		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
+		if (!key)
+			goto out;
+
+
+		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+		if (genhash || memcmp(hash_location, newhash, 16) != 0)
+			goto out;
+
+	}
+
+	if (key) {
+		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
+				   (TCPOPT_NOP << 16) |
+				   (TCPOPT_MD5SIG << 8) |
+				   TCPOLEN_MD5SIG);
+		/* Update length and the length the header thinks exists */
+		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
+		rep.th.doff = arg.iov[0].iov_len / 4;
+
+		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
+				     key, ip_hdr(skb)->saddr,
+				     ip_hdr(skb)->daddr, &rep.th);
+	}
+#endif
+	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
+	if (rep.opt[0] == 0) {
+		__be32 mrst = mptcp_reset_option(skb);
+
+		if (mrst) {
+			rep.opt[0] = mrst;
+			arg.iov[0].iov_len += sizeof(mrst);
+			rep.th.doff = arg.iov[0].iov_len / 4;
+		}
+	}
+
+	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+				      ip_hdr(skb)->saddr, /* XXX */
+				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
+	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+
+	/* When socket is gone, all binding information is lost.
+	 * routing might fail in this case. No choice here, if we choose to force
+	 * input interface, we will misroute in case of asymmetric route.
+	 */
+	if (sk) {
+		arg.bound_dev_if = sk->sk_bound_dev_if;
+		if (sk_fullsock(sk))
+			trace_tcp_send_reset(sk, skb);
+	}
+
+	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
+		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
+
+	arg.tos = ip_hdr(skb)->tos;
+	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
+	local_bh_disable();
+	ctl_sk = this_cpu_read(ipv4_tcp_sk);
+	sock_net_set(ctl_sk, net);
+	if (sk) {
+		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+				   inet_twsk(sk)->tw_priority : sk->sk_priority;
+		transmit_time = tcp_transmit_time(sk);
+		xfrm_sk_clone_policy(ctl_sk, sk);
+		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
+			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
+	} else {
+		ctl_sk->sk_mark = 0;
+		ctl_sk->sk_priority = 0;
+	}
+	ip_send_unicast_reply(ctl_sk,
+			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
+			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+			      &arg, arg.iov[0].iov_len,
+			      transmit_time, txhash);
+
+	xfrm_sk_free_policy(ctl_sk);
+	sock_net_set(ctl_sk, &init_net);
+	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+	local_bh_enable();
+
+#ifdef CONFIG_TCP_MD5SIG
+out:
+	rcu_read_unlock();
+#endif
+}
+
+/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
+   outside socket context is ugly, certainly. What can I do?
+ */
+
+static void tcp_v4_send_ack(const struct sock *sk,
+			    struct sk_buff *skb, u32 seq, u32 ack,
+			    u32 win, u32 tsval, u32 tsecr, int oif,
+			    struct tcp_md5sig_key *key,
+			    int reply_flags, u8 tos, u32 txhash)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	struct {
+		struct tcphdr th;
+		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
+#ifdef CONFIG_TCP_MD5SIG
+			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
+#endif
+			];
+	} rep;
+	struct net *net = sock_net(sk);
+	struct ip_reply_arg arg;
+	struct sock *ctl_sk;
+	u64 transmit_time;
+
+	memset(&rep.th, 0, sizeof(struct tcphdr));
+	memset(&arg, 0, sizeof(arg));
+
+	arg.iov[0].iov_base = (unsigned char *)&rep;
+	arg.iov[0].iov_len  = sizeof(rep.th);
+	if (tsecr) {
+		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+				   (TCPOPT_TIMESTAMP << 8) |
+				   TCPOLEN_TIMESTAMP);
+		rep.opt[1] = htonl(tsval);
+		rep.opt[2] = htonl(tsecr);
+		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
+	}
+
+	/* Swap the send and the receive. */
+	rep.th.dest    = th->source;
+	rep.th.source  = th->dest;
+	rep.th.doff    = arg.iov[0].iov_len / 4;
+	rep.th.seq     = htonl(seq);
+	rep.th.ack_seq = htonl(ack);
+	rep.th.ack     = 1;
+	rep.th.window  = htons(win);
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (key) {
+		int offset = (tsecr) ? 3 : 0;
+
+		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
+					  (TCPOPT_NOP << 16) |
+					  (TCPOPT_MD5SIG << 8) |
+					  TCPOLEN_MD5SIG);
+		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
+		rep.th.doff = arg.iov[0].iov_len/4;
+
+		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
+				    key, ip_hdr(skb)->saddr,
+				    ip_hdr(skb)->daddr, &rep.th);
+	}
+#endif
+	arg.flags = reply_flags;
+	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+				      ip_hdr(skb)->saddr, /* XXX */
+				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
+	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+	if (oif)
+		arg.bound_dev_if = oif;
+	arg.tos = tos;
+	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
+	local_bh_disable();
+	ctl_sk = this_cpu_read(ipv4_tcp_sk);
+	sock_net_set(ctl_sk, net);
+	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
+	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
+	transmit_time = tcp_transmit_time(sk);
+	ip_send_unicast_reply(ctl_sk,
+			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
+			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+			      &arg, arg.iov[0].iov_len,
+			      transmit_time, txhash);
+
+	sock_net_set(ctl_sk, &init_net);
+	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+	local_bh_enable();
+}
+
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+
+	tcp_v4_send_ack(sk, skb,
+			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
+			tcptw->tw_ts_recent,
+			tw->tw_bound_dev_if,
+			tcp_twsk_md5_key(tcptw),
+			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
+			tw->tw_tos,
+			tw->tw_txhash
+			);
+
+	inet_twsk_put(tw);
+}
+
+static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
+				  struct request_sock *req)
+{
+	const union tcp_md5_addr *addr;
+	int l3index;
+
+	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+	 */
+	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
+					     tcp_sk(sk)->snd_nxt;
+
+	/* RFC 7323 2.3
+	 * The window field (SEG.WND) of every outgoing segment, with the
+	 * exception of <SYN> segments, MUST be right-shifted by
+	 * Rcv.Wind.Shift bits:
+	 */
+	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+	tcp_v4_send_ack(sk, skb, seq,
+			tcp_rsk(req)->rcv_nxt,
+			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
+			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
+			READ_ONCE(req->ts_recent),
+			0,
+			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
+			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
+			ip_hdr(skb)->tos,
+			READ_ONCE(tcp_rsk(req)->txhash));
+}
+
+/*
+ *	Send a SYN-ACK after having received a SYN.
+ *	This still operates on a request_sock only, not on a big
+ *	socket.
+ */
+static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
+			      struct flowi *fl,
+			      struct request_sock *req,
+			      struct tcp_fastopen_cookie *foc,
+			      enum tcp_synack_type synack_type,
+			      struct sk_buff *syn_skb)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct flowi4 fl4;
+	int err = -1;
+	struct sk_buff *skb;
+	u8 tos;
+
+	/* First, grab a route. */
+	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+		return -1;
+
+	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
+
+	if (skb) {
+		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+
+		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
+				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+				(inet_sk(sk)->tos & INET_ECN_MASK) :
+				inet_sk(sk)->tos;
+
+		if (!INET_ECN_is_capable(tos) &&
+		    tcp_bpf_ca_needs_ecn((struct sock *)req))
+			tos |= INET_ECN_ECT_0;
+
+		rcu_read_lock();
+		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+					    ireq->ir_rmt_addr,
+					    rcu_dereference(ireq->ireq_opt),
+					    tos);
+		rcu_read_unlock();
+		err = net_xmit_eval(err);
+	}
+
+	return err;
+}
+
+/*
+ *	IPv4 request_sock destructor.
+ */
+static void tcp_v4_reqsk_destructor(struct request_sock *req)
+{
+	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
+}
+
+#ifdef CONFIG_TCP_MD5SIG
+/*
+ * RFC2385 MD5 checksumming requires a mapping of
+ * IP address->MD5 Key.
+ * We need to maintain these in the sk structure.
+ */
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
+EXPORT_SYMBOL(tcp_md5_needed);
+
+static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
+{
+	if (!old)
+		return true;
+
+	/* l3index always overrides non-l3index */
+	if (old->l3index && new->l3index == 0)
+		return false;
+	if (old->l3index == 0 && new->l3index)
+		return true;
+
+	return old->prefixlen < new->prefixlen;
+}
+
+/* Find the Key structure for an address.  */
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
+					   const union tcp_md5_addr *addr,
+					   int family)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_key *key;
+	const struct tcp_md5sig_info *md5sig;
+	__be32 mask;
+	struct tcp_md5sig_key *best_match = NULL;
+	bool match;
+
+	/* caller either holds rcu_read_lock() or socket lock */
+	md5sig = rcu_dereference_check(tp->md5sig_info,
+				       lockdep_sock_is_held(sk));
+	if (!md5sig)
+		return NULL;
+
+	hlist_for_each_entry_rcu(key, &md5sig->head, node,
+				 lockdep_sock_is_held(sk)) {
+		if (key->family != family)
+			continue;
+		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
+			continue;
+		if (family == AF_INET) {
+			mask = inet_make_mask(key->prefixlen);
+			match = (key->addr.a4.s_addr & mask) ==
+				(addr->a4.s_addr & mask);
+#if IS_ENABLED(CONFIG_IPV6)
+		} else if (family == AF_INET6) {
+			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
+						  key->prefixlen);
+#endif
+		} else {
+			match = false;
+		}
+
+		if (match && better_md5_match(best_match, key))
+			best_match = key;
+	}
+	return best_match;
+}
+EXPORT_SYMBOL(__tcp_md5_do_lookup);
+
+static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
+						      const union tcp_md5_addr *addr,
+						      int family, u8 prefixlen,
+						      int l3index, u8 flags)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_key *key;
+	unsigned int size = sizeof(struct in_addr);
+	const struct tcp_md5sig_info *md5sig;
+
+	/* caller either holds rcu_read_lock() or socket lock */
+	md5sig = rcu_dereference_check(tp->md5sig_info,
+				       lockdep_sock_is_held(sk));
+	if (!md5sig)
+		return NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (family == AF_INET6)
+		size = sizeof(struct in6_addr);
+#endif
+	hlist_for_each_entry_rcu(key, &md5sig->head, node,
+				 lockdep_sock_is_held(sk)) {
+		if (key->family != family)
+			continue;
+		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
+			continue;
+		if (key->l3index != l3index)
+			continue;
+		if (!memcmp(&key->addr, addr, size) &&
+		    key->prefixlen == prefixlen)
+			return key;
+	}
+	return NULL;
+}
+
+struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
+					 const struct sock *addr_sk)
+{
+	const union tcp_md5_addr *addr;
+	int l3index;
+
+	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+						 addr_sk->sk_bound_dev_if);
+	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
+	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+}
+EXPORT_SYMBOL(tcp_v4_md5_lookup);
+
+static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_info *md5sig;
+
+	md5sig = kmalloc(sizeof(*md5sig), gfp);
+	if (!md5sig)
+		return -ENOMEM;
+
+	sk_gso_disable(sk);
+	INIT_HLIST_HEAD(&md5sig->head);
+	rcu_assign_pointer(tp->md5sig_info, md5sig);
+	return 0;
+}
+
+/* This can be called on a newly created socket, from other files */
+static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+			    int family, u8 prefixlen, int l3index, u8 flags,
+			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
+{
+	/* Add Key to the list */
+	struct tcp_md5sig_key *key;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_info *md5sig;
+
+	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
+	if (key) {
+		/* Pre-existing entry - just update that one.
+		 * Note that the key might be used concurrently.
+		 * data_race() is telling kcsan that we do not care of
+		 * key mismatches, since changing MD5 key on live flows
+		 * can lead to packet drops.
+		 */
+		data_race(memcpy(key->key, newkey, newkeylen));
+
+		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
+		 * Also note that a reader could catch new key->keylen value
+		 * but old key->key[], this is the reason we use __GFP_ZERO
+		 * at sock_kmalloc() time below these lines.
+		 */
+		WRITE_ONCE(key->keylen, newkeylen);
+
+		return 0;
+	}
+
+	md5sig = rcu_dereference_protected(tp->md5sig_info,
+					   lockdep_sock_is_held(sk));
+
+	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
+	if (!key)
+		return -ENOMEM;
+	if (!tcp_alloc_md5sig_pool()) {
+		sock_kfree_s(sk, key, sizeof(*key));
+		return -ENOMEM;
+	}
+
+	memcpy(key->key, newkey, newkeylen);
+	key->keylen = newkeylen;
+	key->family = family;
+	key->prefixlen = prefixlen;
+	key->l3index = l3index;
+	key->flags = flags;
+	memcpy(&key->addr, addr,
+	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
+								 sizeof(struct in_addr));
+	hlist_add_head_rcu(&key->node, &md5sig->head);
+	return 0;
+}
+
+int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+		   int family, u8 prefixlen, int l3index, u8 flags,
+		   const u8 *newkey, u8 newkeylen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
+		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
+			return -ENOMEM;
+
+		if (!static_branch_inc(&tcp_md5_needed.key)) {
+			struct tcp_md5sig_info *md5sig;
+
+			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
+			rcu_assign_pointer(tp->md5sig_info, NULL);
+			kfree_rcu(md5sig, rcu);
+			return -EUSERS;
+		}
+	}
+
+	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
+				newkey, newkeylen, GFP_KERNEL);
+}
+EXPORT_SYMBOL(tcp_md5_do_add);
+
+int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
+		     int family, u8 prefixlen, int l3index,
+		     struct tcp_md5sig_key *key)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
+		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
+			return -ENOMEM;
+
+		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
+			struct tcp_md5sig_info *md5sig;
+
+			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
+			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
+			rcu_assign_pointer(tp->md5sig_info, NULL);
+			kfree_rcu(md5sig, rcu);
+			return -EUSERS;
+		}
+	}
+
+	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
+				key->flags, key->key, key->keylen,
+				sk_gfp_mask(sk, GFP_ATOMIC));
+}
+EXPORT_SYMBOL(tcp_md5_key_copy);
+
+int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
+		   u8 prefixlen, int l3index, u8 flags)
+{
+	struct tcp_md5sig_key *key;
+
+	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
+	if (!key)
+		return -ENOENT;
+	hlist_del_rcu(&key->node);
+	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+	kfree_rcu(key, rcu);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_md5_do_del);
+
+static void tcp_clear_md5_list(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_key *key;
+	struct hlist_node *n;
+	struct tcp_md5sig_info *md5sig;
+
+	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
+
+	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
+		hlist_del_rcu(&key->node);
+		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+		kfree_rcu(key, rcu);
+	}
+}
+
+static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
+				 sockptr_t optval, int optlen)
+{
+	struct tcp_md5sig cmd;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+	const union tcp_md5_addr *addr;
+	u8 prefixlen = 32;
+	int l3index = 0;
+	u8 flags;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
+		return -EFAULT;
+
+	if (sin->sin_family != AF_INET)
+		return -EINVAL;
+
+	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+
+	if (optname == TCP_MD5SIG_EXT &&
+	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
+		prefixlen = cmd.tcpm_prefixlen;
+		if (prefixlen > 32)
+			return -EINVAL;
+	}
+
+	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
+	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+		struct net_device *dev;
+
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+		if (dev && netif_is_l3_master(dev))
+			l3index = dev->ifindex;
+
+		rcu_read_unlock();
+
+		/* ok to reference set/not set outside of rcu;
+		 * right now device MUST be an L3 master
+		 */
+		if (!dev || !l3index)
+			return -EINVAL;
+	}
+
+	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
+
+	if (!cmd.tcpm_keylen)
+		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
+
+	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
+		return -EINVAL;
+
+	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
+			      cmd.tcpm_key, cmd.tcpm_keylen);
+}
+
+static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
+				   __be32 daddr, __be32 saddr,
+				   const struct tcphdr *th, int nbytes)
+{
+	struct tcp4_pseudohdr *bp;
+	struct scatterlist sg;
+	struct tcphdr *_th;
+
+	bp = hp->scratch;
+	bp->saddr = saddr;
+	bp->daddr = daddr;
+	bp->pad = 0;
+	bp->protocol = IPPROTO_TCP;
+	bp->len = cpu_to_be16(nbytes);
+
+	_th = (struct tcphdr *)(bp + 1);
+	memcpy(_th, th, sizeof(*th));
+	_th->check = 0;
+
+	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
+	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+				sizeof(*bp) + sizeof(*th));
+	return crypto_ahash_update(hp->md5_req);
+}
+
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
+{
+	struct tcp_md5sig_pool *hp;
+	struct ahash_request *req;
+
+	hp = tcp_get_md5sig_pool();
+	if (!hp)
+		goto clear_hash_noput;
+	req = hp->md5_req;
+
+	if (crypto_ahash_init(req))
+		goto clear_hash;
+	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
+		goto clear_hash;
+	if (tcp_md5_hash_key(hp, key))
+		goto clear_hash;
+	ahash_request_set_crypt(req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(req))
+		goto clear_hash;
+
+	tcp_put_md5sig_pool();
+	return 0;
+
+clear_hash:
+	tcp_put_md5sig_pool();
+clear_hash_noput:
+	memset(md5_hash, 0, 16);
+	return 1;
+}
+
+int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+			const struct sock *sk,
+			const struct sk_buff *skb)
+{
+	struct tcp_md5sig_pool *hp;
+	struct ahash_request *req;
+	const struct tcphdr *th = tcp_hdr(skb);
+	__be32 saddr, daddr;
+
+	if (sk) { /* valid for establish/request sockets */
+		saddr = sk->sk_rcv_saddr;
+		daddr = sk->sk_daddr;
+	} else {
+		const struct iphdr *iph = ip_hdr(skb);
+		saddr = iph->saddr;
+		daddr = iph->daddr;
+	}
+
+	hp = tcp_get_md5sig_pool();
+	if (!hp)
+		goto clear_hash_noput;
+	req = hp->md5_req;
+
+	if (crypto_ahash_init(req))
+		goto clear_hash;
+
+	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
+		goto clear_hash;
+	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
+		goto clear_hash;
+	if (tcp_md5_hash_key(hp, key))
+		goto clear_hash;
+	ahash_request_set_crypt(req, NULL, md5_hash, 0);
+	if (crypto_ahash_final(req))
+		goto clear_hash;
+
+	tcp_put_md5sig_pool();
+	return 0;
+
+clear_hash:
+	tcp_put_md5sig_pool();
+clear_hash_noput:
+	memset(md5_hash, 0, 16);
+	return 1;
+}
+EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+
+#endif
+
+static void tcp_v4_init_req(struct request_sock *req,
+			    const struct sock *sk_listener,
+			    struct sk_buff *skb)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+	struct net *net = sock_net(sk_listener);
+
+	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
+}
+
+static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+					  struct sk_buff *skb,
+					  struct flowi *fl,
+					  struct request_sock *req)
+{
+	tcp_v4_init_req(req, sk, skb);
+
+	if (security_inet_conn_request(sk, skb, req))
+		return NULL;
+
+	return inet_csk_route_req(sk, &fl->u.ip4, req);
+}
+
+struct request_sock_ops tcp_request_sock_ops __read_mostly = {
+	.family		=	PF_INET,
+	.obj_size	=	sizeof(struct tcp_request_sock),
+	.rtx_syn_ack	=	tcp_rtx_synack,
+	.send_ack	=	tcp_v4_reqsk_send_ack,
+	.destructor	=	tcp_v4_reqsk_destructor,
+	.send_reset	=	tcp_v4_send_reset,
+	.syn_ack_timeout =	tcp_syn_ack_timeout,
+};
+
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+	.mss_clamp	=	TCP_MSS_DEFAULT,
+#ifdef CONFIG_TCP_MD5SIG
+	.req_md5_lookup	=	tcp_v4_md5_lookup,
+	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
+#endif
+#ifdef CONFIG_SYN_COOKIES
+	.cookie_init_seq =	cookie_v4_init_sequence,
+#endif
+	.route_req	=	tcp_v4_route_req,
+	.init_seq	=	tcp_v4_init_seq,
+	.init_ts_off	=	tcp_v4_init_ts_off,
+	.send_synack	=	tcp_v4_send_synack,
+};
+
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	/* Never answer to SYNs send to broadcast or multicast */
+	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		goto drop;
+
+	return tcp_conn_request(&tcp_request_sock_ops,
+				&tcp_request_sock_ipv4_ops, sk, skb);
+
+drop:
+	tcp_listendrop(sk);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_v4_conn_request);
+
+
+/*
+ * The three way handshake has completed - we got a valid synack -
+ * now create the new socket.
+ */
+struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
+				  struct request_sock *req,
+				  struct dst_entry *dst,
+				  struct request_sock *req_unhash,
+				  bool *own_req)
+{
+	struct inet_request_sock *ireq;
+	bool found_dup_sk = false;
+	struct inet_sock *newinet;
+	struct tcp_sock *newtp;
+	struct sock *newsk;
+#ifdef CONFIG_TCP_MD5SIG
+	const union tcp_md5_addr *addr;
+	struct tcp_md5sig_key *key;
+	int l3index;
+#endif
+	struct ip_options_rcu *inet_opt;
+
+	if (sk_acceptq_is_full(sk))
+		goto exit_overflow;
+
+	newsk = tcp_create_openreq_child(sk, req, skb);
+	if (!newsk)
+		goto exit_nonewsk;
+
+	newsk->sk_gso_type = SKB_GSO_TCPV4;
+	inet_sk_rx_dst_set(newsk, skb);
+
+	newtp		      = tcp_sk(newsk);
+	newinet		      = inet_sk(newsk);
+	ireq		      = inet_rsk(req);
+	sk_daddr_set(newsk, ireq->ir_rmt_addr);
+	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+	newsk->sk_bound_dev_if = ireq->ir_iif;
+	newinet->inet_saddr   = ireq->ir_loc_addr;
+	inet_opt	      = rcu_dereference(ireq->ireq_opt);
+	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
+	newinet->mc_index     = inet_iif(skb);
+	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
+	newinet->rcv_tos      = ip_hdr(skb)->tos;
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
+	if (inet_opt)
+		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+	atomic_set(&newinet->inet_id, get_random_u16());
+
+	/* Set ToS of the new socket based upon the value of incoming SYN.
+	 * ECT bits are set later in tcp_init_transfer().
+	 */
+	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
+	if (!dst) {
+		dst = inet_csk_route_child_sock(sk, newsk, req);
+		if (!dst)
+			goto put_and_exit;
+	} else {
+		/* syncookie case : see end of cookie_v4_check() */
+	}
+	sk_setup_caps(newsk, dst);
+
+	tcp_ca_openreq_child(newsk, dst);
+
+	tcp_sync_mss(newsk, dst_mtu(dst));
+	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
+
+	tcp_initialize_rcv_mss(newsk);
+
+#ifdef CONFIG_TCP_MD5SIG
+	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+	/* Copy over the MD5 key from the original socket */
+	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
+	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+	if (key) {
+		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
+			goto put_and_exit;
+		sk_gso_disable(newsk);
+	}
+#endif
+
+	if (__inet_inherit_port(sk, newsk) < 0)
+		goto put_and_exit;
+	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+				       &found_dup_sk);
+	if (likely(*own_req)) {
+		tcp_move_syn(newtp, req);
+		ireq->ireq_opt = NULL;
+	} else {
+		newinet->inet_opt = NULL;
+
+		if (!req_unhash && found_dup_sk) {
+			/* This code path should only be executed in the
+			 * syncookie case only
+			 */
+			bh_unlock_sock(newsk);
+			sock_put(newsk);
+			newsk = NULL;
+		}
+	}
+	return newsk;
+
+exit_overflow:
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+	dst_release(dst);
+exit:
+	tcp_listendrop(sk);
+	return NULL;
+put_and_exit:
+	newinet->inet_opt = NULL;
+	inet_csk_prepare_forced_close(newsk);
+	tcp_done(newsk);
+	goto exit;
+}
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+
+static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_SYN_COOKIES
+	const struct tcphdr *th = tcp_hdr(skb);
+
+	if (!th->syn)
+		sk = cookie_v4_check(sk, skb);
+#endif
+	return sk;
+}
+
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
+			 struct tcphdr *th, u32 *cookie)
+{
+	u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
+				    &tcp_request_sock_ipv4_ops, sk, th);
+	if (mss) {
+		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
+		tcp_synq_overflow(sk);
+	}
+#endif
+	return mss;
+}
+
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+							   u32));
+/* The socket must have it's spinlock held when we get
+ * here, unless it is a TCP_LISTEN socket.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	enum skb_drop_reason reason;
+	struct sock *rsk;
+
+	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+		struct dst_entry *dst;
+
+		dst = rcu_dereference_protected(sk->sk_rx_dst,
+						lockdep_sock_is_held(sk));
+
+		sock_rps_save_rxhash(sk, skb);
+		sk_mark_napi_id(sk, skb);
+		if (dst) {
+			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
+			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
+					     dst, 0)) {
+				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
+				dst_release(dst);
+			}
+		}
+		tcp_rcv_established(sk, skb);
+		return 0;
+	}
+
+	reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	if (tcp_checksum_complete(skb))
+		goto csum_err;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
+
+		if (!nsk)
+			goto discard;
+		if (nsk != sk) {
+			if (tcp_child_process(sk, nsk, skb)) {
+				rsk = nsk;
+				goto reset;
+			}
+			return 0;
+		}
+	} else
+		sock_rps_save_rxhash(sk, skb);
+
+	if (tcp_rcv_state_process(sk, skb)) {
+		rsk = sk;
+		goto reset;
+	}
+	return 0;
+
+reset:
+	tcp_v4_send_reset(rsk, skb);
+discard:
+	kfree_skb_reason(skb, reason);
+	/* Be careful here. If this function gets more complicated and
+	 * gcc suffers from register pressure on the x86, sk (in %ebx)
+	 * might be destroyed here. This current version compiles correctly,
+	 * but you have been warned.
+	 */
+	return 0;
+
+csum_err:
+	reason = SKB_DROP_REASON_TCP_CSUM;
+	trace_tcp_bad_csum(skb);
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+	goto discard;
+}
+EXPORT_SYMBOL(tcp_v4_do_rcv);
+
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct sock *sk;
+
+	if (skb->pkt_type != PACKET_HOST)
+		return 0;
+
+	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
+		return 0;
+
+	iph = ip_hdr(skb);
+	th = tcp_hdr(skb);
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		return 0;
+
+	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
+				       iph->saddr, th->source,
+				       iph->daddr, ntohs(th->dest),
+				       skb->skb_iif, inet_sdif(skb));
+	if (sk) {
+		skb->sk = sk;
+		skb->destructor = sock_edemux;
+		if (sk_fullsock(sk)) {
+			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
+
+			if (dst)
+				dst = dst_check(dst, 0);
+			if (dst &&
+			    sk->sk_rx_dst_ifindex == skb->skb_iif)
+				skb_dst_set_noref(skb, dst);
+		}
+	}
+	return 0;
+}
+
+bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
+		     enum skb_drop_reason *reason)
+{
+	u32 limit, tail_gso_size, tail_gso_segs;
+	struct skb_shared_info *shinfo;
+	const struct tcphdr *th;
+	struct tcphdr *thtail;
+	struct sk_buff *tail;
+	unsigned int hdrlen;
+	bool fragstolen;
+	u32 gso_segs;
+	u32 gso_size;
+	int delta;
+
+	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
+	 * we can fix skb->truesize to its real value to avoid future drops.
+	 * This is valid because skb is not yet charged to the socket.
+	 * It has been noticed pure SACK packets were sometimes dropped
+	 * (if cooked by drivers without copybreak feature).
+	 */
+	skb_condense(skb);
+
+	skb_dst_drop(skb);
+
+	if (unlikely(tcp_checksum_complete(skb))) {
+		bh_unlock_sock(sk);
+		trace_tcp_bad_csum(skb);
+		*reason = SKB_DROP_REASON_TCP_CSUM;
+		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+		return true;
+	}
+
+	/* Attempt coalescing to last skb in backlog, even if we are
+	 * above the limits.
+	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+	 */
+	th = (const struct tcphdr *)skb->data;
+	hdrlen = th->doff * 4;
+
+	tail = sk->sk_backlog.tail;
+	if (!tail)
+		goto no_coalesce;
+	thtail = (struct tcphdr *)tail->data;
+
+	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+	    ((TCP_SKB_CB(tail)->tcp_flags |
+	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
+	    !((TCP_SKB_CB(tail)->tcp_flags &
+	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
+	    ((TCP_SKB_CB(tail)->tcp_flags ^
+	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+	    tail->decrypted != skb->decrypted ||
+#endif
+	    !mptcp_skb_can_collapse(tail, skb) ||
+	    thtail->doff != th->doff ||
+	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+		goto no_coalesce;
+
+	__skb_pull(skb, hdrlen);
+
+	shinfo = skb_shinfo(skb);
+	gso_size = shinfo->gso_size ?: skb->len;
+	gso_segs = shinfo->gso_segs ?: 1;
+
+	shinfo = skb_shinfo(tail);
+	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
+	tail_gso_segs = shinfo->gso_segs ?: 1;
+
+	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
+			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+			thtail->window = th->window;
+		}
+
+		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
+		 * thtail->fin, so that the fast path in tcp_rcv_established()
+		 * is not entered if we append a packet with a FIN.
+		 * SYN, RST, URG are not present.
+		 * ACK is set on both packets.
+		 * PSH : we do not really care in TCP stack,
+		 *       at least for 'GRO' packets.
+		 */
+		thtail->fin |= th->fin;
+		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+
+		if (TCP_SKB_CB(skb)->has_rxtstamp) {
+			TCP_SKB_CB(tail)->has_rxtstamp = true;
+			tail->tstamp = skb->tstamp;
+			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+		}
+
+		/* Not as strict as GRO. We only need to carry mss max value */
+		shinfo->gso_size = max(gso_size, tail_gso_size);
+		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
+
+		sk->sk_backlog.len += delta;
+		__NET_INC_STATS(sock_net(sk),
+				LINUX_MIB_TCPBACKLOGCOALESCE);
+		kfree_skb_partial(skb, fragstolen);
+		return false;
+	}
+	__skb_push(skb, hdrlen);
+
+no_coalesce:
+	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
+
+	/* Only socket owner can try to collapse/prune rx queues
+	 * to reduce memory overhead, so add a little headroom here.
+	 * Few sockets backlog are possibly concurrently non empty.
+	 */
+	limit += 64 * 1024;
+
+	if (unlikely(sk_add_backlog(sk, skb, limit))) {
+		bh_unlock_sock(sk);
+		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_add_backlog);
+
+int tcp_filter(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = (struct tcphdr *)skb->data;
+
+	return sk_filter_trim_cap(sk, skb, th->doff * 4);
+}
+EXPORT_SYMBOL(tcp_filter);
+
+static void tcp_v4_restore_cb(struct sk_buff *skb)
+{
+	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
+		sizeof(struct inet_skb_parm));
+}
+
+static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
+			   const struct tcphdr *th)
+{
+	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
+	 * barrier() makes sure compiler wont play fool^Waliasing games.
+	 */
+	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
+		sizeof(struct inet_skb_parm));
+	barrier();
+
+	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+				    skb->len - th->doff * 4);
+	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
+	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
+	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
+	TCP_SKB_CB(skb)->sacked	 = 0;
+	TCP_SKB_CB(skb)->has_rxtstamp =
+			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
+}
+
+/*
+ *	From tcp_input.c
+ */
+
+int tcp_v4_rcv(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	enum skb_drop_reason drop_reason;
+	int sdif = inet_sdif(skb);
+	int dif = inet_iif(skb);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	bool refcounted;
+	struct sock *sk;
+	int ret;
+
+	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	if (skb->pkt_type != PACKET_HOST)
+		goto discard_it;
+
+	/* Count it even if it's bad */
+	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
+
+	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+		goto discard_it;
+
+	th = (const struct tcphdr *)skb->data;
+
+	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
+		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+		goto bad_packet;
+	}
+	if (!pskb_may_pull(skb, th->doff * 4))
+		goto discard_it;
+
+	/* An explanation is required here, I think.
+	 * Packet length and doff are validated by header prediction,
+	 * provided case of th->doff==0 is eliminated.
+	 * So, we defer the checks. */
+
+	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
+		goto csum_error;
+
+	th = (const struct tcphdr *)skb->data;
+	iph = ip_hdr(skb);
+lookup:
+	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
+			       skb, __tcp_hdrlen(th), th->source,
+			       th->dest, sdif, &refcounted);
+	if (!sk)
+		goto no_tcp_socket;
+
+process:
+	if (sk->sk_state == TCP_TIME_WAIT)
+		goto do_time_wait;
+
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		struct request_sock *req = inet_reqsk(sk);
+		bool req_stolen = false;
+		struct sock *nsk;
+
+		sk = req->rsk_listener;
+		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+		else
+			drop_reason = tcp_inbound_md5_hash(sk, skb,
+						   &iph->saddr, &iph->daddr,
+						   AF_INET, dif, sdif);
+		if (unlikely(drop_reason)) {
+			sk_drops_add(sk, skb);
+			reqsk_put(req);
+			goto discard_it;
+		}
+		if (tcp_checksum_complete(skb)) {
+			reqsk_put(req);
+			goto csum_error;
+		}
+		if (unlikely(sk->sk_state != TCP_LISTEN)) {
+			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+			if (!nsk) {
+				inet_csk_reqsk_queue_drop_and_put(sk, req);
+				goto lookup;
+			}
+			sk = nsk;
+			/* reuseport_migrate_sock() has already held one sk_refcnt
+			 * before returning.
+			 */
+		} else {
+			/* We own a reference on the listener, increase it again
+			 * as we might lose it too soon.
+			 */
+			sock_hold(sk);
+		}
+		refcounted = true;
+		nsk = NULL;
+		if (!tcp_filter(sk, skb)) {
+			th = (const struct tcphdr *)skb->data;
+			iph = ip_hdr(skb);
+			tcp_v4_fill_cb(skb, iph, th);
+			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
+		} else {
+			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+		}
+		if (!nsk) {
+			reqsk_put(req);
+			if (req_stolen) {
+				/* Another cpu got exclusive access to req
+				 * and created a full blown socket.
+				 * Try to feed this packet to this socket
+				 * instead of discarding it.
+				 */
+				tcp_v4_restore_cb(skb);
+				sock_put(sk);
+				goto lookup;
+			}
+			goto discard_and_relse;
+		}
+		nf_reset_ct(skb);
+		if (nsk == sk) {
+			reqsk_put(req);
+			tcp_v4_restore_cb(skb);
+		} else if (tcp_child_process(sk, nsk, skb)) {
+			tcp_v4_send_reset(nsk, skb);
+			goto discard_and_relse;
+		} else {
+			sock_put(sk);
+			return 0;
+		}
+	}
+
+	if (static_branch_unlikely(&ip4_min_ttl)) {
+		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
+		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
+			goto discard_and_relse;
+		}
+	}
+
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+		goto discard_and_relse;
+	}
+
+	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
+					   &iph->daddr, AF_INET, dif, sdif);
+	if (drop_reason)
+		goto discard_and_relse;
+
+	nf_reset_ct(skb);
+
+	if (tcp_filter(sk, skb)) {
+		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+		goto discard_and_relse;
+	}
+	th = (const struct tcphdr *)skb->data;
+	iph = ip_hdr(skb);
+	tcp_v4_fill_cb(skb, iph, th);
+
+	skb->dev = NULL;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		ret = tcp_v4_do_rcv(sk, skb);
+		goto put_and_return;
+	}
+
+	sk_incoming_cpu_update(sk);
+
+	bh_lock_sock_nested(sk);
+	tcp_segs_in(tcp_sk(sk), skb);
+	ret = 0;
+	if (!sock_owned_by_user(sk)) {
+		ret = tcp_v4_do_rcv(sk, skb);
+	} else {
+		if (tcp_add_backlog(sk, skb, &drop_reason))
+			goto discard_and_relse;
+	}
+	bh_unlock_sock(sk);
+
+put_and_return:
+	if (refcounted)
+		sock_put(sk);
+
+	return ret;
+
+no_tcp_socket:
+	drop_reason = SKB_DROP_REASON_NO_SOCKET;
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto discard_it;
+
+	tcp_v4_fill_cb(skb, iph, th);
+
+	if (tcp_checksum_complete(skb)) {
+csum_error:
+		drop_reason = SKB_DROP_REASON_TCP_CSUM;
+		trace_tcp_bad_csum(skb);
+		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
+bad_packet:
+		__TCP_INC_STATS(net, TCP_MIB_INERRS);
+	} else {
+		tcp_v4_send_reset(NULL, skb);
+	}
+
+discard_it:
+	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
+	/* Discard frame. */
+	kfree_skb_reason(skb, drop_reason);
+	return 0;
+
+discard_and_relse:
+	sk_drops_add(sk, skb);
+	if (refcounted)
+		sock_put(sk);
+	goto discard_it;
+
+do_time_wait:
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+		inet_twsk_put(inet_twsk(sk));
+		goto discard_it;
+	}
+
+	tcp_v4_fill_cb(skb, iph, th);
+
+	if (tcp_checksum_complete(skb)) {
+		inet_twsk_put(inet_twsk(sk));
+		goto csum_error;
+	}
+	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+	case TCP_TW_SYN: {
+		struct sock *sk2 = inet_lookup_listener(net,
+							net->ipv4.tcp_death_row.hashinfo,
+							skb, __tcp_hdrlen(th),
+							iph->saddr, th->source,
+							iph->daddr, th->dest,
+							inet_iif(skb),
+							sdif);
+		if (sk2) {
+			inet_twsk_deschedule_put(inet_twsk(sk));
+			sk = sk2;
+			tcp_v4_restore_cb(skb);
+			refcounted = false;
+			goto process;
+		}
+	}
+		/* to ACK */
+		fallthrough;
+	case TCP_TW_ACK:
+		tcp_v4_timewait_ack(sk, skb);
+		break;
+	case TCP_TW_RST:
+		tcp_v4_send_reset(sk, skb);
+		inet_twsk_deschedule_put(inet_twsk(sk));
+		goto discard_it;
+	case TCP_TW_SUCCESS:;
+	}
+	goto discard_it;
+}
+
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
+	.twsk_unique	= tcp_twsk_unique,
+	.twsk_destructor= tcp_twsk_destructor,
+};
+
+void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+
+	if (dst && dst_hold_safe(dst)) {
+		rcu_assign_pointer(sk->sk_rx_dst, dst);
+		sk->sk_rx_dst_ifindex = skb->skb_iif;
+	}
+}
+EXPORT_SYMBOL(inet_sk_rx_dst_set);
+
+const struct inet_connection_sock_af_ops ipv4_specific = {
+	.queue_xmit	   = ip_queue_xmit,
+	.send_check	   = tcp_v4_send_check,
+	.rebuild_header	   = inet_sk_rebuild_header,
+	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
+	.conn_request	   = tcp_v4_conn_request,
+	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
+	.net_header_len	   = sizeof(struct iphdr),
+	.setsockopt	   = ip_setsockopt,
+	.getsockopt	   = ip_getsockopt,
+	.addr2sockaddr	   = inet_csk_addr2sockaddr,
+	.sockaddr_len	   = sizeof(struct sockaddr_in),
+	.mtu_reduced	   = tcp_v4_mtu_reduced,
+};
+EXPORT_SYMBOL(ipv4_specific);
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
+	.md5_lookup		= tcp_v4_md5_lookup,
+	.calc_md5_hash		= tcp_v4_md5_hash_skb,
+	.md5_parse		= tcp_v4_parse_md5_keys,
+};
+#endif
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
+static int tcp_v4_init_sock(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tcp_init_sock(sk);
+
+	icsk->icsk_af_ops = &ipv4_specific;
+
+#ifdef CONFIG_TCP_MD5SIG
+	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+#endif
+
+	return 0;
+}
+
+void tcp_v4_destroy_sock(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	trace_tcp_destroy_sock(sk);
+
+	tcp_clear_xmit_timers(sk);
+
+	tcp_cleanup_congestion_control(sk);
+
+	tcp_cleanup_ulp(sk);
+
+	/* Cleanup up the write buffer. */
+	tcp_write_queue_purge(sk);
+
+	/* Check if we want to disable active TFO */
+	tcp_fastopen_active_disable_ofo_check(sk);
+
+	/* Cleans up our, hopefully empty, out_of_order_queue. */
+	skb_rbtree_purge(&tp->out_of_order_queue);
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* Clean up the MD5 key list, if any */
+	if (tp->md5sig_info) {
+		tcp_clear_md5_list(sk);
+		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
+		tp->md5sig_info = NULL;
+		static_branch_slow_dec_deferred(&tcp_md5_needed);
+	}
+#endif
+
+	/* Clean up a referenced TCP bind bucket. */
+	if (inet_csk(sk)->icsk_bind_hash)
+		inet_put_port(sk);
+
+	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
+
+	/* If socket is aborted during connect operation */
+	tcp_free_fastopen_req(tp);
+	tcp_fastopen_destroy_cipher(sk);
+	tcp_saved_syn_free(tp);
+
+	sk_sockets_allocated_dec(sk);
+}
+EXPORT_SYMBOL(tcp_v4_destroy_sock);
+
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCP sock list dumping. */
+
+static unsigned short seq_file_family(const struct seq_file *seq);
+
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+	unsigned short family = seq_file_family(seq);
+
+	/* AF_UNSPEC is used as a match all */
+	return ((family == AF_UNSPEC || family == sk->sk_family) &&
+		net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+/* Find a non empty bucket (starting from st->bucket)
+ * and return the first sk from it.
+ */
+static void *listening_get_first(struct seq_file *seq)
+{
+	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	struct tcp_iter_state *st = seq->private;
+
+	st->offset = 0;
+	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
+		struct inet_listen_hashbucket *ilb2;
+		struct hlist_nulls_node *node;
+		struct sock *sk;
+
+		ilb2 = &hinfo->lhash2[st->bucket];
+		if (hlist_nulls_empty(&ilb2->nulls_head))
+			continue;
+
+		spin_lock(&ilb2->lock);
+		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
+			if (seq_sk_match(seq, sk))
+				return sk;
+		}
+		spin_unlock(&ilb2->lock);
+	}
+
+	return NULL;
+}
+
+/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
+ * If "cur" is the last one in the st->bucket,
+ * call listening_get_first() to return the first sk of the next
+ * non empty bucket.
+ */
+static void *listening_get_next(struct seq_file *seq, void *cur)
+{
+	struct tcp_iter_state *st = seq->private;
+	struct inet_listen_hashbucket *ilb2;
+	struct hlist_nulls_node *node;
+	struct inet_hashinfo *hinfo;
+	struct sock *sk = cur;
+
+	++st->num;
+	++st->offset;
+
+	sk = sk_nulls_next(sk);
+	sk_nulls_for_each_from(sk, node) {
+		if (seq_sk_match(seq, sk))
+			return sk;
+	}
+
+	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	ilb2 = &hinfo->lhash2[st->bucket];
+	spin_unlock(&ilb2->lock);
+	++st->bucket;
+	return listening_get_first(seq);
+}
+
+static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
+{
+	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	st->bucket = 0;
+	st->offset = 0;
+	rc = listening_get_first(seq);
+
+	while (rc && *pos) {
+		rc = listening_get_next(seq, rc);
+		--*pos;
+	}
+	return rc;
+}
+
+static inline bool empty_bucket(struct inet_hashinfo *hinfo,
+				const struct tcp_iter_state *st)
+{
+	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
+}
+
+/*
+ * Get first established socket starting from bucket given in st->bucket.
+ * If st->bucket is zero, the very first socket in the hash is returned.
+ */
+static void *established_get_first(struct seq_file *seq)
+{
+	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	struct tcp_iter_state *st = seq->private;
+
+	st->offset = 0;
+	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
+		struct sock *sk;
+		struct hlist_nulls_node *node;
+		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
+
+		cond_resched();
+
+		/* Lockless fast path for the common case of empty buckets */
+		if (empty_bucket(hinfo, st))
+			continue;
+
+		spin_lock_bh(lock);
+		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
+			if (seq_sk_match(seq, sk))
+				return sk;
+		}
+		spin_unlock_bh(lock);
+	}
+
+	return NULL;
+}
+
+static void *established_get_next(struct seq_file *seq, void *cur)
+{
+	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	struct tcp_iter_state *st = seq->private;
+	struct hlist_nulls_node *node;
+	struct sock *sk = cur;
+
+	++st->num;
+	++st->offset;
+
+	sk = sk_nulls_next(sk);
+
+	sk_nulls_for_each_from(sk, node) {
+		if (seq_sk_match(seq, sk))
+			return sk;
+	}
+
+	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+	++st->bucket;
+	return established_get_first(seq);
+}
+
+static void *established_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	st->bucket = 0;
+	rc = established_get_first(seq);
+
+	while (rc && pos) {
+		rc = established_get_next(seq, rc);
+		--pos;
+	}
+	return rc;
+}
+
+static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
+{
+	void *rc;
+	struct tcp_iter_state *st = seq->private;
+
+	st->state = TCP_SEQ_STATE_LISTENING;
+	rc	  = listening_get_idx(seq, &pos);
+
+	if (!rc) {
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		rc	  = established_get_idx(seq, pos);
+	}
+
+	return rc;
+}
+
+static void *tcp_seek_last_pos(struct seq_file *seq)
+{
+	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	struct tcp_iter_state *st = seq->private;
+	int bucket = st->bucket;
+	int offset = st->offset;
+	int orig_num = st->num;
+	void *rc = NULL;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_LISTENING:
+		if (st->bucket > hinfo->lhash2_mask)
+			break;
+		rc = listening_get_first(seq);
+		while (offset-- && rc && bucket == st->bucket)
+			rc = listening_get_next(seq, rc);
+		if (rc)
+			break;
+		st->bucket = 0;
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		fallthrough;
+	case TCP_SEQ_STATE_ESTABLISHED:
+		if (st->bucket > hinfo->ehash_mask)
+			break;
+		rc = established_get_first(seq);
+		while (offset-- && rc && bucket == st->bucket)
+			rc = established_get_next(seq, rc);
+	}
+
+	st->num = orig_num;
+
+	return rc;
+}
+
+void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	if (*pos && *pos == st->last_pos) {
+		rc = tcp_seek_last_pos(seq);
+		if (rc)
+			goto out;
+	}
+
+	st->state = TCP_SEQ_STATE_LISTENING;
+	st->num = 0;
+	st->bucket = 0;
+	st->offset = 0;
+	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+out:
+	st->last_pos = *pos;
+	return rc;
+}
+EXPORT_SYMBOL(tcp_seq_start);
+
+void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct tcp_iter_state *st = seq->private;
+	void *rc = NULL;
+
+	if (v == SEQ_START_TOKEN) {
+		rc = tcp_get_idx(seq, 0);
+		goto out;
+	}
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_LISTENING:
+		rc = listening_get_next(seq, v);
+		if (!rc) {
+			st->state = TCP_SEQ_STATE_ESTABLISHED;
+			st->bucket = 0;
+			st->offset = 0;
+			rc	  = established_get_first(seq);
+		}
+		break;
+	case TCP_SEQ_STATE_ESTABLISHED:
+		rc = established_get_next(seq, v);
+		break;
+	}
+out:
+	++*pos;
+	st->last_pos = *pos;
+	return rc;
+}
+EXPORT_SYMBOL(tcp_seq_next);
+
+void tcp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	struct tcp_iter_state *st = seq->private;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_LISTENING:
+		if (v != SEQ_START_TOKEN)
+			spin_unlock(&hinfo->lhash2[st->bucket].lock);
+		break;
+	case TCP_SEQ_STATE_ESTABLISHED:
+		if (v)
+			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+		break;
+	}
+}
+EXPORT_SYMBOL(tcp_seq_stop);
+
+static void get_openreq4(const struct request_sock *req,
+			 struct seq_file *f, int i)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	long delta = req->rsk_timer.expires - jiffies;
+
+	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
+		i,
+		ireq->ir_loc_addr,
+		ireq->ir_num,
+		ireq->ir_rmt_addr,
+		ntohs(ireq->ir_rmt_port),
+		TCP_SYN_RECV,
+		0, 0, /* could print option size, but that is af dependent. */
+		1,    /* timers active (only the expire timer) */
+		jiffies_delta_to_clock_t(delta),
+		req->num_timeout,
+		from_kuid_munged(seq_user_ns(f),
+				 sock_i_uid(req->rsk_listener)),
+		0,  /* non standard timer */
+		0, /* open_requests have no inode */
+		0,
+		req);
+}
+
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
+{
+	int timer_active;
+	unsigned long timer_expires;
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
+	__be32 dest = inet->inet_daddr;
+	__be32 src = inet->inet_rcv_saddr;
+	__u16 destp = ntohs(inet->inet_dport);
+	__u16 srcp = ntohs(inet->inet_sport);
+	int rx_queue;
+	int state;
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+		timer_active	= 1;
+		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+		timer_active	= 4;
+		timer_expires	= icsk->icsk_timeout;
+	} else if (timer_pending(&sk->sk_timer)) {
+		timer_active	= 2;
+		timer_expires	= sk->sk_timer.expires;
+	} else {
+		timer_active	= 0;
+		timer_expires = jiffies;
+	}
+
+	state = inet_sk_state_load(sk);
+	if (state == TCP_LISTEN)
+		rx_queue = READ_ONCE(sk->sk_ack_backlog);
+	else
+		/* Because we don't lock the socket,
+		 * we might find a transient negative value.
+		 */
+		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+				      READ_ONCE(tp->copied_seq), 0);
+
+	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
+			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
+		i, src, srcp, dest, destp, state,
+		READ_ONCE(tp->write_seq) - tp->snd_una,
+		rx_queue,
+		timer_active,
+		jiffies_delta_to_clock_t(timer_expires - jiffies),
+		icsk->icsk_retransmits,
+		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
+		icsk->icsk_probes_out,
+		sock_i_ino(sk),
+		refcount_read(&sk->sk_refcnt), sk,
+		jiffies_to_clock_t(icsk->icsk_rto),
+		jiffies_to_clock_t(icsk->icsk_ack.ato),
+		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
+		tcp_snd_cwnd(tp),
+		state == TCP_LISTEN ?
+		    fastopenq->max_qlen :
+		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
+}
+
+static void get_timewait4_sock(const struct inet_timewait_sock *tw,
+			       struct seq_file *f, int i)
+{
+	long delta = tw->tw_timer.expires - jiffies;
+	__be32 dest, src;
+	__u16 destp, srcp;
+
+	dest  = tw->tw_daddr;
+	src   = tw->tw_rcv_saddr;
+	destp = ntohs(tw->tw_dport);
+	srcp  = ntohs(tw->tw_sport);
+
+	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
+		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+		refcount_read(&tw->tw_refcnt), tw);
+}
+
+#define TMPSZ 150
+
+static int tcp4_seq_show(struct seq_file *seq, void *v)
+{
+	struct tcp_iter_state *st;
+	struct sock *sk = v;
+
+	seq_setwidth(seq, TMPSZ - 1);
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode");
+		goto out;
+	}
+	st = seq->private;
+
+	if (sk->sk_state == TCP_TIME_WAIT)
+		get_timewait4_sock(v, seq, st->num);
+	else if (sk->sk_state == TCP_NEW_SYN_RECV)
+		get_openreq4(v, seq, st->num);
+	else
+		get_tcp4_sock(v, seq, st->num);
+out:
+	seq_pad(seq, '\n');
+	return 0;
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_tcp_iter_state {
+	struct tcp_iter_state state;
+	unsigned int cur_sk;
+	unsigned int end_sk;
+	unsigned int max_sk;
+	struct sock **batch;
+	bool st_bucket_done;
+};
+
+struct bpf_iter__tcp {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct sock_common *, sk_common);
+	uid_t uid __aligned(8);
+};
+
+static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+			     struct sock_common *sk_common, uid_t uid)
+{
+	struct bpf_iter__tcp ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.sk_common = sk_common;
+	ctx.uid = uid;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
+{
+	while (iter->cur_sk < iter->end_sk)
+		sock_gen_put(iter->batch[iter->cur_sk++]);
+}
+
+static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
+				      unsigned int new_batch_sz)
+{
+	struct sock **new_batch;
+
+	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
+			     GFP_USER | __GFP_NOWARN);
+	if (!new_batch)
+		return -ENOMEM;
+
+	bpf_iter_tcp_put_batch(iter);
+	kvfree(iter->batch);
+	iter->batch = new_batch;
+	iter->max_sk = new_batch_sz;
+
+	return 0;
+}
+
+static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
+						 struct sock *start_sk)
+{
+	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct tcp_iter_state *st = &iter->state;
+	struct hlist_nulls_node *node;
+	unsigned int expected = 1;
+	struct sock *sk;
+
+	sock_hold(start_sk);
+	iter->batch[iter->end_sk++] = start_sk;
+
+	sk = sk_nulls_next(start_sk);
+	sk_nulls_for_each_from(sk, node) {
+		if (seq_sk_match(seq, sk)) {
+			if (iter->end_sk < iter->max_sk) {
+				sock_hold(sk);
+				iter->batch[iter->end_sk++] = sk;
+			}
+			expected++;
+		}
+	}
+	spin_unlock(&hinfo->lhash2[st->bucket].lock);
+
+	return expected;
+}
+
+static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
+						   struct sock *start_sk)
+{
+	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct tcp_iter_state *st = &iter->state;
+	struct hlist_nulls_node *node;
+	unsigned int expected = 1;
+	struct sock *sk;
+
+	sock_hold(start_sk);
+	iter->batch[iter->end_sk++] = start_sk;
+
+	sk = sk_nulls_next(start_sk);
+	sk_nulls_for_each_from(sk, node) {
+		if (seq_sk_match(seq, sk)) {
+			if (iter->end_sk < iter->max_sk) {
+				sock_hold(sk);
+				iter->batch[iter->end_sk++] = sk;
+			}
+			expected++;
+		}
+	}
+	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+
+	return expected;
+}
+
+static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+{
+	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct tcp_iter_state *st = &iter->state;
+	unsigned int expected;
+	bool resized = false;
+	struct sock *sk;
+
+	/* The st->bucket is done.  Directly advance to the next
+	 * bucket instead of having the tcp_seek_last_pos() to skip
+	 * one by one in the current bucket and eventually find out
+	 * it has to advance to the next bucket.
+	 */
+	if (iter->st_bucket_done) {
+		st->offset = 0;
+		st->bucket++;
+		if (st->state == TCP_SEQ_STATE_LISTENING &&
+		    st->bucket > hinfo->lhash2_mask) {
+			st->state = TCP_SEQ_STATE_ESTABLISHED;
+			st->bucket = 0;
+		}
+	}
+
+again:
+	/* Get a new batch */
+	iter->cur_sk = 0;
+	iter->end_sk = 0;
+	iter->st_bucket_done = false;
+
+	sk = tcp_seek_last_pos(seq);
+	if (!sk)
+		return NULL; /* Done */
+
+	if (st->state == TCP_SEQ_STATE_LISTENING)
+		expected = bpf_iter_tcp_listening_batch(seq, sk);
+	else
+		expected = bpf_iter_tcp_established_batch(seq, sk);
+
+	if (iter->end_sk == expected) {
+		iter->st_bucket_done = true;
+		return sk;
+	}
+
+	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
+		resized = true;
+		goto again;
+	}
+
+	return sk;
+}
+
+static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* bpf iter does not support lseek, so it always
+	 * continue from where it was stop()-ped.
+	 */
+	if (*pos)
+		return bpf_iter_tcp_batch(seq);
+
+	return SEQ_START_TOKEN;
+}
+
+static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct tcp_iter_state *st = &iter->state;
+	struct sock *sk;
+
+	/* Whenever seq_next() is called, the iter->cur_sk is
+	 * done with seq_show(), so advance to the next sk in
+	 * the batch.
+	 */
+	if (iter->cur_sk < iter->end_sk) {
+		/* Keeping st->num consistent in tcp_iter_state.
+		 * bpf_iter_tcp does not use st->num.
+		 * meta.seq_num is used instead.
+		 */
+		st->num++;
+		/* Move st->offset to the next sk in the bucket such that
+		 * the future start() will resume at st->offset in
+		 * st->bucket.  See tcp_seek_last_pos().
+		 */
+		st->offset++;
+		sock_gen_put(iter->batch[iter->cur_sk++]);
+	}
+
+	if (iter->cur_sk < iter->end_sk)
+		sk = iter->batch[iter->cur_sk];
+	else
+		sk = bpf_iter_tcp_batch(seq);
+
+	++*pos;
+	/* Keeping st->last_pos consistent in tcp_iter_state.
+	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
+	 */
+	st->last_pos = *pos;
+	return sk;
+}
+
+static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	struct sock *sk = v;
+	uid_t uid;
+	int ret;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+
+	if (sk_fullsock(sk))
+		lock_sock(sk);
+
+	if (unlikely(sk_unhashed(sk))) {
+		ret = SEQ_SKIP;
+		goto unlock;
+	}
+
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		uid = 0;
+	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		const struct request_sock *req = v;
+
+		uid = from_kuid_munged(seq_user_ns(seq),
+				       sock_i_uid(req->rsk_listener));
+	} else {
+		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+	}
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	ret = tcp_prog_seq_show(prog, &meta, v, uid);
+
+unlock:
+	if (sk_fullsock(sk))
+		release_sock(sk);
+	return ret;
+
+}
+
+static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)tcp_prog_seq_show(prog, &meta, v, 0);
+	}
+
+	if (iter->cur_sk < iter->end_sk) {
+		bpf_iter_tcp_put_batch(iter);
+		iter->st_bucket_done = false;
+	}
+}
+
+static const struct seq_operations bpf_iter_tcp_seq_ops = {
+	.show		= bpf_iter_tcp_seq_show,
+	.start		= bpf_iter_tcp_seq_start,
+	.next		= bpf_iter_tcp_seq_next,
+	.stop		= bpf_iter_tcp_seq_stop,
+};
+#endif
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+	const struct tcp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
+	if (seq->op == &bpf_iter_tcp_seq_ops)
+		return AF_UNSPEC;
+#endif
+
+	/* Iterated from proc fs */
+	afinfo = pde_data(file_inode(seq->file));
+	return afinfo->family;
+}
+
+static const struct seq_operations tcp4_seq_ops = {
+	.show		= tcp4_seq_show,
+	.start		= tcp_seq_start,
+	.next		= tcp_seq_next,
+	.stop		= tcp_seq_stop,
+};
+
+static struct tcp_seq_afinfo tcp4_seq_afinfo = {
+	.family		= AF_INET,
+};
+
+static int __net_init tcp4_proc_init_net(struct net *net)
+{
+	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
+			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit tcp4_proc_exit_net(struct net *net)
+{
+	remove_proc_entry("tcp", net->proc_net);
+}
+
+static struct pernet_operations tcp4_net_ops = {
+	.init = tcp4_proc_init_net,
+	.exit = tcp4_proc_exit_net,
+};
+
+int __init tcp4_proc_init(void)
+{
+	return register_pernet_subsys(&tcp4_net_ops);
+}
+
+void tcp4_proc_exit(void)
+{
+	unregister_pernet_subsys(&tcp4_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
+ * This mimics the strategy used in sock_def_write_space().
+ */
+bool tcp_stream_memory_free(const struct sock *sk, int wake)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
+			    READ_ONCE(tp->snd_nxt);
+
+	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
+}
+EXPORT_SYMBOL(tcp_stream_memory_free);
+
+struct proto tcp_prot = {
+	.name			= "TCP",
+	.owner			= THIS_MODULE,
+	.close			= tcp_close,
+	.pre_connect		= tcp_v4_pre_connect,
+	.connect		= tcp_v4_connect,
+	.disconnect		= tcp_disconnect,
+	.accept			= inet_csk_accept,
+	.ioctl			= tcp_ioctl,
+	.init			= tcp_v4_init_sock,
+	.destroy		= tcp_v4_destroy_sock,
+	.shutdown		= tcp_shutdown,
+	.setsockopt		= tcp_setsockopt,
+	.getsockopt		= tcp_getsockopt,
+	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
+	.keepalive		= tcp_set_keepalive,
+	.recvmsg		= tcp_recvmsg,
+	.sendmsg		= tcp_sendmsg,
+	.splice_eof		= tcp_splice_eof,
+	.backlog_rcv		= tcp_v4_do_rcv,
+	.release_cb		= tcp_release_cb,
+	.hash			= inet_hash,
+	.unhash			= inet_unhash,
+	.get_port		= inet_csk_get_port,
+	.put_port		= inet_put_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= tcp_bpf_update_proto,
+#endif
+	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.leave_memory_pressure	= tcp_leave_memory_pressure,
+	.stream_memory_free	= tcp_stream_memory_free,
+	.sockets_allocated	= &tcp_sockets_allocated,
+	.orphan_count		= &tcp_orphan_count,
+
+	.memory_allocated	= &tcp_memory_allocated,
+	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
+
+	.memory_pressure	= &tcp_memory_pressure,
+	.sysctl_mem		= sysctl_tcp_mem,
+	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
+	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
+	.max_header		= MAX_TCP_HEADER,
+	.obj_size		= sizeof(struct tcp_sock),
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
+	.twsk_prot		= &tcp_timewait_sock_ops,
+	.rsk_prot		= &tcp_request_sock_ops,
+	.h.hashinfo		= NULL,
+	.no_autobind		= true,
+	.diag_destroy		= tcp_abort,
+};
+EXPORT_SYMBOL(tcp_prot);
+
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+	if (net->ipv4.tcp_congestion_control)
+		bpf_module_put(net->ipv4.tcp_congestion_control,
+			       net->ipv4.tcp_congestion_control->owner);
+}
+
+static void __net_init tcp_set_hashinfo(struct net *net)
+{
+	struct inet_hashinfo *hinfo;
+	unsigned int ehash_entries;
+	struct net *old_net;
+
+	if (net_eq(net, &init_net))
+		goto fallback;
+
+	old_net = current->nsproxy->net_ns;
+	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
+	if (!ehash_entries)
+		goto fallback;
+
+	ehash_entries = roundup_pow_of_two(ehash_entries);
+	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
+	if (!hinfo) {
+		pr_warn("Failed to allocate TCP ehash (entries: %u) "
+			"for a netns, fallback to the global one\n",
+			ehash_entries);
+fallback:
+		hinfo = &tcp_hashinfo;
+		ehash_entries = tcp_hashinfo.ehash_mask + 1;
+	}
+
+	net->ipv4.tcp_death_row.hashinfo = hinfo;
+	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
+	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
+}
+
+static int __net_init tcp_sk_init(struct net *net)
+{
+	net->ipv4.sysctl_tcp_ecn = 2;
+	net->ipv4.sysctl_tcp_ecn_fallback = 1;
+
+	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
+	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
+
+	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
+
+	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
+	net->ipv4.sysctl_tcp_syncookies = 1;
+	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
+	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
+	net->ipv4.sysctl_tcp_orphan_retries = 0;
+	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
+	net->ipv4.sysctl_tcp_tw_reuse = 2;
+	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
+
+	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
+	tcp_set_hashinfo(net);
+
+	net->ipv4.sysctl_tcp_sack = 1;
+	net->ipv4.sysctl_tcp_window_scaling = 1;
+	net->ipv4.sysctl_tcp_timestamps = 1;
+	net->ipv4.sysctl_tcp_early_retrans = 3;
+	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
+	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
+	net->ipv4.sysctl_tcp_retrans_collapse = 1;
+	net->ipv4.sysctl_tcp_max_reordering = 300;
+	net->ipv4.sysctl_tcp_dsack = 1;
+	net->ipv4.sysctl_tcp_app_win = 31;
+	net->ipv4.sysctl_tcp_adv_win_scale = 1;
+	net->ipv4.sysctl_tcp_frto = 2;
+	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
+	/* This limits the percentage of the congestion window which we
+	 * will allow a single TSO frame to consume.  Building TSO frames
+	 * which are too large can cause TCP streams to be bursty.
+	 */
+	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
+	/* Default TSQ limit of 16 TSO segments */
+	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
+
+	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
+	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
+
+	net->ipv4.sysctl_tcp_min_tso_segs = 2;
+	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
+	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
+	net->ipv4.sysctl_tcp_autocorking = 1;
+	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
+	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
+	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
+	if (net != &init_net) {
+		memcpy(net->ipv4.sysctl_tcp_rmem,
+		       init_net.ipv4.sysctl_tcp_rmem,
+		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
+		memcpy(net->ipv4.sysctl_tcp_wmem,
+		       init_net.ipv4.sysctl_tcp_wmem,
+		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
+	}
+	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
+	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
+	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
+	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+
+	/* Set default values for PLB */
+	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
+	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
+	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
+	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
+	/* Default congestion threshold for PLB to mark a round is 50% */
+	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
+
+	/* Reno is always built in */
+	if (!net_eq(net, &init_net) &&
+	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
+			       init_net.ipv4.tcp_congestion_control->owner))
+		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
+	else
+		net->ipv4.tcp_congestion_control = &tcp_reno;
+
+	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
+	net->ipv4.sysctl_tcp_shrink_window = 0;
+
+	return 0;
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+	struct net *net;
+
+	tcp_twsk_purge(net_exit_list, AF_INET);
+
+	list_for_each_entry(net, net_exit_list, exit_list) {
+		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
+		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
+		tcp_fastopen_ctx_destroy(net);
+	}
+}
+
+static struct pernet_operations __net_initdata tcp_sk_ops = {
+       .init	   = tcp_sk_init,
+       .exit	   = tcp_sk_exit,
+       .exit_batch = tcp_sk_exit_batch,
+};
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
+		     struct sock_common *sk_common, uid_t uid)
+
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+	struct bpf_tcp_iter_state *iter = priv_data;
+	int err;
+
+	err = bpf_iter_init_seq_net(priv_data, aux);
+	if (err)
+		return err;
+
+	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
+	if (err) {
+		bpf_iter_fini_seq_net(priv_data);
+		return err;
+	}
+
+	return 0;
+}
+
+static void bpf_iter_fini_tcp(void *priv_data)
+{
+	struct bpf_tcp_iter_state *iter = priv_data;
+
+	bpf_iter_fini_seq_net(priv_data);
+	kvfree(iter->batch);
+}
+
+static const struct bpf_iter_seq_info tcp_seq_info = {
+	.seq_ops		= &bpf_iter_tcp_seq_ops,
+	.init_seq_private	= bpf_iter_init_tcp,
+	.fini_seq_private	= bpf_iter_fini_tcp,
+	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
+};
+
+static const struct bpf_func_proto *
+bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
+			    const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_setsockopt:
+		return &bpf_sk_setsockopt_proto;
+	case BPF_FUNC_getsockopt:
+		return &bpf_sk_getsockopt_proto;
+	default:
+		return NULL;
+	}
+}
+
+static struct bpf_iter_reg tcp_reg_info = {
+	.target			= "tcp",
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__tcp, sk_common),
+		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+	},
+	.get_func_proto		= bpf_iter_tcp_get_func_proto,
+	.seq_info		= &tcp_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
+	if (bpf_iter_reg_target(&tcp_reg_info))
+		pr_warn("Warning: could not register bpf iterator tcp\n");
+}
+
+#endif
+
+void __init tcp_v4_init(void)
+{
+	int cpu, res;
+
+	for_each_possible_cpu(cpu) {
+		struct sock *sk;
+
+		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+					   IPPROTO_TCP, &init_net);
+		if (res)
+			panic("Failed to create the TCP control socket.\n");
+		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+		/* Please enforce IP_DF and IPID==0 for RST and
+		 * ACK sent in SYN-RECV and TIME-WAIT state.
+		 */
+		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
+
+		per_cpu(ipv4_tcp_sk, cpu) = sk;
+	}
+	if (register_pernet_subsys(&tcp_sk_ops))
+		panic("Failed to create the TCP control socket.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	bpf_iter_register();
+#endif
+}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
new file mode 100644
index 0000000000..ae36780977
--- /dev/null
+++ b/net/ipv4/tcp_lp.c
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP Low Priority (TCP-LP)
+ *
+ * TCP Low Priority is a distributed algorithm whose goal is to utilize only
+ *   the excess network bandwidth as compared to the ``fair share`` of
+ *   bandwidth as targeted by TCP.
+ *
+ * As of 2.6.13, Linux supports pluggable congestion control algorithms.
+ * Due to the limitation of the API, we take the following changes from
+ * the original TCP-LP implementation:
+ *   o We use newReno in most core CA handling. Only add some checking
+ *     within cong_avoid.
+ *   o Error correcting in remote HZ, therefore remote HZ will be keeped
+ *     on checking and updating.
+ *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
+ *     OWD have a similar meaning as RTT. Also correct the buggy formular.
+ *   o Handle reaction for Early Congestion Indication (ECI) within
+ *     pkts_acked, as mentioned within pseudo code.
+ *   o OWD is handled in relative format, where local time stamp will in
+ *     tcp_time_stamp format.
+ *
+ * Original Author:
+ *   Aleksandar Kuzmanovic <akuzma@northwestern.edu>
+ * Available from:
+ *   http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ * Original implementation for 2.4.19:
+ *   http://www-ece.rice.edu/networks/TCP-LP/
+ *
+ * 2.6.x module Authors:
+ *   Wong Hoi Sing, Edison <hswong3i@gmail.com>
+ *   Hung Hing Lun, Mike <hlhung3i@gmail.com>
+ * SourceForge project page:
+ *   http://tcp-lp-mod.sourceforge.net/
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* resolution of owd */
+#define LP_RESOL       TCP_TS_HZ
+
+/**
+ * enum tcp_lp_state
+ * @LP_VALID_RHZ: is remote HZ valid?
+ * @LP_VALID_OWD: is OWD valid?
+ * @LP_WITHIN_THR: are we within threshold?
+ * @LP_WITHIN_INF: are we within inference?
+ *
+ * TCP-LP's state flags.
+ * We create this set of state flag mainly for debugging.
+ */
+enum tcp_lp_state {
+	LP_VALID_RHZ = (1 << 0),
+	LP_VALID_OWD = (1 << 1),
+	LP_WITHIN_THR = (1 << 3),
+	LP_WITHIN_INF = (1 << 4),
+};
+
+/**
+ * struct lp
+ * @flag: TCP-LP state flag
+ * @sowd: smoothed OWD << 3
+ * @owd_min: min OWD
+ * @owd_max: max OWD
+ * @owd_max_rsv: reserved max owd
+ * @remote_hz: estimated remote HZ
+ * @remote_ref_time: remote reference time
+ * @local_ref_time: local reference time
+ * @last_drop: time for last active drop
+ * @inference: current inference
+ *
+ * TCP-LP's private struct.
+ * We get the idea from original TCP-LP implementation where only left those we
+ * found are really useful.
+ */
+struct lp {
+	u32 flag;
+	u32 sowd;
+	u32 owd_min;
+	u32 owd_max;
+	u32 owd_max_rsv;
+	u32 remote_hz;
+	u32 remote_ref_time;
+	u32 local_ref_time;
+	u32 last_drop;
+	u32 inference;
+};
+
+/**
+ * tcp_lp_init
+ * @sk: socket to initialize congestion control algorithm for
+ *
+ * Init all required variables.
+ * Clone the handling from Vegas module implementation.
+ */
+static void tcp_lp_init(struct sock *sk)
+{
+	struct lp *lp = inet_csk_ca(sk);
+
+	lp->flag = 0;
+	lp->sowd = 0;
+	lp->owd_min = 0xffffffff;
+	lp->owd_max = 0;
+	lp->owd_max_rsv = 0;
+	lp->remote_hz = 0;
+	lp->remote_ref_time = 0;
+	lp->local_ref_time = 0;
+	lp->last_drop = 0;
+	lp->inference = 0;
+}
+
+/**
+ * tcp_lp_cong_avoid
+ * @sk: socket to avoid congesting
+ *
+ * Implementation of cong_avoid.
+ * Will only call newReno CA when away from inference.
+ * From TCP-LP's paper, this will be handled in additive increasement.
+ */
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct lp *lp = inet_csk_ca(sk);
+
+	if (!(lp->flag & LP_WITHIN_INF))
+		tcp_reno_cong_avoid(sk, ack, acked);
+}
+
+/**
+ * tcp_lp_remote_hz_estimator
+ * @sk: socket which needs an estimate for the remote HZs
+ *
+ * Estimate remote HZ.
+ * We keep on updating the estimated value, where original TCP-LP
+ * implementation only guest it for once and use forever.
+ */
+static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+	s64 rhz = lp->remote_hz << 6;	/* remote HZ << 6 */
+	s64 m = 0;
+
+	/* not yet record reference time
+	 * go away!! record it before come back!! */
+	if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
+		goto out;
+
+	/* we can't calc remote HZ with no different!! */
+	if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
+	    tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+		goto out;
+
+	m = TCP_TS_HZ *
+	    (tp->rx_opt.rcv_tsval - lp->remote_ref_time) /
+	    (tp->rx_opt.rcv_tsecr - lp->local_ref_time);
+	if (m < 0)
+		m = -m;
+
+	if (rhz > 0) {
+		m -= rhz >> 6;	/* m is now error in remote HZ est */
+		rhz += m;	/* 63/64 old + 1/64 new */
+	} else
+		rhz = m << 6;
+
+ out:
+	/* record time for successful remote HZ calc */
+	if ((rhz >> 6) > 0)
+		lp->flag |= LP_VALID_RHZ;
+	else
+		lp->flag &= ~LP_VALID_RHZ;
+
+	/* record reference time stamp */
+	lp->remote_ref_time = tp->rx_opt.rcv_tsval;
+	lp->local_ref_time = tp->rx_opt.rcv_tsecr;
+
+	return rhz >> 6;
+}
+
+/**
+ * tcp_lp_owd_calculator
+ * @sk: socket to calculate one way delay for
+ *
+ * Calculate one way delay (in relative format).
+ * Original implement OWD as minus of remote time difference to local time
+ * difference directly. As this time difference just simply equal to RTT, when
+ * the network status is stable, remote RTT will equal to local RTT, and result
+ * OWD into zero.
+ * It seems to be a bug and so we fixed it.
+ */
+static u32 tcp_lp_owd_calculator(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+	s64 owd = 0;
+
+	lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
+
+	if (lp->flag & LP_VALID_RHZ) {
+		owd =
+		    tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
+		    tp->rx_opt.rcv_tsecr * (LP_RESOL / TCP_TS_HZ);
+		if (owd < 0)
+			owd = -owd;
+	}
+
+	if (owd > 0)
+		lp->flag |= LP_VALID_OWD;
+	else
+		lp->flag &= ~LP_VALID_OWD;
+
+	return owd;
+}
+
+/**
+ * tcp_lp_rtt_sample
+ * @sk: socket to add a rtt sample to
+ * @rtt: round trip time, which is ignored!
+ *
+ * Implementation or rtt_sample.
+ * Will take the following action,
+ *   1. calc OWD,
+ *   2. record the min/max OWD,
+ *   3. calc smoothed OWD (SOWD).
+ * Most ideas come from the original TCP-LP implementation.
+ */
+static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
+{
+	struct lp *lp = inet_csk_ca(sk);
+	s64 mowd = tcp_lp_owd_calculator(sk);
+
+	/* sorry that we don't have valid data */
+	if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
+		return;
+
+	/* record the next min owd */
+	if (mowd < lp->owd_min)
+		lp->owd_min = mowd;
+
+	/* always forget the max of the max
+	 * we just set owd_max as one below it */
+	if (mowd > lp->owd_max) {
+		if (mowd > lp->owd_max_rsv) {
+			if (lp->owd_max_rsv == 0)
+				lp->owd_max = mowd;
+			else
+				lp->owd_max = lp->owd_max_rsv;
+			lp->owd_max_rsv = mowd;
+		} else
+			lp->owd_max = mowd;
+	}
+
+	/* calc for smoothed owd */
+	if (lp->sowd != 0) {
+		mowd -= lp->sowd >> 3;	/* m is now error in owd est */
+		lp->sowd += mowd;	/* owd = 7/8 owd + 1/8 new */
+	} else
+		lp->sowd = mowd << 3;	/* take the measured time be owd */
+}
+
+/**
+ * tcp_lp_pkts_acked
+ * @sk: socket requiring congestion avoidance calculations
+ *
+ * Implementation of pkts_acked.
+ * Deal with active drop under Early Congestion Indication.
+ * Only drop to half and 1 will be handle, because we hope to use back
+ * newReno in increase case.
+ * We work it out by following the idea from TCP-LP's paper directly
+ */
+static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+	u32 now = tcp_time_stamp(tp);
+	u32 delta;
+
+	if (sample->rtt_us > 0)
+		tcp_lp_rtt_sample(sk, sample->rtt_us);
+
+	/* calc inference */
+	delta = now - tp->rx_opt.rcv_tsecr;
+	if ((s32)delta > 0)
+		lp->inference = 3 * delta;
+
+	/* test if within inference */
+	if (lp->last_drop && (now - lp->last_drop < lp->inference))
+		lp->flag |= LP_WITHIN_INF;
+	else
+		lp->flag &= ~LP_WITHIN_INF;
+
+	/* test if within threshold */
+	if (lp->sowd >> 3 <
+	    lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
+		lp->flag |= LP_WITHIN_THR;
+	else
+		lp->flag &= ~LP_WITHIN_THR;
+
+	pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
+		 tcp_snd_cwnd(tp), lp->remote_hz, lp->owd_min, lp->owd_max,
+		 lp->sowd >> 3);
+
+	if (lp->flag & LP_WITHIN_THR)
+		return;
+
+	/* FIXME: try to reset owd_min and owd_max here
+	 * so decrease the chance the min/max is no longer suitable
+	 * and will usually within threshold when within inference */
+	lp->owd_min = lp->sowd >> 3;
+	lp->owd_max = lp->sowd >> 2;
+	lp->owd_max_rsv = lp->sowd >> 2;
+
+	/* happened within inference
+	 * drop snd_cwnd into 1 */
+	if (lp->flag & LP_WITHIN_INF)
+		tcp_snd_cwnd_set(tp, 1U);
+
+	/* happened after inference
+	 * cut snd_cwnd into half */
+	else
+		tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp) >> 1U, 1U));
+
+	/* record this drop time */
+	lp->last_drop = now;
+}
+
+static struct tcp_congestion_ops tcp_lp __read_mostly = {
+	.init = tcp_lp_init,
+	.ssthresh = tcp_reno_ssthresh,
+	.undo_cwnd = tcp_reno_undo_cwnd,
+	.cong_avoid = tcp_lp_cong_avoid,
+	.pkts_acked = tcp_lp_pkts_acked,
+
+	.owner = THIS_MODULE,
+	.name = "lp"
+};
+
+static int __init tcp_lp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_lp);
+}
+
+static void __exit tcp_lp_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_lp);
+}
+
+module_init(tcp_lp_register);
+module_exit(tcp_lp_unregister);
+
+MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun Mike");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Low Priority");
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 0000000000..7aca12c59c
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,1050 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/tcp.h>
+#include <linux/hash.h>
+#include <linux/tcp_metrics.h>
+#include <linux/vmalloc.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/genetlink.h>
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+						   const struct inetpeer_addr *daddr,
+						   struct net *net, unsigned int hash);
+
+struct tcp_fastopen_metrics {
+	u16	mss;
+	u16	syn_loss:10,		/* Recurring Fast Open SYN losses */
+		try_exp:2;		/* Request w/ exp. option (once) */
+	unsigned long	last_syn_loss;	/* Last Fast Open SYN loss */
+	struct	tcp_fastopen_cookie	cookie;
+};
+
+/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
+ * Kernel only stores RTT and RTTVAR in usec resolution
+ */
+#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
+
+struct tcp_metrics_block {
+	struct tcp_metrics_block __rcu	*tcpm_next;
+	struct net			*tcpm_net;
+	struct inetpeer_addr		tcpm_saddr;
+	struct inetpeer_addr		tcpm_daddr;
+	unsigned long			tcpm_stamp;
+	u32				tcpm_lock;
+	u32				tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
+	struct tcp_fastopen_metrics	tcpm_fastopen;
+
+	struct rcu_head			rcu_head;
+};
+
+static inline struct net *tm_net(const struct tcp_metrics_block *tm)
+{
+	/* Paired with the WRITE_ONCE() in tcpm_new() */
+	return READ_ONCE(tm->tcpm_net);
+}
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+			      enum tcp_metric_index idx)
+{
+	/* Paired with WRITE_ONCE() in tcpm_suck_dst() */
+	return READ_ONCE(tm->tcpm_lock) & (1 << idx);
+}
+
+static u32 tcp_metric_get(const struct tcp_metrics_block *tm,
+			  enum tcp_metric_index idx)
+{
+	/* Paired with WRITE_ONCE() in tcp_metric_set() */
+	return READ_ONCE(tm->tcpm_vals[idx]);
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+			   enum tcp_metric_index idx,
+			   u32 val)
+{
+	/* Paired with READ_ONCE() in tcp_metric_get() */
+	WRITE_ONCE(tm->tcpm_vals[idx], val);
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+		      const struct inetpeer_addr *b)
+{
+	return (a->family == b->family) && !inetpeer_addr_cmp(a, b);
+}
+
+struct tcpm_hash_bucket {
+	struct tcp_metrics_block __rcu	*chain;
+};
+
+static struct tcpm_hash_bucket	*tcp_metrics_hash __read_mostly;
+static unsigned int		tcp_metrics_hash_log __read_mostly;
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+static DEFINE_SEQLOCK(fastopen_seqlock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm,
+			  const struct dst_entry *dst,
+			  bool fastopen_clear)
+{
+	u32 msval;
+	u32 val;
+
+	WRITE_ONCE(tm->tcpm_stamp, jiffies);
+
+	val = 0;
+	if (dst_metric_locked(dst, RTAX_RTT))
+		val |= 1 << TCP_METRIC_RTT;
+	if (dst_metric_locked(dst, RTAX_RTTVAR))
+		val |= 1 << TCP_METRIC_RTTVAR;
+	if (dst_metric_locked(dst, RTAX_SSTHRESH))
+		val |= 1 << TCP_METRIC_SSTHRESH;
+	if (dst_metric_locked(dst, RTAX_CWND))
+		val |= 1 << TCP_METRIC_CWND;
+	if (dst_metric_locked(dst, RTAX_REORDERING))
+		val |= 1 << TCP_METRIC_REORDERING;
+	/* Paired with READ_ONCE() in tcp_metric_locked() */
+	WRITE_ONCE(tm->tcpm_lock, val);
+
+	msval = dst_metric_raw(dst, RTAX_RTT);
+	tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC);
+
+	msval = dst_metric_raw(dst, RTAX_RTTVAR);
+	tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC);
+	tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+		       dst_metric_raw(dst, RTAX_SSTHRESH));
+	tcp_metric_set(tm, TCP_METRIC_CWND,
+		       dst_metric_raw(dst, RTAX_CWND));
+	tcp_metric_set(tm, TCP_METRIC_REORDERING,
+		       dst_metric_raw(dst, RTAX_REORDERING));
+	if (fastopen_clear) {
+		write_seqlock(&fastopen_seqlock);
+		tm->tcpm_fastopen.mss = 0;
+		tm->tcpm_fastopen.syn_loss = 0;
+		tm->tcpm_fastopen.try_exp = 0;
+		tm->tcpm_fastopen.cookie.exp = false;
+		tm->tcpm_fastopen.cookie.len = 0;
+		write_sequnlock(&fastopen_seqlock);
+	}
+}
+
+#define TCP_METRICS_TIMEOUT		(60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm,
+			     const struct dst_entry *dst)
+{
+	unsigned long limit;
+
+	if (!tm)
+		return;
+	limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT;
+	if (unlikely(time_after(jiffies, limit)))
+		tcpm_suck_dst(tm, dst, false);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH	5
+#define TCP_METRICS_RECLAIM_PTR		(struct tcp_metrics_block *) 0x1UL
+
+#define deref_locked(p)	\
+	rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+					  struct inetpeer_addr *saddr,
+					  struct inetpeer_addr *daddr,
+					  unsigned int hash)
+{
+	struct tcp_metrics_block *tm;
+	struct net *net;
+	bool reclaim = false;
+
+	spin_lock_bh(&tcp_metrics_lock);
+	net = dev_net(dst->dev);
+
+	/* While waiting for the spin-lock the cache might have been populated
+	 * with this entry and so we have to check again.
+	 */
+	tm = __tcp_get_metrics(saddr, daddr, net, hash);
+	if (tm == TCP_METRICS_RECLAIM_PTR) {
+		reclaim = true;
+		tm = NULL;
+	}
+	if (tm) {
+		tcpm_check_stamp(tm, dst);
+		goto out_unlock;
+	}
+
+	if (unlikely(reclaim)) {
+		struct tcp_metrics_block *oldest;
+
+		oldest = deref_locked(tcp_metrics_hash[hash].chain);
+		for (tm = deref_locked(oldest->tcpm_next); tm;
+		     tm = deref_locked(tm->tcpm_next)) {
+			if (time_before(READ_ONCE(tm->tcpm_stamp),
+					READ_ONCE(oldest->tcpm_stamp)))
+				oldest = tm;
+		}
+		tm = oldest;
+	} else {
+		tm = kzalloc(sizeof(*tm), GFP_ATOMIC);
+		if (!tm)
+			goto out_unlock;
+	}
+	/* Paired with the READ_ONCE() in tm_net() */
+	WRITE_ONCE(tm->tcpm_net, net);
+
+	tm->tcpm_saddr = *saddr;
+	tm->tcpm_daddr = *daddr;
+
+	tcpm_suck_dst(tm, dst, reclaim);
+
+	if (likely(!reclaim)) {
+		tm->tcpm_next = tcp_metrics_hash[hash].chain;
+		rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
+	}
+
+out_unlock:
+	spin_unlock_bh(&tcp_metrics_lock);
+	return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+	if (tm)
+		return tm;
+	if (depth > TCP_METRICS_RECLAIM_DEPTH)
+		return TCP_METRICS_RECLAIM_PTR;
+	return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+						   const struct inetpeer_addr *daddr,
+						   struct net *net, unsigned int hash)
+{
+	struct tcp_metrics_block *tm;
+	int depth = 0;
+
+	for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_saddr, saddr) &&
+		    addr_same(&tm->tcpm_daddr, daddr) &&
+		    net_eq(tm_net(tm), net))
+			break;
+		depth++;
+	}
+	return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+						       struct dst_entry *dst)
+{
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr saddr, daddr;
+	unsigned int hash;
+	struct net *net;
+
+	saddr.family = req->rsk_ops->family;
+	daddr.family = req->rsk_ops->family;
+	switch (daddr.family) {
+	case AF_INET:
+		inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr);
+		inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr);
+		hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr);
+		inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr);
+		hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
+		break;
+#endif
+	default:
+		return NULL;
+	}
+
+	net = dev_net(dst->dev);
+	hash ^= net_hash_mix(net);
+	hash = hash_32(hash, tcp_metrics_hash_log);
+
+	for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_saddr, &saddr) &&
+		    addr_same(&tm->tcpm_daddr, &daddr) &&
+		    net_eq(tm_net(tm), net))
+			break;
+	}
+	tcpm_check_stamp(tm, dst);
+	return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+						 struct dst_entry *dst,
+						 bool create)
+{
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr saddr, daddr;
+	unsigned int hash;
+	struct net *net;
+
+	if (sk->sk_family == AF_INET) {
+		inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+		inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
+		hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (sk->sk_family == AF_INET6) {
+		if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
+			inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
+			inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
+			hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
+		} else {
+			inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr);
+			inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr);
+			hash = ipv6_addr_hash(&sk->sk_v6_daddr);
+		}
+	}
+#endif
+	else
+		return NULL;
+
+	net = dev_net(dst->dev);
+	hash ^= net_hash_mix(net);
+	hash = hash_32(hash, tcp_metrics_hash_log);
+
+	tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
+	if (tm == TCP_METRICS_RECLAIM_PTR)
+		tm = NULL;
+	if (!tm && create)
+		tm = tcpm_new(dst, &saddr, &daddr, hash);
+	else
+		tcpm_check_stamp(tm, dst);
+
+	return tm;
+}
+
+/* Save metrics learned by this TCP session.  This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct tcp_metrics_block *tm;
+	unsigned long rtt;
+	u32 val;
+	int m;
+
+	sk_dst_confirm(sk);
+	if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst)
+		return;
+
+	rcu_read_lock();
+	if (icsk->icsk_backoff || !tp->srtt_us) {
+		/* This session failed to estimate rtt. Why?
+		 * Probably, no packets returned in time.  Reset our
+		 * results.
+		 */
+		tm = tcp_get_metrics(sk, dst, false);
+		if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+			tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+		goto out_unlock;
+	} else
+		tm = tcp_get_metrics(sk, dst, true);
+
+	if (!tm)
+		goto out_unlock;
+
+	rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+	m = rtt - tp->srtt_us;
+
+	/* If newly calculated rtt larger than stored one, store new
+	 * one. Otherwise, use EWMA. Remember, rtt overestimation is
+	 * always better than underestimation.
+	 */
+	if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+		if (m <= 0)
+			rtt = tp->srtt_us;
+		else
+			rtt -= (m >> 3);
+		tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
+	}
+
+	if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+		unsigned long var;
+
+		if (m < 0)
+			m = -m;
+
+		/* Scale deviation to rttvar fixed point */
+		m >>= 1;
+		if (m < tp->mdev_us)
+			m = tp->mdev_us;
+
+		var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
+		if (m >= var)
+			var = m;
+		else
+			var -= (var - m) >> 2;
+
+		tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
+	}
+
+	if (tcp_in_initial_slowstart(tp)) {
+		/* Slow start still did not finish. */
+		if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+			if (val && (tcp_snd_cwnd(tp) >> 1) > val)
+				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+					       tcp_snd_cwnd(tp) >> 1);
+		}
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			if (tcp_snd_cwnd(tp) > val)
+				tcp_metric_set(tm, TCP_METRIC_CWND,
+					       tcp_snd_cwnd(tp));
+		}
+	} else if (!tcp_in_slow_start(tp) &&
+		   icsk->icsk_ca_state == TCP_CA_Open) {
+		/* Cong. avoidance phase, cwnd is reliable. */
+		if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+			tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+				       max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh));
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1);
+		}
+	} else {
+		/* Else slow start did not finish, cwnd is non-sense,
+		 * ssthresh may be also invalid.
+		 */
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			tcp_metric_set(tm, TCP_METRIC_CWND,
+				       (val + tp->snd_ssthresh) >> 1);
+		}
+		if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+			if (val && tp->snd_ssthresh > val)
+				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+					       tp->snd_ssthresh);
+		}
+		if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+			val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+			if (val < tp->reordering &&
+			    tp->reordering !=
+			    READ_ONCE(net->ipv4.sysctl_tcp_reordering))
+				tcp_metric_set(tm, TCP_METRIC_REORDERING,
+					       tp->reordering);
+		}
+	}
+	WRITE_ONCE(tm->tcpm_stamp, jiffies);
+out_unlock:
+	rcu_read_unlock();
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct tcp_metrics_block *tm;
+	u32 val, crtt = 0; /* cached RTT scaled by 8 */
+
+	sk_dst_confirm(sk);
+	/* ssthresh may have been reduced unnecessarily during.
+	 * 3WHS. Restore it back to its initial default.
+	 */
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	if (!dst)
+		goto reset;
+
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, dst, false);
+	if (!tm) {
+		rcu_read_unlock();
+		goto reset;
+	}
+
+	if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+		tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+	val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
+	      0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+	if (val) {
+		tp->snd_ssthresh = val;
+		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	}
+	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+	if (val && tp->reordering != val)
+		tp->reordering = val;
+
+	crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+	rcu_read_unlock();
+reset:
+	/* The initial RTT measurement from the SYN/SYN-ACK is not ideal
+	 * to seed the RTO for later data packets because SYN packets are
+	 * small. Use the per-dst cached values to seed the RTO but keep
+	 * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
+	 * Later the RTO will be updated immediately upon obtaining the first
+	 * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
+	 * influences the first RTO but not later RTT estimation.
+	 *
+	 * But if RTT is not available from the SYN (due to retransmits or
+	 * syn cookies) or the cache, force a conservative 3secs timeout.
+	 *
+	 * A bit of theory. RTT is time passed after "normal" sized packet
+	 * is sent until it is ACKed. In normal circumstances sending small
+	 * packets force peer to delay ACKs and calculation is correct too.
+	 * The algorithm is adaptive and, provided we follow specs, it
+	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
+	 * tricks sort of "quick acks" for time long enough to decrease RTT
+	 * to low value, and then abruptly stops to do it and starts to delay
+	 * ACKs, wait for troubles.
+	 */
+	if (crtt > tp->srtt_us) {
+		/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
+		crtt /= 8 * USEC_PER_SEC / HZ;
+		inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
+	} else if (tp->srtt_us == 0) {
+		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
+		 * 3WHS. This is most likely due to retransmission,
+		 * including spurious one. Reset the RTO back to 3secs
+		 * from the more aggressive 1sec to avoid more spurious
+		 * retransmission.
+		 */
+		tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
+		tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
+
+		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+	}
+}
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
+{
+	struct tcp_metrics_block *tm;
+	bool ret;
+
+	if (!dst)
+		return false;
+
+	rcu_read_lock();
+	tm = __tcp_get_metrics_req(req, dst);
+	if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
+		ret = true;
+	else
+		ret = false;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+			    struct tcp_fastopen_cookie *cookie)
+{
+	struct tcp_metrics_block *tm;
+
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+	if (tm) {
+		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+		unsigned int seq;
+
+		do {
+			seq = read_seqbegin(&fastopen_seqlock);
+			if (tfom->mss)
+				*mss = tfom->mss;
+			*cookie = tfom->cookie;
+			if (cookie->len <= 0 && tfom->try_exp == 1)
+				cookie->exp = true;
+		} while (read_seqretry(&fastopen_seqlock, seq));
+	}
+	rcu_read_unlock();
+}
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+			    struct tcp_fastopen_cookie *cookie, bool syn_lost,
+			    u16 try_exp)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_metrics_block *tm;
+
+	if (!dst)
+		return;
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, dst, true);
+	if (tm) {
+		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+
+		write_seqlock_bh(&fastopen_seqlock);
+		if (mss)
+			tfom->mss = mss;
+		if (cookie && cookie->len > 0)
+			tfom->cookie = *cookie;
+		else if (try_exp > tfom->try_exp &&
+			 tfom->cookie.len <= 0 && !tfom->cookie.exp)
+			tfom->try_exp = try_exp;
+		if (syn_lost) {
+			++tfom->syn_loss;
+			tfom->last_syn_loss = jiffies;
+		} else
+			tfom->syn_loss = 0;
+		write_sequnlock_bh(&fastopen_seqlock);
+	}
+	rcu_read_unlock();
+}
+
+static struct genl_family tcp_metrics_nl_family;
+
+static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
+	[TCP_METRICS_ATTR_ADDR_IPV4]	= { .type = NLA_U32, },
+	[TCP_METRICS_ATTR_ADDR_IPV6]	= { .type = NLA_BINARY,
+					    .len = sizeof(struct in6_addr), },
+	/* Following attributes are not received for GET/DEL,
+	 * we keep them for reference
+	 */
+#if 0
+	[TCP_METRICS_ATTR_AGE]		= { .type = NLA_MSECS, },
+	[TCP_METRICS_ATTR_TW_TSVAL]	= { .type = NLA_U32, },
+	[TCP_METRICS_ATTR_TW_TS_STAMP]	= { .type = NLA_S32, },
+	[TCP_METRICS_ATTR_VALS]		= { .type = NLA_NESTED, },
+	[TCP_METRICS_ATTR_FOPEN_MSS]	= { .type = NLA_U16, },
+	[TCP_METRICS_ATTR_FOPEN_SYN_DROPS]	= { .type = NLA_U16, },
+	[TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]	= { .type = NLA_MSECS, },
+	[TCP_METRICS_ATTR_FOPEN_COOKIE]	= { .type = NLA_BINARY,
+					    .len = TCP_FASTOPEN_COOKIE_MAX, },
+#endif
+};
+
+/* Add attributes, caller cancels its header on failure */
+static int tcp_metrics_fill_info(struct sk_buff *msg,
+				 struct tcp_metrics_block *tm)
+{
+	struct nlattr *nest;
+	int i;
+
+	switch (tm->tcpm_daddr.family) {
+	case AF_INET:
+		if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
+				    inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0)
+			goto nla_put_failure;
+		if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
+				    inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0)
+			goto nla_put_failure;
+		break;
+	case AF_INET6:
+		if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
+				     inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0)
+			goto nla_put_failure;
+		if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
+				     inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0)
+			goto nla_put_failure;
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
+			  jiffies - READ_ONCE(tm->tcpm_stamp),
+			  TCP_METRICS_ATTR_PAD) < 0)
+		goto nla_put_failure;
+
+	{
+		int n = 0;
+
+		nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS);
+		if (!nest)
+			goto nla_put_failure;
+		for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
+			u32 val = tcp_metric_get(tm, i);
+
+			if (!val)
+				continue;
+			if (i == TCP_METRIC_RTT) {
+				if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
+						val) < 0)
+					goto nla_put_failure;
+				n++;
+				val = max(val / 1000, 1U);
+			}
+			if (i == TCP_METRIC_RTTVAR) {
+				if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
+						val) < 0)
+					goto nla_put_failure;
+				n++;
+				val = max(val / 1000, 1U);
+			}
+			if (nla_put_u32(msg, i + 1, val) < 0)
+				goto nla_put_failure;
+			n++;
+		}
+		if (n)
+			nla_nest_end(msg, nest);
+		else
+			nla_nest_cancel(msg, nest);
+	}
+
+	{
+		struct tcp_fastopen_metrics tfom_copy[1], *tfom;
+		unsigned int seq;
+
+		do {
+			seq = read_seqbegin(&fastopen_seqlock);
+			tfom_copy[0] = tm->tcpm_fastopen;
+		} while (read_seqretry(&fastopen_seqlock, seq));
+
+		tfom = tfom_copy;
+		if (tfom->mss &&
+		    nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
+				tfom->mss) < 0)
+			goto nla_put_failure;
+		if (tfom->syn_loss &&
+		    (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
+				tfom->syn_loss) < 0 ||
+		     nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
+				jiffies - tfom->last_syn_loss,
+				TCP_METRICS_ATTR_PAD) < 0))
+			goto nla_put_failure;
+		if (tfom->cookie.len > 0 &&
+		    nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
+			    tfom->cookie.len, tfom->cookie.val) < 0)
+			goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int tcp_metrics_dump_info(struct sk_buff *skb,
+				 struct netlink_callback *cb,
+				 struct tcp_metrics_block *tm)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			  &tcp_metrics_nl_family, NLM_F_MULTI,
+			  TCP_METRICS_CMD_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (tcp_metrics_fill_info(skb, tm) < 0)
+		goto nla_put_failure;
+
+	genlmsg_end(skb, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int tcp_metrics_nl_dump(struct sk_buff *skb,
+			       struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	unsigned int max_rows = 1U << tcp_metrics_hash_log;
+	unsigned int row, s_row = cb->args[0];
+	int s_col = cb->args[1], col = s_col;
+
+	for (row = s_row; row < max_rows; row++, s_col = 0) {
+		struct tcp_metrics_block *tm;
+		struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
+
+		rcu_read_lock();
+		for (col = 0, tm = rcu_dereference(hb->chain); tm;
+		     tm = rcu_dereference(tm->tcpm_next), col++) {
+			if (!net_eq(tm_net(tm), net))
+				continue;
+			if (col < s_col)
+				continue;
+			if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
+				rcu_read_unlock();
+				goto done;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+done:
+	cb->args[0] = row;
+	cb->args[1] = col;
+	return skb->len;
+}
+
+static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+			   unsigned int *hash, int optional, int v4, int v6)
+{
+	struct nlattr *a;
+
+	a = info->attrs[v4];
+	if (a) {
+		inetpeer_set_addr_v4(addr, nla_get_in_addr(a));
+		if (hash)
+			*hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr));
+		return 0;
+	}
+	a = info->attrs[v6];
+	if (a) {
+		struct in6_addr in6;
+
+		if (nla_len(a) != sizeof(struct in6_addr))
+			return -EINVAL;
+		in6 = nla_get_in6_addr(a);
+		inetpeer_set_addr_v6(addr, &in6);
+		if (hash)
+			*hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr));
+		return 0;
+	}
+	return optional ? 1 : -EAFNOSUPPORT;
+}
+
+static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+			 unsigned int *hash, int optional)
+{
+	return __parse_nl_addr(info, addr, hash, optional,
+			       TCP_METRICS_ATTR_ADDR_IPV4,
+			       TCP_METRICS_ATTR_ADDR_IPV6);
+}
+
+static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
+{
+	return __parse_nl_addr(info, addr, NULL, 0,
+			       TCP_METRICS_ATTR_SADDR_IPV4,
+			       TCP_METRICS_ATTR_SADDR_IPV6);
+}
+
+static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr saddr, daddr;
+	unsigned int hash;
+	struct sk_buff *msg;
+	struct net *net = genl_info_net(info);
+	void *reply;
+	int ret;
+	bool src = true;
+
+	ret = parse_nl_addr(info, &daddr, &hash, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = parse_nl_saddr(info, &saddr);
+	if (ret < 0)
+		src = false;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
+				  info->genlhdr->cmd);
+	if (!reply)
+		goto nla_put_failure;
+
+	hash ^= net_hash_mix(net);
+	hash = hash_32(hash, tcp_metrics_hash_log);
+	ret = -ESRCH;
+	rcu_read_lock();
+	for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_daddr, &daddr) &&
+		    (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+		    net_eq(tm_net(tm), net)) {
+			ret = tcp_metrics_fill_info(msg, tm);
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (ret < 0)
+		goto out_free;
+
+	genlmsg_end(msg, reply);
+	return genlmsg_reply(msg, info);
+
+nla_put_failure:
+	ret = -EMSGSIZE;
+
+out_free:
+	nlmsg_free(msg);
+	return ret;
+}
+
+static void tcp_metrics_flush_all(struct net *net)
+{
+	unsigned int max_rows = 1U << tcp_metrics_hash_log;
+	struct tcpm_hash_bucket *hb = tcp_metrics_hash;
+	struct tcp_metrics_block *tm;
+	unsigned int row;
+
+	for (row = 0; row < max_rows; row++, hb++) {
+		struct tcp_metrics_block __rcu **pp;
+		bool match;
+
+		spin_lock_bh(&tcp_metrics_lock);
+		pp = &hb->chain;
+		for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+			match = net ? net_eq(tm_net(tm), net) :
+				!refcount_read(&tm_net(tm)->ns.count);
+			if (match) {
+				rcu_assign_pointer(*pp, tm->tcpm_next);
+				kfree_rcu(tm, rcu_head);
+			} else {
+				pp = &tm->tcpm_next;
+			}
+		}
+		spin_unlock_bh(&tcp_metrics_lock);
+	}
+}
+
+static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+	struct tcpm_hash_bucket *hb;
+	struct tcp_metrics_block *tm;
+	struct tcp_metrics_block __rcu **pp;
+	struct inetpeer_addr saddr, daddr;
+	unsigned int hash;
+	struct net *net = genl_info_net(info);
+	int ret;
+	bool src = true, found = false;
+
+	ret = parse_nl_addr(info, &daddr, &hash, 1);
+	if (ret < 0)
+		return ret;
+	if (ret > 0) {
+		tcp_metrics_flush_all(net);
+		return 0;
+	}
+	ret = parse_nl_saddr(info, &saddr);
+	if (ret < 0)
+		src = false;
+
+	hash ^= net_hash_mix(net);
+	hash = hash_32(hash, tcp_metrics_hash_log);
+	hb = tcp_metrics_hash + hash;
+	pp = &hb->chain;
+	spin_lock_bh(&tcp_metrics_lock);
+	for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+		if (addr_same(&tm->tcpm_daddr, &daddr) &&
+		    (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+		    net_eq(tm_net(tm), net)) {
+			rcu_assign_pointer(*pp, tm->tcpm_next);
+			kfree_rcu(tm, rcu_head);
+			found = true;
+		} else {
+			pp = &tm->tcpm_next;
+		}
+	}
+	spin_unlock_bh(&tcp_metrics_lock);
+	if (!found)
+		return -ESRCH;
+	return 0;
+}
+
+static const struct genl_small_ops tcp_metrics_nl_ops[] = {
+	{
+		.cmd = TCP_METRICS_CMD_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = tcp_metrics_nl_cmd_get,
+		.dumpit = tcp_metrics_nl_dump,
+	},
+	{
+		.cmd = TCP_METRICS_CMD_DEL,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = tcp_metrics_nl_cmd_del,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
+static struct genl_family tcp_metrics_nl_family __ro_after_init = {
+	.hdrsize	= 0,
+	.name		= TCP_METRICS_GENL_NAME,
+	.version	= TCP_METRICS_GENL_VERSION,
+	.maxattr	= TCP_METRICS_ATTR_MAX,
+	.policy = tcp_metrics_nl_policy,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.small_ops	= tcp_metrics_nl_ops,
+	.n_small_ops	= ARRAY_SIZE(tcp_metrics_nl_ops),
+	.resv_start_op	= TCP_METRICS_CMD_DEL + 1,
+};
+
+static unsigned int tcpmhash_entries __initdata;
+static int __init set_tcpmhash_entries(char *str)
+{
+	ssize_t ret;
+
+	if (!str)
+		return 0;
+
+	ret = kstrtouint(str, 0, &tcpmhash_entries);
+	if (ret)
+		return 0;
+
+	return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static void __init tcp_metrics_hash_alloc(void)
+{
+	unsigned int slots = tcpmhash_entries;
+	size_t size;
+
+	if (!slots) {
+		if (totalram_pages() >= 128 * 1024)
+			slots = 16 * 1024;
+		else
+			slots = 8 * 1024;
+	}
+
+	tcp_metrics_hash_log = order_base_2(slots);
+	size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
+
+	tcp_metrics_hash = kvzalloc(size, GFP_KERNEL);
+	if (!tcp_metrics_hash)
+		panic("Could not allocate the tcp_metrics hash table\n");
+}
+
+static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list)
+{
+	tcp_metrics_flush_all(NULL);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+	.exit_batch	=	tcp_net_metrics_exit_batch,
+};
+
+void __init tcp_metrics_init(void)
+{
+	int ret;
+
+	tcp_metrics_hash_alloc();
+
+	ret = register_pernet_subsys(&tcp_net_metrics_ops);
+	if (ret < 0)
+		panic("Could not register tcp_net_metrics_ops\n");
+
+	ret = genl_register_family(&tcp_metrics_nl_family);
+	if (ret < 0)
+		panic("Could not register tcp_metrics generic netlink\n");
+}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
new file mode 100644
index 0000000000..b98d476f15
--- /dev/null
+++ b/net/ipv4/tcp_minisocks.c
@@ -0,0 +1,895 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <net/busy_poll.h>
+
+static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+	if (seq == s_win)
+		return true;
+	if (after(end_seq, s_win) && before(seq, e_win))
+		return true;
+	return seq == e_win && seq == end_seq;
+}
+
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+				  const struct sk_buff *skb, int mib_idx)
+{
+	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+				  &tcptw->tw_last_oow_ack_time)) {
+		/* Send ACK. Note, we do not put the bucket,
+		 * it will be released by caller.
+		 */
+		return TCP_TW_ACK;
+	}
+
+	/* We are rate-limiting, so just release the tw sock and drop skb. */
+	inet_twsk_put(tw);
+	return TCP_TW_SUCCESS;
+}
+
+/*
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ *   (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ *   lifetime in the internet, which results in wrong conclusion, that
+ *   it is set to catch "old duplicate segments" wandering out of their path.
+ *   It is not quite correct. This timeout is calculated so that it exceeds
+ *   maximal retransmission timeout enough to allow to lose one (or more)
+ *   segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ *   finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc.  --ANK
+ *
+ * We don't need to initialize tmp_out.sack_ok as we don't use the results
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+			   const struct tcphdr *th)
+{
+	struct tcp_options_received tmp_opt;
+	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+	bool paws_reject = false;
+
+	tmp_opt.saw_tstamp = 0;
+	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
+		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
+
+		if (tmp_opt.saw_tstamp) {
+			if (tmp_opt.rcv_tsecr)
+				tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
+			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
+			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
+			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+		}
+	}
+
+	if (tw->tw_substate == TCP_FIN_WAIT2) {
+		/* Just repeat all the checks of tcp_rcv_state_process() */
+
+		/* Out of window, send ACK */
+		if (paws_reject ||
+		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+				   tcptw->tw_rcv_nxt,
+				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
+			return tcp_timewait_check_oow_rate_limit(
+				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
+
+		if (th->rst)
+			goto kill;
+
+		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
+			return TCP_TW_RST;
+
+		/* Dup ACK? */
+		if (!th->ack ||
+		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
+		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
+			inet_twsk_put(tw);
+			return TCP_TW_SUCCESS;
+		}
+
+		/* New data or FIN. If new data arrive after half-duplex close,
+		 * reset.
+		 */
+		if (!th->fin ||
+		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1)
+			return TCP_TW_RST;
+
+		/* FIN arrived, enter true time-wait state. */
+		tw->tw_substate	  = TCP_TIME_WAIT;
+		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		if (tmp_opt.saw_tstamp) {
+			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
+			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
+		}
+
+		inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
+		return TCP_TW_ACK;
+	}
+
+	/*
+	 *	Now real TIME-WAIT state.
+	 *
+	 *	RFC 1122:
+	 *	"When a connection is [...] on TIME-WAIT state [...]
+	 *	[a TCP] MAY accept a new SYN from the remote TCP to
+	 *	reopen the connection directly, if it:
+	 *
+	 *	(1)  assigns its initial sequence number for the new
+	 *	connection to be larger than the largest sequence
+	 *	number it used on the previous connection incarnation,
+	 *	and
+	 *
+	 *	(2)  returns to TIME-WAIT state if the SYN turns out
+	 *	to be an old duplicate".
+	 */
+
+	if (!paws_reject &&
+	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
+	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
+		/* In window segment, it may be only reset or bare ack. */
+
+		if (th->rst) {
+			/* This is TIME_WAIT assassination, in two flavors.
+			 * Oh well... nobody has a sufficient solution to this
+			 * protocol bug yet.
+			 */
+			if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) {
+kill:
+				inet_twsk_deschedule_put(tw);
+				return TCP_TW_SUCCESS;
+			}
+		} else {
+			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
+		}
+
+		if (tmp_opt.saw_tstamp) {
+			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
+			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
+		}
+
+		inet_twsk_put(tw);
+		return TCP_TW_SUCCESS;
+	}
+
+	/* Out of window segment.
+
+	   All the segments are ACKed immediately.
+
+	   The only exception is new SYN. We accept it, if it is
+	   not old duplicate and we are not in danger to be killed
+	   by delayed old duplicates. RFC check is that it has
+	   newer sequence number works at rates <40Mbit/sec.
+	   However, if paws works, it is reliable AND even more,
+	   we even may relax silly seq space cutoff.
+
+	   RED-PEN: we violate main RFC requirement, if this SYN will appear
+	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
+	   we must return socket to time-wait state. It is not good,
+	   but not fatal yet.
+	 */
+
+	if (th->syn && !th->rst && !th->ack && !paws_reject &&
+	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+	     (tmp_opt.saw_tstamp &&
+	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
+		if (isn == 0)
+			isn++;
+		TCP_SKB_CB(skb)->tcp_tw_isn = isn;
+		return TCP_TW_SYN;
+	}
+
+	if (paws_reject)
+		__NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+
+	if (!th->rst) {
+		/* In this case we must reset the TIMEWAIT timer.
+		 *
+		 * If it is ACKless SYN it may be both old duplicate
+		 * and new good SYN with random sequence number <rcv_nxt.
+		 * Do not reschedule in the last case.
+		 */
+		if (paws_reject || th->ack)
+			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
+
+		return tcp_timewait_check_oow_rate_limit(
+			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
+	}
+	inet_twsk_put(tw);
+	return TCP_TW_SUCCESS;
+}
+EXPORT_SYMBOL(tcp_timewait_state_process);
+
+static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
+{
+#ifdef CONFIG_TCP_MD5SIG
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_key *key;
+
+	/*
+	 * The timewait bucket does not have the key DB from the
+	 * sock structure. We just make a quick copy of the
+	 * md5 key being used (if indeed we are using one)
+	 * so the timewait ack generating code has the key.
+	 */
+	tcptw->tw_md5_key = NULL;
+	if (!static_branch_unlikely(&tcp_md5_needed.key))
+		return;
+
+	key = tp->af_specific->md5_lookup(sk, sk);
+	if (key) {
+		tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+		if (!tcptw->tw_md5_key)
+			return;
+		if (!tcp_alloc_md5sig_pool())
+			goto out_free;
+		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
+			goto out_free;
+	}
+	return;
+out_free:
+	WARN_ON_ONCE(1);
+	kfree(tcptw->tw_md5_key);
+	tcptw->tw_md5_key = NULL;
+#endif
+}
+
+/*
+ * Move a socket to time-wait or dead fin-wait-2 state.
+ */
+void tcp_time_wait(struct sock *sk, int state, int timeo)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct inet_timewait_sock *tw;
+
+	tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);
+
+	if (tw) {
+		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+		tw->tw_transparent	= inet_test_bit(TRANSPARENT, sk);
+		tw->tw_mark		= sk->sk_mark;
+		tw->tw_priority		= sk->sk_priority;
+		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
+		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
+		tcptw->tw_snd_nxt	= tp->snd_nxt;
+		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
+		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
+		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+		tcptw->tw_ts_offset	= tp->tsoffset;
+		tcptw->tw_last_oow_ack_time = 0;
+		tcptw->tw_tx_delay	= tp->tcp_tx_delay;
+		tw->tw_txhash		= sk->sk_txhash;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (tw->tw_family == PF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+
+			tw->tw_v6_daddr = sk->sk_v6_daddr;
+			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+			tw->tw_tclass = np->tclass;
+			tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
+			tw->tw_ipv6only = sk->sk_ipv6only;
+		}
+#endif
+
+		tcp_time_wait_init(sk, tcptw);
+
+		/* Get the TIME_WAIT timeout firing. */
+		if (timeo < rto)
+			timeo = rto;
+
+		if (state == TCP_TIME_WAIT)
+			timeo = TCP_TIMEWAIT_LEN;
+
+		/* tw_timer is pinned, so we need to make sure BH are disabled
+		 * in following section, otherwise timer handler could run before
+		 * we complete the initialization.
+		 */
+		local_bh_disable();
+		inet_twsk_schedule(tw, timeo);
+		/* Linkage updates.
+		 * Note that access to tw after this point is illegal.
+		 */
+		inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo);
+		local_bh_enable();
+	} else {
+		/* Sorry, if we're out of memory, just CLOSE this
+		 * socket up.  We've got bigger problems than
+		 * non-graceful socket closings.
+		 */
+		NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW);
+	}
+
+	tcp_update_metrics(sk);
+	tcp_done(sk);
+}
+EXPORT_SYMBOL(tcp_time_wait);
+
+void tcp_twsk_destructor(struct sock *sk)
+{
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+
+		if (twsk->tw_md5_key) {
+			kfree_rcu(twsk->tw_md5_key, rcu);
+			static_branch_slow_dec_deferred(&tcp_md5_needed);
+		}
+	}
+#endif
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+
+void tcp_twsk_purge(struct list_head *net_exit_list, int family)
+{
+	bool purged_once = false;
+	struct net *net;
+
+	list_for_each_entry(net, net_exit_list, exit_list) {
+		if (net->ipv4.tcp_death_row.hashinfo->pernet) {
+			/* Even if tw_refcount == 1, we must clean up kernel reqsk */
+			inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family);
+		} else if (!purged_once) {
+			/* The last refcount is decremented in tcp_sk_exit_batch() */
+			if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1)
+				continue;
+
+			inet_twsk_purge(&tcp_hashinfo, family);
+			purged_once = true;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_purge);
+
+/* Warning : This function is called without sk_listener being locked.
+ * Be sure to read socket fields once, as their value could change under us.
+ */
+void tcp_openreq_init_rwin(struct request_sock *req,
+			   const struct sock *sk_listener,
+			   const struct dst_entry *dst)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+	const struct tcp_sock *tp = tcp_sk(sk_listener);
+	int full_space = tcp_full_space(sk_listener);
+	u32 window_clamp;
+	__u8 rcv_wscale;
+	u32 rcv_wnd;
+	int mss;
+
+	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+	window_clamp = READ_ONCE(tp->window_clamp);
+	/* Set this up on the first call only */
+	req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+	/* limit the window selection if the user enforce a smaller rx buffer */
+	if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
+	    (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+		req->rsk_window_clamp = full_space;
+
+	rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
+	if (rcv_wnd == 0)
+		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+	else if (full_space < rcv_wnd * mss)
+		full_space = rcv_wnd * mss;
+
+	/* tcp_full_space because it is guaranteed to be the first packet */
+	tcp_select_initial_window(sk_listener, full_space,
+		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+		&req->rsk_rcv_wnd,
+		&req->rsk_window_clamp,
+		ireq->wscale_ok,
+		&rcv_wscale,
+		rcv_wnd);
+	ireq->rcv_wscale = rcv_wscale;
+}
+EXPORT_SYMBOL(tcp_openreq_init_rwin);
+
+static void tcp_ecn_openreq_child(struct tcp_sock *tp,
+				  const struct request_sock *req)
+{
+	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
+}
+
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+	bool ca_got_dst = false;
+
+	if (ca_key != TCP_CA_UNSPEC) {
+		const struct tcp_congestion_ops *ca;
+
+		rcu_read_lock();
+		ca = tcp_ca_find_key(ca_key);
+		if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
+			icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+			icsk->icsk_ca_ops = ca;
+			ca_got_dst = true;
+		}
+		rcu_read_unlock();
+	}
+
+	/* If no valid choice made yet, assign current system default ca. */
+	if (!ca_got_dst &&
+	    (!icsk->icsk_ca_setsockopt ||
+	     !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
+		tcp_assign_congestion_control(sk);
+
+	tcp_set_ca_state(sk, TCP_CA_Open);
+}
+EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+
+static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
+				    struct request_sock *req,
+				    struct tcp_sock *newtp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	struct inet_request_sock *ireq;
+
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		ireq = inet_rsk(req);
+		if (oldtp->syn_smc && !ireq->smc_ok)
+			newtp->syn_smc = 0;
+	}
+#endif
+}
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(const struct sock *sk,
+				      struct request_sock *req,
+				      struct sk_buff *skb)
+{
+	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct tcp_request_sock *treq = tcp_rsk(req);
+	struct inet_connection_sock *newicsk;
+	const struct tcp_sock *oldtp;
+	struct tcp_sock *newtp;
+	u32 seq;
+
+	if (!newsk)
+		return NULL;
+
+	newicsk = inet_csk(newsk);
+	newtp = tcp_sk(newsk);
+	oldtp = tcp_sk(sk);
+
+	smc_check_reset_syn_req(oldtp, req, newtp);
+
+	/* Now setup tcp_sock */
+	newtp->pred_flags = 0;
+
+	seq = treq->rcv_isn + 1;
+	newtp->rcv_wup = seq;
+	WRITE_ONCE(newtp->copied_seq, seq);
+	WRITE_ONCE(newtp->rcv_nxt, seq);
+	newtp->segs_in = 1;
+
+	seq = treq->snt_isn + 1;
+	newtp->snd_sml = newtp->snd_una = seq;
+	WRITE_ONCE(newtp->snd_nxt, seq);
+	newtp->snd_up = seq;
+
+	INIT_LIST_HEAD(&newtp->tsq_node);
+	INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
+
+	tcp_init_wl(newtp, treq->rcv_isn);
+
+	minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
+	newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
+
+	newtp->lsndtime = tcp_jiffies32;
+	newsk->sk_txhash = READ_ONCE(treq->txhash);
+	newtp->total_retrans = req->num_retrans;
+
+	tcp_init_xmit_timers(newsk);
+	WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
+
+	if (sock_flag(newsk, SOCK_KEEPOPEN))
+		inet_csk_reset_keepalive_timer(newsk,
+					       keepalive_time_when(newtp));
+
+	newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+	newtp->rx_opt.sack_ok = ireq->sack_ok;
+	newtp->window_clamp = req->rsk_window_clamp;
+	newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+	newtp->rcv_wnd = req->rsk_rcv_wnd;
+	newtp->rx_opt.wscale_ok = ireq->wscale_ok;
+	if (newtp->rx_opt.wscale_ok) {
+		newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+		newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
+	} else {
+		newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+		newtp->window_clamp = min(newtp->window_clamp, 65535U);
+	}
+	newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
+	newtp->max_window = newtp->snd_wnd;
+
+	if (newtp->rx_opt.tstamp_ok) {
+		newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
+		newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
+		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+	} else {
+		newtp->rx_opt.ts_recent_stamp = 0;
+		newtp->tcp_header_len = sizeof(struct tcphdr);
+	}
+	if (req->num_timeout) {
+		newtp->undo_marker = treq->snt_isn;
+		newtp->retrans_stamp = div_u64(treq->snt_synack,
+					       USEC_PER_SEC / TCP_TS_HZ);
+	}
+	newtp->tsoffset = treq->ts_off;
+#ifdef CONFIG_TCP_MD5SIG
+	newtp->md5sig_info = NULL;	/*XXX*/
+#endif
+	if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
+		newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
+	newtp->rx_opt.mss_clamp = req->mss;
+	tcp_ecn_openreq_child(newtp, req);
+	newtp->fastopen_req = NULL;
+	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
+
+	newtp->bpf_chg_cc_inprogress = 0;
+	tcp_bpf_clone(sk, newsk);
+
+	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+
+	return newsk;
+}
+EXPORT_SYMBOL(tcp_create_openreq_child);
+
+/*
+ * Process an incoming packet for SYN_RECV sockets represented as a
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
+ *
+ * We don't need to initialize tmp_opt.sack_ok as we don't use the results
+ *
+ * Note: If @fastopen is true, this can be called from process context.
+ *       Otherwise, this is from BH context.
+ */
+
+struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+			   struct request_sock *req,
+			   bool fastopen, bool *req_stolen)
+{
+	struct tcp_options_received tmp_opt;
+	struct sock *child;
+	const struct tcphdr *th = tcp_hdr(skb);
+	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+	bool paws_reject = false;
+	bool own_req;
+
+	tmp_opt.saw_tstamp = 0;
+	if (th->doff > (sizeof(struct tcphdr)>>2)) {
+		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
+
+		if (tmp_opt.saw_tstamp) {
+			tmp_opt.ts_recent = READ_ONCE(req->ts_recent);
+			if (tmp_opt.rcv_tsecr)
+				tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
+			/* We do not store true stamp, but it is not required,
+			 * it can be estimated (approximately)
+			 * from another data.
+			 */
+			tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
+			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+		}
+	}
+
+	/* Check for pure retransmitted SYN. */
+	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
+	    flg == TCP_FLAG_SYN &&
+	    !paws_reject) {
+		/*
+		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+		 * this case on figure 6 and figure 8, but formal
+		 * protocol description says NOTHING.
+		 * To be more exact, it says that we should send ACK,
+		 * because this segment (at least, if it has no data)
+		 * is out of window.
+		 *
+		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+		 *  describe SYN-RECV state. All the description
+		 *  is wrong, we cannot believe to it and should
+		 *  rely only on common sense and implementation
+		 *  experience.
+		 *
+		 * Enforce "SYN-ACK" according to figure 8, figure 6
+		 * of RFC793, fixed by RFC1122.
+		 *
+		 * Note that even if there is new data in the SYN packet
+		 * they will be thrown away too.
+		 *
+		 * Reset timer after retransmitting SYNACK, similar to
+		 * the idea of fast retransmit in recovery.
+		 */
+		if (!tcp_oow_rate_limited(sock_net(sk), skb,
+					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+					  &tcp_rsk(req)->last_oow_ack_time) &&
+
+		    !inet_rtx_syn_ack(sk, req)) {
+			unsigned long expires = jiffies;
+
+			expires += reqsk_timeout(req, TCP_RTO_MAX);
+			if (!fastopen)
+				mod_timer_pending(&req->rsk_timer, expires);
+			else
+				req->rsk_timer.expires = expires;
+		}
+		return NULL;
+	}
+
+	/* Further reproduces section "SEGMENT ARRIVES"
+	   for state SYN-RECEIVED of RFC793.
+	   It is broken, however, it does not work only
+	   when SYNs are crossed.
+
+	   You would think that SYN crossing is impossible here, since
+	   we should have a SYN_SENT socket (from connect()) on our end,
+	   but this is not true if the crossed SYNs were sent to both
+	   ends by a malicious third party.  We must defend against this,
+	   and to do that we first verify the ACK (as per RFC793, page
+	   36) and reset if it is invalid.  Is this a true full defense?
+	   To convince ourselves, let us consider a way in which the ACK
+	   test can still pass in this 'malicious crossed SYNs' case.
+	   Malicious sender sends identical SYNs (and thus identical sequence
+	   numbers) to both A and B:
+
+		A: gets SYN, seq=7
+		B: gets SYN, seq=7
+
+	   By our good fortune, both A and B select the same initial
+	   send sequence number of seven :-)
+
+		A: sends SYN|ACK, seq=7, ack_seq=8
+		B: sends SYN|ACK, seq=7, ack_seq=8
+
+	   So we are now A eating this SYN|ACK, ACK test passes.  So
+	   does sequence test, SYN is truncated, and thus we consider
+	   it a bare ACK.
+
+	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+	   bare ACK.  Otherwise, we create an established connection.  Both
+	   ends (listening sockets) accept the new incoming connection and try
+	   to talk to each other. 8-)
+
+	   Note: This case is both harmless, and rare.  Possibility is about the
+	   same as us discovering intelligent life on another plant tomorrow.
+
+	   But generally, we should (RFC lies!) to accept ACK
+	   from SYNACK both here and in tcp_rcv_state_process().
+	   tcp_rcv_state_process() does not, hence, we do not too.
+
+	   Note that the case is absolutely generic:
+	   we cannot optimize anything here without
+	   violating protocol. All the checks must be made
+	   before attempt to create socket.
+	 */
+
+	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
+	 *                  and the incoming segment acknowledges something not yet
+	 *                  sent (the segment carries an unacceptable ACK) ...
+	 *                  a reset is sent."
+	 *
+	 * Invalid ACK: reset will be sent by listening socket.
+	 * Note that the ACK validity check for a Fast Open socket is done
+	 * elsewhere and is checked directly against the child socket rather
+	 * than req because user data may have been sent out.
+	 */
+	if ((flg & TCP_FLAG_ACK) && !fastopen &&
+	    (TCP_SKB_CB(skb)->ack_seq !=
+	     tcp_rsk(req)->snt_isn + 1))
+		return sk;
+
+	/* Also, it would be not so bad idea to check rcv_tsecr, which
+	 * is essentially ACK extension and too early or too late values
+	 * should cause reset in unsynchronized states.
+	 */
+
+	/* RFC793: "first check sequence number". */
+
+	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+					  tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
+		/* Out of window: send ACK and drop. */
+		if (!(flg & TCP_FLAG_RST) &&
+		    !tcp_oow_rate_limited(sock_net(sk), skb,
+					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+					  &tcp_rsk(req)->last_oow_ack_time))
+			req->rsk_ops->send_ack(sk, skb, req);
+		if (paws_reject)
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+		return NULL;
+	}
+
+	/* In sequence, PAWS is OK. */
+
+	/* TODO: We probably should defer ts_recent change once
+	 * we take ownership of @req.
+	 */
+	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
+		WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval);
+
+	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
+		/* Truncate SYN, it is out of window starting
+		   at tcp_rsk(req)->rcv_isn + 1. */
+		flg &= ~TCP_FLAG_SYN;
+	}
+
+	/* RFC793: "second check the RST bit" and
+	 *	   "fourth, check the SYN bit"
+	 */
+	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
+		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+		goto embryonic_reset;
+	}
+
+	/* ACK sequence verified above, just make sure ACK is
+	 * set.  If ACK not set, just silently drop the packet.
+	 *
+	 * XXX (TFO) - if we ever allow "data after SYN", the
+	 * following check needs to be removed.
+	 */
+	if (!(flg & TCP_FLAG_ACK))
+		return NULL;
+
+	/* For Fast Open no more processing is needed (sk is the
+	 * child socket).
+	 */
+	if (fastopen)
+		return sk;
+
+	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
+	if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
+	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+		inet_rsk(req)->acked = 1;
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
+		return NULL;
+	}
+
+	/* OK, ACK is valid, create big socket and
+	 * feed this segment to it. It will repeat all
+	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+	 * ESTABLISHED STATE. If it will be dropped after
+	 * socket is created, wait for troubles.
+	 */
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+							 req, &own_req);
+	if (!child)
+		goto listen_overflow;
+
+	if (own_req && rsk_drop_req(req)) {
+		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+		inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
+		return child;
+	}
+
+	sock_rps_save_rxhash(child, skb);
+	tcp_synack_rtt_meas(child, req);
+	*req_stolen = !own_req;
+	return inet_csk_complete_hashdance(sk, child, req, own_req);
+
+listen_overflow:
+	if (sk != req->rsk_listener)
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+
+	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) {
+		inet_rsk(req)->acked = 1;
+		return NULL;
+	}
+
+embryonic_reset:
+	if (!(flg & TCP_FLAG_RST)) {
+		/* Received a bad SYN pkt - for TFO We try not to reset
+		 * the local connection unless it's really necessary to
+		 * avoid becoming vulnerable to outside attack aiming at
+		 * resetting legit local connections.
+		 */
+		req->rsk_ops->send_reset(sk, skb);
+	} else if (fastopen) { /* received a valid RST pkt */
+		reqsk_fastopen_remove(sk, req, true);
+		tcp_reset(sk, skb);
+	}
+	if (!fastopen) {
+		bool unlinked = inet_csk_reqsk_queue_drop(sk, req);
+
+		if (unlinked)
+			__NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+		*req_stolen = !unlinked;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(tcp_check_req);
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+		      struct sk_buff *skb)
+	__releases(&((child)->sk_lock.slock))
+{
+	int ret = 0;
+	int state = child->sk_state;
+
+	/* record sk_napi_id and sk_rx_queue_mapping of child. */
+	sk_mark_napi_id_set(child, skb);
+
+	tcp_segs_in(tcp_sk(child), skb);
+	if (!sock_owned_by_user(child)) {
+		ret = tcp_rcv_state_process(child, skb);
+		/* Wakeup parent, send SIGIO */
+		if (state == TCP_SYN_RECV && child->sk_state != state)
+			parent->sk_data_ready(parent);
+	} else {
+		/* Alas, it is possible again, because we do lookup
+		 * in main socket hash table and lock on listening
+		 * socket does not protect us more.
+		 */
+		__sk_add_backlog(child, skb);
+	}
+
+	bh_unlock_sock(child);
+	sock_put(child);
+	return ret;
+}
+EXPORT_SYMBOL(tcp_child_process);
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
new file mode 100644
index 0000000000..a60662f4bd
--- /dev/null
+++ b/net/ipv4/tcp_nv.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP NV: TCP with Congestion Avoidance
+ *
+ * TCP-NV is a successor of TCP-Vegas that has been developed to
+ * deal with the issues that occur in modern networks.
+ * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
+ * the ability to detect congestion before packet losses occur.
+ * When congestion (queue buildup) starts to occur, TCP-NV
+ * predicts what the cwnd size should be for the current
+ * throughput and it reduces the cwnd proportionally to
+ * the difference between the current cwnd and the predicted cwnd.
+ *
+ * NV is only recommeneded for traffic within a data center, and when
+ * all the flows are NV (at least those within the data center). This
+ * is due to the inherent unfairness between flows using losses to
+ * detect congestion (congestion control) and those that use queue
+ * buildup to detect congestion (congestion avoidance).
+ *
+ * Note: High NIC coalescence values may lower the performance of NV
+ * due to the increased noise in RTT values. In particular, we have
+ * seen issues with rx-frames values greater than 8.
+ *
+ * TODO:
+ * 1) Add mechanism to deal with reverse congestion.
+ */
+
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+/* TCP NV parameters
+ *
+ * nv_pad		Max number of queued packets allowed in network
+ * nv_pad_buffer	Do not grow cwnd if this closed to nv_pad
+ * nv_reset_period	How often (in) seconds)to reset min_rtt
+ * nv_min_cwnd		Don't decrease cwnd below this if there are no losses
+ * nv_cong_dec_mult	Decrease cwnd by X% (30%) of congestion when detected
+ * nv_ssthresh_factor	On congestion set ssthresh to this * <desired cwnd> / 8
+ * nv_rtt_factor	RTT averaging factor
+ * nv_loss_dec_factor	Decrease cwnd to this (80%) when losses occur
+ * nv_dec_eval_min_calls	Wait this many RTT measurements before dec cwnd
+ * nv_inc_eval_min_calls	Wait this many RTT measurements before inc cwnd
+ * nv_ssthresh_eval_min_calls	Wait this many RTT measurements before stopping
+ *				slow-start due to congestion
+ * nv_stop_rtt_cnt	Only grow cwnd for this many RTTs after non-congestion
+ * nv_rtt_min_cnt	Wait these many RTTs before making congesion decision
+ * nv_cwnd_growth_rate_neg
+ * nv_cwnd_growth_rate_pos
+ *	How quickly to double growth rate (not rate) of cwnd when not
+ *	congested. One value (nv_cwnd_growth_rate_neg) for when
+ *	rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos)
+ *	otherwise.
+ */
+
+static int nv_pad __read_mostly = 10;
+static int nv_pad_buffer __read_mostly = 2;
+static int nv_reset_period __read_mostly = 5; /* in seconds */
+static int nv_min_cwnd __read_mostly = 2;
+static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */
+static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */
+static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */
+static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */
+static int nv_cwnd_growth_rate_neg __read_mostly = 8;
+static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */
+static int nv_dec_eval_min_calls __read_mostly = 60;
+static int nv_inc_eval_min_calls __read_mostly = 20;
+static int nv_ssthresh_eval_min_calls __read_mostly = 30;
+static int nv_stop_rtt_cnt __read_mostly = 10;
+static int nv_rtt_min_cnt __read_mostly = 2;
+
+module_param(nv_pad, int, 0644);
+MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network");
+module_param(nv_reset_period, int, 0644);
+MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
+module_param(nv_min_cwnd, int, 0644);
+MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
+		 " without losses");
+
+/* TCP NV Parameters */
+struct tcpnv {
+	unsigned long nv_min_rtt_reset_jiffies;  /* when to switch to
+						  * nv_min_rtt_new */
+	s8  cwnd_growth_factor;	/* Current cwnd growth factor,
+				 * < 0 => less than 1 packet/RTT */
+	u8  available8;
+	u16 available16;
+	u8  nv_allow_cwnd_growth:1, /* whether cwnd can grow */
+		nv_reset:1,	    /* whether to reset values */
+		nv_catchup:1;	    /* whether we are growing because
+				     * of temporary cwnd decrease */
+	u8  nv_eval_call_cnt;	/* call count since last eval */
+	u8  nv_min_cwnd;	/* nv won't make a ca decision if cwnd is
+				 * smaller than this. It may grow to handle
+				 * TSO, LRO and interrupt coalescence because
+				 * with these a small cwnd cannot saturate
+				 * the link. Note that this is different from
+				 * the file local nv_min_cwnd */
+	u8  nv_rtt_cnt;		/* RTTs without making ca decision */;
+	u32 nv_last_rtt;	/* last rtt */
+	u32 nv_min_rtt;		/* active min rtt. Used to determine slope */
+	u32 nv_min_rtt_new;	/* min rtt for future use */
+	u32 nv_base_rtt;        /* If non-zero it represents the threshold for
+				 * congestion */
+	u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is
+				 * set to 80% of nv_base_rtt. It helps reduce
+				 * unfairness between flows */
+	u32 nv_rtt_max_rate;	/* max rate seen during current RTT */
+	u32 nv_rtt_start_seq;	/* current RTT ends when packet arrives
+				 * acking beyond nv_rtt_start_seq */
+	u32 nv_last_snd_una;	/* Previous value of tp->snd_una. It is
+				 * used to determine bytes acked since last
+				 * call to bictcp_acked */
+	u32 nv_no_cong_cnt;	/* Consecutive no congestion decisions */
+};
+
+#define NV_INIT_RTT	  U32_MAX
+#define NV_MIN_CWND	  4
+#define NV_MIN_CWND_GROW  2
+#define NV_TSO_CWND_BOUND 80
+
+static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->nv_reset = 0;
+	ca->nv_no_cong_cnt = 0;
+	ca->nv_rtt_cnt = 0;
+	ca->nv_last_rtt = 0;
+	ca->nv_rtt_max_rate = 0;
+	ca->nv_rtt_start_seq = tp->snd_una;
+	ca->nv_eval_call_cnt = 0;
+	ca->nv_last_snd_una = tp->snd_una;
+}
+
+static void tcpnv_init(struct sock *sk)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+	int base_rtt;
+
+	tcpnv_reset(ca, sk);
+
+	/* See if base_rtt is available from socket_ops bpf program.
+	 * It is meant to be used in environments, such as communication
+	 * within a datacenter, where we have reasonable estimates of
+	 * RTTs
+	 */
+	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
+	if (base_rtt > 0) {
+		ca->nv_base_rtt = base_rtt;
+		ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
+	} else {
+		ca->nv_base_rtt = 0;
+		ca->nv_lower_bound_rtt = 0;
+	}
+
+	ca->nv_allow_cwnd_growth = 1;
+	ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ;
+	ca->nv_min_rtt = NV_INIT_RTT;
+	ca->nv_min_rtt_new = NV_INIT_RTT;
+	ca->nv_min_cwnd = NV_MIN_CWND;
+	ca->nv_catchup = 0;
+	ca->cwnd_growth_factor = 0;
+}
+
+/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt)
+ * bounds to RTT.
+ */
+inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val)
+{
+	if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt)
+		return ca->nv_lower_bound_rtt;
+	else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt)
+		return ca->nv_base_rtt;
+	else
+		return val;
+}
+
+static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+	u32 cnt;
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	/* Only grow cwnd if NV has not detected congestion */
+	if (!ca->nv_allow_cwnd_growth)
+		return;
+
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+
+	if (ca->cwnd_growth_factor < 0) {
+		cnt = tcp_snd_cwnd(tp) << -ca->cwnd_growth_factor;
+		tcp_cong_avoid_ai(tp, cnt, acked);
+	} else {
+		cnt = max(4U, tcp_snd_cwnd(tp) >> ca->cwnd_growth_factor);
+		tcp_cong_avoid_ai(tp, cnt, acked);
+	}
+}
+
+static u32 tcpnv_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return max((tcp_snd_cwnd(tp) * nv_loss_dec_factor) >> 10, 2U);
+}
+
+static void tcpnv_state(struct sock *sk, u8 new_state)
+{
+	struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Open && ca->nv_reset) {
+		tcpnv_reset(ca, sk);
+	} else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR ||
+		new_state == TCP_CA_Recovery) {
+		ca->nv_reset = 1;
+		ca->nv_allow_cwnd_growth = 0;
+		if (new_state == TCP_CA_Loss) {
+			/* Reset cwnd growth factor to Reno value */
+			if (ca->cwnd_growth_factor > 0)
+				ca->cwnd_growth_factor = 0;
+			/* Decrease growth rate if allowed */
+			if (nv_cwnd_growth_rate_neg > 0 &&
+			    ca->cwnd_growth_factor > -8)
+				ca->cwnd_growth_factor--;
+		}
+	}
+}
+
+/* Do congestion avoidance calculations for TCP-NV
+ */
+static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpnv *ca = inet_csk_ca(sk);
+	unsigned long now = jiffies;
+	u64 rate64;
+	u32 rate, max_win, cwnd_by_slope;
+	u32 avg_rtt;
+	u32 bytes_acked = 0;
+
+	/* Some calls are for duplicates without timetamps */
+	if (sample->rtt_us < 0)
+		return;
+
+	/* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */
+	if (icsk->icsk_ca_state != TCP_CA_Open &&
+	    icsk->icsk_ca_state != TCP_CA_Disorder)
+		return;
+
+	/* Stop cwnd growth if we were in catch up mode */
+	if (ca->nv_catchup && tcp_snd_cwnd(tp) >= nv_min_cwnd) {
+		ca->nv_catchup = 0;
+		ca->nv_allow_cwnd_growth = 0;
+	}
+
+	bytes_acked = tp->snd_una - ca->nv_last_snd_una;
+	ca->nv_last_snd_una = tp->snd_una;
+
+	if (sample->in_flight == 0)
+		return;
+
+	/* Calculate moving average of RTT */
+	if (nv_rtt_factor > 0) {
+		if (ca->nv_last_rtt > 0) {
+			avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor +
+				   ((u64)ca->nv_last_rtt)
+				   * (256 - nv_rtt_factor)) >> 8;
+		} else {
+			avg_rtt = sample->rtt_us;
+			ca->nv_min_rtt = avg_rtt << 1;
+		}
+		ca->nv_last_rtt = avg_rtt;
+	} else {
+		avg_rtt = sample->rtt_us;
+	}
+
+	/* rate in 100's bits per second */
+	rate64 = ((u64)sample->in_flight) * 80000;
+	do_div(rate64, avg_rtt ?: 1);
+	rate = (u32)rate64;
+
+	/* Remember the maximum rate seen during this RTT
+	 * Note: It may be more than one RTT. This function should be
+	 *       called at least nv_dec_eval_min_calls times.
+	 */
+	if (ca->nv_rtt_max_rate < rate)
+		ca->nv_rtt_max_rate = rate;
+
+	/* We have valid information, increment counter */
+	if (ca->nv_eval_call_cnt < 255)
+		ca->nv_eval_call_cnt++;
+
+	/* Apply bounds to rtt. Only used to update min_rtt */
+	avg_rtt = nv_get_bounded_rtt(ca, avg_rtt);
+
+	/* update min rtt if necessary */
+	if (avg_rtt < ca->nv_min_rtt)
+		ca->nv_min_rtt = avg_rtt;
+
+	/* update future min_rtt if necessary */
+	if (avg_rtt < ca->nv_min_rtt_new)
+		ca->nv_min_rtt_new = avg_rtt;
+
+	/* nv_min_rtt is updated with the minimum (possibley averaged) rtt
+	 * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
+	 * warm reset). This new nv_min_rtt will be continued to be updated
+	 * and be used for another sysctl_tcp_nv_reset_period seconds,
+	 * when it will be updated again.
+	 * In practice we introduce some randomness, so the actual period used
+	 * is chosen randomly from the range:
+	 *   [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
+	 */
+	if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
+		unsigned char rand;
+
+		ca->nv_min_rtt = ca->nv_min_rtt_new;
+		ca->nv_min_rtt_new = NV_INIT_RTT;
+		get_random_bytes(&rand, 1);
+		ca->nv_min_rtt_reset_jiffies =
+			now + ((nv_reset_period * (384 + rand) * HZ) >> 9);
+		/* Every so often we decrease ca->nv_min_cwnd in case previous
+		 *  value is no longer accurate.
+		 */
+		ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND);
+	}
+
+	/* Once per RTT check if we need to do congestion avoidance */
+	if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
+		ca->nv_rtt_start_seq = tp->snd_nxt;
+		if (ca->nv_rtt_cnt < 0xff)
+			/* Increase counter for RTTs without CA decision */
+			ca->nv_rtt_cnt++;
+
+		/* If this function is only called once within an RTT
+		 * the cwnd is probably too small (in some cases due to
+		 * tso, lro or interrupt coalescence), so we increase
+		 * ca->nv_min_cwnd.
+		 */
+		if (ca->nv_eval_call_cnt == 1 &&
+		    bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache &&
+		    ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) {
+			ca->nv_min_cwnd = min(ca->nv_min_cwnd
+					      + NV_MIN_CWND_GROW,
+					      NV_TSO_CWND_BOUND + 1);
+			ca->nv_rtt_start_seq = tp->snd_nxt +
+				ca->nv_min_cwnd * tp->mss_cache;
+			ca->nv_eval_call_cnt = 0;
+			ca->nv_allow_cwnd_growth = 1;
+			return;
+		}
+
+		/* Find the ideal cwnd for current rate from slope
+		 * slope = 80000.0 * mss / nv_min_rtt
+		 * cwnd_by_slope = nv_rtt_max_rate / slope
+		 */
+		cwnd_by_slope = (u32)
+			div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
+				  80000ULL * tp->mss_cache);
+		max_win = cwnd_by_slope + nv_pad;
+
+		/* If cwnd > max_win, decrease cwnd
+		 * if cwnd < max_win, grow cwnd
+		 * else leave the same
+		 */
+		if (tcp_snd_cwnd(tp) > max_win) {
+			/* there is congestion, check that it is ok
+			 * to make a CA decision
+			 * 1. We should have at least nv_dec_eval_min_calls
+			 *    data points before making a CA  decision
+			 * 2. We only make a congesion decision after
+			 *    nv_rtt_min_cnt RTTs
+			 */
+			if (ca->nv_rtt_cnt < nv_rtt_min_cnt) {
+				return;
+			} else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
+				if (ca->nv_eval_call_cnt <
+				    nv_ssthresh_eval_min_calls)
+					return;
+				/* otherwise we will decrease cwnd */
+			} else if (ca->nv_eval_call_cnt <
+				   nv_dec_eval_min_calls) {
+				if (ca->nv_allow_cwnd_growth &&
+				    ca->nv_rtt_cnt > nv_stop_rtt_cnt)
+					ca->nv_allow_cwnd_growth = 0;
+				return;
+			}
+
+			/* We have enough data to determine we are congested */
+			ca->nv_allow_cwnd_growth = 0;
+			tp->snd_ssthresh =
+				(nv_ssthresh_factor * max_win) >> 3;
+			if (tcp_snd_cwnd(tp) - max_win > 2) {
+				/* gap > 2, we do exponential cwnd decrease */
+				int dec;
+
+				dec = max(2U, ((tcp_snd_cwnd(tp) - max_win) *
+					       nv_cong_dec_mult) >> 7);
+				tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - dec);
+			} else if (nv_cong_dec_mult > 0) {
+				tcp_snd_cwnd_set(tp, max_win);
+			}
+			if (ca->cwnd_growth_factor > 0)
+				ca->cwnd_growth_factor = 0;
+			ca->nv_no_cong_cnt = 0;
+		} else if (tcp_snd_cwnd(tp) <= max_win - nv_pad_buffer) {
+			/* There is no congestion, grow cwnd if allowed*/
+			if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls)
+				return;
+
+			ca->nv_allow_cwnd_growth = 1;
+			ca->nv_no_cong_cnt++;
+			if (ca->cwnd_growth_factor < 0 &&
+			    nv_cwnd_growth_rate_neg > 0 &&
+			    ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) {
+				ca->cwnd_growth_factor++;
+				ca->nv_no_cong_cnt = 0;
+			} else if (ca->cwnd_growth_factor >= 0 &&
+				   nv_cwnd_growth_rate_pos > 0 &&
+				   ca->nv_no_cong_cnt >
+				   nv_cwnd_growth_rate_pos) {
+				ca->cwnd_growth_factor++;
+				ca->nv_no_cong_cnt = 0;
+			}
+		} else {
+			/* cwnd is in-between, so do nothing */
+			return;
+		}
+
+		/* update state */
+		ca->nv_eval_call_cnt = 0;
+		ca->nv_rtt_cnt = 0;
+		ca->nv_rtt_max_rate = 0;
+
+		/* Don't want to make cwnd < nv_min_cwnd
+		 * (it wasn't before, if it is now is because nv
+		 *  decreased it).
+		 */
+		if (tcp_snd_cwnd(tp) < nv_min_cwnd)
+			tcp_snd_cwnd_set(tp, nv_min_cwnd);
+	}
+}
+
+/* Extract info for Tcp socket info provided via netlink */
+static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
+			     union tcp_cc_info *info)
+{
+	const struct tcpnv *ca = inet_csk_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		info->vegas.tcpv_enabled = 1;
+		info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
+		info->vegas.tcpv_rtt = ca->nv_last_rtt;
+		info->vegas.tcpv_minrtt = ca->nv_min_rtt;
+
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
+	}
+	return 0;
+}
+
+static struct tcp_congestion_ops tcpnv __read_mostly = {
+	.init		= tcpnv_init,
+	.ssthresh	= tcpnv_recalc_ssthresh,
+	.cong_avoid	= tcpnv_cong_avoid,
+	.set_state	= tcpnv_state,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.pkts_acked     = tcpnv_acked,
+	.get_info	= tcpnv_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "nv",
+};
+
+static int __init tcpnv_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
+
+	return tcp_register_congestion_control(&tcpnv);
+}
+
+static void __exit tcpnv_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcpnv);
+}
+
+module_init(tcpnv_register);
+module_exit(tcpnv_unregister);
+
+MODULE_AUTHOR("Lawrence Brakmo");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP NV");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
new file mode 100644
index 0000000000..8311c38267
--- /dev/null
+++ b/net/ipv4/tcp_offload.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	IPV4 GSO/GRO offload support
+ *	Linux INET implementation
+ *
+ *	TCPv4 GSO/GRO support
+ */
+
+#include <linux/indirect_call_wrapper.h>
+#include <linux/skbuff.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+
+static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
+			   unsigned int seq, unsigned int mss)
+{
+	while (skb) {
+		if (before(ts_seq, seq + mss)) {
+			skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP;
+			skb_shinfo(skb)->tskey = ts_seq;
+			return;
+		}
+
+		skb = skb->next;
+		seq += mss;
+	}
+}
+
+static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
+					netdev_features_t features)
+{
+	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4))
+		return ERR_PTR(-EINVAL);
+
+	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+		return ERR_PTR(-EINVAL);
+
+	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+		const struct iphdr *iph = ip_hdr(skb);
+		struct tcphdr *th = tcp_hdr(skb);
+
+		/* Set up checksum pseudo header, usually expect stack to
+		 * have done this already.
+		 */
+
+		th->check = 0;
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
+	}
+
+	return tcp_gso_segment(skb, features);
+}
+
+struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
+				netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int sum_truesize = 0;
+	struct tcphdr *th;
+	unsigned int thlen;
+	unsigned int seq;
+	unsigned int oldlen;
+	unsigned int mss;
+	struct sk_buff *gso_skb = skb;
+	__sum16 newcheck;
+	bool ooo_okay, copy_destructor;
+	__wsum delta;
+
+	th = tcp_hdr(skb);
+	thlen = th->doff * 4;
+	if (thlen < sizeof(*th))
+		goto out;
+
+	if (!pskb_may_pull(skb, thlen))
+		goto out;
+
+	oldlen = ~skb->len;
+	__skb_pull(skb, thlen);
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
+
+	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+		/* Packet is from an untrusted source, reset gso_segs. */
+
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+		segs = NULL;
+		goto out;
+	}
+
+	copy_destructor = gso_skb->destructor == tcp_wfree;
+	ooo_okay = gso_skb->ooo_okay;
+	/* All segments but the first should have ooo_okay cleared */
+	skb->ooo_okay = 0;
+
+	segs = skb_segment(skb, features);
+	if (IS_ERR(segs))
+		goto out;
+
+	/* Only first segment might have ooo_okay set */
+	segs->ooo_okay = ooo_okay;
+
+	/* GSO partial and frag_list segmentation only requires splitting
+	 * the frame into an MSS multiple and possibly a remainder, both
+	 * cases return a GSO skb. So update the mss now.
+	 */
+	if (skb_is_gso(segs))
+		mss *= skb_shinfo(segs)->gso_segs;
+
+	delta = (__force __wsum)htonl(oldlen + thlen + mss);
+
+	skb = segs;
+	th = tcp_hdr(skb);
+	seq = ntohl(th->seq);
+
+	if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
+		tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
+
+	newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta));
+
+	while (skb->next) {
+		th->fin = th->psh = 0;
+		th->check = newcheck;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			gso_reset_checksum(skb, ~th->check);
+		else
+			th->check = gso_make_checksum(skb, ~th->check);
+
+		seq += mss;
+		if (copy_destructor) {
+			skb->destructor = gso_skb->destructor;
+			skb->sk = gso_skb->sk;
+			sum_truesize += skb->truesize;
+		}
+		skb = skb->next;
+		th = tcp_hdr(skb);
+
+		th->seq = htonl(seq);
+		th->cwr = 0;
+	}
+
+	/* Following permits TCP Small Queues to work well with GSO :
+	 * The callback to TCP stack will be called at the time last frag
+	 * is freed at TX completion, and not right now when gso_skb
+	 * is freed by GSO engine
+	 */
+	if (copy_destructor) {
+		int delta;
+
+		swap(gso_skb->sk, skb->sk);
+		swap(gso_skb->destructor, skb->destructor);
+		sum_truesize += skb->truesize;
+		delta = sum_truesize - gso_skb->truesize;
+		/* In some pathological cases, delta can be negative.
+		 * We need to either use refcount_add() or refcount_sub_and_test()
+		 */
+		if (likely(delta >= 0))
+			refcount_add(delta, &skb->sk->sk_wmem_alloc);
+		else
+			WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc));
+	}
+
+	delta = (__force __wsum)htonl(oldlen +
+				      (skb_tail_pointer(skb) -
+				       skb_transport_header(skb)) +
+				      skb->data_len);
+	th->check = ~csum_fold(csum_add(csum_unfold(th->check), delta));
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		gso_reset_checksum(skb, ~th->check);
+	else
+		th->check = gso_make_checksum(skb, ~th->check);
+out:
+	return segs;
+}
+
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+	struct sk_buff *pp = NULL;
+	struct sk_buff *p;
+	struct tcphdr *th;
+	struct tcphdr *th2;
+	unsigned int len;
+	unsigned int thlen;
+	__be32 flags;
+	unsigned int mss = 1;
+	unsigned int hlen;
+	unsigned int off;
+	int flush = 1;
+	int i;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*th);
+	th = skb_gro_header(skb, hlen, off);
+	if (unlikely(!th))
+		goto out;
+
+	thlen = th->doff * 4;
+	if (thlen < sizeof(*th))
+		goto out;
+
+	hlen = off + thlen;
+	if (skb_gro_header_hard(skb, hlen)) {
+		th = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!th))
+			goto out;
+	}
+
+	skb_gro_pull(skb, thlen);
+
+	len = skb_gro_len(skb);
+	flags = tcp_flag_word(th);
+
+	list_for_each_entry(p, head, list) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		th2 = tcp_hdr(p);
+
+		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		goto found;
+	}
+	p = NULL;
+	goto out_check_final;
+
+found:
+	/* Include the IP ID check below from the inner most IP hdr */
+	flush = NAPI_GRO_CB(p)->flush;
+	flush |= (__force int)(flags & TCP_FLAG_CWR);
+	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+	flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
+	for (i = sizeof(*th); i < thlen; i += 4)
+		flush |= *(u32 *)((u8 *)th + i) ^
+			 *(u32 *)((u8 *)th2 + i);
+
+	/* When we receive our second frame we can made a decision on if we
+	 * continue this flow as an atomic flow with a fixed ID or if we use
+	 * an incrementing ID.
+	 */
+	if (NAPI_GRO_CB(p)->flush_id != 1 ||
+	    NAPI_GRO_CB(p)->count != 1 ||
+	    !NAPI_GRO_CB(p)->is_atomic)
+		flush |= NAPI_GRO_CB(p)->flush_id;
+	else
+		NAPI_GRO_CB(p)->is_atomic = false;
+
+	mss = skb_shinfo(p)->gso_size;
+
+	/* If skb is a GRO packet, make sure its gso_size matches prior packet mss.
+	 * If it is a single frame, do not aggregate it if its length
+	 * is bigger than our mss.
+	 */
+	if (unlikely(skb_is_gso(skb)))
+		flush |= (mss != skb_shinfo(skb)->gso_size);
+	else
+		flush |= (len - 1) >= mss;
+
+	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+#ifdef CONFIG_TLS_DEVICE
+	flush |= p->decrypted ^ skb->decrypted;
+#endif
+
+	if (flush || skb_gro_receive(p, skb)) {
+		mss = 1;
+		goto out_check_final;
+	}
+
+	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
+
+out_check_final:
+	/* Force a flush if last segment is smaller than mss. */
+	if (unlikely(skb_is_gso(skb)))
+		flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size;
+	else
+		flush = len < mss;
+
+	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+					TCP_FLAG_RST | TCP_FLAG_SYN |
+					TCP_FLAG_FIN));
+
+	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+		pp = p;
+
+out:
+	NAPI_GRO_CB(skb)->flush |= (flush != 0);
+
+	return pp;
+}
+
+void tcp_gro_complete(struct sk_buff *skb)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+
+	skb->csum_start = (unsigned char *)th - skb->head;
+	skb->csum_offset = offsetof(struct tcphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+	if (th->cwr)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+	if (skb->encapsulation)
+		skb->inner_transport_header = skb->transport_header;
+}
+EXPORT_SYMBOL(tcp_gro_complete);
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+	/* Don't bother verifying checksum if we're going to flush anyway. */
+	if (!NAPI_GRO_CB(skb)->flush &&
+	    skb_gro_checksum_validate(skb, IPPROTO_TCP,
+				      inet_gro_compute_pseudo)) {
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	return tcp_gro_receive(head, skb);
+}
+
+INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr *th = tcp_hdr(skb);
+
+	th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
+				  iph->daddr, 0);
+	skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+
+	if (NAPI_GRO_CB(skb)->is_atomic)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
+
+	tcp_gro_complete(skb);
+	return 0;
+}
+
+static const struct net_offload tcpv4_offload = {
+	.callbacks = {
+		.gso_segment	=	tcp4_gso_segment,
+		.gro_receive	=	tcp4_gro_receive,
+		.gro_complete	=	tcp4_gro_complete,
+	},
+};
+
+int __init tcpv4_offload_init(void)
+{
+	return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
+}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
new file mode 100644
index 0000000000..ab3b7b4b44
--- /dev/null
+++ b/net/ipv4/tcp_output.c
@@ -0,0 +1,4273 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
+ *				:	Fragmentation on mtu decrease
+ *				:	Segment collapse on retransmit
+ *				:	AF independence
+ *
+ *		Linus Torvalds	:	send_delayed_ack
+ *		David S. Miller	:	Charge memory using the right skb
+ *					during syn/ack processing.
+ *		David S. Miller :	Output engine completely rewritten.
+ *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
+ *		Cacophonix Gaul :	draft-minshall-nagle-01
+ *		J Hadi Salim	:	ECN support
+ *
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <net/tcp.h>
+#include <net/mptcp.h>
+
+#include <linux/compiler.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/static_key.h>
+
+#include <trace/events/tcp.h>
+
+/* Refresh clocks of a TCP socket,
+ * ensuring monotically increasing values.
+ */
+void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+	u64 val = tcp_clock_ns();
+
+	tp->tcp_clock_cache = val;
+	tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
+}
+
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+			   int push_one, gfp_t gfp);
+
+/* Account for new data that has been sent to the network. */
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int prior_packets = tp->packets_out;
+
+	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
+
+	__skb_unlink(skb, &sk->sk_write_queue);
+	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
+	if (tp->highest_sack == NULL)
+		tp->highest_sack = skb;
+
+	tp->packets_out += tcp_skb_pcount(skb);
+	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+		tcp_rearm_rto(sk);
+
+	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
+		      tcp_skb_pcount(skb));
+	tcp_check_space(sk);
+}
+
+/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
+ * window scaling factor due to loss of precision.
+ * If window has been shrunk, what should we make? It is not clear at all.
+ * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
+ * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
+ * invalid. OK, let's make this for now:
+ */
+static inline __u32 tcp_acceptable_seq(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
+	    (tp->rx_opt.wscale_ok &&
+	     ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
+		return tp->snd_nxt;
+	else
+		return tcp_wnd_end(tp);
+}
+
+/* Calculate mss to advertise in SYN segment.
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ *    attached devices, because some buggy hosts are confused by
+ *    large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ *    This may be overridden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ *    probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct dst_entry *dst = __sk_dst_get(sk);
+	int mss = tp->advmss;
+
+	if (dst) {
+		unsigned int metric = dst_metric_advmss(dst);
+
+		if (metric < mss) {
+			mss = metric;
+			tp->advmss = mss;
+		}
+	}
+
+	return (__u16)mss;
+}
+
+/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism.
+ */
+void tcp_cwnd_restart(struct sock *sk, s32 delta)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
+	u32 cwnd = tcp_snd_cwnd(tp);
+
+	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
+
+	tp->snd_ssthresh = tcp_current_ssthresh(sk);
+	restart_cwnd = min(restart_cwnd, cwnd);
+
+	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
+		cwnd >>= 1;
+	tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+	tp->snd_cwnd_used = 0;
+}
+
+/* Congestion state accounting after a packet has been sent. */
+static void tcp_event_data_sent(struct tcp_sock *tp,
+				struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const u32 now = tcp_jiffies32;
+
+	if (tcp_packets_in_flight(tp) == 0)
+		tcp_ca_event(sk, CA_EVENT_TX_START);
+
+	tp->lsndtime = now;
+
+	/* If it is a reply for ato after last received
+	 * packet, enter pingpong mode.
+	 */
+	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+		inet_csk_enter_pingpong_mode(sk);
+}
+
+/* Account for an ACK we sent. */
+static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (unlikely(tp->compressed_ack)) {
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+			      tp->compressed_ack);
+		tp->compressed_ack = 0;
+		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+			__sock_put(sk);
+	}
+
+	if (unlikely(rcv_nxt != tp->rcv_nxt))
+		return;  /* Special ACK sent by DCTCP to reflect ECN */
+	tcp_dec_quickack_mode(sk);
+	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+/* Determine a window scaling and initial window to offer.
+ * Based on the assumption that the given amount of space
+ * will be offered. Store the results in the tp structure.
+ * NOTE: for smooth operation initial space offering should
+ * be a multiple of mss if possible. We assume here that mss >= 1.
+ * This MUST be enforced by all callers.
+ */
+void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
+			       __u32 *rcv_wnd, __u32 *window_clamp,
+			       int wscale_ok, __u8 *rcv_wscale,
+			       __u32 init_rcv_wnd)
+{
+	unsigned int space = (__space < 0 ? 0 : __space);
+
+	/* If no clamp set the clamp to the max possible scaled window */
+	if (*window_clamp == 0)
+		(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
+	space = min(*window_clamp, space);
+
+	/* Quantize space offering to a multiple of mss if possible. */
+	if (space > mss)
+		space = rounddown(space, mss);
+
+	/* NOTE: offering an initial window larger than 32767
+	 * will break some buggy TCP stacks. If the admin tells us
+	 * it is likely we could be speaking with such a buggy stack
+	 * we will truncate our initial window offering to 32K-1
+	 * unless the remote has sent us a window scaling option,
+	 * which we interpret as a sign the remote TCP is not
+	 * misinterpreting the window field as a signed quantity.
+	 */
+	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
+		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+	else
+		(*rcv_wnd) = min_t(u32, space, U16_MAX);
+
+	if (init_rcv_wnd)
+		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
+
+	*rcv_wscale = 0;
+	if (wscale_ok) {
+		/* Set window scaling on max possible window */
+		space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
+		space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
+		space = min_t(u32, space, *window_clamp);
+		*rcv_wscale = clamp_t(int, ilog2(space) - 15,
+				      0, TCP_MAX_WSCALE);
+	}
+	/* Set the clamp no higher than max representable value */
+	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
+}
+EXPORT_SYMBOL(tcp_select_initial_window);
+
+/* Chose a new window to advertise, update state in tcp_sock for the
+ * socket, and return result with RFC1323 scaling applied.  The return
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+static u16 tcp_select_window(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	u32 old_win = tp->rcv_wnd;
+	u32 cur_win, new_win;
+
+	/* Make the window 0 if we failed to queue the data because we
+	 * are out of memory. The window is temporary, so we don't store
+	 * it on the socket.
+	 */
+	if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM))
+		return 0;
+
+	cur_win = tcp_receive_window(tp);
+	new_win = __tcp_select_window(sk);
+	if (new_win < cur_win) {
+		/* Danger Will Robinson!
+		 * Don't update rcv_wup/rcv_wnd here or else
+		 * we will not be able to advertise a zero
+		 * window in time.  --DaveM
+		 *
+		 * Relax Will Robinson.
+		 */
+		if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) {
+			/* Never shrink the offered window */
+			if (new_win == 0)
+				NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
+			new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+		}
+	}
+
+	tp->rcv_wnd = new_win;
+	tp->rcv_wup = tp->rcv_nxt;
+
+	/* Make sure we do not exceed the maximum possible
+	 * scaled window.
+	 */
+	if (!tp->rx_opt.rcv_wscale &&
+	    READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
+		new_win = min(new_win, MAX_TCP_WINDOW);
+	else
+		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+	/* RFC1323 scaling applied */
+	new_win >>= tp->rx_opt.rcv_wscale;
+
+	/* If we advertise zero window, disable fast path. */
+	if (new_win == 0) {
+		tp->pred_flags = 0;
+		if (old_win)
+			NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
+	} else if (old_win == 0) {
+		NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
+	}
+
+	return new_win;
+}
+
+/* Packet ECN state for a SYN-ACK */
+static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
+	if (!(tp->ecn_flags & TCP_ECN_OK))
+		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+	else if (tcp_ca_needs_ecn(sk) ||
+		 tcp_bpf_ca_needs_ecn(sk))
+		INET_ECN_xmit(sk);
+}
+
+/* Packet ECN state for a SYN.  */
+static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
+		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
+
+	if (!use_ecn) {
+		const struct dst_entry *dst = __sk_dst_get(sk);
+
+		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+			use_ecn = true;
+	}
+
+	tp->ecn_flags = 0;
+
+	if (use_ecn) {
+		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
+		tp->ecn_flags = TCP_ECN_OK;
+		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+			INET_ECN_xmit(sk);
+	}
+}
+
+static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
+{
+	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
+		/* tp->ecn_flags are cleared at a later point in time when
+		 * SYN ACK is ultimatively being received.
+		 */
+		TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
+}
+
+static void
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
+{
+	if (inet_rsk(req)->ecn_ok)
+		th->ece = 1;
+}
+
+/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
+ * be sent.
+ */
+static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+			 struct tcphdr *th, int tcp_header_len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->ecn_flags & TCP_ECN_OK) {
+		/* Not-retransmitted data segment: set ECT and inject CWR. */
+		if (skb->len != tcp_header_len &&
+		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
+			INET_ECN_xmit(sk);
+			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
+				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+				th->cwr = 1;
+				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+			}
+		} else if (!tcp_ca_needs_ecn(sk)) {
+			/* ACK or retransmitted segment: clear ECT|CE */
+			INET_ECN_dontxmit(sk);
+		}
+		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
+			th->ece = 1;
+	}
+}
+
+/* Constructs common control bits of non-data skb. If SYN/FIN is present,
+ * auto increment end seqno.
+ */
+static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+{
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	TCP_SKB_CB(skb)->tcp_flags = flags;
+
+	tcp_skb_pcount_set(skb, 1);
+
+	TCP_SKB_CB(skb)->seq = seq;
+	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
+		seq++;
+	TCP_SKB_CB(skb)->end_seq = seq;
+}
+
+static inline bool tcp_urg_mode(const struct tcp_sock *tp)
+{
+	return tp->snd_una != tp->snd_up;
+}
+
+#define OPTION_SACK_ADVERTISE	BIT(0)
+#define OPTION_TS		BIT(1)
+#define OPTION_MD5		BIT(2)
+#define OPTION_WSCALE		BIT(3)
+#define OPTION_FAST_OPEN_COOKIE	BIT(8)
+#define OPTION_SMC		BIT(9)
+#define OPTION_MPTCP		BIT(10)
+
+static void smc_options_write(__be32 *ptr, u16 *options)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (unlikely(OPTION_SMC & *options)) {
+			*ptr++ = htonl((TCPOPT_NOP  << 24) |
+				       (TCPOPT_NOP  << 16) |
+				       (TCPOPT_EXP <<  8) |
+				       (TCPOLEN_EXP_SMC_BASE));
+			*ptr++ = htonl(TCPOPT_SMC_MAGIC);
+		}
+	}
+#endif
+}
+
+struct tcp_out_options {
+	u16 options;		/* bit field of OPTION_* */
+	u16 mss;		/* 0 to disable */
+	u8 ws;			/* window scale, 0 to disable */
+	u8 num_sack_blocks;	/* number of SACK blocks to include */
+	u8 hash_size;		/* bytes in hash_location */
+	u8 bpf_opt_len;		/* length of BPF hdr option */
+	__u8 *hash_location;	/* temporary pointer, overloaded */
+	__u32 tsval, tsecr;	/* need to include OPTION_TS */
+	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
+	struct mptcp_out_options mptcp;
+};
+
+static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
+				struct tcp_sock *tp,
+				struct tcp_out_options *opts)
+{
+#if IS_ENABLED(CONFIG_MPTCP)
+	if (unlikely(OPTION_MPTCP & opts->options))
+		mptcp_write_options(th, ptr, tp, &opts->mptcp);
+#endif
+}
+
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
+					enum tcp_synack_type synack_type)
+{
+	if (unlikely(!skb))
+		return BPF_WRITE_HDR_TCP_CURRENT_MSS;
+
+	if (unlikely(synack_type == TCP_SYNACK_COOKIE))
+		return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+
+	return 0;
+}
+
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+				  struct request_sock *req,
+				  struct sk_buff *syn_skb,
+				  enum tcp_synack_type synack_type,
+				  struct tcp_out_options *opts,
+				  unsigned int *remaining)
+{
+	struct bpf_sock_ops_kern sock_ops;
+	int err;
+
+	if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+					   BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+	    !*remaining)
+		return;
+
+	/* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
+
+	/* init sock_ops */
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+	sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
+
+	if (req) {
+		/* The listen "sk" cannot be passed here because
+		 * it is not locked.  It would not make too much
+		 * sense to do bpf_setsockopt(listen_sk) based
+		 * on individual connection request also.
+		 *
+		 * Thus, "req" is passed here and the cgroup-bpf-progs
+		 * of the listen "sk" will be run.
+		 *
+		 * "req" is also used here for fastopen even the "sk" here is
+		 * a fullsock "child" sk.  It is to keep the behavior
+		 * consistent between fastopen and non-fastopen on
+		 * the bpf programming side.
+		 */
+		sock_ops.sk = (struct sock *)req;
+		sock_ops.syn_skb = syn_skb;
+	} else {
+		sock_owned_by_me(sk);
+
+		sock_ops.is_fullsock = 1;
+		sock_ops.sk = sk;
+	}
+
+	sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+	sock_ops.remaining_opt_len = *remaining;
+	/* tcp_current_mss() does not pass a skb */
+	if (skb)
+		bpf_skops_init_skb(&sock_ops, skb, 0);
+
+	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+	if (err || sock_ops.remaining_opt_len == *remaining)
+		return;
+
+	opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
+	/* round up to 4 bytes */
+	opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
+
+	*remaining -= opts->bpf_opt_len;
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+				    struct request_sock *req,
+				    struct sk_buff *syn_skb,
+				    enum tcp_synack_type synack_type,
+				    struct tcp_out_options *opts)
+{
+	u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
+	struct bpf_sock_ops_kern sock_ops;
+	int err;
+
+	if (likely(!max_opt_len))
+		return;
+
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+	sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+
+	if (req) {
+		sock_ops.sk = (struct sock *)req;
+		sock_ops.syn_skb = syn_skb;
+	} else {
+		sock_owned_by_me(sk);
+
+		sock_ops.is_fullsock = 1;
+		sock_ops.sk = sk;
+	}
+
+	sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+	sock_ops.remaining_opt_len = max_opt_len;
+	first_opt_off = tcp_hdrlen(skb) - max_opt_len;
+	bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
+
+	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+	if (err)
+		nr_written = 0;
+	else
+		nr_written = max_opt_len - sock_ops.remaining_opt_len;
+
+	if (nr_written < max_opt_len)
+		memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
+		       max_opt_len - nr_written);
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+				  struct request_sock *req,
+				  struct sk_buff *syn_skb,
+				  enum tcp_synack_type synack_type,
+				  struct tcp_out_options *opts,
+				  unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+				    struct request_sock *req,
+				    struct sk_buff *syn_skb,
+				    enum tcp_synack_type synack_type,
+				    struct tcp_out_options *opts)
+{
+}
+#endif
+
+/* Write previously computed TCP options to the packet.
+ *
+ * Beware: Something in the Internet is very sensitive to the ordering of
+ * TCP options, we learned this through the hard way, so be careful here.
+ * Luckily we can at least blame others for their non-compliance but from
+ * inter-operability perspective it seems that we're somewhat stuck with
+ * the ordering which we have been using if we want to keep working with
+ * those broken things (not that it currently hurts anybody as there isn't
+ * particular reason why the ordering would need to be changed).
+ *
+ * At least SACK_PERM as the first option is known to lead to a disaster
+ * (but it may well be that other scenarios fail similarly).
+ */
+static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
+			      struct tcp_out_options *opts)
+{
+	__be32 *ptr = (__be32 *)(th + 1);
+	u16 options = opts->options;	/* mungable copy */
+
+	if (unlikely(OPTION_MD5 & options)) {
+		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+		/* overload cookie hash location */
+		opts->hash_location = (__u8 *)ptr;
+		ptr += 4;
+	}
+
+	if (unlikely(opts->mss)) {
+		*ptr++ = htonl((TCPOPT_MSS << 24) |
+			       (TCPOLEN_MSS << 16) |
+			       opts->mss);
+	}
+
+	if (likely(OPTION_TS & options)) {
+		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+				       (TCPOLEN_SACK_PERM << 16) |
+				       (TCPOPT_TIMESTAMP << 8) |
+				       TCPOLEN_TIMESTAMP);
+			options &= ~OPTION_SACK_ADVERTISE;
+		} else {
+			*ptr++ = htonl((TCPOPT_NOP << 24) |
+				       (TCPOPT_NOP << 16) |
+				       (TCPOPT_TIMESTAMP << 8) |
+				       TCPOLEN_TIMESTAMP);
+		}
+		*ptr++ = htonl(opts->tsval);
+		*ptr++ = htonl(opts->tsecr);
+	}
+
+	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+		*ptr++ = htonl((TCPOPT_NOP << 24) |
+			       (TCPOPT_NOP << 16) |
+			       (TCPOPT_SACK_PERM << 8) |
+			       TCPOLEN_SACK_PERM);
+	}
+
+	if (unlikely(OPTION_WSCALE & options)) {
+		*ptr++ = htonl((TCPOPT_NOP << 24) |
+			       (TCPOPT_WINDOW << 16) |
+			       (TCPOLEN_WINDOW << 8) |
+			       opts->ws);
+	}
+
+	if (unlikely(opts->num_sack_blocks)) {
+		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
+			tp->duplicate_sack : tp->selective_acks;
+		int this_sack;
+
+		*ptr++ = htonl((TCPOPT_NOP  << 24) |
+			       (TCPOPT_NOP  << 16) |
+			       (TCPOPT_SACK <<  8) |
+			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
+						     TCPOLEN_SACK_PERBLOCK)));
+
+		for (this_sack = 0; this_sack < opts->num_sack_blocks;
+		     ++this_sack) {
+			*ptr++ = htonl(sp[this_sack].start_seq);
+			*ptr++ = htonl(sp[this_sack].end_seq);
+		}
+
+		tp->rx_opt.dsack = 0;
+	}
+
+	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+		u8 *p = (u8 *)ptr;
+		u32 len; /* Fast Open option length */
+
+		if (foc->exp) {
+			len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+			*ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+				     TCPOPT_FASTOPEN_MAGIC);
+			p += TCPOLEN_EXP_FASTOPEN_BASE;
+		} else {
+			len = TCPOLEN_FASTOPEN_BASE + foc->len;
+			*p++ = TCPOPT_FASTOPEN;
+			*p++ = len;
+		}
+
+		memcpy(p, foc->val, foc->len);
+		if ((len & 3) == 2) {
+			p[foc->len] = TCPOPT_NOP;
+			p[foc->len + 1] = TCPOPT_NOP;
+		}
+		ptr += (len + 3) >> 2;
+	}
+
+	smc_options_write(ptr, &options);
+
+	mptcp_options_write(th, ptr, tp, opts);
+}
+
+static void smc_set_option(const struct tcp_sock *tp,
+			   struct tcp_out_options *opts,
+			   unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc) {
+			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+				opts->options |= OPTION_SMC;
+				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+			}
+		}
+	}
+#endif
+}
+
+static void smc_set_option_cond(const struct tcp_sock *tp,
+				const struct inet_request_sock *ireq,
+				struct tcp_out_options *opts,
+				unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+	if (static_branch_unlikely(&tcp_have_smc)) {
+		if (tp->syn_smc && ireq->smc_ok) {
+			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+				opts->options |= OPTION_SMC;
+				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+			}
+		}
+	}
+#endif
+}
+
+static void mptcp_set_option_cond(const struct request_sock *req,
+				  struct tcp_out_options *opts,
+				  unsigned int *remaining)
+{
+	if (rsk_is_mptcp(req)) {
+		unsigned int size;
+
+		if (mptcp_synack_options(req, &size, &opts->mptcp)) {
+			if (*remaining >= size) {
+				opts->options |= OPTION_MPTCP;
+				*remaining -= size;
+			}
+		}
+	}
+}
+
+/* Compute TCP options for SYN packets. This is not the final
+ * network wire format yet.
+ */
+static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+				struct tcp_out_options *opts,
+				struct tcp_md5sig_key **md5)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int remaining = MAX_TCP_OPTION_SPACE;
+	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+
+	*md5 = NULL;
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key) &&
+	    rcu_access_pointer(tp->md5sig_info)) {
+		*md5 = tp->af_specific->md5_lookup(sk, sk);
+		if (*md5) {
+			opts->options |= OPTION_MD5;
+			remaining -= TCPOLEN_MD5SIG_ALIGNED;
+		}
+	}
+#endif
+
+	/* We always get an MSS option.  The option bytes which will be seen in
+	 * normal data packets should timestamps be used, must be in the MSS
+	 * advertised.  But we subtract them from tp->mss_cache so that
+	 * calculations in tcp_sendmsg are simpler etc.  So account for this
+	 * fact here if necessary.  If we don't do this correctly, as a
+	 * receiver we won't recognize data packets as being full sized when we
+	 * should, and thus we won't abide by the delayed ACK rules correctly.
+	 * SACKs don't matter, we never delay an ACK when we have any of those
+	 * going out.  */
+	opts->mss = tcp_advertise_mss(sk);
+	remaining -= TCPOLEN_MSS_ALIGNED;
+
+	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
+		opts->options |= OPTION_TS;
+		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
+		opts->tsecr = tp->rx_opt.ts_recent;
+		remaining -= TCPOLEN_TSTAMP_ALIGNED;
+	}
+	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
+		opts->ws = tp->rx_opt.rcv_wscale;
+		opts->options |= OPTION_WSCALE;
+		remaining -= TCPOLEN_WSCALE_ALIGNED;
+	}
+	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
+		opts->options |= OPTION_SACK_ADVERTISE;
+		if (unlikely(!(OPTION_TS & opts->options)))
+			remaining -= TCPOLEN_SACKPERM_ALIGNED;
+	}
+
+	if (fastopen && fastopen->cookie.len >= 0) {
+		u32 need = fastopen->cookie.len;
+
+		need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+					       TCPOLEN_FASTOPEN_BASE;
+		need = (need + 3) & ~3U;  /* Align to 32 bits */
+		if (remaining >= need) {
+			opts->options |= OPTION_FAST_OPEN_COOKIE;
+			opts->fastopen_cookie = &fastopen->cookie;
+			remaining -= need;
+			tp->syn_fastopen = 1;
+			tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
+		}
+	}
+
+	smc_set_option(tp, opts, &remaining);
+
+	if (sk_is_mptcp(sk)) {
+		unsigned int size;
+
+		if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
+			opts->options |= OPTION_MPTCP;
+			remaining -= size;
+		}
+	}
+
+	bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+	return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Set up TCP options for SYN-ACKs. */
+static unsigned int tcp_synack_options(const struct sock *sk,
+				       struct request_sock *req,
+				       unsigned int mss, struct sk_buff *skb,
+				       struct tcp_out_options *opts,
+				       const struct tcp_md5sig_key *md5,
+				       struct tcp_fastopen_cookie *foc,
+				       enum tcp_synack_type synack_type,
+				       struct sk_buff *syn_skb)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+	unsigned int remaining = MAX_TCP_OPTION_SPACE;
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (md5) {
+		opts->options |= OPTION_MD5;
+		remaining -= TCPOLEN_MD5SIG_ALIGNED;
+
+		/* We can't fit any SACK blocks in a packet with MD5 + TS
+		 * options. There was discussion about disabling SACK
+		 * rather than TS in order to fit in better with old,
+		 * buggy kernels, but that was deemed to be unnecessary.
+		 */
+		if (synack_type != TCP_SYNACK_COOKIE)
+			ireq->tstamp_ok &= !ireq->sack_ok;
+	}
+#endif
+
+	/* We always send an MSS option. */
+	opts->mss = mss;
+	remaining -= TCPOLEN_MSS_ALIGNED;
+
+	if (likely(ireq->wscale_ok)) {
+		opts->ws = ireq->rcv_wscale;
+		opts->options |= OPTION_WSCALE;
+		remaining -= TCPOLEN_WSCALE_ALIGNED;
+	}
+	if (likely(ireq->tstamp_ok)) {
+		opts->options |= OPTION_TS;
+		opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
+		opts->tsecr = READ_ONCE(req->ts_recent);
+		remaining -= TCPOLEN_TSTAMP_ALIGNED;
+	}
+	if (likely(ireq->sack_ok)) {
+		opts->options |= OPTION_SACK_ADVERTISE;
+		if (unlikely(!ireq->tstamp_ok))
+			remaining -= TCPOLEN_SACKPERM_ALIGNED;
+	}
+	if (foc != NULL && foc->len >= 0) {
+		u32 need = foc->len;
+
+		need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+				   TCPOLEN_FASTOPEN_BASE;
+		need = (need + 3) & ~3U;  /* Align to 32 bits */
+		if (remaining >= need) {
+			opts->options |= OPTION_FAST_OPEN_COOKIE;
+			opts->fastopen_cookie = foc;
+			remaining -= need;
+		}
+	}
+
+	mptcp_set_option_cond(req, opts, &remaining);
+
+	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+
+	bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+			      synack_type, opts, &remaining);
+
+	return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Compute TCP options for ESTABLISHED sockets. This is not the
+ * final wire format yet.
+ */
+static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
+					struct tcp_out_options *opts,
+					struct tcp_md5sig_key **md5)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int size = 0;
+	unsigned int eff_sacks;
+
+	opts->options = 0;
+
+	*md5 = NULL;
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key) &&
+	    rcu_access_pointer(tp->md5sig_info)) {
+		*md5 = tp->af_specific->md5_lookup(sk, sk);
+		if (*md5) {
+			opts->options |= OPTION_MD5;
+			size += TCPOLEN_MD5SIG_ALIGNED;
+		}
+	}
+#endif
+
+	if (likely(tp->rx_opt.tstamp_ok)) {
+		opts->options |= OPTION_TS;
+		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
+		opts->tsecr = tp->rx_opt.ts_recent;
+		size += TCPOLEN_TSTAMP_ALIGNED;
+	}
+
+	/* MPTCP options have precedence over SACK for the limited TCP
+	 * option space because a MPTCP connection would be forced to
+	 * fall back to regular TCP if a required multipath option is
+	 * missing. SACK still gets a chance to use whatever space is
+	 * left.
+	 */
+	if (sk_is_mptcp(sk)) {
+		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+		unsigned int opt_size = 0;
+
+		if (mptcp_established_options(sk, skb, &opt_size, remaining,
+					      &opts->mptcp)) {
+			opts->options |= OPTION_MPTCP;
+			size += opt_size;
+		}
+	}
+
+	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+	if (unlikely(eff_sacks)) {
+		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+		if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
+					 TCPOLEN_SACK_PERBLOCK))
+			return size;
+
+		opts->num_sack_blocks =
+			min_t(unsigned int, eff_sacks,
+			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+			      TCPOLEN_SACK_PERBLOCK);
+
+		size += TCPOLEN_SACK_BASE_ALIGNED +
+			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+	}
+
+	if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+					    BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+		bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+		size = MAX_TCP_OPTION_SPACE - remaining;
+	}
+
+	return size;
+}
+
+
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+	struct tasklet_struct	tasklet;
+	struct list_head	head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+
+static void tcp_tsq_write(struct sock *sk)
+{
+	if ((1 << sk->sk_state) &
+	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
+		struct tcp_sock *tp = tcp_sk(sk);
+
+		if (tp->lost_out > tp->retrans_out &&
+		    tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
+			tcp_mstamp_refresh(tp);
+			tcp_xmit_retransmit_queue(sk);
+		}
+
+		tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
+			       0, GFP_ATOMIC);
+	}
+}
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk))
+		tcp_tsq_write(sk);
+	else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+		sock_hold(sk);
+	bh_unlock_sock(sk);
+}
+/*
+ * One tasklet per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transferring tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(struct tasklet_struct *t)
+{
+	struct tsq_tasklet *tsq = from_tasklet(tsq,  t, tasklet);
+	LIST_HEAD(list);
+	unsigned long flags;
+	struct list_head *q, *n;
+	struct tcp_sock *tp;
+	struct sock *sk;
+
+	local_irq_save(flags);
+	list_splice_init(&tsq->head, &list);
+	local_irq_restore(flags);
+
+	list_for_each_safe(q, n, &list) {
+		tp = list_entry(q, struct tcp_sock, tsq_node);
+		list_del(&tp->tsq_node);
+
+		sk = (struct sock *)tp;
+		smp_mb__before_atomic();
+		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
+		tcp_tsq_handler(sk);
+		sk_free(sk);
+	}
+}
+
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
+			  TCPF_WRITE_TIMER_DEFERRED |	\
+			  TCPF_DELACK_TIMER_DEFERRED |	\
+			  TCPF_MTU_REDUCED_DEFERRED)
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+	unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags);
+	unsigned long nflags;
+
+	/* perform an atomic operation only if at least one flag is set */
+	do {
+		if (!(flags & TCP_DEFERRED_ALL))
+			return;
+		nflags = flags & ~TCP_DEFERRED_ALL;
+	} while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags));
+
+	if (flags & TCPF_TSQ_DEFERRED) {
+		tcp_tsq_write(sk);
+		__sock_put(sk);
+	}
+	/* Here begins the tricky part :
+	 * We are called from release_sock() with :
+	 * 1) BH disabled
+	 * 2) sk_lock.slock spinlock held
+	 * 3) socket owned by us (sk->sk_lock.owned == 1)
+	 *
+	 * But following code is meant to be called from BH handlers,
+	 * so we should keep BH disabled, but early release socket ownership
+	 */
+	sock_release_ownership(sk);
+
+	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
+		tcp_write_timer_handler(sk);
+		__sock_put(sk);
+	}
+	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
+		tcp_delack_timer_handler(sk);
+		__sock_put(sk);
+	}
+	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
+		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
+		__sock_put(sk);
+	}
+}
+EXPORT_SYMBOL(tcp_release_cb);
+
+void __init tcp_tasklet_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+
+		INIT_LIST_HEAD(&tsq->head);
+		tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
+	}
+}
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We can't xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned long flags, nval, oval;
+	struct tsq_tasklet *tsq;
+	bool empty;
+
+	/* Keep one reference on sk_wmem_alloc.
+	 * Will be released by sk_free() from here or tcp_tasklet_func()
+	 */
+	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
+
+	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
+	 * Wait until our queues (qdisc + devices) are drained.
+	 * This gives :
+	 * - less callbacks to tcp_write_xmit(), reducing stress (batches)
+	 * - chance for incoming ACK (processed by another cpu maybe)
+	 *   to migrate this flow (skb->ooo_okay will be eventually set)
+	 */
+	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+		goto out;
+
+	oval = smp_load_acquire(&sk->sk_tsq_flags);
+	do {
+		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+			goto out;
+
+		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
+	} while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));
+
+	/* queue this socket to tasklet queue */
+	local_irq_save(flags);
+	tsq = this_cpu_ptr(&tsq_tasklet);
+	empty = list_empty(&tsq->head);
+	list_add(&tp->tsq_node, &tsq->head);
+	if (empty)
+		tasklet_schedule(&tsq->tasklet);
+	local_irq_restore(flags);
+	return;
+out:
+	sk_free(sk);
+}
+
+/* Note: Called under soft irq.
+ * We can call TCP stack right away, unless socket is owned by user.
+ */
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
+{
+	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
+	struct sock *sk = (struct sock *)tp;
+
+	tcp_tsq_handler(sk);
+	sock_put(sk);
+
+	return HRTIMER_NORESTART;
+}
+
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
+				      u64 prior_wstamp)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (sk->sk_pacing_status != SK_PACING_NONE) {
+		unsigned long rate = sk->sk_pacing_rate;
+
+		/* Original sch_fq does not pace first 10 MSS
+		 * Note that tp->data_segs_out overflows after 2^32 packets,
+		 * this is a minor annoyance.
+		 */
+		if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
+			u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
+			u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
+
+			/* take into account OS jitter */
+			len_ns -= min_t(u64, len_ns / 2, credit);
+			tp->tcp_wstamp_ns += len_ns;
+		}
+	}
+	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+}
+
+INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
+
+/* This routine actually transmits TCP packets queued in by
+ * tcp_do_sendmsg().  This is used by both the initial
+ * transmission and possible later retransmissions.
+ * All SKB's seen here are completely headerless.  It is our
+ * job to build the TCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ *
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
+			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_sock *inet;
+	struct tcp_sock *tp;
+	struct tcp_skb_cb *tcb;
+	struct tcp_out_options opts;
+	unsigned int tcp_options_size, tcp_header_size;
+	struct sk_buff *oskb = NULL;
+	struct tcp_md5sig_key *md5;
+	struct tcphdr *th;
+	u64 prior_wstamp;
+	int err;
+
+	BUG_ON(!skb || !tcp_skb_pcount(skb));
+	tp = tcp_sk(sk);
+	prior_wstamp = tp->tcp_wstamp_ns;
+	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+	skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+	if (clone_it) {
+		oskb = skb;
+
+		tcp_skb_tsorted_save(oskb) {
+			if (unlikely(skb_cloned(oskb)))
+				skb = pskb_copy(oskb, gfp_mask);
+			else
+				skb = skb_clone(oskb, gfp_mask);
+		} tcp_skb_tsorted_restore(oskb);
+
+		if (unlikely(!skb))
+			return -ENOBUFS;
+		/* retransmit skbs might have a non zero value in skb->dev
+		 * because skb->dev is aliased with skb->rbnode.rb_left
+		 */
+		skb->dev = NULL;
+	}
+
+	inet = inet_sk(sk);
+	tcb = TCP_SKB_CB(skb);
+	memset(&opts, 0, sizeof(opts));
+
+	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
+	} else {
+		tcp_options_size = tcp_established_options(sk, skb, &opts,
+							   &md5);
+		/* Force a PSH flag on all (GSO) packets to expedite GRO flush
+		 * at receiver : This slightly improve GRO performance.
+		 * Note that we do not force the PSH flag for non GSO packets,
+		 * because they might be sent under high congestion events,
+		 * and in this case it is better to delay the delivery of 1-MSS
+		 * packets and thus the corresponding ACK packet that would
+		 * release the following packet.
+		 */
+		if (tcp_skb_pcount(skb) > 1)
+			tcb->tcp_flags |= TCPHDR_PSH;
+	}
+	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
+
+	/* We set skb->ooo_okay to one if this packet can select
+	 * a different TX queue than prior packets of this flow,
+	 * to avoid self inflicted reorders.
+	 * The 'other' queue decision is based on current cpu number
+	 * if XPS is enabled, or sk->sk_txhash otherwise.
+	 * We can switch to another (and better) queue if:
+	 * 1) No packet with payload is in qdisc/device queues.
+	 *    Delays in TX completion can defeat the test
+	 *    even if packets were already sent.
+	 * 2) Or rtx queue is empty.
+	 *    This mitigates above case if ACK packets for
+	 *    all prior packets were already processed.
+	 */
+	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) ||
+			tcp_rtx_queue_empty(sk);
+
+	/* If we had to use memory reserve to allocate this skb,
+	 * this might cause drops if packet is looped back :
+	 * Other socket might not have SOCK_MEMALLOC.
+	 * Packets not looped back do not care about pfmemalloc.
+	 */
+	skb->pfmemalloc = 0;
+
+	skb_push(skb, tcp_header_size);
+	skb_reset_transport_header(skb);
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
+	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+
+	skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));
+
+	/* Build TCP header and checksum it. */
+	th = (struct tcphdr *)skb->data;
+	th->source		= inet->inet_sport;
+	th->dest		= inet->inet_dport;
+	th->seq			= htonl(tcb->seq);
+	th->ack_seq		= htonl(rcv_nxt);
+	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
+					tcb->tcp_flags);
+
+	th->check		= 0;
+	th->urg_ptr		= 0;
+
+	/* The urg_mode check is necessary during a below snd_una win probe */
+	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+		if (before(tp->snd_up, tcb->seq + 0x10000)) {
+			th->urg_ptr = htons(tp->snd_up - tcb->seq);
+			th->urg = 1;
+		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
+			th->urg_ptr = htons(0xFFFF);
+			th->urg = 1;
+		}
+	}
+
+	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
+	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
+		th->window      = htons(tcp_select_window(sk));
+		tcp_ecn_send(sk, skb, th, tcp_header_size);
+	} else {
+		/* RFC1323: The window in SYN & SYN/ACK segments
+		 * is never scaled.
+		 */
+		th->window	= htons(min(tp->rcv_wnd, 65535U));
+	}
+
+	tcp_options_write(th, tp, &opts);
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* Calculate the MD5 hash, as we have all we need now */
+	if (md5) {
+		sk_gso_disable(sk);
+		tp->af_specific->calc_md5_hash(opts.hash_location,
+					       md5, sk, skb);
+	}
+#endif
+
+	/* BPF prog is the last one writing header option */
+	bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
+
+	INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
+			   tcp_v6_send_check, tcp_v4_send_check,
+			   sk, skb);
+
+	if (likely(tcb->tcp_flags & TCPHDR_ACK))
+		tcp_event_ack_sent(sk, rcv_nxt);
+
+	if (skb->len != tcp_header_size) {
+		tcp_event_data_sent(tp, sk);
+		tp->data_segs_out += tcp_skb_pcount(skb);
+		tp->bytes_sent += skb->len - tcp_header_size;
+	}
+
+	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
+		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
+			      tcp_skb_pcount(skb));
+
+	tp->segs_out += tcp_skb_pcount(skb);
+	skb_set_hash_from_sk(skb, sk);
+	/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
+	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
+
+	/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
+
+	/* Cleanup our debris for IP stacks */
+	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
+			       sizeof(struct inet6_skb_parm)));
+
+	tcp_add_tx_delay(skb, tp);
+
+	err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
+				 inet6_csk_xmit, ip_queue_xmit,
+				 sk, skb, &inet->cork.fl);
+
+	if (unlikely(err > 0)) {
+		tcp_enter_cwr(sk);
+		err = net_xmit_eval(err);
+	}
+	if (!err && oskb) {
+		tcp_update_skb_after_send(sk, oskb, prior_wstamp);
+		tcp_rate_skb_sent(sk, oskb);
+	}
+	return err;
+}
+
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+			    gfp_t gfp_mask)
+{
+	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
+				  tcp_sk(sk)->rcv_nxt);
+}
+
+/* This routine just queues the buffer for sending.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
+ */
+static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Advance write_seq and place onto the write_queue. */
+	WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
+	__skb_header_release(skb);
+	tcp_add_write_queue_tail(sk, skb);
+	sk_wmem_queued_add(sk, skb->truesize);
+	sk_mem_charge(sk, skb->truesize);
+}
+
+/* Initialize TSO segments for a packet. */
+static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+{
+	if (skb->len <= mss_now) {
+		/* Avoid the costly divide in the normal
+		 * non-TSO case.
+		 */
+		tcp_skb_pcount_set(skb, 1);
+		TCP_SKB_CB(skb)->tcp_gso_size = 0;
+	} else {
+		tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
+		TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+	}
+}
+
+/* Pcount in the middle of the write queue got changed, we need to do various
+ * tweaks to fix counters
+ */
+static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->packets_out -= decr;
+
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+		tp->sacked_out -= decr;
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+		tp->retrans_out -= decr;
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+		tp->lost_out -= decr;
+
+	/* Reno case is special. Sigh... */
+	if (tcp_is_reno(tp) && decr > 0)
+		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
+
+	if (tp->lost_skb_hint &&
+	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+		tp->lost_cnt_hint -= decr;
+
+	tcp_verify_left_out(tp);
+}
+
+static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->txstamp_ack ||
+		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
+}
+
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	if (unlikely(tcp_has_tx_tstamp(skb)) &&
+	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+		shinfo->tx_flags &= ~tsflags;
+		shinfo2->tx_flags |= tsflags;
+		swap(shinfo->tskey, shinfo2->tskey);
+		TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
+		TCP_SKB_CB(skb)->txstamp_ack = 0;
+	}
+}
+
+static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+{
+	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+	TCP_SKB_CB(skb)->eor = 0;
+}
+
+/* Insert buff after skb on the write or rtx queue of sk.  */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+					 struct sk_buff *buff,
+					 struct sock *sk,
+					 enum tcp_queue tcp_queue)
+{
+	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+		__skb_queue_after(&sk->sk_write_queue, skb, buff);
+	else
+		tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
+/* Function to create two new TCP segments.  Shrinks the given segment
+ * to the specified size and appends a new segment with the rest of the
+ * packet to the list.  This won't be called frequently, I hope.
+ * Remember, these are still headerless SKBs at this point.
+ */
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+		 struct sk_buff *skb, u32 len,
+		 unsigned int mss_now, gfp_t gfp)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *buff;
+	int old_factor;
+	long limit;
+	int nlen;
+	u8 flags;
+
+	if (WARN_ON(len > skb->len))
+		return -EINVAL;
+
+	DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
+
+	/* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
+	 * We need some allowance to not penalize applications setting small
+	 * SO_SNDBUF values.
+	 * Also allow first and last skb in retransmit queue to be split.
+	 */
+	limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
+	if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
+		     tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
+		     skb != tcp_rtx_queue_head(sk) &&
+		     skb != tcp_rtx_queue_tail(sk))) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
+		return -ENOMEM;
+	}
+
+	if (skb_unclone_keeptruesize(skb, gfp))
+		return -ENOMEM;
+
+	/* Get a new skb... force flag on. */
+	buff = tcp_stream_alloc_skb(sk, gfp, true);
+	if (!buff)
+		return -ENOMEM; /* We'll just try again later. */
+	skb_copy_decrypted(buff, skb);
+	mptcp_skb_ext_copy(buff, skb);
+
+	sk_wmem_queued_add(sk, buff->truesize);
+	sk_mem_charge(sk, buff->truesize);
+	nlen = skb->len - len;
+	buff->truesize += nlen;
+	skb->truesize -= nlen;
+
+	/* Correct the sequence numbers. */
+	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+	/* PSH and FIN should only be set in the second packet. */
+	flags = TCP_SKB_CB(skb)->tcp_flags;
+	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+	TCP_SKB_CB(buff)->tcp_flags = flags;
+	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+	tcp_skb_fragment_eor(skb, buff);
+
+	skb_split(skb, buff, len);
+
+	skb_set_delivery_time(buff, skb->tstamp, true);
+	tcp_fragment_tstamp(skb, buff);
+
+	old_factor = tcp_skb_pcount(skb);
+
+	/* Fix up tso_factor for both original and new SKB.  */
+	tcp_set_skb_tso_segs(skb, mss_now);
+	tcp_set_skb_tso_segs(buff, mss_now);
+
+	/* Update delivered info for the new segment */
+	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
+	/* If this packet has been sent out already, we must
+	 * adjust the various packet counters.
+	 */
+	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+		int diff = old_factor - tcp_skb_pcount(skb) -
+			tcp_skb_pcount(buff);
+
+		if (diff)
+			tcp_adjust_pcount(sk, skb, diff);
+	}
+
+	/* Link BUFF into the send queue. */
+	__skb_header_release(buff);
+	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
+	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
+		list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
+
+	return 0;
+}
+
+/* This is similar to __pskb_pull_tail(). The difference is that pulled
+ * data is not copied, but immediately discarded.
+ */
+static int __pskb_trim_head(struct sk_buff *skb, int len)
+{
+	struct skb_shared_info *shinfo;
+	int i, k, eat;
+
+	DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
+	eat = len;
+	k = 0;
+	shinfo = skb_shinfo(skb);
+	for (i = 0; i < shinfo->nr_frags; i++) {
+		int size = skb_frag_size(&shinfo->frags[i]);
+
+		if (size <= eat) {
+			skb_frag_unref(skb, i);
+			eat -= size;
+		} else {
+			shinfo->frags[k] = shinfo->frags[i];
+			if (eat) {
+				skb_frag_off_add(&shinfo->frags[k], eat);
+				skb_frag_size_sub(&shinfo->frags[k], eat);
+				eat = 0;
+			}
+			k++;
+		}
+	}
+	shinfo->nr_frags = k;
+
+	skb->data_len -= len;
+	skb->len = skb->data_len;
+	return len;
+}
+
+/* Remove acked data from a packet in the transmit queue. */
+int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+	u32 delta_truesize;
+
+	if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
+		return -ENOMEM;
+
+	delta_truesize = __pskb_trim_head(skb, len);
+
+	TCP_SKB_CB(skb)->seq += len;
+
+	skb->truesize	   -= delta_truesize;
+	sk_wmem_queued_add(sk, -delta_truesize);
+	if (!skb_zcopy_pure(skb))
+		sk_mem_uncharge(sk, delta_truesize);
+
+	/* Any change of skb->len requires recalculation of tso factor. */
+	if (tcp_skb_pcount(skb) > 1)
+		tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
+
+	return 0;
+}
+
+/* Calculate MSS not accounting any TCP options.  */
+static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	int mss_now;
+
+	/* Calculate base mss without TCP options:
+	   It is MMS_S - sizeof(tcphdr) of rfc1122
+	 */
+	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+	if (icsk->icsk_af_ops->net_frag_header_len) {
+		const struct dst_entry *dst = __sk_dst_get(sk);
+
+		if (dst && dst_allfrag(dst))
+			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+	}
+
+	/* Clamp it (mss_clamp does not include tcp options) */
+	if (mss_now > tp->rx_opt.mss_clamp)
+		mss_now = tp->rx_opt.mss_clamp;
+
+	/* Now subtract optional transport overhead */
+	mss_now -= icsk->icsk_ext_hdr_len;
+
+	/* Then reserve room for full set of TCP options and 8 bytes of data */
+	mss_now = max(mss_now,
+		      READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
+	return mss_now;
+}
+
+/* Calculate MSS. Not accounting for SACKs here.  */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+	/* Subtract TCP options size, not including SACKs */
+	return __tcp_mtu_to_mss(sk, pmtu) -
+	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
+}
+EXPORT_SYMBOL(tcp_mtu_to_mss);
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	int mtu;
+
+	mtu = mss +
+	      tp->tcp_header_len +
+	      icsk->icsk_ext_hdr_len +
+	      icsk->icsk_af_ops->net_header_len;
+
+	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+	if (icsk->icsk_af_ops->net_frag_header_len) {
+		const struct dst_entry *dst = __sk_dst_get(sk);
+
+		if (dst && dst_allfrag(dst))
+			mtu += icsk->icsk_af_ops->net_frag_header_len;
+	}
+	return mtu;
+}
+EXPORT_SYMBOL(tcp_mss_to_mtu);
+
+/* MTU probing init per socket */
+void tcp_mtup_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct net *net = sock_net(sk);
+
+	icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
+	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+			       icsk->icsk_af_ops->net_header_len;
+	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
+	icsk->icsk_mtup.probe_size = 0;
+	if (icsk->icsk_mtup.enabled)
+		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
+}
+EXPORT_SYMBOL(tcp_mtup_init);
+
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+   for TCP options, but includes only bare TCP header.
+
+   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
+   It is minimum of user_mss and mss received with SYN.
+   It also does not include TCP options.
+
+   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
+
+   tp->mss_cache is current effective sending mss, including
+   all tcp options except for SACKs. It is evaluated,
+   taking into account current pmtu, but never exceeds
+   tp->rx_opt.mss_clamp.
+
+   NOTE1. rfc1122 clearly states that advertised MSS
+   DOES NOT include either tcp or ip options.
+
+   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+   are READ ONLY outside this function.		--ANK (980731)
+ */
+unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int mss_now;
+
+	if (icsk->icsk_mtup.search_high > pmtu)
+		icsk->icsk_mtup.search_high = pmtu;
+
+	mss_now = tcp_mtu_to_mss(sk, pmtu);
+	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
+
+	/* And store cached results */
+	icsk->icsk_pmtu_cookie = pmtu;
+	if (icsk->icsk_mtup.enabled)
+		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
+	tp->mss_cache = mss_now;
+
+	return mss_now;
+}
+EXPORT_SYMBOL(tcp_sync_mss);
+
+/* Compute the current effective MSS, taking SACKs and IP options,
+ * and even PMTU discovery events into account.
+ */
+unsigned int tcp_current_mss(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct dst_entry *dst = __sk_dst_get(sk);
+	u32 mss_now;
+	unsigned int header_len;
+	struct tcp_out_options opts;
+	struct tcp_md5sig_key *md5;
+
+	mss_now = tp->mss_cache;
+
+	if (dst) {
+		u32 mtu = dst_mtu(dst);
+		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
+			mss_now = tcp_sync_mss(sk, mtu);
+	}
+
+	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
+		     sizeof(struct tcphdr);
+	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
+	 * some common options. If this is an odd packet (because we have SACK
+	 * blocks etc) then our calculated header_len will be different, and
+	 * we have to adjust mss_now correspondingly */
+	if (header_len != tp->tcp_header_len) {
+		int delta = (int) header_len - tp->tcp_header_len;
+		mss_now -= delta;
+	}
+
+	return mss_now;
+}
+
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+static void tcp_cwnd_application_limited(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		/* Limited by application or receiver window. */
+		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+		u32 win_used = max(tp->snd_cwnd_used, init_win);
+		if (win_used < tcp_snd_cwnd(tp)) {
+			tp->snd_ssthresh = tcp_current_ssthresh(sk);
+			tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
+		}
+		tp->snd_cwnd_used = 0;
+	}
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+}
+
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
+{
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Track the strongest available signal of the degree to which the cwnd
+	 * is fully utilized. If cwnd-limited then remember that fact for the
+	 * current window. If not cwnd-limited then track the maximum number of
+	 * outstanding packets in the current window. (If cwnd-limited then we
+	 * chose to not update tp->max_packets_out to avoid an extra else
+	 * clause with no functional impact.)
+	 */
+	if (!before(tp->snd_una, tp->cwnd_usage_seq) ||
+	    is_cwnd_limited ||
+	    (!tp->is_cwnd_limited &&
+	     tp->packets_out > tp->max_packets_out)) {
+		tp->is_cwnd_limited = is_cwnd_limited;
+		tp->max_packets_out = tp->packets_out;
+		tp->cwnd_usage_seq = tp->snd_nxt;
+	}
+
+	if (tcp_is_cwnd_limited(sk)) {
+		/* Network is feed fully. */
+		tp->snd_cwnd_used = 0;
+		tp->snd_cwnd_stamp = tcp_jiffies32;
+	} else {
+		/* Network starves. */
+		if (tp->packets_out > tp->snd_cwnd_used)
+			tp->snd_cwnd_used = tp->packets_out;
+
+		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
+		    (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
+		    !ca_ops->cong_control)
+			tcp_cwnd_application_limited(sk);
+
+		/* The following conditions together indicate the starvation
+		 * is caused by insufficient sender buffer:
+		 * 1) just sent some data (see tcp_write_xmit)
+		 * 2) not cwnd limited (this else condition)
+		 * 3) no more data to send (tcp_write_queue_empty())
+		 * 4) application is hitting buffer limit (SOCK_NOSPACE)
+		 */
+		if (tcp_write_queue_empty(sk) && sk->sk_socket &&
+		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
+		    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+			tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
+	}
+}
+
+/* Minshall's variant of the Nagle send check. */
+static bool tcp_minshall_check(const struct tcp_sock *tp)
+{
+	return after(tp->snd_sml, tp->snd_una) &&
+		!after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Update snd_sml if this skb is under mss
+ * Note that a TSO packet might end with a sub-mss segment
+ * The test is really :
+ * if ((skb->len % mss) != 0)
+ *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+ * But we can avoid doing the divide again given we already have
+ *  skb_pcount = skb->len / mss_now
+ */
+static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+				const struct sk_buff *skb)
+{
+	if (skb->len < tcp_skb_pcount(skb) * mss_now)
+		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+}
+
+/* Return false, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized. (provided by caller in %partial bool)
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
+			    int nonagle)
+{
+	return partial &&
+		((nonagle & TCP_NAGLE_CORK) ||
+		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+}
+
+/* Return how many segs we'd like on a TSO packet,
+ * depending on current pacing rate, and how close the peer is.
+ *
+ * Rationale is:
+ * - For close peers, we rather send bigger packets to reduce
+ *   cpu costs, because occasional losses will be repaired fast.
+ * - For long distance/rtt flows, we would like to get ACK clocking
+ *   with 1 ACK per ms.
+ *
+ * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
+ */
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+			    int min_tso_segs)
+{
+	unsigned long bytes;
+	u32 r;
+
+	bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
+
+	r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
+	if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
+		bytes += sk->sk_gso_max_size >> r;
+
+	bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
+
+	return max_t(u32, bytes / mss_now, min_tso_segs);
+}
+
+/* Return the number of segments we want in the skb we are transmitting.
+ * See if congestion control module wants to decide; otherwise, autosize.
+ */
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+	u32 min_tso, tso_segs;
+
+	min_tso = ca_ops->min_tso_segs ?
+			ca_ops->min_tso_segs(sk) :
+			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+
+	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+}
+
+/* Returns the portion of skb which can be sent right away */
+static unsigned int tcp_mss_split_point(const struct sock *sk,
+					const struct sk_buff *skb,
+					unsigned int mss_now,
+					unsigned int max_segs,
+					int nonagle)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 partial, needed, window, max_len;
+
+	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+	max_len = mss_now * max_segs;
+
+	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+		return max_len;
+
+	needed = min(skb->len, window);
+
+	if (max_len <= needed)
+		return max_len;
+
+	partial = needed % mss_now;
+	/* If last segment is not a full MSS, check if Nagle rules allow us
+	 * to include this last segment in this skb.
+	 * Otherwise, we'll split the skb at last MSS boundary
+	 */
+	if (tcp_nagle_check(partial != 0, tp, nonagle))
+		return needed - partial;
+
+	return needed;
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
+					 const struct sk_buff *skb)
+{
+	u32 in_flight, cwnd, halfcwnd;
+
+	/* Don't be strict about the congestion window for the final FIN.  */
+	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+	    tcp_skb_pcount(skb) == 1)
+		return 1;
+
+	in_flight = tcp_packets_in_flight(tp);
+	cwnd = tcp_snd_cwnd(tp);
+	if (in_flight >= cwnd)
+		return 0;
+
+	/* For better scheduling, ensure we have at least
+	 * 2 GSO packets in flight.
+	 */
+	halfcwnd = max(cwnd >> 1, 1U);
+	return min(halfcwnd, cwnd - in_flight);
+}
+
+/* Initialize TSO state of a skb.
+ * This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+{
+	int tso_segs = tcp_skb_pcount(skb);
+
+	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
+		tcp_set_skb_tso_segs(skb, mss_now);
+		tso_segs = tcp_skb_pcount(skb);
+	}
+	return tso_segs;
+}
+
+
+/* Return true if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+				  unsigned int cur_mss, int nonagle)
+{
+	/* Nagle rule does not apply to frames, which sit in the middle of the
+	 * write_queue (they have no chances to get new data).
+	 *
+	 * This is implemented in the callers, where they modify the 'nonagle'
+	 * argument based upon the location of SKB in the send queue.
+	 */
+	if (nonagle & TCP_NAGLE_PUSH)
+		return true;
+
+	/* Don't use the nagle rule for urgent data (or for the final FIN). */
+	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+		return true;
+
+	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
+		return true;
+
+	return false;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+			     const struct sk_buff *skb,
+			     unsigned int cur_mss)
+{
+	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+	if (skb->len > cur_mss)
+		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+	return !after(end_seq, tcp_wnd_end(tp));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list.  It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation.  In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+			unsigned int mss_now, gfp_t gfp)
+{
+	int nlen = skb->len - len;
+	struct sk_buff *buff;
+	u8 flags;
+
+	/* All of a TSO frame must be composed of paged data.  */
+	DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);
+
+	buff = tcp_stream_alloc_skb(sk, gfp, true);
+	if (unlikely(!buff))
+		return -ENOMEM;
+	skb_copy_decrypted(buff, skb);
+	mptcp_skb_ext_copy(buff, skb);
+
+	sk_wmem_queued_add(sk, buff->truesize);
+	sk_mem_charge(sk, buff->truesize);
+	buff->truesize += nlen;
+	skb->truesize -= nlen;
+
+	/* Correct the sequence numbers. */
+	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+	/* PSH and FIN should only be set in the second packet. */
+	flags = TCP_SKB_CB(skb)->tcp_flags;
+	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+	TCP_SKB_CB(buff)->tcp_flags = flags;
+
+	tcp_skb_fragment_eor(skb, buff);
+
+	skb_split(skb, buff, len);
+	tcp_fragment_tstamp(skb, buff);
+
+	/* Fix up tso_factor for both original and new SKB.  */
+	tcp_set_skb_tso_segs(skb, mss_now);
+	tcp_set_skb_tso_segs(buff, mss_now);
+
+	/* Link BUFF into the send queue. */
+	__skb_header_release(buff);
+	tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
+
+	return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+				 bool *is_cwnd_limited,
+				 bool *is_rwnd_limited,
+				 u32 max_segs)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 send_win, cong_win, limit, in_flight;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *head;
+	int win_divisor;
+	s64 delta;
+
+	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
+		goto send_now;
+
+	/* Avoid bursty behavior by allowing defer
+	 * only if the last write was recent (1 ms).
+	 * Note that tp->tcp_wstamp_ns can be in the future if we have
+	 * packets waiting in a qdisc or device for EDT delivery.
+	 */
+	delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+	if (delta > 0)
+		goto send_now;
+
+	in_flight = tcp_packets_in_flight(tp);
+
+	BUG_ON(tcp_skb_pcount(skb) <= 1);
+	BUG_ON(tcp_snd_cwnd(tp) <= in_flight);
+
+	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+	/* From in_flight test above, we know that cwnd > in_flight.  */
+	cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;
+
+	limit = min(send_win, cong_win);
+
+	/* If a full-sized TSO skb can be sent, do it. */
+	if (limit >= max_segs * tp->mss_cache)
+		goto send_now;
+
+	/* Middle in queue won't get any more data, full sendable already? */
+	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
+		goto send_now;
+
+	win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
+	if (win_divisor) {
+		u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);
+
+		/* If at least some fraction of a window is available,
+		 * just use it.
+		 */
+		chunk /= win_divisor;
+		if (limit >= chunk)
+			goto send_now;
+	} else {
+		/* Different approach, try not to defer past a single
+		 * ACK.  Receiver should ACK every other full sized
+		 * frame, so if we have space for more than 3 frames
+		 * then send now.
+		 */
+		if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
+			goto send_now;
+	}
+
+	/* TODO : use tsorted_sent_queue ? */
+	head = tcp_rtx_queue_head(sk);
+	if (!head)
+		goto send_now;
+	delta = tp->tcp_clock_cache - head->tstamp;
+	/* If next ACK is likely to come too late (half srtt), do not defer */
+	if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
+		goto send_now;
+
+	/* Ok, it looks like it is advisable to defer.
+	 * Three cases are tracked :
+	 * 1) We are cwnd-limited
+	 * 2) We are rwnd-limited
+	 * 3) We are application limited.
+	 */
+	if (cong_win < send_win) {
+		if (cong_win <= skb->len) {
+			*is_cwnd_limited = true;
+			return true;
+		}
+	} else {
+		if (send_win <= skb->len) {
+			*is_rwnd_limited = true;
+			return true;
+		}
+	}
+
+	/* If this packet won't get more data, do not wait. */
+	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
+	    TCP_SKB_CB(skb)->eor)
+		goto send_now;
+
+	return true;
+
+send_now:
+	return false;
+}
+
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	u32 interval;
+	s32 delta;
+
+	interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
+	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
+	if (unlikely(delta >= interval * HZ)) {
+		int mss = tcp_current_mss(sk);
+
+		/* Update current search range */
+		icsk->icsk_mtup.probe_size = 0;
+		icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+			sizeof(struct tcphdr) +
+			icsk->icsk_af_ops->net_header_len;
+		icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+		/* Update probe time stamp */
+		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
+	}
+}
+
+static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
+{
+	struct sk_buff *skb, *next;
+
+	skb = tcp_send_head(sk);
+	tcp_for_write_queue_from_safe(skb, next, sk) {
+		if (len <= skb->len)
+			break;
+
+		if (unlikely(TCP_SKB_CB(skb)->eor) ||
+		    tcp_has_tx_tstamp(skb) ||
+		    !skb_pure_zcopy_same(skb, next))
+			return false;
+
+		len -= skb->len;
+	}
+
+	return true;
+}
+
+static int tcp_clone_payload(struct sock *sk, struct sk_buff *to,
+			     int probe_size)
+{
+	skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags;
+	int i, todo, len = 0, nr_frags = 0;
+	const struct sk_buff *skb;
+
+	if (!sk_wmem_schedule(sk, to->truesize + probe_size))
+		return -ENOMEM;
+
+	skb_queue_walk(&sk->sk_write_queue, skb) {
+		const skb_frag_t *fragfrom = skb_shinfo(skb)->frags;
+
+		if (skb_headlen(skb))
+			return -EINVAL;
+
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) {
+			if (len >= probe_size)
+				goto commit;
+			todo = min_t(int, skb_frag_size(fragfrom),
+				     probe_size - len);
+			len += todo;
+			if (lastfrag &&
+			    skb_frag_page(fragfrom) == skb_frag_page(lastfrag) &&
+			    skb_frag_off(fragfrom) == skb_frag_off(lastfrag) +
+						      skb_frag_size(lastfrag)) {
+				skb_frag_size_add(lastfrag, todo);
+				continue;
+			}
+			if (unlikely(nr_frags == MAX_SKB_FRAGS))
+				return -E2BIG;
+			skb_frag_page_copy(fragto, fragfrom);
+			skb_frag_off_copy(fragto, fragfrom);
+			skb_frag_size_set(fragto, todo);
+			nr_frags++;
+			lastfrag = fragto++;
+		}
+	}
+commit:
+	WARN_ON_ONCE(len != probe_size);
+	for (i = 0; i < nr_frags; i++)
+		skb_frag_ref(to, i);
+
+	skb_shinfo(to)->nr_frags = nr_frags;
+	to->truesize += probe_size;
+	to->len += probe_size;
+	to->data_len += probe_size;
+	__skb_header_release(to);
+	return 0;
+}
+
+/* Create a new MTU probe if we are ready.
+ * MTU probe is regularly attempting to increase the path MTU by
+ * deliberately sending larger packets.  This discovers routing
+ * changes resulting in larger path MTUs.
+ *
+ * Returns 0 if we should wait to probe (no cwnd available),
+ *         1 if a probe was sent,
+ *         -1 otherwise
+ */
+static int tcp_mtu_probe(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb, *nskb, *next;
+	struct net *net = sock_net(sk);
+	int probe_size;
+	int size_needed;
+	int copy, len;
+	int mss_now;
+	int interval;
+
+	/* Not currently probing/verifying,
+	 * not in recovery,
+	 * have enough cwnd, and
+	 * not SACKing (the variable headers throw things off)
+	 */
+	if (likely(!icsk->icsk_mtup.enabled ||
+		   icsk->icsk_mtup.probe_size ||
+		   inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+		   tcp_snd_cwnd(tp) < 11 ||
+		   tp->rx_opt.num_sacks || tp->rx_opt.dsack))
+		return -1;
+
+	/* Use binary search for probe_size between tcp_mss_base,
+	 * and current mss_clamp. if (search_high - search_low)
+	 * smaller than a threshold, backoff from probing.
+	 */
+	mss_now = tcp_current_mss(sk);
+	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+				    icsk->icsk_mtup.search_low) >> 1);
+	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+	/* When misfortune happens, we are reprobing actively,
+	 * and then reprobe timer has expired. We stick with current
+	 * probing process by not resetting search range to its orignal.
+	 */
+	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+	    interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
+		/* Check whether enough time has elaplased for
+		 * another round of probing.
+		 */
+		tcp_mtu_check_reprobe(sk);
+		return -1;
+	}
+
+	/* Have enough data in the send queue to probe? */
+	if (tp->write_seq - tp->snd_nxt < size_needed)
+		return -1;
+
+	if (tp->snd_wnd < size_needed)
+		return -1;
+	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
+		return 0;
+
+	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
+	if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) {
+		if (!tcp_packets_in_flight(tp))
+			return -1;
+		else
+			return 0;
+	}
+
+	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
+		return -1;
+
+	/* We're allowed to probe.  Build it now. */
+	nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false);
+	if (!nskb)
+		return -1;
+
+	/* build the payload, and be prepared to abort if this fails. */
+	if (tcp_clone_payload(sk, nskb, probe_size)) {
+		tcp_skb_tsorted_anchor_cleanup(nskb);
+		consume_skb(nskb);
+		return -1;
+	}
+	sk_wmem_queued_add(sk, nskb->truesize);
+	sk_mem_charge(sk, nskb->truesize);
+
+	skb = tcp_send_head(sk);
+	skb_copy_decrypted(nskb, skb);
+	mptcp_skb_ext_copy(nskb, skb);
+
+	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
+
+	tcp_insert_write_queue_before(nskb, skb, sk);
+	tcp_highest_sack_replace(sk, skb, nskb);
+
+	len = 0;
+	tcp_for_write_queue_from_safe(skb, next, sk) {
+		copy = min_t(int, skb->len, probe_size - len);
+
+		if (skb->len <= copy) {
+			/* We've eaten all the data from this skb.
+			 * Throw it away. */
+			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+			/* If this is the last SKB we copy and eor is set
+			 * we need to propagate it to the new skb.
+			 */
+			TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
+			tcp_skb_collapse_tstamp(nskb, skb);
+			tcp_unlink_write_queue(skb, sk);
+			tcp_wmem_free_skb(sk, skb);
+		} else {
+			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
+						   ~(TCPHDR_FIN|TCPHDR_PSH);
+			__pskb_trim_head(skb, copy);
+			tcp_set_skb_tso_segs(skb, mss_now);
+			TCP_SKB_CB(skb)->seq += copy;
+		}
+
+		len += copy;
+
+		if (len >= probe_size)
+			break;
+	}
+	tcp_init_tso_segs(nskb, nskb->len);
+
+	/* We're ready to send.  If this fails, the probe will
+	 * be resegmented into mss-sized pieces by tcp_write_xmit().
+	 */
+	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+		/* Decrement cwnd here because we are sending
+		 * effectively two packets. */
+		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
+		tcp_event_new_data_sent(sk, nskb);
+
+		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+		return 1;
+	}
+
+	return -1;
+}
+
+static bool tcp_pacing_check(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tcp_needs_internal_pacing(sk))
+		return false;
+
+	if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
+		return false;
+
+	if (!hrtimer_is_queued(&tp->pacing_timer)) {
+		hrtimer_start(&tp->pacing_timer,
+			      ns_to_ktime(tp->tcp_wstamp_ns),
+			      HRTIMER_MODE_ABS_PINNED_SOFT);
+		sock_hold(sk);
+	}
+	return true;
+}
+
+static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk)
+{
+	const struct rb_node *node = sk->tcp_rtx_queue.rb_node;
+
+	/* No skb in the rtx queue. */
+	if (!node)
+		return true;
+
+	/* Only one skb in rtx queue. */
+	return !node->rb_left && !node->rb_right;
+}
+
+/* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * (These limits are doubled for retransmits)
+ * This allows for :
+ *  - better RTT estimation and ACK scheduling
+ *  - faster recovery
+ *  - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
+				  unsigned int factor)
+{
+	unsigned long limit;
+
+	limit = max_t(unsigned long,
+		      2 * skb->truesize,
+		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
+	if (sk->sk_pacing_status == SK_PACING_NONE)
+		limit = min_t(unsigned long, limit,
+			      READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
+	limit <<= factor;
+
+	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
+	    tcp_sk(sk)->tcp_tx_delay) {
+		u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+
+		/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
+		 * approximate our needs assuming an ~100% skb->truesize overhead.
+		 * USEC_PER_SEC is approximated by 2^20.
+		 * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
+		 */
+		extra_bytes >>= (20 - 1);
+		limit += extra_bytes;
+	}
+	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
+		/* Always send skb if rtx queue is empty or has one skb.
+		 * No need to wait for TX completion to call us back,
+		 * after softirq/tasklet schedule.
+		 * This helps when TX completions are delayed too much.
+		 */
+		if (tcp_rtx_queue_empty_or_single_skb(sk))
+			return false;
+
+		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+		/* It is possible TX completion already happened
+		 * before we set TSQ_THROTTLED, so we must
+		 * test again the condition.
+		 */
+		smp_mb__after_atomic();
+		if (refcount_read(&sk->sk_wmem_alloc) > limit)
+			return true;
+	}
+	return false;
+}
+
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+	const u32 now = tcp_jiffies32;
+	enum tcp_chrono old = tp->chrono_type;
+
+	if (old > TCP_CHRONO_UNSPEC)
+		tp->chrono_stat[old - 1] += now - tp->chrono_start;
+	tp->chrono_start = now;
+	tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* If there are multiple conditions worthy of tracking in a
+	 * chronograph then the highest priority enum takes precedence
+	 * over the other conditions. So that if something "more interesting"
+	 * starts happening, stop the previous chrono and start a new one.
+	 */
+	if (type > tp->chrono_type)
+		tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+
+	/* There are multiple conditions worthy of tracking in a
+	 * chronograph, so that the highest priority enum takes
+	 * precedence over the other conditions (see tcp_chrono_start).
+	 * If a condition stops, we only stop chrono tracking if
+	 * it's the "most interesting" or current chrono we are
+	 * tracking and starts busy chrono if we have pending data.
+	 */
+	if (tcp_rtx_and_write_queues_empty(sk))
+		tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+	else if (type == tp->chrono_type)
+		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+}
+
+/* This routine writes packets to the network.  It advances the
+ * send_head.  This happens as incoming acks open up the remote
+ * window for us.
+ *
+ * LARGESEND note: !tcp_urg_mode is overkill, only frames between
+ * snd_up-64k-mss .. snd_up cannot be large. However, taking into
+ * account rare use of URG, this is not a big flaw.
+ *
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
+
+ * Returns true, if no segments are in flight and we have queued segments,
+ * but cannot send anything now because of SWS or another problem.
+ */
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+			   int push_one, gfp_t gfp)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	unsigned int tso_segs, sent_pkts;
+	int cwnd_quota;
+	int result;
+	bool is_cwnd_limited = false, is_rwnd_limited = false;
+	u32 max_segs;
+
+	sent_pkts = 0;
+
+	tcp_mstamp_refresh(tp);
+	if (!push_one) {
+		/* Do MTU probing. */
+		result = tcp_mtu_probe(sk);
+		if (!result) {
+			return false;
+		} else if (result > 0) {
+			sent_pkts = 1;
+		}
+	}
+
+	max_segs = tcp_tso_segs(sk, mss_now);
+	while ((skb = tcp_send_head(sk))) {
+		unsigned int limit;
+
+		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+			/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
+			tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+			tcp_init_tso_segs(skb, mss_now);
+			goto repair; /* Skip network transmission */
+		}
+
+		if (tcp_pacing_check(sk))
+			break;
+
+		tso_segs = tcp_init_tso_segs(skb, mss_now);
+		BUG_ON(!tso_segs);
+
+		cwnd_quota = tcp_cwnd_test(tp, skb);
+		if (!cwnd_quota) {
+			if (push_one == 2)
+				/* Force out a loss probe pkt. */
+				cwnd_quota = 1;
+			else
+				break;
+		}
+
+		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+			is_rwnd_limited = true;
+			break;
+		}
+
+		if (tso_segs == 1) {
+			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+						     (tcp_skb_is_last(sk, skb) ?
+						      nonagle : TCP_NAGLE_PUSH))))
+				break;
+		} else {
+			if (!push_one &&
+			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+						 &is_rwnd_limited, max_segs))
+				break;
+		}
+
+		limit = mss_now;
+		if (tso_segs > 1 && !tcp_urg_mode(tp))
+			limit = tcp_mss_split_point(sk, skb, mss_now,
+						    min_t(unsigned int,
+							  cwnd_quota,
+							  max_segs),
+						    nonagle);
+
+		if (skb->len > limit &&
+		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+			break;
+
+		if (tcp_small_queue_check(sk, skb, 0))
+			break;
+
+		/* Argh, we hit an empty skb(), presumably a thread
+		 * is sleeping in sendmsg()/sk_stream_wait_memory().
+		 * We do not want to send a pure-ack packet and have
+		 * a strange looking rtx queue with empty packet(s).
+		 */
+		if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
+			break;
+
+		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+			break;
+
+repair:
+		/* Advance the send_head.  This one is sent out.
+		 * This call will increment packets_out.
+		 */
+		tcp_event_new_data_sent(sk, skb);
+
+		tcp_minshall_update(tp, mss_now, skb);
+		sent_pkts += tcp_skb_pcount(skb);
+
+		if (push_one)
+			break;
+	}
+
+	if (is_rwnd_limited)
+		tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
+	else
+		tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+
+	is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
+	if (likely(sent_pkts || is_cwnd_limited))
+		tcp_cwnd_validate(sk, is_cwnd_limited);
+
+	if (likely(sent_pkts)) {
+		if (tcp_in_cwnd_reduction(sk))
+			tp->prr_out += sent_pkts;
+
+		/* Send one loss probe per tail loss episode. */
+		if (push_one != 2)
+			tcp_schedule_loss_probe(sk, false);
+		return false;
+	}
+	return !tp->packets_out && !tcp_write_queue_empty(sk);
+}
+
+bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 timeout, timeout_us, rto_delta_us;
+	int early_retrans;
+
+	/* Don't do any loss probe on a Fast Open connection before 3WHS
+	 * finishes.
+	 */
+	if (rcu_access_pointer(tp->fastopen_rsk))
+		return false;
+
+	early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
+	/* Schedule a loss probe in 2*RTT for SACK capable connections
+	 * not in loss recovery, that are either limited by cwnd or application.
+	 */
+	if ((early_retrans != 3 && early_retrans != 4) ||
+	    !tp->packets_out || !tcp_is_sack(tp) ||
+	    (icsk->icsk_ca_state != TCP_CA_Open &&
+	     icsk->icsk_ca_state != TCP_CA_CWR))
+		return false;
+
+	/* Probe timeout is 2*rtt. Add minimum RTO to account
+	 * for delayed ack when there's one outstanding packet. If no RTT
+	 * sample is available then probe after TCP_TIMEOUT_INIT.
+	 */
+	if (tp->srtt_us) {
+		timeout_us = tp->srtt_us >> 2;
+		if (tp->packets_out == 1)
+			timeout_us += tcp_rto_min_us(sk);
+		else
+			timeout_us += TCP_TIMEOUT_MIN_US;
+		timeout = usecs_to_jiffies(timeout_us);
+	} else {
+		timeout = TCP_TIMEOUT_INIT;
+	}
+
+	/* If the RTO formula yields an earlier time, then use that time. */
+	rto_delta_us = advancing_rto ?
+			jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
+			tcp_rto_delta_us(sk);  /* How far in future is RTO? */
+	if (rto_delta_us > 0)
+		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
+
+	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
+	return true;
+}
+
+/* Thanks to skb fast clones, we can detect if a prior transmit of
+ * a packet is still in a qdisc or driver queue.
+ * In this case, there is very little point doing a retransmit !
+ */
+static bool skb_still_in_host_queue(struct sock *sk,
+				    const struct sk_buff *skb)
+{
+	if (unlikely(skb_fclone_busy(sk, skb))) {
+		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+		smp_mb__after_atomic();
+		if (skb_fclone_busy(sk, skb)) {
+			NET_INC_STATS(sock_net(sk),
+				      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+			return true;
+		}
+	}
+	return false;
+}
+
+/* When probe timeout (PTO) fires, try send a new segment if possible, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int pcount;
+	int mss = tcp_current_mss(sk);
+
+	/* At most one outstanding TLP */
+	if (tp->tlp_high_seq)
+		goto rearm_timer;
+
+	tp->tlp_retrans = 0;
+	skb = tcp_send_head(sk);
+	if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+		pcount = tp->packets_out;
+		tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+		if (tp->packets_out > pcount)
+			goto probe_sent;
+		goto rearm_timer;
+	}
+	skb = skb_rb_last(&sk->tcp_rtx_queue);
+	if (unlikely(!skb)) {
+		WARN_ONCE(tp->packets_out,
+			  "invalid inflight: %u state %u cwnd %u mss %d\n",
+			  tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss);
+		inet_csk(sk)->icsk_pending = 0;
+		return;
+	}
+
+	if (skb_still_in_host_queue(sk, skb))
+		goto rearm_timer;
+
+	pcount = tcp_skb_pcount(skb);
+	if (WARN_ON(!pcount))
+		goto rearm_timer;
+
+	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+		if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+					  (pcount - 1) * mss, mss,
+					  GFP_ATOMIC)))
+			goto rearm_timer;
+		skb = skb_rb_next(skb);
+	}
+
+	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+		goto rearm_timer;
+
+	if (__tcp_retransmit_skb(sk, skb, 1))
+		goto rearm_timer;
+
+	tp->tlp_retrans = 1;
+
+probe_sent:
+	/* Record snd_nxt for loss detection. */
+	tp->tlp_high_seq = tp->snd_nxt;
+
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
+	/* Reset s.t. tcp_rearm_rto will restart timer from now */
+	inet_csk(sk)->icsk_pending = 0;
+rearm_timer:
+	tcp_rearm_rto(sk);
+}
+
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+			       int nonagle)
+{
+	/* If we are closed, the bytes will have to remain here.
+	 * In time closedown will finish, we empty the write queue and
+	 * all will be happy.
+	 */
+	if (unlikely(sk->sk_state == TCP_CLOSE))
+		return;
+
+	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+			   sk_gfp_mask(sk, GFP_ATOMIC)))
+		tcp_check_probe_timer(sk);
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+	struct sk_buff *skb = tcp_send_head(sk);
+
+	BUG_ON(!skb || skb->len < mss_now);
+
+	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
+}
+
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
+ *  RECV.NEXT + RCV.WIN fixed until:
+ *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recommended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ *
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ *
+ * BSD seems to make the following compromise:
+ *
+ *	If the free space is less than the 1/4 of the maximum
+ *	space available and the free space is less than 1/2 mss,
+ *	then set the window to 0.
+ *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
+ *	Otherwise, just prevent the window from shrinking
+ *	and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
+ */
+u32 __tcp_select_window(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	/* MSS for the peer's data.  Previous versions used mss_clamp
+	 * here.  I don't know if the value based on our guesses
+	 * of peer's MSS is better for the performance.  It's more correct
+	 * but may be worse for the performance because of rcv_mss
+	 * fluctuations.  --SAW  1998/11/1
+	 */
+	int mss = icsk->icsk_ack.rcv_mss;
+	int free_space = tcp_space(sk);
+	int allowed_space = tcp_full_space(sk);
+	int full_space, window;
+
+	if (sk_is_mptcp(sk))
+		mptcp_space(sk, &free_space, &allowed_space);
+
+	full_space = min_t(int, tp->window_clamp, allowed_space);
+
+	if (unlikely(mss > full_space)) {
+		mss = full_space;
+		if (mss <= 0)
+			return 0;
+	}
+
+	/* Only allow window shrink if the sysctl is enabled and we have
+	 * a non-zero scaling factor in effect.
+	 */
+	if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale)
+		goto shrink_window_allowed;
+
+	/* do not allow window to shrink */
+
+	if (free_space < (full_space >> 1)) {
+		icsk->icsk_ack.quick = 0;
+
+		if (tcp_under_memory_pressure(sk))
+			tcp_adjust_rcv_ssthresh(sk);
+
+		/* free_space might become our new window, make sure we don't
+		 * increase it due to wscale.
+		 */
+		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+		/* if free space is less than mss estimate, or is below 1/16th
+		 * of the maximum allowed, try to move to zero-window, else
+		 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
+		 * new incoming data is dropped due to memory limits.
+		 * With large window, mss test triggers way too late in order
+		 * to announce zero window in time before rmem limit kicks in.
+		 */
+		if (free_space < (allowed_space >> 4) || free_space < mss)
+			return 0;
+	}
+
+	if (free_space > tp->rcv_ssthresh)
+		free_space = tp->rcv_ssthresh;
+
+	/* Don't do rounding if we are using window scaling, since the
+	 * scaled window will not line up with the MSS boundary anyway.
+	 */
+	if (tp->rx_opt.rcv_wscale) {
+		window = free_space;
+
+		/* Advertise enough space so that it won't get scaled away.
+		 * Import case: prevent zero window announcement if
+		 * 1<<rcv_wscale > mss.
+		 */
+		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
+	} else {
+		window = tp->rcv_wnd;
+		/* Get the largest window that is a nice multiple of mss.
+		 * Window clamp already applied above.
+		 * If our current window offering is within 1 mss of the
+		 * free space we just keep it. This prevents the divide
+		 * and multiply from happening most of the time.
+		 * We also don't do any window rounding when the free space
+		 * is too small.
+		 */
+		if (window <= free_space - mss || window > free_space)
+			window = rounddown(free_space, mss);
+		else if (mss == full_space &&
+			 free_space > window + (full_space >> 1))
+			window = free_space;
+	}
+
+	return window;
+
+shrink_window_allowed:
+	/* new window should always be an exact multiple of scaling factor */
+	free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+	if (free_space < (full_space >> 1)) {
+		icsk->icsk_ack.quick = 0;
+
+		if (tcp_under_memory_pressure(sk))
+			tcp_adjust_rcv_ssthresh(sk);
+
+		/* if free space is too low, return a zero window */
+		if (free_space < (allowed_space >> 4) || free_space < mss ||
+			free_space < (1 << tp->rx_opt.rcv_wscale))
+			return 0;
+	}
+
+	if (free_space > tp->rcv_ssthresh) {
+		free_space = tp->rcv_ssthresh;
+		/* new window should always be an exact multiple of scaling factor
+		 *
+		 * For this case, we ALIGN "up" (increase free_space) because
+		 * we know free_space is not zero here, it has been reduced from
+		 * the memory-based limit, and rcv_ssthresh is not a hard limit
+		 * (unlike sk_rcvbuf).
+		 */
+		free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale));
+	}
+
+	return free_space;
+}
+
+void tcp_skb_collapse_tstamp(struct sk_buff *skb,
+			     const struct sk_buff *next_skb)
+{
+	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
+		const struct skb_shared_info *next_shinfo =
+			skb_shinfo(next_skb);
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+		shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+		shinfo->tskey = next_shinfo->tskey;
+		TCP_SKB_CB(skb)->txstamp_ack |=
+			TCP_SKB_CB(next_skb)->txstamp_ack;
+	}
+}
+
+/* Collapses two adjacent SKB's during retransmission. */
+static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *next_skb = skb_rb_next(skb);
+	int next_skb_size;
+
+	next_skb_size = next_skb->len;
+
+	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+
+	if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size))
+		return false;
+
+	tcp_highest_sack_replace(sk, next_skb, skb);
+
+	/* Update sequence range on original skb. */
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+	/* Merge over control information. This moves PSH/FIN etc. over */
+	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
+
+	/* All done, get rid of second SKB and account for it so
+	 * packet counting does not break.
+	 */
+	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
+
+	/* changed transmit queue under us so clear hints */
+	tcp_clear_retrans_hints_partial(tp);
+	if (next_skb == tp->retransmit_skb_hint)
+		tp->retransmit_skb_hint = skb;
+
+	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
+
+	tcp_skb_collapse_tstamp(skb, next_skb);
+
+	tcp_rtx_queue_unlink_and_free(next_skb, sk);
+	return true;
+}
+
+/* Check if coalescing SKBs is legal. */
+static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
+{
+	if (tcp_skb_pcount(skb) > 1)
+		return false;
+	if (skb_cloned(skb))
+		return false;
+	/* Some heuristics for collapsing over SACK'd could be invented */
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+		return false;
+
+	return true;
+}
+
+/* Collapse packets in the retransmit queue to make to create
+ * less packets on the wire. This is only done on retransmission.
+ */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
+				     int space)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = to, *tmp;
+	bool first = true;
+
+	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
+		return;
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+		return;
+
+	skb_rbtree_walk_from_safe(skb, tmp) {
+		if (!tcp_can_collapse(sk, skb))
+			break;
+
+		if (!tcp_skb_can_collapse(to, skb))
+			break;
+
+		space -= skb->len;
+
+		if (first) {
+			first = false;
+			continue;
+		}
+
+		if (space < 0)
+			break;
+
+		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
+			break;
+
+		if (!tcp_collapse_retrans(sk, to))
+			break;
+	}
+}
+
+/* This retransmits one SKB.  Policy decisions and retransmit queue
+ * state updates are done by the caller.  Returns non-zero if an
+ * error occurred which prevented the send.
+ */
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int cur_mss;
+	int diff, len, err;
+	int avail_wnd;
+
+	/* Inconclusive MTU probe */
+	if (icsk->icsk_mtup.probe_size)
+		icsk->icsk_mtup.probe_size = 0;
+
+	if (skb_still_in_host_queue(sk, skb))
+		return -EBUSY;
+
+start:
+	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+		if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+			TCP_SKB_CB(skb)->seq++;
+			goto start;
+		}
+		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
+		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+			return -ENOMEM;
+	}
+
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+		return -EHOSTUNREACH; /* Routing failure or similar. */
+
+	cur_mss = tcp_current_mss(sk);
+	avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+	/* If receiver has shrunk his window, and skb is out of
+	 * new window, do not retransmit it. The exception is the
+	 * case, when window is shrunk to zero. In this case
+	 * our retransmit of one segment serves as a zero window probe.
+	 */
+	if (avail_wnd <= 0) {
+		if (TCP_SKB_CB(skb)->seq != tp->snd_una)
+			return -EAGAIN;
+		avail_wnd = cur_mss;
+	}
+
+	len = cur_mss * segs;
+	if (len > avail_wnd) {
+		len = rounddown(avail_wnd, cur_mss);
+		if (!len)
+			len = avail_wnd;
+	}
+	if (skb->len > len) {
+		if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+				 cur_mss, GFP_ATOMIC))
+			return -ENOMEM; /* We'll try again later. */
+	} else {
+		if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
+			return -ENOMEM;
+
+		diff = tcp_skb_pcount(skb);
+		tcp_set_skb_tso_segs(skb, cur_mss);
+		diff -= tcp_skb_pcount(skb);
+		if (diff)
+			tcp_adjust_pcount(sk, skb, diff);
+		avail_wnd = min_t(int, avail_wnd, cur_mss);
+		if (skb->len < avail_wnd)
+			tcp_retrans_try_collapse(sk, skb, avail_wnd);
+	}
+
+	/* RFC3168, section 6.1.1.1. ECN fallback */
+	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
+		tcp_ecn_clear_syn(sk, skb);
+
+	/* Update global and local TCP statistics. */
+	segs = tcp_skb_pcount(skb);
+	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+	tp->total_retrans += segs;
+	tp->bytes_retrans += skb->len;
+
+	/* make sure skb->data is aligned on arches that require it
+	 * and check if ack-trimming & collapsing extended the headroom
+	 * beyond what csum_start can cover.
+	 */
+	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
+		     skb_headroom(skb) >= 0xFFFF)) {
+		struct sk_buff *nskb;
+
+		tcp_skb_tsorted_save(skb) {
+			nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
+			if (nskb) {
+				nskb->dev = NULL;
+				err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
+			} else {
+				err = -ENOBUFS;
+			}
+		} tcp_skb_tsorted_restore(skb);
+
+		if (!err) {
+			tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
+			tcp_rate_skb_sent(sk, skb);
+		}
+	} else {
+		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+	}
+
+	/* To avoid taking spuriously low RTT samples based on a timestamp
+	 * for a transmit that never happened, always mark EVER_RETRANS
+	 */
+	TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+				  TCP_SKB_CB(skb)->seq, segs, err);
+
+	if (likely(!err)) {
+		trace_tcp_retransmit_skb(sk, skb);
+	} else if (err != -EBUSY) {
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
+	}
+	return err;
+}
+
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int err = __tcp_retransmit_skb(sk, skb, segs);
+
+	if (err == 0) {
+#if FASTRETRANS_DEBUG > 0
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+			net_dbg_ratelimited("retrans_out leaked\n");
+		}
+#endif
+		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
+		tp->retrans_out += tcp_skb_pcount(skb);
+	}
+
+	/* Save stamp of the first (attempted) retransmit. */
+	if (!tp->retrans_stamp)
+		tp->retrans_stamp = tcp_skb_timestamp(skb);
+
+	if (tp->undo_retrans < 0)
+		tp->undo_retrans = 0;
+	tp->undo_retrans += tcp_skb_pcount(skb);
+	return err;
+}
+
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged.  It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb, *rtx_head, *hole = NULL;
+	struct tcp_sock *tp = tcp_sk(sk);
+	bool rearm_timer = false;
+	u32 max_segs;
+	int mib_idx;
+
+	if (!tp->packets_out)
+		return;
+
+	rtx_head = tcp_rtx_queue_head(sk);
+	skb = tp->retransmit_skb_hint ?: rtx_head;
+	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
+	skb_rbtree_walk_from(skb) {
+		__u8 sacked;
+		int segs;
+
+		if (tcp_pacing_check(sk))
+			break;
+
+		/* we could do better than to assign each time */
+		if (!hole)
+			tp->retransmit_skb_hint = skb;
+
+		segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
+		if (segs <= 0)
+			break;
+		sacked = TCP_SKB_CB(skb)->sacked;
+		/* In case tcp_shift_skb_data() have aggregated large skbs,
+		 * we need to make sure not sending too bigs TSO packets
+		 */
+		segs = min_t(int, segs, max_segs);
+
+		if (tp->retrans_out >= tp->lost_out) {
+			break;
+		} else if (!(sacked & TCPCB_LOST)) {
+			if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+				hole = skb;
+			continue;
+
+		} else {
+			if (icsk->icsk_ca_state != TCP_CA_Loss)
+				mib_idx = LINUX_MIB_TCPFASTRETRANS;
+			else
+				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+		}
+
+		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+			continue;
+
+		if (tcp_small_queue_check(sk, skb, 1))
+			break;
+
+		if (tcp_retransmit_skb(sk, skb, segs))
+			break;
+
+		NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
+
+		if (tcp_in_cwnd_reduction(sk))
+			tp->prr_out += tcp_skb_pcount(skb);
+
+		if (skb == rtx_head &&
+		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
+			rearm_timer = true;
+
+	}
+	if (rearm_timer)
+		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				     inet_csk(sk)->icsk_rto,
+				     TCP_RTO_MAX);
+}
+
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
+ */
+void sk_forced_mem_schedule(struct sock *sk, int size)
+{
+	int delta, amt;
+
+	delta = size - sk->sk_forward_alloc;
+	if (delta <= 0)
+		return;
+	amt = sk_mem_pages(delta);
+	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
+	sk_memory_allocated_add(sk, amt);
+
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+		mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+					gfp_memcg_charge() | __GFP_NOFAIL);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
+ */
+void tcp_send_fin(struct sock *sk)
+{
+	struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Optimization, tack on the FIN if we have one skb in write queue and
+	 * this skb was not yet sent, or we are under memory pressure.
+	 * Note: in the latter case, FIN packet will be sent after a timeout,
+	 * as TCP stack thinks it has already been transmitted.
+	 */
+	tskb = tail;
+	if (!tskb && tcp_under_memory_pressure(sk))
+		tskb = skb_rb_last(&sk->tcp_rtx_queue);
+
+	if (tskb) {
+		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+		TCP_SKB_CB(tskb)->end_seq++;
+		tp->write_seq++;
+		if (!tail) {
+			/* This means tskb was already sent.
+			 * Pretend we included the FIN on previous transmit.
+			 * We need to set tp->snd_nxt to the value it would have
+			 * if FIN had been sent. This is because retransmit path
+			 * does not change tp->snd_nxt.
+			 */
+			WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
+			return;
+		}
+	} else {
+		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+		if (unlikely(!skb))
+			return;
+
+		INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+		skb_reserve(skb, MAX_TCP_HEADER);
+		sk_forced_mem_schedule(sk, skb->truesize);
+		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+		tcp_init_nondata_skb(skb, tp->write_seq,
+				     TCPHDR_ACK | TCPHDR_FIN);
+		tcp_queue_skb(sk, skb);
+	}
+	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
+}
+
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue.  This behavior is recommended
+ * by RFC 2525, section 2.17.  -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+{
+	struct sk_buff *skb;
+
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
+
+	/* NOTE: No TCP options attached and we never retransmit this. */
+	skb = alloc_skb(MAX_TCP_HEADER, priority);
+	if (!skb) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+		return;
+	}
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
+			     TCPHDR_ACK | TCPHDR_RST);
+	tcp_mstamp_refresh(tcp_sk(sk));
+	/* Send it off. */
+	if (tcp_transmit_skb(sk, skb, 0, priority))
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+
+	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
+	 * skb here is different to the troublesome skb, so use NULL
+	 */
+	trace_tcp_send_reset(sk, NULL);
+}
+
+/* Send a crossed SYN-ACK during socket establishment.
+ * WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
+int tcp_send_synack(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	skb = tcp_rtx_queue_head(sk);
+	if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+		pr_err("%s: wrong queue state\n", __func__);
+		return -EFAULT;
+	}
+	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
+		if (skb_cloned(skb)) {
+			struct sk_buff *nskb;
+
+			tcp_skb_tsorted_save(skb) {
+				nskb = skb_copy(skb, GFP_ATOMIC);
+			} tcp_skb_tsorted_restore(skb);
+			if (!nskb)
+				return -ENOMEM;
+			INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
+			tcp_highest_sack_replace(sk, skb, nskb);
+			tcp_rtx_queue_unlink_and_free(skb, sk);
+			__skb_header_release(nskb);
+			tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
+			sk_wmem_queued_add(sk, nskb->truesize);
+			sk_mem_charge(sk, nskb->truesize);
+			skb = nskb;
+		}
+
+		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
+		tcp_ecn_send_synack(sk, skb);
+	}
+	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+}
+
+/**
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ *       should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received.  It could be NULL for rtx case.
+ */
+struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
+				struct request_sock *req,
+				struct tcp_fastopen_cookie *foc,
+				enum tcp_synack_type synack_type,
+				struct sk_buff *syn_skb)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_key *md5 = NULL;
+	struct tcp_out_options opts;
+	struct sk_buff *skb;
+	int tcp_header_size;
+	struct tcphdr *th;
+	int mss;
+	u64 now;
+
+	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+	if (unlikely(!skb)) {
+		dst_release(dst);
+		return NULL;
+	}
+	/* Reserve space for headers. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+
+	switch (synack_type) {
+	case TCP_SYNACK_NORMAL:
+		skb_set_owner_w(skb, req_to_sk(req));
+		break;
+	case TCP_SYNACK_COOKIE:
+		/* Under synflood, we do not attach skb to a socket,
+		 * to avoid false sharing.
+		 */
+		break;
+	case TCP_SYNACK_FASTOPEN:
+		/* sk is a const pointer, because we want to express multiple
+		 * cpu might call us concurrently.
+		 * sk->sk_wmem_alloc in an atomic, we can promote to rw.
+		 */
+		skb_set_owner_w(skb, (struct sock *)sk);
+		break;
+	}
+	skb_dst_set(skb, dst);
+
+	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+
+	memset(&opts, 0, sizeof(opts));
+	now = tcp_clock_ns();
+#ifdef CONFIG_SYN_COOKIES
+	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
+		skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
+				      true);
+	else
+#endif
+	{
+		skb_set_delivery_time(skb, now, true);
+		if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
+			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
+	}
+
+#ifdef CONFIG_TCP_MD5SIG
+	rcu_read_lock();
+	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+#endif
+	skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4);
+	/* bpf program will be interested in the tcp_flags */
+	TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
+	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+					     foc, synack_type,
+					     syn_skb) + sizeof(*th);
+
+	skb_push(skb, tcp_header_size);
+	skb_reset_transport_header(skb);
+
+	th = (struct tcphdr *)skb->data;
+	memset(th, 0, sizeof(struct tcphdr));
+	th->syn = 1;
+	th->ack = 1;
+	tcp_ecn_make_synack(req, th);
+	th->source = htons(ireq->ir_num);
+	th->dest = ireq->ir_rmt_port;
+	skb->mark = ireq->ir_mark;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	th->seq = htonl(tcp_rsk(req)->snt_isn);
+	/* XXX data is queued and acked as is. No buffer/window check */
+	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
+
+	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
+	tcp_options_write(th, NULL, &opts);
+	th->doff = (tcp_header_size >> 2);
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* Okay, we have all we need - do the md5 hash if needed */
+	if (md5)
+		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
+					       md5, req_to_sk(req), skb);
+	rcu_read_unlock();
+#endif
+
+	bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+				synack_type, &opts);
+
+	skb_set_delivery_time(skb, now, true);
+	tcp_add_tx_delay(skb, tp);
+
+	return skb;
+}
+EXPORT_SYMBOL(tcp_make_synack);
+
+static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_congestion_ops *ca;
+	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+
+	if (ca_key == TCP_CA_UNSPEC)
+		return;
+
+	rcu_read_lock();
+	ca = tcp_ca_find_key(ca_key);
+	if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
+		bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
+		icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+		icsk->icsk_ca_ops = ca;
+	}
+	rcu_read_unlock();
+}
+
+/* Do all connect socket setups that can be done AF independent. */
+static void tcp_connect_init(struct sock *sk)
+{
+	const struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u8 rcv_wscale;
+	u32 rcv_wnd;
+
+	/* We'll fix this up when we get a response from the other end.
+	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+	 */
+	tp->tcp_header_len = sizeof(struct tcphdr);
+	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
+		tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
+
+	/* If user gave his TCP_MAXSEG, record it to clamp */
+	if (tp->rx_opt.user_mss)
+		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	tp->max_window = 0;
+	tcp_mtup_init(sk);
+	tcp_sync_mss(sk, dst_mtu(dst));
+
+	tcp_ca_dst_init(sk, dst);
+
+	if (!tp->window_clamp)
+		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
+
+	tcp_initialize_rcv_mss(sk);
+
+	/* limit the window selection if the user enforce a smaller rx buffer */
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+		tp->window_clamp = tcp_full_space(sk);
+
+	rcv_wnd = tcp_rwnd_init_bpf(sk);
+	if (rcv_wnd == 0)
+		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
+	tcp_select_initial_window(sk, tcp_full_space(sk),
+				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+				  &tp->rcv_wnd,
+				  &tp->window_clamp,
+				  READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
+				  &rcv_wscale,
+				  rcv_wnd);
+
+	tp->rx_opt.rcv_wscale = rcv_wscale;
+	tp->rcv_ssthresh = tp->rcv_wnd;
+
+	WRITE_ONCE(sk->sk_err, 0);
+	sock_reset_flag(sk, SOCK_DONE);
+	tp->snd_wnd = 0;
+	tcp_init_wl(tp, 0);
+	tcp_write_queue_purge(sk);
+	tp->snd_una = tp->write_seq;
+	tp->snd_sml = tp->write_seq;
+	tp->snd_up = tp->write_seq;
+	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
+
+	if (likely(!tp->repair))
+		tp->rcv_nxt = 0;
+	else
+		tp->rcv_tstamp = tcp_jiffies32;
+	tp->rcv_wup = tp->rcv_nxt;
+	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+
+	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
+	inet_csk(sk)->icsk_retransmits = 0;
+	tcp_clear_retrans(tp);
+}
+
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	tcb->end_seq += skb->len;
+	__skb_header_release(skb);
+	sk_wmem_queued_add(sk, skb->truesize);
+	sk_mem_charge(sk, skb->truesize);
+	WRITE_ONCE(tp->write_seq, tcb->end_seq);
+	tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_fastopen_request *fo = tp->fastopen_req;
+	struct page_frag *pfrag = sk_page_frag(sk);
+	struct sk_buff *syn_data;
+	int space, err = 0;
+
+	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
+	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
+		goto fallback;
+
+	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+	 * user-MSS. Reserve maximum option space for middleboxes that add
+	 * private TCP options. The cost is reduced data space in SYN :(
+	 */
+	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+	/* Sync mss_cache after updating the mss_clamp */
+	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+
+	space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
+		MAX_TCP_OPTION_SPACE;
+
+	space = min_t(size_t, space, fo->size);
+
+	if (space &&
+	    !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE),
+				  pfrag, sk->sk_allocation))
+		goto fallback;
+	syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false);
+	if (!syn_data)
+		goto fallback;
+	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
+	if (space) {
+		space = min_t(size_t, space, pfrag->size - pfrag->offset);
+		space = tcp_wmem_schedule(sk, space);
+	}
+	if (space) {
+		space = copy_page_from_iter(pfrag->page, pfrag->offset,
+					    space, &fo->data->msg_iter);
+		if (unlikely(!space)) {
+			tcp_skb_tsorted_anchor_cleanup(syn_data);
+			kfree_skb(syn_data);
+			goto fallback;
+		}
+		skb_fill_page_desc(syn_data, 0, pfrag->page,
+				   pfrag->offset, space);
+		page_ref_inc(pfrag->page);
+		pfrag->offset += space;
+		skb_len_add(syn_data, space);
+		skb_zcopy_set(syn_data, fo->uarg, NULL);
+	}
+	/* No more data pending in inet_wait_for_connect() */
+	if (space == fo->size)
+		fo->data = NULL;
+	fo->copied = space;
+
+	tcp_connect_queue_skb(sk, syn_data);
+	if (syn_data->len)
+		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
+
+	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
+
+	skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
+
+	/* Now full SYN+DATA was cloned and sent (or not),
+	 * remove the SYN from the original skb (syn_data)
+	 * we keep in write queue in case of a retransmit, as we
+	 * also have the SYN packet (with no data) in the same queue.
+	 */
+	TCP_SKB_CB(syn_data)->seq++;
+	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
+	if (!err) {
+		tp->syn_data = (fo->copied > 0);
+		tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
+		goto done;
+	}
+
+	/* data was not sent, put it in write_queue */
+	__skb_queue_tail(&sk->sk_write_queue, syn_data);
+	tp->packets_out -= tcp_skb_pcount(syn_data);
+
+fallback:
+	/* Send a regular SYN with Fast Open cookie request option */
+	if (fo->cookie.len > 0)
+		fo->cookie.len = 0;
+	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+	if (err)
+		tp->syn_fastopen = 0;
+done:
+	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
+	return err;
+}
+
+/* Build a SYN and send it off. */
+int tcp_connect(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *buff;
+	int err;
+
+	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
+
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+		return -EHOSTUNREACH; /* Routing failure or similar. */
+
+	tcp_connect_init(sk);
+
+	if (unlikely(tp->repair)) {
+		tcp_finish_connect(sk, NULL);
+		return 0;
+	}
+
+	buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true);
+	if (unlikely(!buff))
+		return -ENOBUFS;
+
+	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+	tcp_mstamp_refresh(tp);
+	tp->retrans_stamp = tcp_time_stamp(tp);
+	tcp_connect_queue_skb(sk, buff);
+	tcp_ecn_send_syn(sk, buff);
+	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+
+	/* Send off SYN; include data in Fast Open. */
+	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+	if (err == -ECONNREFUSED)
+		return err;
+
+	/* We change tp->snd_nxt after the tcp_transmit_skb() call
+	 * in order to make this packet get counted in tcpOutSegs.
+	 */
+	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
+	tp->pushed_seq = tp->write_seq;
+	buff = tcp_send_head(sk);
+	if (unlikely(buff)) {
+		WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
+		tp->pushed_seq	= TCP_SKB_CB(buff)->seq;
+	}
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
+
+	/* Timer for repeating the SYN until an answer. */
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_connect);
+
+u32 tcp_delack_max(const struct sock *sk)
+{
+	const struct dst_entry *dst = __sk_dst_get(sk);
+	u32 delack_max = inet_csk(sk)->icsk_delack_max;
+
+	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) {
+		u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
+		u32 delack_from_rto_min = max_t(int, 1, rto_min - 1);
+
+		delack_max = min_t(u32, delack_max, delack_from_rto_min);
+	}
+	return delack_max;
+}
+
+/* Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
+ * for details.
+ */
+void tcp_send_delayed_ack(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ato = icsk->icsk_ack.ato;
+	unsigned long timeout;
+
+	if (ato > TCP_DELACK_MIN) {
+		const struct tcp_sock *tp = tcp_sk(sk);
+		int max_ato = HZ / 2;
+
+		if (inet_csk_in_pingpong_mode(sk) ||
+		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
+			max_ato = TCP_DELACK_MAX;
+
+		/* Slow path, intersegment interval is "high". */
+
+		/* If some rtt estimate is known, use it to bound delayed ack.
+		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
+		 * directly.
+		 */
+		if (tp->srtt_us) {
+			int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
+					TCP_DELACK_MIN);
+
+			if (rtt < max_ato)
+				max_ato = rtt;
+		}
+
+		ato = min(ato, max_ato);
+	}
+
+	ato = min_t(u32, ato, tcp_delack_max(sk));
+
+	/* Stay within the limit we were given */
+	timeout = jiffies + ato;
+
+	/* Use new timeout only if there wasn't a older one earlier. */
+	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+		/* If delack timer is about to expire, send ACK now. */
+		if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+			tcp_send_ack(sk);
+			return;
+		}
+
+		if (!time_before(timeout, icsk->icsk_ack.timeout))
+			timeout = icsk->icsk_ack.timeout;
+	}
+	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+	icsk->icsk_ack.timeout = timeout;
+	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+/* This routine sends an ack and also updates the window. */
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
+{
+	struct sk_buff *buff;
+
+	/* If we have been reset, we may not send again. */
+	if (sk->sk_state == TCP_CLOSE)
+		return;
+
+	/* We are not putting this on the write queue, so
+	 * tcp_transmit_skb() will set the ownership to this
+	 * sock.
+	 */
+	buff = alloc_skb(MAX_TCP_HEADER,
+			 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
+	if (unlikely(!buff)) {
+		struct inet_connection_sock *icsk = inet_csk(sk);
+		unsigned long delay;
+
+		delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
+		if (delay < TCP_RTO_MAX)
+			icsk->icsk_ack.retry++;
+		inet_csk_schedule_ack(sk);
+		icsk->icsk_ack.ato = TCP_ATO_MIN;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
+		return;
+	}
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(buff, MAX_TCP_HEADER);
+	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
+
+	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
+	 * too much.
+	 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
+	 */
+	skb_set_tcp_pure_ack(buff);
+
+	/* Send it off, this clears delayed acks for us. */
+	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
+}
+EXPORT_SYMBOL_GPL(__tcp_send_ack);
+
+void tcp_send_ack(struct sock *sk)
+{
+	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
+}
+
+/* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ *
+ * Question: what should we make while urgent mode?
+ * 4.4BSD forces sending single byte of data. We cannot send
+ * out of window data, because we have SND.NXT==SND.MAX...
+ *
+ * Current solution: to send TWO zero-length segments in urgent mode:
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
+ * out-of-date with SND.UNA-1 to probe window.
+ */
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	/* We don't queue it, tcp_transmit_skb() sets ownership. */
+	skb = alloc_skb(MAX_TCP_HEADER,
+			sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
+	if (!skb)
+		return -1;
+
+	/* Reserve space for headers and set control bits. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+	/* Use a previous sequence.  This should cause the other
+	 * end to send an ack.  Don't queue or clone SKB, just
+	 * send it.
+	 */
+	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
+	NET_INC_STATS(sock_net(sk), mib);
+	return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
+}
+
+/* Called from setsockopt( ... TCP_REPAIR ) */
+void tcp_send_window_probe(struct sock *sk)
+{
+	if (sk->sk_state == TCP_ESTABLISHED) {
+		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+		tcp_mstamp_refresh(tcp_sk(sk));
+		tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
+	}
+}
+
+/* Initiate keepalive or window probe from timer. */
+int tcp_write_wakeup(struct sock *sk, int mib)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (sk->sk_state == TCP_CLOSE)
+		return -1;
+
+	skb = tcp_send_head(sk);
+	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+		int err;
+		unsigned int mss = tcp_current_mss(sk);
+		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+		/* We are probing the opening of a window
+		 * but the window size is != 0
+		 * must have been a result SWS avoidance ( sender )
+		 */
+		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+		    skb->len > mss) {
+			seg_size = min(seg_size, mss);
+			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+			if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+					 skb, seg_size, mss, GFP_ATOMIC))
+				return -1;
+		} else if (!tcp_skb_pcount(skb))
+			tcp_set_skb_tso_segs(skb, mss);
+
+		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+		if (!err)
+			tcp_event_new_data_sent(sk, skb);
+		return err;
+	} else {
+		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+			tcp_xmit_probe_skb(sk, 1, mib);
+		return tcp_xmit_probe_skb(sk, 0, mib);
+	}
+}
+
+/* A window probe timeout has occurred.  If window is not closed send
+ * a partial packet else a zero probe.
+ */
+void tcp_send_probe0(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	unsigned long timeout;
+	int err;
+
+	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
+
+	if (tp->packets_out || tcp_write_queue_empty(sk)) {
+		/* Cancel probe timer, if it is not required. */
+		icsk->icsk_probes_out = 0;
+		icsk->icsk_backoff = 0;
+		icsk->icsk_probes_tstamp = 0;
+		return;
+	}
+
+	icsk->icsk_probes_out++;
+	if (err <= 0) {
+		if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
+			icsk->icsk_backoff++;
+		timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
+	} else {
+		/* If packet was not sent due to local congestion,
+		 * Let senders fight for local resources conservatively.
+		 */
+		timeout = TCP_RESOURCE_PROBE_INTERVAL;
+	}
+
+	timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
+	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
+}
+
+int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
+{
+	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+	struct flowi fl;
+	int res;
+
+	/* Paired with WRITE_ONCE() in sock_setsockopt() */
+	if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
+		WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
+	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+				  NULL);
+	if (!res) {
+		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+		if (unlikely(tcp_passive_fastopen(sk))) {
+			/* sk has const attribute because listeners are lockless.
+			 * However in this case, we are dealing with a passive fastopen
+			 * socket thus we can change total_retrans value.
+			 */
+			tcp_sk_rw(sk)->total_retrans++;
+		}
+		trace_tcp_retransmit_synack(sk, req);
+	}
+	return res;
+}
+EXPORT_SYMBOL(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
new file mode 100644
index 0000000000..4bcf7eff95
--- /dev/null
+++ b/net/ipv4/tcp_plb.c
@@ -0,0 +1,109 @@
+/* Protective Load Balancing (PLB)
+ *
+ * PLB was designed to reduce link load imbalance across datacenter
+ * switches. PLB is a host-based optimization; it leverages congestion
+ * signals from the transport layer to randomly change the path of the
+ * connection experiencing sustained congestion. PLB prefers to repath
+ * after idle periods to minimize packet reordering. It repaths by
+ * changing the IPv6 Flow Label on the packets of a connection, which
+ * datacenter switches include as part of ECMP/WCMP hashing.
+ *
+ * PLB is described in detail in:
+ *
+ *	Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu,
+ *	Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson,
+ *	David Wetherall,Abdul Kabbani:
+ *	"PLB: Congestion Signals are Simple and Effective for
+ *	 Network Load Balancing"
+ *	In ACM SIGCOMM 2022, Amsterdam Netherlands.
+ *
+ */
+
+#include <net/tcp.h>
+
+/* Called once per round-trip to update PLB state for a connection. */
+void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
+			  const int cong_ratio)
+{
+	struct net *net = sock_net(sk);
+
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+		return;
+
+	if (cong_ratio >= 0) {
+		if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh))
+			plb->consec_cong_rounds = 0;
+		else if (plb->consec_cong_rounds <
+			 READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds))
+			plb->consec_cong_rounds++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state);
+
+/* Check whether recent congestion has been persistent enough to warrant
+ * a load balancing decision that switches the connection to another path.
+ */
+void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
+{
+	struct net *net = sock_net(sk);
+	u32 max_suspend;
+	bool forced_rehash = false, idle_rehash = false;
+
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+		return;
+
+	forced_rehash = plb->consec_cong_rounds >=
+			READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds);
+	/* If sender goes idle then we check whether to rehash. */
+	idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) &&
+		      !tcp_sk(sk)->packets_out &&
+		      plb->consec_cong_rounds >=
+		      READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds);
+
+	if (!forced_rehash && !idle_rehash)
+		return;
+
+	/* Note that tcp_jiffies32 can wrap; we detect wraps by checking for
+	 * cases where the max suspension end is before the actual suspension
+	 * end. We clear pause_until to 0 to indicate there is no recent
+	 * RTO event that constrains PLB rehashing.
+	 */
+	max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+	if (plb->pause_until &&
+	    (!before(tcp_jiffies32, plb->pause_until) ||
+	     before(tcp_jiffies32 + max_suspend, plb->pause_until)))
+		plb->pause_until = 0;
+
+	if (plb->pause_until)
+		return;
+
+	sk_rethink_txhash(sk);
+	plb->consec_cong_rounds = 0;
+	tcp_sk(sk)->plb_rehash++;
+	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
+}
+EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
+
+/* Upon RTO, disallow load balancing for a while, to avoid having load
+ * balancing decisions switch traffic to a black-holed path that was
+ * previously avoided with a sk_rethink_txhash() call at RTO time.
+ */
+void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb)
+{
+	struct net *net = sock_net(sk);
+	u32 pause;
+
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+		return;
+
+	pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+	pause += get_random_u32_below(pause);
+	plb->pause_until = tcp_jiffies32 + pause;
+
+	/* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call
+	 * that may switch this connection to a path with completely different
+	 * congestion characteristics.
+	 */
+	plb->consec_cong_rounds = 0;
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
new file mode 100644
index 0000000000..a8f6d9d06f
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ *    send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ *    ack_rate  = #pkts_delivered/(last_ack_time - first_ack_time)
+ *    bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits.  The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	 /* In general we need to start delivery rate samples from the
+	  * time we received the most recent ACK, to ensure we include
+	  * the full time the network needs to deliver all in-flight
+	  * packets. If there are no packets in flight yet, then we
+	  * know that any ACKs after now indicate that the network was
+	  * able to deliver those packets completely in the sampling
+	  * interval between now and the next ACK.
+	  *
+	  * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+	  * because the latter is a guess based on RTO and loss-marking
+	  * heuristics. We don't want spurious RTOs or loss markings to cause
+	  * a spuriously small time interval, causing a spuriously high
+	  * bandwidth estimate.
+	  */
+	if (!tp->packets_out) {
+		u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+		tp->first_tx_mstamp  = tstamp_us;
+		tp->delivered_mstamp = tstamp_us;
+	}
+
+	TCP_SKB_CB(skb)->tx.first_tx_mstamp	= tp->first_tx_mstamp;
+	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
+	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
+	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
+	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the most recently sent time and the highest
+ * sequence.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+			    struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+	u64 tx_tstamp;
+
+	if (!scb->tx.delivered_mstamp)
+		return;
+
+	tx_tstamp = tcp_skb_timestamp_us(skb);
+	if (!rs->prior_delivered ||
+	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+			       scb->end_seq, rs->last_end_seq)) {
+		rs->prior_delivered_ce  = scb->tx.delivered_ce;
+		rs->prior_delivered  = scb->tx.delivered;
+		rs->prior_mstamp     = scb->tx.delivered_mstamp;
+		rs->is_app_limited   = scb->tx.is_app_limited;
+		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
+		rs->last_end_seq     = scb->end_seq;
+
+		/* Record send time of most recently ACKed packet: */
+		tp->first_tx_mstamp  = tx_tstamp;
+		/* Find the duration of the "send phase" of this window: */
+		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+						     scb->tx.first_tx_mstamp);
+
+	}
+	/* Mark off the skb delivered once it's sacked to avoid being
+	 * used again when it's cumulatively acked. For acked packets
+	 * we don't need to reset since it'll be freed soon.
+	 */
+	if (scb->sacked & TCPCB_SACKED_ACKED)
+		scb->tx.delivered_mstamp = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+		  bool is_sack_reneg, struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 snd_us, ack_us;
+
+	/* Clear app limited if bubble is acked and gone. */
+	if (tp->app_limited && after(tp->delivered, tp->app_limited))
+		tp->app_limited = 0;
+
+	/* TODO: there are multiple places throughout tcp_ack() to get
+	 * current time. Refactor the code using a new "tcp_acktag_state"
+	 * to carry current time, flags, stats like "tcp_sacktag_state".
+	 */
+	if (delivered)
+		tp->delivered_mstamp = tp->tcp_mstamp;
+
+	rs->acked_sacked = delivered;	/* freshly ACKed or SACKed */
+	rs->losses = lost;		/* freshly marked lost */
+	/* Return an invalid sample if no timing information is available or
+	 * in recovery from loss with SACK reneging. Rate samples taken during
+	 * a SACK reneging event may overestimate bw by including packets that
+	 * were SACKed before the reneg.
+	 */
+	if (!rs->prior_mstamp || is_sack_reneg) {
+		rs->delivered = -1;
+		rs->interval_us = -1;
+		return;
+	}
+	rs->delivered   = tp->delivered - rs->prior_delivered;
+
+	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+	/* delivered_ce occupies less than 32 bits in the skb control block */
+	rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
+
+	/* Model sending data and receiving ACKs as separate pipeline phases
+	 * for a window. Usually the ACK phase is longer, but with ACK
+	 * compression the send phase can be longer. To be safe we use the
+	 * longer phase.
+	 */
+	snd_us = rs->interval_us;				/* send phase */
+	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+				    rs->prior_mstamp); /* ack phase */
+	rs->interval_us = max(snd_us, ack_us);
+
+	/* Record both segment send and ack receive intervals */
+	rs->snd_interval_us = snd_us;
+	rs->rcv_interval_us = ack_us;
+
+	/* Normally we expect interval_us >= min-rtt.
+	 * Note that rate may still be over-estimated when a spuriously
+	 * retransmistted skb was first (s)acked because "interval_us"
+	 * is under-estimated (up to an RTT). However continuously
+	 * measuring the delivery rate during loss recovery is crucial
+	 * for connections suffer heavy or prolonged losses.
+	 */
+	if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+		if (!rs->is_retrans)
+			pr_debug("tcp rate: %ld %d %u %u %u\n",
+				 rs->interval_us, rs->delivered,
+				 inet_csk(sk)->icsk_ca_state,
+				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+		rs->interval_us = -1;
+		return;
+	}
+
+	/* Record the last non-app-limited or the highest app-limited bw */
+	if (!rs->is_app_limited ||
+	    ((u64)rs->delivered * tp->rate_interval_us >=
+	     (u64)tp->rate_delivered * rs->interval_us)) {
+		tp->rate_delivered = rs->delivered;
+		tp->rate_interval_us = rs->interval_us;
+		tp->rate_app_limited = rs->is_app_limited;
+	}
+}
+
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (/* We have less than one packet to send. */
+	    tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+	    /* Nothing in sending host's qdisc queues or NIC tx queue. */
+	    sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+	    /* We are not limited by CWND. */
+	    tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
+	    /* All lost packets have been retransmitted. */
+	    tp->lost_out <= tp->retrans_out)
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
new file mode 100644
index 0000000000..bba10110fb
--- /dev/null
+++ b/net/ipv4/tcp_recovery.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+static u32 tcp_rack_reo_wnd(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->reord_seen) {
+		/* If reordering has not been observed, be aggressive during
+		 * the recovery or starting the recovery by DUPACK threshold.
+		 */
+		if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
+			return 0;
+
+		if (tp->sacked_out >= tp->reordering &&
+		    !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+		      TCP_RACK_NO_DUPTHRESH))
+			return 0;
+	}
+
+	/* To be more reordering resilient, allow min_rtt/4 settling delay.
+	 * Use min_rtt instead of the smoothed RTT because reordering is
+	 * often a path property and less related to queuing or delayed ACKs.
+	 * Upon receiving DSACKs, linearly increase the window up to the
+	 * smoothed RTT.
+	 */
+	return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
+		   tp->srtt_us >> 3);
+}
+
+s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
+{
+	return tp->rack.rtt_us + reo_wnd -
+	       tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
+}
+
+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
+ */
+static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb, *n;
+	u32 reo_wnd;
+
+	*reo_timeout = 0;
+	reo_wnd = tcp_rack_reo_wnd(sk);
+	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
+				 tcp_tsorted_anchor) {
+		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+		s32 remaining;
+
+		/* Skip ones marked lost but not yet retransmitted */
+		if ((scb->sacked & TCPCB_LOST) &&
+		    !(scb->sacked & TCPCB_SACKED_RETRANS))
+			continue;
+
+		if (!tcp_skb_sent_after(tp->rack.mstamp,
+					tcp_skb_timestamp_us(skb),
+					tp->rack.end_seq, scb->end_seq))
+			break;
+
+		/* A packet is lost if it has not been s/acked beyond
+		 * the recent RTT plus the reordering window.
+		 */
+		remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
+		if (remaining <= 0) {
+			tcp_mark_skb_lost(sk, skb);
+			list_del_init(&skb->tcp_tsorted_anchor);
+		} else {
+			/* Record maximum wait time */
+			*reo_timeout = max_t(u32, *reo_timeout, remaining);
+		}
+	}
+}
+
+bool tcp_rack_mark_lost(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 timeout;
+
+	if (!tp->rack.advanced)
+		return false;
+
+	/* Reset the advanced flag to avoid unnecessary queue scanning */
+	tp->rack.advanced = 0;
+	tcp_rack_detect_loss(sk, &timeout);
+	if (timeout) {
+		timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+					  timeout, inet_csk(sk)->icsk_rto);
+	}
+	return !!timeout;
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
+		      u64 xmit_time)
+{
+	u32 rtt_us;
+
+	rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
+	if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
+		/* If the sacked packet was retransmitted, it's ambiguous
+		 * whether the retransmission or the original (or the prior
+		 * retransmission) was sacked.
+		 *
+		 * If the original is lost, there is no ambiguity. Otherwise
+		 * we assume the original can be delayed up to aRTT + min_rtt.
+		 * the aRTT term is bounded by the fast recovery or timeout,
+		 * so it's at least one RTT (i.e., retransmission is at least
+		 * an RTT later).
+		 */
+		return;
+	}
+	tp->rack.advanced = 1;
+	tp->rack.rtt_us = rtt_us;
+	if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
+			       end_seq, tp->rack.end_seq)) {
+		tp->rack.mstamp = xmit_time;
+		tp->rack.end_seq = end_seq;
+	}
+}
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 timeout, prior_inflight;
+	u32 lost = tp->lost;
+
+	prior_inflight = tcp_packets_in_flight(tp);
+	tcp_rack_detect_loss(sk, &timeout);
+	if (prior_inflight != tcp_packets_in_flight(tp)) {
+		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+			tcp_enter_recovery(sk, false);
+			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+				tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0);
+		}
+		tcp_xmit_retransmit_queue(sk);
+	}
+	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+		tcp_rearm_rto(sk);
+}
+
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If a DSACK is received that seems like it may have been due to reordering
+ * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+	     TCP_RACK_STATIC_REO_WND) ||
+	    !rs->prior_delivered)
+		return;
+
+	/* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+	if (before(rs->prior_delivered, tp->rack.last_delivered))
+		tp->rack.dsack_seen = 0;
+
+	/* Adjust the reo_wnd if update is pending */
+	if (tp->rack.dsack_seen) {
+		tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+					       tp->rack.reo_wnd_steps + 1);
+		tp->rack.dsack_seen = 0;
+		tp->rack.last_delivered = tp->delivered;
+		tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+	} else if (!tp->rack.reo_wnd_persist) {
+		tp->rack.reo_wnd_steps = 1;
+	}
+}
+
+/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
+ * the next unacked packet upon receiving
+ * a) three or more DUPACKs to start the fast recovery
+ * b) an ACK acknowledging new data during the fast recovery.
+ */
+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
+{
+	const u8 state = inet_csk(sk)->icsk_ca_state;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
+	    (state == TCP_CA_Recovery && snd_una_advanced)) {
+		struct sk_buff *skb = tcp_rtx_queue_head(sk);
+		u32 mss;
+
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+			return;
+
+		mss = tcp_skb_mss(skb);
+		if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
+			tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+				     mss, mss, GFP_ATOMIC);
+
+		tcp_mark_skb_lost(sk, skb);
+	}
+}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 0000000000..862b96248a
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Tom Kelly's Scalable TCP
+ *
+ * See http://www.deneholme.net/tom/scalable/
+ *
+ * John Heffner <jheffner@sc.edu>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* These factors derived from the recommended values in the aer:
+ * .01 and 7/8.
+ */
+#define TCP_SCALABLE_AI_CNT	100U
+#define TCP_SCALABLE_MD_SCALE	3
+
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+	tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT),
+			  acked);
+}
+
+static u32 tcp_scalable_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return max(tcp_snd_cwnd(tp) - (tcp_snd_cwnd(tp)>>TCP_SCALABLE_MD_SCALE), 2U);
+}
+
+static struct tcp_congestion_ops tcp_scalable __read_mostly = {
+	.ssthresh	= tcp_scalable_ssthresh,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.cong_avoid	= tcp_scalable_cong_avoid,
+
+	.owner		= THIS_MODULE,
+	.name		= "scalable",
+};
+
+static int __init tcp_scalable_register(void)
+{
+	return tcp_register_congestion_control(&tcp_scalable);
+}
+
+static void __exit tcp_scalable_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_scalable);
+}
+
+module_init(tcp_scalable_register);
+module_exit(tcp_scalable_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
new file mode 100644
index 0000000000..984ab4a042
--- /dev/null
+++ b/net/ipv4/tcp_timer.c
@@ -0,0 +1,841 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 elapsed, start_ts, user_timeout;
+	s32 remaining;
+
+	start_ts = tcp_sk(sk)->retrans_stamp;
+	user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+	if (!user_timeout)
+		return icsk->icsk_rto;
+	elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
+	remaining = user_timeout - elapsed;
+	if (remaining <= 0)
+		return 1; /* user timeout has passed; fire ASAP */
+
+	return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
+}
+
+u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 remaining, user_timeout;
+	s32 elapsed;
+
+	user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+	if (!user_timeout || !icsk->icsk_probes_tstamp)
+		return when;
+
+	elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
+	if (unlikely(elapsed < 0))
+		elapsed = 0;
+	remaining = msecs_to_jiffies(user_timeout) - elapsed;
+	remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
+
+	return min_t(u32, remaining, when);
+}
+
+/**
+ *  tcp_write_err() - close socket and save error info
+ *  @sk:  The socket the error has appeared on.
+ *
+ *  Returns: Nothing (void)
+ */
+
+static void tcp_write_err(struct sock *sk)
+{
+	WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
+	sk_error_report(sk);
+
+	tcp_write_queue_purge(sk);
+	tcp_done(sk);
+	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
+}
+
+/**
+ *  tcp_out_of_resources() - Close socket if out of resources
+ *  @sk:        pointer to current socket
+ *  @do_reset:  send a last packet with reset flag
+ *
+ *  Do not allow orphaned sockets to eat all our resources.
+ *  This is direct violation of TCP specs, but it is required
+ *  to prevent DoS attacks. It is called when a retransmission timeout
+ *  or zero probe timeout occurs on orphaned socket.
+ *
+ *  Also close if our net namespace is exiting; in that case there is no
+ *  hope of ever communicating again since all netns interfaces are already
+ *  down (or about to be down), and we need to release our dst references,
+ *  which have been moved to the netns loopback interface, so the namespace
+ *  can finish exiting.  This condition is only possible if we are a kernel
+ *  socket, as those do not hold references to the namespace.
+ *
+ *  Criteria is still not confirmed experimentally and may change.
+ *  We kill the socket, if:
+ *  1. If number of orphaned sockets exceeds an administratively configured
+ *     limit.
+ *  2. If we have strong memory pressure.
+ *  3. If our net namespace is exiting.
+ */
+static int tcp_out_of_resources(struct sock *sk, bool do_reset)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int shift = 0;
+
+	/* If peer does not open window for long time, or did not transmit
+	 * anything for long time, penalize it. */
+	if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+		shift++;
+
+	/* If some dubious ICMP arrived, penalize even more. */
+	if (READ_ONCE(sk->sk_err_soft))
+		shift++;
+
+	if (tcp_check_oom(sk, shift)) {
+		/* Catch exceptional cases, when connection requires reset.
+		 *      1. Last segment was sent recently. */
+		if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+		    /*  2. Window is closed. */
+		    (!tp->snd_wnd && !tp->packets_out))
+			do_reset = true;
+		if (do_reset)
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+		tcp_done(sk);
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+		return 1;
+	}
+
+	if (!check_net(sock_net(sk))) {
+		/* Not possible to send reset; just close */
+		tcp_done(sk);
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ *  tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket
+ *  @sk:    Pointer to the current socket.
+ *  @alive: bool, socket alive state
+ */
+static int tcp_orphan_retries(struct sock *sk, bool alive)
+{
+	int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */
+
+	/* We know from an ICMP that something is wrong. */
+	if (READ_ONCE(sk->sk_err_soft) && !alive)
+		retries = 0;
+
+	/* However, if socket sent something recently, select some safe
+	 * number of retries. 8 corresponds to >100 seconds with minimal
+	 * RTO of 200msec. */
+	if (retries == 0 && alive)
+		retries = 8;
+	return retries;
+}
+
+static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
+{
+	const struct net *net = sock_net(sk);
+	int mss;
+
+	/* Black hole detection */
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing))
+		return;
+
+	if (!icsk->icsk_mtup.enabled) {
+		icsk->icsk_mtup.enabled = 1;
+		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
+	} else {
+		mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+		mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss);
+		mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor));
+		mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss));
+		icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+	}
+	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+static unsigned int tcp_model_timeout(struct sock *sk,
+				      unsigned int boundary,
+				      unsigned int rto_base)
+{
+	unsigned int linear_backoff_thresh, timeout;
+
+	linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
+	if (boundary <= linear_backoff_thresh)
+		timeout = ((2 << boundary) - 1) * rto_base;
+	else
+		timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+			(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+	return jiffies_to_msecs(timeout);
+}
+/**
+ *  retransmits_timed_out() - returns true if this connection has timed out
+ *  @sk:       The current socket
+ *  @boundary: max number of retransmissions
+ *  @timeout:  A custom timeout value.
+ *             If set to 0 the default timeout is calculated and used.
+ *             Using TCP_RTO_MIN and the number of unsuccessful retransmits.
+ *
+ * The default "timeout" value this function can calculate and use
+ * is equivalent to the timeout of a TCP Connection
+ * after "boundary" unsuccessful, exponentially backed-off
+ * retransmissions with an initial RTO of TCP_RTO_MIN.
+ */
+static bool retransmits_timed_out(struct sock *sk,
+				  unsigned int boundary,
+				  unsigned int timeout)
+{
+	unsigned int start_ts;
+
+	if (!inet_csk(sk)->icsk_retransmits)
+		return false;
+
+	start_ts = tcp_sk(sk)->retrans_stamp;
+	if (likely(timeout == 0)) {
+		unsigned int rto_base = TCP_RTO_MIN;
+
+		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+			rto_base = tcp_timeout_init(sk);
+		timeout = tcp_model_timeout(sk, boundary, rto_base);
+	}
+
+	return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	bool expired = false, do_reset;
+	int retry_until, max_retransmits;
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		if (icsk->icsk_retransmits)
+			__dst_negative_advice(sk);
+		/* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
+		retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
+			READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
+
+		max_retransmits = retry_until;
+		if (sk->sk_state == TCP_SYN_SENT)
+			max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts);
+
+		expired = icsk->icsk_retransmits >= max_retransmits;
+	} else {
+		if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
+			/* Black hole detection */
+			tcp_mtu_probing(icsk, sk);
+
+			__dst_negative_advice(sk);
+		}
+
+		retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2);
+		if (sock_flag(sk, SOCK_DEAD)) {
+			const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
+
+			retry_until = tcp_orphan_retries(sk, alive);
+			do_reset = alive ||
+				!retransmits_timed_out(sk, retry_until, 0);
+
+			if (tcp_out_of_resources(sk, do_reset))
+				return 1;
+		}
+	}
+	if (!expired)
+		expired = retransmits_timed_out(sk, retry_until,
+						READ_ONCE(icsk->icsk_user_timeout));
+	tcp_fastopen_active_detect_blackhole(sk, expired);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+				  icsk->icsk_retransmits,
+				  icsk->icsk_rto, (int)expired);
+
+	if (expired) {
+		/* Has it gone just too far? */
+		tcp_write_err(sk);
+		return 1;
+	}
+
+	if (sk_rethink_txhash(sk)) {
+		tp->timeout_rehash++;
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
+	}
+
+	return 0;
+}
+
+/* Called with BH disabled */
+void tcp_delack_timer_handler(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+		return;
+
+	/* Handling the sack compression case */
+	if (tp->compressed_ack) {
+		tcp_mstamp_refresh(tp);
+		tcp_sack_compress_send_ack(sk);
+		return;
+	}
+
+	if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+		return;
+
+	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+		return;
+	}
+	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+	if (inet_csk_ack_scheduled(sk)) {
+		if (!inet_csk_in_pingpong_mode(sk)) {
+			/* Delayed ACK missed: inflate ATO. */
+			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
+		} else {
+			/* Delayed ACK missed: leave pingpong mode and
+			 * deflate ATO.
+			 */
+			inet_csk_exit_pingpong_mode(sk);
+			icsk->icsk_ack.ato      = TCP_ATO_MIN;
+		}
+		tcp_mstamp_refresh(tp);
+		tcp_send_ack(sk);
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+	}
+}
+
+
+/**
+ *  tcp_delack_timer() - The TCP delayed ACK timeout handler
+ *  @t:  Pointer to the timer. (gets casted to struct sock *)
+ *
+ *  This function gets (indirectly) called when the kernel timer for a TCP packet
+ *  of this socket expires. Calls tcp_delack_timer_handler() to do the actual work.
+ *
+ *  Returns: Nothing (void)
+ */
+static void tcp_delack_timer(struct timer_list *t)
+{
+	struct inet_connection_sock *icsk =
+			from_timer(icsk, t, icsk_delack_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		tcp_delack_timer_handler(sk);
+	} else {
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+		/* deleguate our work to tcp_release_cb() */
+		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
+			sock_hold(sk);
+	}
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static void tcp_probe_timer(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb = tcp_send_head(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int max_probes;
+
+	if (tp->packets_out || !skb) {
+		icsk->icsk_probes_out = 0;
+		icsk->icsk_probes_tstamp = 0;
+		return;
+	}
+
+	/* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
+	 * long as the receiver continues to respond probes. We support this by
+	 * default and reset icsk_probes_out with incoming ACKs. But if the
+	 * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
+	 * kill the socket when the retry count and the time exceeds the
+	 * corresponding system limit. We also implement similar policy when
+	 * we use RTO to probe window in tcp_retransmit_timer().
+	 */
+	if (!icsk->icsk_probes_tstamp) {
+		icsk->icsk_probes_tstamp = tcp_jiffies32;
+	} else {
+		u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+
+		if (user_timeout &&
+		    (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
+		     msecs_to_jiffies(user_timeout))
+		goto abort;
+	}
+	max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
+	if (sock_flag(sk, SOCK_DEAD)) {
+		const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+
+		max_probes = tcp_orphan_retries(sk, alive);
+		if (!alive && icsk->icsk_backoff >= max_probes)
+			goto abort;
+		if (tcp_out_of_resources(sk, true))
+			return;
+	}
+
+	if (icsk->icsk_probes_out >= max_probes) {
+abort:		tcp_write_err(sk);
+	} else {
+		/* Only send another probe if we didn't close things up. */
+		tcp_send_probe0(sk);
+	}
+}
+
+/*
+ *	Timer for Fast Open socket to retransmit SYNACK. Note that the
+ *	sk here is the child socket, not the parent (listener) socket.
+ */
+static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int max_retries;
+
+	req->rsk_ops->syn_ack_timeout(req);
+
+	/* Add one more retry for fastopen.
+	 * Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
+	 */
+	max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
+		READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
+
+	if (req->num_timeout >= max_retries) {
+		tcp_write_err(sk);
+		return;
+	}
+	/* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */
+	if (icsk->icsk_retransmits == 1)
+		tcp_enter_loss(sk);
+	/* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+	 * returned from rtx_syn_ack() to make it more persistent like
+	 * regular retransmit because if the child socket has been accepted
+	 * it's not good to give up too easily.
+	 */
+	inet_rtx_syn_ack(sk, req);
+	req->num_timeout++;
+	icsk->icsk_retransmits++;
+	if (!tp->retrans_stamp)
+		tp->retrans_stamp = tcp_time_stamp(tp);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+			  req->timeout << req->num_timeout, TCP_RTO_MAX);
+}
+
+static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
+				     const struct sk_buff *skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const int timeout = TCP_RTO_MAX * 2;
+	u32 rcv_delta, rtx_delta;
+
+	rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
+	if (rcv_delta <= timeout)
+		return false;
+
+	rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) -
+			(tp->retrans_stamp ?: tcp_skb_timestamp(skb)));
+
+	return rtx_delta > timeout;
+}
+
+/**
+ *  tcp_retransmit_timer() - The TCP retransmit timeout handler
+ *  @sk:  Pointer to the current socket.
+ *
+ *  This function gets called when the kernel timer for a TCP packet
+ *  of this socket expires.
+ *
+ *  It handles retransmission, timer adjustment and other necessary measures.
+ *
+ *  Returns: Nothing (void)
+ */
+void tcp_retransmit_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock *req;
+	struct sk_buff *skb;
+
+	req = rcu_dereference_protected(tp->fastopen_rsk,
+					lockdep_sock_is_held(sk));
+	if (req) {
+		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+			     sk->sk_state != TCP_FIN_WAIT1);
+		tcp_fastopen_synack_timer(sk, req);
+		/* Before we receive ACK to our SYN-ACK don't retransmit
+		 * anything else (e.g., data or FIN segments).
+		 */
+		return;
+	}
+
+	if (!tp->packets_out)
+		return;
+
+	skb = tcp_rtx_queue_head(sk);
+	if (WARN_ON_ONCE(!skb))
+		return;
+
+	tp->tlp_high_seq = 0;
+
+	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
+	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+		/* Receiver dastardly shrinks window. Our retransmits
+		 * become zero probes, but we should not timeout this
+		 * connection. If the socket is an orphan, time it out,
+		 * we cannot allow such beasts to hang infinitely.
+		 */
+		struct inet_sock *inet = inet_sk(sk);
+		u32 rtx_delta;
+
+		rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb));
+		if (sk->sk_family == AF_INET) {
+			net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
+				&inet->inet_daddr, ntohs(inet->inet_dport),
+				inet->inet_num, tp->snd_una, tp->snd_nxt,
+				jiffies_to_msecs(jiffies - tp->rcv_tstamp),
+				rtx_delta);
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (sk->sk_family == AF_INET6) {
+			net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
+				&sk->sk_v6_daddr, ntohs(inet->inet_dport),
+				inet->inet_num, tp->snd_una, tp->snd_nxt,
+				jiffies_to_msecs(jiffies - tp->rcv_tstamp),
+				rtx_delta);
+		}
+#endif
+		if (tcp_rtx_probe0_timed_out(sk, skb)) {
+			tcp_write_err(sk);
+			goto out;
+		}
+		tcp_enter_loss(sk);
+		tcp_retransmit_skb(sk, skb, 1);
+		__sk_dst_reset(sk);
+		goto out_reset_timer;
+	}
+
+	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
+	if (tcp_write_timeout(sk))
+		goto out;
+
+	if (icsk->icsk_retransmits == 0) {
+		int mib_idx = 0;
+
+		if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+			if (tcp_is_sack(tp))
+				mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
+			else
+				mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
+		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
+			mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+		} else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+			   tp->sacked_out) {
+			if (tcp_is_sack(tp))
+				mib_idx = LINUX_MIB_TCPSACKFAILURES;
+			else
+				mib_idx = LINUX_MIB_TCPRENOFAILURES;
+		}
+		if (mib_idx)
+			__NET_INC_STATS(sock_net(sk), mib_idx);
+	}
+
+	tcp_enter_loss(sk);
+
+	icsk->icsk_retransmits++;
+	if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
+		/* Retransmission failed because of local congestion,
+		 * Let senders fight for local resources conservatively.
+		 */
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  TCP_RESOURCE_PROBE_INTERVAL,
+					  TCP_RTO_MAX);
+		goto out;
+	}
+
+	/* Increase the timeout each time we retransmit.  Note that
+	 * we do not increase the rtt estimate.  rto is initialized
+	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
+	 * that doubling rto each time is the least we can get away with.
+	 * In KA9Q, Karn uses this for the first few times, and then
+	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
+	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
+	 * defined in the protocol as the maximum possible RTT.  I guess
+	 * we'll have to use something other than TCP to talk to the
+	 * University of Mars.
+	 *
+	 * PAWS allows us longer timeouts and large windows, so once
+	 * implemented ftp to mars will work nicely. We will have to fix
+	 * the 120 second clamps though!
+	 */
+	icsk->icsk_backoff++;
+
+out_reset_timer:
+	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+	 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+	 * might be increased if the stream oscillates between thin and thick,
+	 * thus the old value might already be too high compared to the value
+	 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+	 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+	 * exponential backoff behaviour to avoid continue hammering
+	 * linear-timeout retransmissions into a black hole
+	 */
+	if (sk->sk_state == TCP_ESTABLISHED &&
+	    (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) &&
+	    tcp_stream_is_thin(tp) &&
+	    icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
+		icsk->icsk_backoff = 0;
+		icsk->icsk_rto = clamp(__tcp_set_rto(tp),
+				       tcp_rto_min(sk),
+				       TCP_RTO_MAX);
+	} else if (sk->sk_state != TCP_SYN_SENT ||
+		   icsk->icsk_backoff >
+		   READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) {
+		/* Use normal (exponential) backoff unless linear timeouts are
+		 * activated.
+		 */
+		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+	}
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				  tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
+	if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0))
+		__sk_dst_reset(sk);
+
+out:;
+}
+
+/* Called with bottom-half processing disabled.
+   Called by tcp_write_timer() */
+void tcp_write_timer_handler(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int event;
+
+	if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+	    !icsk->icsk_pending)
+		return;
+
+	if (time_after(icsk->icsk_timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
+		return;
+	}
+
+	tcp_mstamp_refresh(tcp_sk(sk));
+	event = icsk->icsk_pending;
+
+	switch (event) {
+	case ICSK_TIME_REO_TIMEOUT:
+		tcp_rack_reo_timeout(sk);
+		break;
+	case ICSK_TIME_LOSS_PROBE:
+		tcp_send_loss_probe(sk);
+		break;
+	case ICSK_TIME_RETRANS:
+		icsk->icsk_pending = 0;
+		tcp_retransmit_timer(sk);
+		break;
+	case ICSK_TIME_PROBE0:
+		icsk->icsk_pending = 0;
+		tcp_probe_timer(sk);
+		break;
+	}
+}
+
+static void tcp_write_timer(struct timer_list *t)
+{
+	struct inet_connection_sock *icsk =
+			from_timer(icsk, t, icsk_retransmit_timer);
+	struct sock *sk = &icsk->icsk_inet.sk;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		tcp_write_timer_handler(sk);
+	} else {
+		/* delegate our work to tcp_release_cb() */
+		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
+			sock_hold(sk);
+	}
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+void tcp_syn_ack_timeout(const struct request_sock *req)
+{
+	struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
+
+	__NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
+}
+EXPORT_SYMBOL(tcp_syn_ack_timeout);
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+		return;
+
+	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
+		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+	else if (!val)
+		inet_csk_delete_keepalive_timer(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_set_keepalive);
+
+
+static void tcp_keepalive_timer (struct timer_list *t)
+{
+	struct sock *sk = from_timer(sk, t, sk_timer);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 elapsed;
+
+	/* Only process if socket is not in use. */
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		inet_csk_reset_keepalive_timer (sk, HZ/20);
+		goto out;
+	}
+
+	if (sk->sk_state == TCP_LISTEN) {
+		pr_err("Hmm... keepalive on a LISTEN ???\n");
+		goto out;
+	}
+
+	tcp_mstamp_refresh(tp);
+	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+		if (READ_ONCE(tp->linger2) >= 0) {
+			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
+
+			if (tmo > 0) {
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+		tcp_send_active_reset(sk, GFP_ATOMIC);
+		goto death;
+	}
+
+	if (!sock_flag(sk, SOCK_KEEPOPEN) ||
+	    ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)))
+		goto out;
+
+	elapsed = keepalive_time_when(tp);
+
+	/* It is alive without keepalive 8) */
+	if (tp->packets_out || !tcp_write_queue_empty(sk))
+		goto resched;
+
+	elapsed = keepalive_time_elapsed(tp);
+
+	if (elapsed >= keepalive_time_when(tp)) {
+		u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+
+		/* If the TCP_USER_TIMEOUT option is enabled, use that
+		 * to determine when to timeout instead.
+		 */
+		if ((user_timeout != 0 &&
+		    elapsed >= msecs_to_jiffies(user_timeout) &&
+		    icsk->icsk_probes_out > 0) ||
+		    (user_timeout == 0 &&
+		    icsk->icsk_probes_out >= keepalive_probes(tp))) {
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			tcp_write_err(sk);
+			goto out;
+		}
+		if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
+			icsk->icsk_probes_out++;
+			elapsed = keepalive_intvl_when(tp);
+		} else {
+			/* If keepalive was lost due to local congestion,
+			 * try harder.
+			 */
+			elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+		}
+	} else {
+		/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
+		elapsed = keepalive_time_when(tp) - elapsed;
+	}
+
+resched:
+	inet_csk_reset_keepalive_timer (sk, elapsed);
+	goto out;
+
+death:
+	tcp_done(sk);
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
+{
+	struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
+	struct sock *sk = (struct sock *)tp;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		if (tp->compressed_ack) {
+			/* Since we have to send one ack finally,
+			 * subtract one from tp->compressed_ack to keep
+			 * LINUX_MIB_TCPACKCOMPRESSED accurate.
+			 */
+			tp->compressed_ack--;
+			tcp_send_ack(sk);
+		}
+	} else {
+		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+				      &sk->sk_tsq_flags))
+			sock_hold(sk);
+	}
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+
+	return HRTIMER_NORESTART;
+}
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+				  &tcp_keepalive_timer);
+	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS_PINNED_SOFT);
+	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
+
+	hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL_PINNED_SOFT);
+	tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
+}
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
new file mode 100644
index 0000000000..2aa4421286
--- /dev/null
+++ b/net/ipv4/tcp_ulp.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Pluggable TCP upper layer protocol support.
+ *
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_ulp_list_lock);
+static LIST_HEAD(tcp_ulp_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
+{
+	struct tcp_ulp_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_ulp_list, list,
+				lockdep_is_held(&tcp_ulp_list_lock)) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+
+	return NULL;
+}
+
+static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
+{
+	const struct tcp_ulp_ops *ulp = NULL;
+
+	rcu_read_lock();
+	ulp = tcp_ulp_find(name);
+
+#ifdef CONFIG_MODULES
+	if (!ulp && capable(CAP_NET_ADMIN)) {
+		rcu_read_unlock();
+		request_module("tcp-ulp-%s", name);
+		rcu_read_lock();
+		ulp = tcp_ulp_find(name);
+	}
+#endif
+	if (!ulp || !try_module_get(ulp->owner))
+		ulp = NULL;
+
+	rcu_read_unlock();
+	return ulp;
+}
+
+/* Attach new upper layer protocol to the list
+ * of available protocols.
+ */
+int tcp_register_ulp(struct tcp_ulp_ops *ulp)
+{
+	int ret = 0;
+
+	spin_lock(&tcp_ulp_list_lock);
+	if (tcp_ulp_find(ulp->name))
+		ret = -EEXIST;
+	else
+		list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
+	spin_unlock(&tcp_ulp_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_ulp);
+
+void tcp_unregister_ulp(struct tcp_ulp_ops *ulp)
+{
+	spin_lock(&tcp_ulp_list_lock);
+	list_del_rcu(&ulp->list);
+	spin_unlock(&tcp_ulp_list_lock);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_ulp);
+
+/* Build string with list of available upper layer protocl values */
+void tcp_get_available_ulp(char *buf, size_t maxlen)
+{
+	struct tcp_ulp_ops *ulp_ops;
+	size_t offs = 0;
+
+	*buf = '\0';
+	rcu_read_lock();
+	list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) {
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", ulp_ops->name);
+
+		if (WARN_ON_ONCE(offs >= maxlen))
+			break;
+	}
+	rcu_read_unlock();
+}
+
+void tcp_update_ulp(struct sock *sk, struct proto *proto,
+		    void (*write_space)(struct sock *sk))
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ulp_ops->update)
+		icsk->icsk_ulp_ops->update(sk, proto, write_space);
+}
+
+void tcp_cleanup_ulp(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	/* No sock_owned_by_me() check here as at the time the
+	 * stack calls this function, the socket is dead and
+	 * about to be destroyed.
+	 */
+	if (!icsk->icsk_ulp_ops)
+		return;
+
+	if (icsk->icsk_ulp_ops->release)
+		icsk->icsk_ulp_ops->release(sk);
+	module_put(icsk->icsk_ulp_ops->owner);
+
+	icsk->icsk_ulp_ops = NULL;
+}
+
+static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int err;
+
+	err = -EEXIST;
+	if (icsk->icsk_ulp_ops)
+		goto out_err;
+
+	if (sk->sk_socket)
+		clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+
+	err = -ENOTCONN;
+	if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN)
+		goto out_err;
+
+	err = ulp_ops->init(sk);
+	if (err)
+		goto out_err;
+
+	icsk->icsk_ulp_ops = ulp_ops;
+	return 0;
+out_err:
+	module_put(ulp_ops->owner);
+	return err;
+}
+
+int tcp_set_ulp(struct sock *sk, const char *name)
+{
+	const struct tcp_ulp_ops *ulp_ops;
+
+	sock_owned_by_me(sk);
+
+	ulp_ops = __tcp_ulp_find_autoload(name);
+	if (!ulp_ops)
+		return -ENOENT;
+
+	return __tcp_set_ulp(sk, ulp_ops);
+}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 0000000000..786848ad37
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+static int alpha = 2;
+static int beta  = 4;
+static int gamma = 1;
+
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static void vegas_enable(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct vegas *vegas = inet_csk_ca(sk);
+
+	/* Begin taking Vegas samples next time we send something. */
+	vegas->doing_vegas_now = 1;
+
+	/* Set the beginning of the next send window. */
+	vegas->beg_snd_nxt = tp->snd_nxt;
+
+	vegas->cntRTT = 0;
+	vegas->minRTT = 0x7fffffff;
+}
+
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct sock *sk)
+{
+	struct vegas *vegas = inet_csk_ca(sk);
+
+	vegas->doing_vegas_now = 0;
+}
+
+void tcp_vegas_init(struct sock *sk)
+{
+	struct vegas *vegas = inet_csk_ca(sk);
+
+	vegas->baseRTT = 0x7fffffff;
+	vegas_enable(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_init);
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample)
+{
+	struct vegas *vegas = inet_csk_ca(sk);
+	u32 vrtt;
+
+	if (sample->rtt_us < 0)
+		return;
+
+	/* Never allow zero rtt or baseRTT */
+	vrtt = sample->rtt_us + 1;
+
+	/* Filter to find propagation delay: */
+	if (vrtt < vegas->baseRTT)
+		vegas->baseRTT = vrtt;
+
+	/* Find the min RTT during the last RTT to find
+	 * the current prop. delay + queuing delay:
+	 */
+	vegas->minRTT = min(vegas->minRTT, vrtt);
+	vegas->cntRTT++;
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked);
+
+void tcp_vegas_state(struct sock *sk, u8 ca_state)
+{
+	if (ca_state == TCP_CA_Open)
+		vegas_enable(sk);
+	else
+		vegas_disable(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_state);
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_CWND_RESTART ||
+	    event == CA_EVENT_TX_START)
+		tcp_vegas_init(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
+
+static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
+{
+	return  min(tp->snd_ssthresh, tcp_snd_cwnd(tp));
+}
+
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct vegas *vegas = inet_csk_ca(sk);
+
+	if (!vegas->doing_vegas_now) {
+		tcp_reno_cong_avoid(sk, ack, acked);
+		return;
+	}
+
+	if (after(ack, vegas->beg_snd_nxt)) {
+		/* Do the Vegas once-per-RTT cwnd adjustment. */
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		vegas->beg_snd_nxt  = tp->snd_nxt;
+
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (vegas->cntRTT <= 2) {
+			/* We don't have enough RTT samples to do the Vegas
+			 * calculation, so we'll behave like Reno.
+			 */
+			tcp_reno_cong_avoid(sk, ack, acked);
+		} else {
+			u32 rtt, diff;
+			u64 target_cwnd;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = vegas->minRTT;
+
+			/* Calculate the cwnd we should have, if we weren't
+			 * going too fast.
+			 *
+			 * This is:
+			 *     (actual rate in segments) * baseRTT
+			 */
+			target_cwnd = (u64)tcp_snd_cwnd(tp) * vegas->baseRTT;
+			do_div(target_cwnd, rtt);
+
+			/* Calculate the difference between the window we had,
+			 * and the window we would like to have. This quantity
+			 * is the "Diff" from the Arizona Vegas papers.
+			 */
+			diff = tcp_snd_cwnd(tp) * (rtt-vegas->baseRTT) / vegas->baseRTT;
+
+			if (diff > gamma && tcp_in_slow_start(tp)) {
+				/* Going too fast. Time to slow down
+				 * and switch to congestion avoidance.
+				 */
+
+				/* Set cwnd to match the actual rate
+				 * exactly:
+				 *   cwnd = (actual rate) * baseRTT
+				 * Then we add 1 because the integer
+				 * truncation robs us of full link
+				 * utilization.
+				 */
+				tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp),
+							 (u32)target_cwnd + 1));
+				tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
+
+			} else if (tcp_in_slow_start(tp)) {
+				/* Slow start.  */
+				tcp_slow_start(tp, acked);
+			} else {
+				/* Congestion avoidance. */
+
+				/* Figure out where we would like cwnd
+				 * to be.
+				 */
+				if (diff > beta) {
+					/* The old window was too fast, so
+					 * we slow down.
+					 */
+					tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
+					tp->snd_ssthresh
+						= tcp_vegas_ssthresh(tp);
+				} else if (diff < alpha) {
+					/* We don't have enough extra packets
+					 * in the network, so speed up.
+					 */
+					tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+				} else {
+					/* Sending just as fast as we
+					 * should be.
+					 */
+				}
+			}
+
+			if (tcp_snd_cwnd(tp) < 2)
+				tcp_snd_cwnd_set(tp, 2);
+			else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
+				tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
+
+			tp->snd_ssthresh = tcp_current_ssthresh(sk);
+		}
+
+		/* Wipe the slate clean for the next RTT. */
+		vegas->cntRTT = 0;
+		vegas->minRTT = 0x7fffffff;
+	}
+	/* Use normal slow start */
+	else if (tcp_in_slow_start(tp))
+		tcp_slow_start(tp, acked);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+			  union tcp_cc_info *info)
+{
+	const struct vegas *ca = inet_csk_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		info->vegas.tcpv_enabled = ca->doing_vegas_now;
+		info->vegas.tcpv_rttcnt = ca->cntRTT;
+		info->vegas.tcpv_rtt = ca->baseRTT;
+		info->vegas.tcpv_minrtt = ca->minRTT;
+
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
+
+static struct tcp_congestion_ops tcp_vegas __read_mostly = {
+	.init		= tcp_vegas_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.cong_avoid	= tcp_vegas_cong_avoid,
+	.pkts_acked	= tcp_vegas_pkts_acked,
+	.set_state	= tcp_vegas_state,
+	.cwnd_event	= tcp_vegas_cwnd_event,
+	.get_info	= tcp_vegas_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "vegas",
+};
+
+static int __init tcp_vegas_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_vegas);
+	return 0;
+}
+
+static void __exit tcp_vegas_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_vegas);
+}
+
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
new file mode 100644
index 0000000000..4f24d0e37d
--- /dev/null
+++ b/net/ipv4/tcp_vegas.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * TCP Vegas congestion control interface
+ */
+#ifndef __TCP_VEGAS_H
+#define __TCP_VEGAS_H 1
+
+/* Vegas variables */
+struct vegas {
+	u32	beg_snd_nxt;	/* right edge during last RTT */
+	u32	beg_snd_una;	/* left edge  during last RTT */
+	u32	beg_snd_cwnd;	/* saves the size of the cwnd */
+	u8	doing_vegas_now;/* if true, do vegas for this RTT */
+	u16	cntRTT;		/* # of RTTs measured within last RTT */
+	u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
+	u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
+};
+
+void tcp_vegas_init(struct sock *sk);
+void tcp_vegas_state(struct sock *sk, u8 ca_state);
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample);
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+			  union tcp_cc_info *info);
+
+#endif	/* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
new file mode 100644
index 0000000000..366ff6f214
--- /dev/null
+++ b/net/ipv4/tcp_veno.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP Veno congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    C. P. Fu, S. C. Liew.
+ *    "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
+ *    IEEE Journal on Selected Areas in Communication,
+ *    Feb. 2003.
+ * 	See https://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Veno variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static const int beta = 3 << V_PARAM_SHIFT;
+
+/* Veno variables */
+struct veno {
+	u8 doing_veno_now;	/* if true, do veno for this rtt */
+	u16 cntrtt;		/* # of rtts measured within last rtt */
+	u32 minrtt;		/* min of rtts measured within last rtt (in usec) */
+	u32 basertt;		/* the min of all Veno rtt measurements seen (in usec) */
+	u32 inc;		/* decide whether to increase cwnd */
+	u32 diff;		/* calculate the diff rate */
+};
+
+/* There are several situations when we must "re-start" Veno:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ */
+static inline void veno_enable(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	/* turn on Veno */
+	veno->doing_veno_now = 1;
+
+	veno->minrtt = 0x7fffffff;
+}
+
+static inline void veno_disable(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	/* turn off Veno */
+	veno->doing_veno_now = 0;
+}
+
+static void tcp_veno_init(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	veno->basertt = 0x7fffffff;
+	veno->inc = 1;
+	veno_enable(sk);
+}
+
+/* Do rtt sampling needed for Veno. */
+static void tcp_veno_pkts_acked(struct sock *sk,
+				const struct ack_sample *sample)
+{
+	struct veno *veno = inet_csk_ca(sk);
+	u32 vrtt;
+
+	if (sample->rtt_us < 0)
+		return;
+
+	/* Never allow zero rtt or baseRTT */
+	vrtt = sample->rtt_us + 1;
+
+	/* Filter to find propagation delay: */
+	if (vrtt < veno->basertt)
+		veno->basertt = vrtt;
+
+	/* Find the min rtt during the last rtt to find
+	 * the current prop. delay + queuing delay:
+	 */
+	veno->minrtt = min(veno->minrtt, vrtt);
+	veno->cntrtt++;
+}
+
+static void tcp_veno_state(struct sock *sk, u8 ca_state)
+{
+	if (ca_state == TCP_CA_Open)
+		veno_enable(sk);
+	else
+		veno_disable(sk);
+}
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Veno calculations
+ * until we get fresh rtt samples.  So when we
+ * restart, we reset our Veno state to a clean
+ * state. After we get acks for this flight of
+ * packets, _then_ we can make Veno calculations
+ * again.
+ */
+static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
+		tcp_veno_init(sk);
+}
+
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct veno *veno = inet_csk_ca(sk);
+
+	if (!veno->doing_veno_now) {
+		tcp_reno_cong_avoid(sk, ack, acked);
+		return;
+	}
+
+	/* limited by applications */
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	/* We do the Veno calculations only if we got enough rtt samples */
+	if (veno->cntrtt <= 2) {
+		/* We don't have enough rtt samples to do the Veno
+		 * calculation, so we'll behave like Reno.
+		 */
+		tcp_reno_cong_avoid(sk, ack, acked);
+	} else {
+		u64 target_cwnd;
+		u32 rtt;
+
+		/* We have enough rtt samples, so, using the Veno
+		 * algorithm, we determine the state of the network.
+		 */
+
+		rtt = veno->minrtt;
+
+		target_cwnd = (u64)tcp_snd_cwnd(tp) * veno->basertt;
+		target_cwnd <<= V_PARAM_SHIFT;
+		do_div(target_cwnd, rtt);
+
+		veno->diff = (tcp_snd_cwnd(tp) << V_PARAM_SHIFT) - target_cwnd;
+
+		if (tcp_in_slow_start(tp)) {
+			/* Slow start. */
+			acked = tcp_slow_start(tp, acked);
+			if (!acked)
+				goto done;
+		}
+
+		/* Congestion avoidance. */
+		if (veno->diff < beta) {
+			/* In the "non-congestive state", increase cwnd
+			 * every rtt.
+			 */
+			tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
+		} else {
+			/* In the "congestive state", increase cwnd
+			 * every other rtt.
+			 */
+			if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+				if (veno->inc &&
+				    tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
+					tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+					veno->inc = 0;
+				} else
+					veno->inc = 1;
+				tp->snd_cwnd_cnt = 0;
+			} else
+				tp->snd_cwnd_cnt += acked;
+		}
+done:
+		if (tcp_snd_cwnd(tp) < 2)
+			tcp_snd_cwnd_set(tp, 2);
+		else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
+			tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
+	}
+	/* Wipe the slate clean for the next rtt. */
+	/* veno->cntrtt = 0; */
+	veno->minrtt = 0x7fffffff;
+}
+
+/* Veno MD phase */
+static u32 tcp_veno_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct veno *veno = inet_csk_ca(sk);
+
+	if (veno->diff < beta)
+		/* in "non-congestive state", cut cwnd by 1/5 */
+		return max(tcp_snd_cwnd(tp) * 4 / 5, 2U);
+	else
+		/* in "congestive state", cut cwnd by 1/2 */
+		return max(tcp_snd_cwnd(tp) >> 1U, 2U);
+}
+
+static struct tcp_congestion_ops tcp_veno __read_mostly = {
+	.init		= tcp_veno_init,
+	.ssthresh	= tcp_veno_ssthresh,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.cong_avoid	= tcp_veno_cong_avoid,
+	.pkts_acked	= tcp_veno_pkts_acked,
+	.set_state	= tcp_veno_state,
+	.cwnd_event	= tcp_veno_cwnd_event,
+
+	.owner		= THIS_MODULE,
+	.name		= "veno",
+};
+
+static int __init tcp_veno_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_veno);
+	return 0;
+}
+
+static void __exit tcp_veno_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_veno);
+}
+
+module_init(tcp_veno_register);
+module_exit(tcp_veno_unregister);
+
+MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Veno");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 0000000000..c6e97141ee
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP Westwood+: end-to-end bandwidth estimation for TCP
+ *
+ *      Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4
+ *
+ * Support at http://c3lab.poliba.it/index.php/Westwood
+ * Main references in literature:
+ *
+ * - Mascolo S, Casetti, M. Gerla et al.
+ *   "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001
+ *
+ * - A. Grieco, s. Mascolo
+ *   "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer
+ *     Comm. Review, 2004
+ *
+ * - A. Dell'Aera, L. Grieco, S. Mascolo.
+ *   "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving :
+ *    A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004
+ *
+ * Westwood+ employs end-to-end bandwidth measurement to set cwnd and
+ * ssthresh after packet loss. The probing phase is as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <net/tcp.h>
+
+/* TCP Westwood structure */
+struct westwood {
+	u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
+	u32    bw_est;           /* bandwidth estimate */
+	u32    rtt_win_sx;       /* here starts a new evaluation... */
+	u32    bk;
+	u32    snd_una;          /* used for evaluating the number of acked bytes */
+	u32    cumul_ack;
+	u32    accounted;
+	u32    rtt;
+	u32    rtt_min;          /* minimum observed RTT */
+	u8     first_ack;        /* flag which infers that this is the first ack */
+	u8     reset_rtt_min;    /* Reset RTT min to next RTT sample*/
+};
+
+/* TCP Westwood functions and constants */
+#define TCP_WESTWOOD_RTT_MIN   (HZ/20)	/* 50ms */
+#define TCP_WESTWOOD_INIT_RTT  (20*HZ)	/* maybe too conservative?! */
+
+/*
+ * @tcp_westwood_create
+ * This function initializes fields used in TCP Westwood+,
+ * it is called after the initial SYN, so the sequence numbers
+ * are correct but new passive connections we have no
+ * information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void tcp_westwood_init(struct sock *sk)
+{
+	struct westwood *w = inet_csk_ca(sk);
+
+	w->bk = 0;
+	w->bw_ns_est = 0;
+	w->bw_est = 0;
+	w->accounted = 0;
+	w->cumul_ack = 0;
+	w->reset_rtt_min = 1;
+	w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
+	w->rtt_win_sx = tcp_jiffies32;
+	w->snd_una = tcp_sk(sk)->snd_una;
+	w->first_ack = 1;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coefficients.
+ */
+static inline u32 westwood_do_filter(u32 a, u32 b)
+{
+	return ((7 * a) + b) >> 3;
+}
+
+static void westwood_filter(struct westwood *w, u32 delta)
+{
+	/* If the filter is empty fill it with the first sample of bandwidth  */
+	if (w->bw_ns_est == 0 && w->bw_est == 0) {
+		w->bw_ns_est = w->bk / delta;
+		w->bw_est = w->bw_ns_est;
+	} else {
+		w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+		w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+	}
+}
+
+/*
+ * @westwood_pkts_acked
+ * Called after processing group of packets.
+ * but all westwood needs is the last sample of srtt.
+ */
+static void tcp_westwood_pkts_acked(struct sock *sk,
+				    const struct ack_sample *sample)
+{
+	struct westwood *w = inet_csk_ca(sk);
+
+	if (sample->rtt_us > 0)
+		w->rtt = usecs_to_jiffies(sample->rtt_us);
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+static void westwood_update_window(struct sock *sk)
+{
+	struct westwood *w = inet_csk_ca(sk);
+	s32 delta = tcp_jiffies32 - w->rtt_win_sx;
+
+	/* Initialize w->snd_una with the first acked sequence number in order
+	 * to fix mismatch between tp->snd_una and w->snd_una for the first
+	 * bandwidth sample
+	 */
+	if (w->first_ack) {
+		w->snd_una = tcp_sk(sk)->snd_una;
+		w->first_ack = 0;
+	}
+
+	/*
+	 * See if a RTT-window has passed.
+	 * Be careful since if RTT is less than
+	 * 50ms we don't filter but we continue 'building the sample'.
+	 * This minimum limit was chosen since an estimation on small
+	 * time intervals is better to avoid...
+	 * Obviously on a LAN we reasonably will always have
+	 * right_bound = left_bound + WESTWOOD_RTT_MIN
+	 */
+	if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
+		westwood_filter(w, delta);
+
+		w->bk = 0;
+		w->rtt_win_sx = tcp_jiffies32;
+	}
+}
+
+static inline void update_rtt_min(struct westwood *w)
+{
+	if (w->reset_rtt_min) {
+		w->rtt_min = w->rtt;
+		w->reset_rtt_min = 0;
+	} else
+		w->rtt_min = min(w->rtt, w->rtt_min);
+}
+
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
+
+	westwood_update_window(sk);
+
+	w->bk += tp->snd_una - w->snd_una;
+	w->snd_una = tp->snd_una;
+	update_rtt_min(w);
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating bk in case of
+ * delayed or partial acks.
+ */
+static inline u32 westwood_acked_count(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
+
+	w->cumul_ack = tp->snd_una - w->snd_una;
+
+	/* If cumul_ack is 0 this is a dupack since it's not moving
+	 * tp->snd_una.
+	 */
+	if (!w->cumul_ack) {
+		w->accounted += tp->mss_cache;
+		w->cumul_ack = tp->mss_cache;
+	}
+
+	if (w->cumul_ack > tp->mss_cache) {
+		/* Partial or delayed ack */
+		if (w->accounted >= w->cumul_ack) {
+			w->accounted -= w->cumul_ack;
+			w->cumul_ack = tp->mss_cache;
+		} else {
+			w->cumul_ack -= w->accounted;
+			w->accounted = 0;
+		}
+	}
+
+	w->snd_una = tp->snd_una;
+
+	return w->cumul_ack;
+}
+
+/*
+ * TCP Westwood
+ * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
+ * so avoids ever returning 0.
+ */
+static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct westwood *w = inet_csk_ca(sk);
+
+	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
+}
+
+static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
+{
+	if (ack_flags & CA_ACK_SLOWPATH) {
+		struct westwood *w = inet_csk_ca(sk);
+
+		westwood_update_window(sk);
+		w->bk += westwood_acked_count(sk);
+
+		update_rtt_min(w);
+		return;
+	}
+
+	westwood_fast_bw(sk);
+}
+
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
+
+	switch (event) {
+	case CA_EVENT_COMPLETE_CWR:
+		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+		tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
+		break;
+	case CA_EVENT_LOSS:
+		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+		/* Update RTT_min when next ack arrives */
+		w->reset_rtt_min = 1;
+		break;
+	default:
+		/* don't care */
+		break;
+	}
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr,
+				union tcp_cc_info *info)
+{
+	const struct westwood *ca = inet_csk_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		info->vegas.tcpv_enabled = 1;
+		info->vegas.tcpv_rttcnt	= 0;
+		info->vegas.tcpv_rtt	= jiffies_to_usecs(ca->rtt);
+		info->vegas.tcpv_minrtt	= jiffies_to_usecs(ca->rtt_min);
+
+		*attr = INET_DIAG_VEGASINFO;
+		return sizeof(struct tcpvegas_info);
+	}
+	return 0;
+}
+
+static struct tcp_congestion_ops tcp_westwood __read_mostly = {
+	.init		= tcp_westwood_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.undo_cwnd      = tcp_reno_undo_cwnd,
+	.cwnd_event	= tcp_westwood_event,
+	.in_ack_event	= tcp_westwood_ack,
+	.get_info	= tcp_westwood_info,
+	.pkts_acked	= tcp_westwood_pkts_acked,
+
+	.owner		= THIS_MODULE,
+	.name		= "westwood"
+};
+
+static int __init tcp_westwood_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_westwood);
+}
+
+static void __exit tcp_westwood_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_westwood);
+}
+
+module_init(tcp_westwood_register);
+module_exit(tcp_westwood_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
new file mode 100644
index 0000000000..18b07ff5d2
--- /dev/null
+++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ *   YeAH TCP
+ *
+ * For further details look at:
+ *   https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ *
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+#define TCP_YEAH_ALPHA       80 /* number of packets queued at the bottleneck */
+#define TCP_YEAH_GAMMA        1 /* fraction of queue to be removed per rtt */
+#define TCP_YEAH_DELTA        3 /* log minimum fraction of cwnd to be removed on loss */
+#define TCP_YEAH_EPSILON      1 /* log maximum fraction to be removed on early decongestion */
+#define TCP_YEAH_PHY          8 /* maximum delta from base */
+#define TCP_YEAH_RHO         16 /* minimum number of consecutive rtt to consider competition on loss */
+#define TCP_YEAH_ZETA        50 /* minimum number of state switches to reset reno_count */
+
+#define TCP_SCALABLE_AI_CNT	 100U
+
+/* YeAH variables */
+struct yeah {
+	struct vegas vegas;	/* must be first */
+
+	/* YeAH */
+	u32 lastQ;
+	u32 doing_reno_now;
+
+	u32 reno_count;
+	u32 fast_count;
+};
+
+static void tcp_yeah_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct yeah *yeah = inet_csk_ca(sk);
+
+	tcp_vegas_init(sk);
+
+	yeah->doing_reno_now = 0;
+	yeah->lastQ = 0;
+
+	yeah->reno_count = 2;
+
+	/* Ensure the MD arithmetic works.  This is somewhat pedantic,
+	 * since I don't think we will see a cwnd this large. :) */
+	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct yeah *yeah = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			goto do_vegas;
+	}
+
+	if (!yeah->doing_reno_now) {
+		/* Scalable */
+		tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT),
+				  acked);
+	} else {
+		/* Reno */
+		tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
+	}
+
+	/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
+	 *
+	 * These are so named because they represent the approximate values
+	 * of snd_una and snd_nxt at the beginning of the current RTT. More
+	 * precisely, they represent the amount of data sent during the RTT.
+	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+	 * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding
+	 * bytes of data have been ACKed during the course of the RTT, giving
+	 * an "actual" rate of:
+	 *
+	 *     (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)
+	 *
+	 * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,
+	 * because delayed ACKs can cover more than one segment, so they
+	 * don't line up yeahly with the boundaries of RTTs.
+	 *
+	 * Another unfortunate fact of life is that delayed ACKs delay the
+	 * advance of the left edge of our send window, so that the number
+	 * of bytes we send in an RTT is often less than our cwnd will allow.
+	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+	 */
+do_vegas:
+	if (after(ack, yeah->vegas.beg_snd_nxt)) {
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (yeah->vegas.cntRTT > 2) {
+			u32 rtt, queue;
+			u64 bw;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = yeah->vegas.minRTT;
+
+			/* Compute excess number of packets above bandwidth
+			 * Avoid doing full 64 bit divide.
+			 */
+			bw = tcp_snd_cwnd(tp);
+			bw *= rtt - yeah->vegas.baseRTT;
+			do_div(bw, rtt);
+			queue = bw;
+
+			if (queue > TCP_YEAH_ALPHA ||
+			    rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
+				if (queue > TCP_YEAH_ALPHA &&
+				    tcp_snd_cwnd(tp) > yeah->reno_count) {
+					u32 reduction = min(queue / TCP_YEAH_GAMMA ,
+							    tcp_snd_cwnd(tp) >> TCP_YEAH_EPSILON);
+
+					tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - reduction);
+
+					tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp),
+								 yeah->reno_count));
+
+					tp->snd_ssthresh = tcp_snd_cwnd(tp);
+				}
+
+				if (yeah->reno_count <= 2)
+					yeah->reno_count = max(tcp_snd_cwnd(tp)>>1, 2U);
+				else
+					yeah->reno_count++;
+
+				yeah->doing_reno_now = min(yeah->doing_reno_now + 1,
+							   0xffffffU);
+			} else {
+				yeah->fast_count++;
+
+				if (yeah->fast_count > TCP_YEAH_ZETA) {
+					yeah->reno_count = 2;
+					yeah->fast_count = 0;
+				}
+
+				yeah->doing_reno_now = 0;
+			}
+
+			yeah->lastQ = queue;
+		}
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		yeah->vegas.beg_snd_una  = yeah->vegas.beg_snd_nxt;
+		yeah->vegas.beg_snd_nxt  = tp->snd_nxt;
+		yeah->vegas.beg_snd_cwnd = tcp_snd_cwnd(tp);
+
+		/* Wipe the slate clean for the next RTT. */
+		yeah->vegas.cntRTT = 0;
+		yeah->vegas.minRTT = 0x7fffffff;
+	}
+}
+
+static u32 tcp_yeah_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct yeah *yeah = inet_csk_ca(sk);
+	u32 reduction;
+
+	if (yeah->doing_reno_now < TCP_YEAH_RHO) {
+		reduction = yeah->lastQ;
+
+		reduction = min(reduction, max(tcp_snd_cwnd(tp)>>1, 2U));
+
+		reduction = max(reduction, tcp_snd_cwnd(tp) >> TCP_YEAH_DELTA);
+	} else
+		reduction = max(tcp_snd_cwnd(tp)>>1, 2U);
+
+	yeah->fast_count = 0;
+	yeah->reno_count = max(yeah->reno_count>>1, 2U);
+
+	return max_t(int, tcp_snd_cwnd(tp) - reduction, 2);
+}
+
+static struct tcp_congestion_ops tcp_yeah __read_mostly = {
+	.init		= tcp_yeah_init,
+	.ssthresh	= tcp_yeah_ssthresh,
+	.undo_cwnd      = tcp_reno_undo_cwnd,
+	.cong_avoid	= tcp_yeah_cong_avoid,
+	.set_state	= tcp_vegas_state,
+	.cwnd_event	= tcp_vegas_cwnd_event,
+	.get_info	= tcp_vegas_get_info,
+	.pkts_acked	= tcp_vegas_pkts_acked,
+
+	.owner		= THIS_MODULE,
+	.name		= "yeah",
+};
+
+static int __init tcp_yeah_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_yeah);
+	return 0;
+}
+
+static void __exit tcp_yeah_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_yeah);
+}
+
+module_init(tcp_yeah_register);
+module_exit(tcp_yeah_unregister);
+
+MODULE_AUTHOR("Angelo P. Castellani");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("YeAH TCP");
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
new file mode 100644
index 0000000000..5048c47c79
--- /dev/null
+++ b/net/ipv4/tunnel4.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* tunnel4.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/mpls.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly;
+static DEFINE_MUTEX(tunnel4_mutex);
+
+static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
+{
+	return (family == AF_INET) ? &tunnel4_handlers :
+		(family == AF_INET6) ? &tunnel64_handlers :
+		&tunnelmpls4_handlers;
+}
+
+int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
+{
+	struct xfrm_tunnel __rcu **pprev;
+	struct xfrm_tunnel *t;
+
+	int ret = -EEXIST;
+	int priority = handler->priority;
+
+	mutex_lock(&tunnel4_mutex);
+
+	for (pprev = fam_handlers(family);
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel4_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t->priority > priority)
+			break;
+		if (t->priority == priority)
+			goto err;
+	}
+
+	handler->next = *pprev;
+	rcu_assign_pointer(*pprev, handler);
+
+	ret = 0;
+
+err:
+	mutex_unlock(&tunnel4_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_register);
+
+int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
+{
+	struct xfrm_tunnel __rcu **pprev;
+	struct xfrm_tunnel *t;
+	int ret = -ENOENT;
+
+	mutex_lock(&tunnel4_mutex);
+
+	for (pprev = fam_handlers(family);
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel4_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t == handler) {
+			*pprev = handler->next;
+			ret = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&tunnel4_mutex);
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+
+#define for_each_tunnel_rcu(head, handler)		\
+	for (handler = rcu_dereference(head);		\
+	     handler != NULL;				\
+	     handler = rcu_dereference(handler->next))	\
+
+static int tunnel4_rcv(struct sk_buff *skb)
+{
+	struct xfrm_tunnel *handler;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto drop;
+
+	for_each_tunnel_rcu(tunnel4_handlers, handler)
+		if (!handler->handler(skb))
+			return 0;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err)
+{
+	struct xfrm_tunnel __rcu *head;
+	struct xfrm_tunnel *handler;
+	int ret;
+
+	head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers;
+
+	for_each_tunnel_rcu(head, handler) {
+		if (handler->cb_handler) {
+			ret = handler->cb_handler(skb, err);
+			if (ret <= 0)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static const struct xfrm_input_afinfo tunnel4_input_afinfo = {
+	.family		=	AF_INET,
+	.is_ipip	=	true,
+	.callback	=	tunnel4_rcv_cb,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tunnel64_rcv(struct sk_buff *skb)
+{
+	struct xfrm_tunnel *handler;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto drop;
+
+	for_each_tunnel_rcu(tunnel64_handlers, handler)
+		if (!handler->handler(skb))
+			return 0;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_MPLS)
+static int tunnelmpls4_rcv(struct sk_buff *skb)
+{
+	struct xfrm_tunnel *handler;
+
+	if (!pskb_may_pull(skb, sizeof(struct mpls_label)))
+		goto drop;
+
+	for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
+		if (!handler->handler(skb))
+			return 0;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
+static int tunnel4_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm_tunnel *handler;
+
+	for_each_tunnel_rcu(tunnel4_handlers, handler)
+		if (!handler->err_handler(skb, info))
+			return 0;
+
+	return -ENOENT;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm_tunnel *handler;
+
+	for_each_tunnel_rcu(tunnel64_handlers, handler)
+		if (!handler->err_handler(skb, info))
+			return 0;
+
+	return -ENOENT;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_MPLS)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm_tunnel *handler;
+
+	for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
+		if (!handler->err_handler(skb, info))
+			return 0;
+
+	return -ENOENT;
+}
+#endif
+
+static const struct net_protocol tunnel4_protocol = {
+	.handler	=	tunnel4_rcv,
+	.err_handler	=	tunnel4_err,
+	.no_policy	=	1,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static const struct net_protocol tunnel64_protocol = {
+	.handler	=	tunnel64_rcv,
+	.err_handler	=	tunnel64_err,
+	.no_policy	=	1,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_MPLS)
+static const struct net_protocol tunnelmpls4_protocol = {
+	.handler	=	tunnelmpls4_rcv,
+	.err_handler	=	tunnelmpls4_err,
+	.no_policy	=	1,
+};
+#endif
+
+static int __init tunnel4_init(void)
+{
+	if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP))
+		goto err;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
+		inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+		goto err;
+	}
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+	if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) {
+		inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+#if IS_ENABLED(CONFIG_IPV6)
+		inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
+#endif
+		goto err;
+	}
+#endif
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+	if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) {
+		inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+#if IS_ENABLED(CONFIG_IPV6)
+		inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+		inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS);
+#endif
+		goto err;
+	}
+#endif
+	return 0;
+
+err:
+	pr_err("%s: can't add protocol\n", __func__);
+	return -EAGAIN;
+}
+
+static void __exit tunnel4_fini(void)
+{
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+	if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo))
+		pr_err("tunnel4 close: can't remove input afinfo\n");
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+	if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS))
+		pr_err("tunnelmpls4 close: can't remove protocol\n");
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
+		pr_err("tunnel64 close: can't remove protocol\n");
+#endif
+	if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
+		pr_err("tunnel4 close: can't remove protocol\n");
+}
+
+module_init(tunnel4_init);
+module_exit(tunnel4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
new file mode 100644
index 0000000000..7be4ddc80d
--- /dev/null
+++ b/net/ipv4/udp.c
@@ -0,0 +1,3634 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The User Datagram Protocol (UDP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *		Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * Fixes:
+ *		Alan Cox	:	verify_area() calls
+ *		Alan Cox	: 	stopped close while in use off icmp
+ *					messages. Not a fix but a botch that
+ *					for udp at least is 'valid'.
+ *		Alan Cox	:	Fixed icmp handling properly
+ *		Alan Cox	: 	Correct error for oversized datagrams
+ *		Alan Cox	:	Tidied select() semantics.
+ *		Alan Cox	:	udp_err() fixed properly, also now
+ *					select and read wake correctly on errors
+ *		Alan Cox	:	udp_send verify_area moved to avoid mem leak
+ *		Alan Cox	:	UDP can count its memory
+ *		Alan Cox	:	send to an unknown connection causes
+ *					an ECONNREFUSED off the icmp, but
+ *					does NOT close.
+ *		Alan Cox	:	Switched to new sk_buff handlers. No more backlog!
+ *		Alan Cox	:	Using generic datagram code. Even smaller and the PEEK
+ *					bug no longer crashes it.
+ *		Fred Van Kempen	: 	Net2e support for sk->broadcast.
+ *		Alan Cox	:	Uses skb_free_datagram
+ *		Alan Cox	:	Added get/set sockopt support.
+ *		Alan Cox	:	Broadcasting without option set returns EACCES.
+ *		Alan Cox	:	No wakeup calls. Instead we now use the callbacks.
+ *		Alan Cox	:	Use ip_tos and ip_ttl
+ *		Alan Cox	:	SNMP Mibs
+ *		Alan Cox	:	MSG_DONTROUTE, and 0.0.0.0 support.
+ *		Matt Dillon	:	UDP length checks.
+ *		Alan Cox	:	Smarter af_inet used properly.
+ *		Alan Cox	:	Use new kernel side addressing.
+ *		Alan Cox	:	Incorrect return on truncated datagram receive.
+ *	Arnt Gulbrandsen 	:	New udp_send and stuff
+ *		Alan Cox	:	Cache last socket
+ *		Alan Cox	:	Route cache
+ *		Jon Peatfield	:	Minor efficiency fix to sendto().
+ *		Mike Shaver	:	RFC1122 checks.
+ *		Alan Cox	:	Nonblocking error fix.
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		Mike McLagan	:	Routing by source
+ *		David S. Miller	:	New socket lookup architecture.
+ *					Last socket cache retained as it
+ *					does have a high hit rate.
+ *		Olaf Kirch	:	Don't linearise iovec on sendmsg.
+ *		Andi Kleen	:	Some cleanups, cache destination entry
+ *					for connect.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *		Melvin Smith	:	Check msg_name not msg_namelen in sendto(),
+ *					return ENOTCONN for unconnected sockets (POSIX)
+ *		Janos Farkas	:	don't deliver multi/broadcasts to a different
+ *					bound-to-device socket
+ *	Hirokazu Takahashi	:	HW checksumming for outgoing UDP
+ *					datagrams.
+ *	Hirokazu Takahashi	:	sendfile() on UDP works now.
+ *		Arnaldo C. Melo :	convert /proc/net/udp to seq_file
+ *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
+ *	Alexey Kuznetsov:		allow both IPv4 and IPv6 sockets to bind
+ *					a single port at the same time.
+ *	Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ *	James Chapman		:	Add L2TP encapsulation type.
+ */
+
+#define pr_fmt(fmt) "UDP: " fmt
+
+#include <linux/bpf-cgroup.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/memblock.h>
+#include <linux/highmem.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/tcp_states.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/ip_tunnels.h>
+#include <net/route.h>
+#include <net/checksum.h>
+#include <net/gso.h>
+#include <net/xfrm.h>
+#include <trace/events/udp.h>
+#include <linux/static_key.h>
+#include <linux/btf_ids.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
+#include "udp_impl.h"
+#include <net/sock_reuseport.h>
+#include <net/addrconf.h>
+#include <net/udp_tunnel.h>
+#include <net/gro.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6_stubs.h>
+#endif
+
+struct udp_table udp_table __read_mostly;
+EXPORT_SYMBOL(udp_table);
+
+long sysctl_udp_mem[3] __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_mem);
+
+atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
+EXPORT_SYMBOL(udp_memory_allocated);
+DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
+
+#define MAX_UDP_PORTS 65536
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
+
+static struct udp_table *udp_get_table_prot(struct sock *sk)
+{
+	return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table;
+}
+
+static int udp_lib_lport_inuse(struct net *net, __u16 num,
+			       const struct udp_hslot *hslot,
+			       unsigned long *bitmap,
+			       struct sock *sk, unsigned int log)
+{
+	struct sock *sk2;
+	kuid_t uid = sock_i_uid(sk);
+
+	sk_for_each(sk2, &hslot->head) {
+		if (net_eq(sock_net(sk2), net) &&
+		    sk2 != sk &&
+		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
+		    (!sk2->sk_reuse || !sk->sk_reuse) &&
+		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+		    inet_rcv_saddr_equal(sk, sk2, true)) {
+			if (sk2->sk_reuseport && sk->sk_reuseport &&
+			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
+			    uid_eq(uid, sock_i_uid(sk2))) {
+				if (!bitmap)
+					return 0;
+			} else {
+				if (!bitmap)
+					return 1;
+				__set_bit(udp_sk(sk2)->udp_port_hash >> log,
+					  bitmap);
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * Note: we still hold spinlock of primary hash chain, so no other writer
+ * can insert/delete a socket with local_port == num
+ */
+static int udp_lib_lport_inuse2(struct net *net, __u16 num,
+				struct udp_hslot *hslot2,
+				struct sock *sk)
+{
+	struct sock *sk2;
+	kuid_t uid = sock_i_uid(sk);
+	int res = 0;
+
+	spin_lock(&hslot2->lock);
+	udp_portaddr_for_each_entry(sk2, &hslot2->head) {
+		if (net_eq(sock_net(sk2), net) &&
+		    sk2 != sk &&
+		    (udp_sk(sk2)->udp_port_hash == num) &&
+		    (!sk2->sk_reuse || !sk->sk_reuse) &&
+		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+		    inet_rcv_saddr_equal(sk, sk2, true)) {
+			if (sk2->sk_reuseport && sk->sk_reuseport &&
+			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
+			    uid_eq(uid, sock_i_uid(sk2))) {
+				res = 0;
+			} else {
+				res = 1;
+			}
+			break;
+		}
+	}
+	spin_unlock(&hslot2->lock);
+	return res;
+}
+
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
+{
+	struct net *net = sock_net(sk);
+	kuid_t uid = sock_i_uid(sk);
+	struct sock *sk2;
+
+	sk_for_each(sk2, &hslot->head) {
+		if (net_eq(sock_net(sk2), net) &&
+		    sk2 != sk &&
+		    sk2->sk_family == sk->sk_family &&
+		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+		    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
+		    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+		    inet_rcv_saddr_equal(sk, sk2, false)) {
+			return reuseport_add_sock(sk, sk2,
+						  inet_rcv_saddr_any(sk));
+		}
+	}
+
+	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
+}
+
+/**
+ *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
+ *
+ *  @sk:          socket struct in question
+ *  @snum:        port number to look up
+ *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
+ *                   with NULL address
+ */
+int udp_lib_get_port(struct sock *sk, unsigned short snum,
+		     unsigned int hash2_nulladdr)
+{
+	struct udp_table *udptable = udp_get_table_prot(sk);
+	struct udp_hslot *hslot, *hslot2;
+	struct net *net = sock_net(sk);
+	int error = -EADDRINUSE;
+
+	if (!snum) {
+		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+		unsigned short first, last;
+		int low, high, remaining;
+		unsigned int rand;
+
+		inet_sk_get_local_port_range(sk, &low, &high);
+		remaining = (high - low) + 1;
+
+		rand = get_random_u32();
+		first = reciprocal_scale(rand, remaining) + low;
+		/*
+		 * force rand to be an odd multiple of UDP_HTABLE_SIZE
+		 */
+		rand = (rand | 1) * (udptable->mask + 1);
+		last = first + udptable->mask + 1;
+		do {
+			hslot = udp_hashslot(udptable, net, first);
+			bitmap_zero(bitmap, PORTS_PER_CHAIN);
+			spin_lock_bh(&hslot->lock);
+			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
+					    udptable->log);
+
+			snum = first;
+			/*
+			 * Iterate on all possible values of snum for this hash.
+			 * Using steps of an odd multiple of UDP_HTABLE_SIZE
+			 * give us randomization and full range coverage.
+			 */
+			do {
+				if (low <= snum && snum <= high &&
+				    !test_bit(snum >> udptable->log, bitmap) &&
+				    !inet_is_local_reserved_port(net, snum))
+					goto found;
+				snum += rand;
+			} while (snum != first);
+			spin_unlock_bh(&hslot->lock);
+			cond_resched();
+		} while (++first != last);
+		goto fail;
+	} else {
+		hslot = udp_hashslot(udptable, net, snum);
+		spin_lock_bh(&hslot->lock);
+		if (hslot->count > 10) {
+			int exist;
+			unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
+
+			slot2          &= udptable->mask;
+			hash2_nulladdr &= udptable->mask;
+
+			hslot2 = udp_hashslot2(udptable, slot2);
+			if (hslot->count < hslot2->count)
+				goto scan_primary_hash;
+
+			exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
+			if (!exist && (hash2_nulladdr != slot2)) {
+				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
+				exist = udp_lib_lport_inuse2(net, snum, hslot2,
+							     sk);
+			}
+			if (exist)
+				goto fail_unlock;
+			else
+				goto found;
+		}
+scan_primary_hash:
+		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
+			goto fail_unlock;
+	}
+found:
+	inet_sk(sk)->inet_num = snum;
+	udp_sk(sk)->udp_port_hash = snum;
+	udp_sk(sk)->udp_portaddr_hash ^= snum;
+	if (sk_unhashed(sk)) {
+		if (sk->sk_reuseport &&
+		    udp_reuseport_add_sock(sk, hslot)) {
+			inet_sk(sk)->inet_num = 0;
+			udp_sk(sk)->udp_port_hash = 0;
+			udp_sk(sk)->udp_portaddr_hash ^= snum;
+			goto fail_unlock;
+		}
+
+		sk_add_node_rcu(sk, &hslot->head);
+		hslot->count++;
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+		spin_lock(&hslot2->lock);
+		if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+		    sk->sk_family == AF_INET6)
+			hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
+					   &hslot2->head);
+		else
+			hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+					   &hslot2->head);
+		hslot2->count++;
+		spin_unlock(&hslot2->lock);
+	}
+	sock_set_flag(sk, SOCK_RCU_FREE);
+	error = 0;
+fail_unlock:
+	spin_unlock_bh(&hslot->lock);
+fail:
+	return error;
+}
+EXPORT_SYMBOL(udp_lib_get_port);
+
+int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+	unsigned int hash2_nulladdr =
+		ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+	unsigned int hash2_partial =
+		ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+
+	/* precompute partial secondary hash */
+	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+	return udp_lib_get_port(sk, snum, hash2_nulladdr);
+}
+
+static int compute_score(struct sock *sk, struct net *net,
+			 __be32 saddr, __be16 sport,
+			 __be32 daddr, unsigned short hnum,
+			 int dif, int sdif)
+{
+	int score;
+	struct inet_sock *inet;
+	bool dev_match;
+
+	if (!net_eq(sock_net(sk), net) ||
+	    udp_sk(sk)->udp_port_hash != hnum ||
+	    ipv6_only_sock(sk))
+		return -1;
+
+	if (sk->sk_rcv_saddr != daddr)
+		return -1;
+
+	score = (sk->sk_family == PF_INET) ? 2 : 1;
+
+	inet = inet_sk(sk);
+	if (inet->inet_daddr) {
+		if (inet->inet_daddr != saddr)
+			return -1;
+		score += 4;
+	}
+
+	if (inet->inet_dport) {
+		if (inet->inet_dport != sport)
+			return -1;
+		score += 4;
+	}
+
+	dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+					dif, sdif);
+	if (!dev_match)
+		return -1;
+	if (sk->sk_bound_dev_if)
+		score += 4;
+
+	if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+		score++;
+	return score;
+}
+
+INDIRECT_CALLABLE_SCOPE
+u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
+		const __be32 faddr, const __be16 fport)
+{
+	static u32 udp_ehash_secret __read_mostly;
+
+	net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
+
+	return __inet_ehashfn(laddr, lport, faddr, fport,
+			      udp_ehash_secret + net_hash_mix(net));
+}
+
+/* called with rcu_read_lock() */
+static struct sock *udp4_lib_lookup2(struct net *net,
+				     __be32 saddr, __be16 sport,
+				     __be32 daddr, unsigned int hnum,
+				     int dif, int sdif,
+				     struct udp_hslot *hslot2,
+				     struct sk_buff *skb)
+{
+	struct sock *sk, *result;
+	int score, badness;
+
+	result = NULL;
+	badness = 0;
+	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+		score = compute_score(sk, net, saddr, sport,
+				      daddr, hnum, dif, sdif);
+		if (score > badness) {
+			badness = score;
+
+			if (sk->sk_state == TCP_ESTABLISHED) {
+				result = sk;
+				continue;
+			}
+
+			result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+						       saddr, sport, daddr, hnum, udp_ehashfn);
+			if (!result) {
+				result = sk;
+				continue;
+			}
+
+			/* Fall back to scoring if group has connections */
+			if (!reuseport_has_conns(sk))
+				return result;
+
+			/* Reuseport logic returned an error, keep original score. */
+			if (IS_ERR(result))
+				continue;
+
+			badness = compute_score(result, net, saddr, sport,
+						daddr, hnum, dif, sdif);
+
+		}
+	}
+	return result;
+}
+
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this. -DaveM
+ */
+struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+		__be16 sport, __be32 daddr, __be16 dport, int dif,
+		int sdif, struct udp_table *udptable, struct sk_buff *skb)
+{
+	unsigned short hnum = ntohs(dport);
+	unsigned int hash2, slot2;
+	struct udp_hslot *hslot2;
+	struct sock *result, *sk;
+
+	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+	slot2 = hash2 & udptable->mask;
+	hslot2 = &udptable->hash2[slot2];
+
+	/* Lookup connected or non-wildcard socket */
+	result = udp4_lib_lookup2(net, saddr, sport,
+				  daddr, hnum, dif, sdif,
+				  hslot2, skb);
+	if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
+		goto done;
+
+	/* Lookup redirect from BPF */
+	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+	    udptable == net->ipv4.udp_table) {
+		sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
+					       saddr, sport, daddr, hnum, dif,
+					       udp_ehashfn);
+		if (sk) {
+			result = sk;
+			goto done;
+		}
+	}
+
+	/* Got non-wildcard socket or error on first lookup */
+	if (result)
+		goto done;
+
+	/* Lookup wildcard sockets */
+	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+	slot2 = hash2 & udptable->mask;
+	hslot2 = &udptable->hash2[slot2];
+
+	result = udp4_lib_lookup2(net, saddr, sport,
+				  htonl(INADDR_ANY), hnum, dif, sdif,
+				  hslot2, skb);
+done:
+	if (IS_ERR(result))
+		return NULL;
+	return result;
+}
+EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
+
+static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
+						 __be16 sport, __be16 dport,
+						 struct udp_table *udptable)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+
+	return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
+				 iph->daddr, dport, inet_iif(skb),
+				 inet_sdif(skb), udptable, skb);
+}
+
+struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
+				 __be16 sport, __be16 dport)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	int iif, sdif;
+
+	inet_get_iif_sdif(skb, &iif, &sdif);
+
+	return __udp4_lib_lookup(net, iph->saddr, sport,
+				 iph->daddr, dport, iif,
+				 sdif, net->ipv4.udp_table, NULL);
+}
+
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
+struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
+			     __be32 daddr, __be16 dport, int dif)
+{
+	struct sock *sk;
+
+	sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
+			       dif, 0, net->ipv4.udp_table, NULL);
+	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+	return sk;
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+#endif
+
+static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk,
+				       __be16 loc_port, __be32 loc_addr,
+				       __be16 rmt_port, __be32 rmt_addr,
+				       int dif, int sdif, unsigned short hnum)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+
+	if (!net_eq(sock_net(sk), net) ||
+	    udp_sk(sk)->udp_port_hash != hnum ||
+	    (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+	    (inet->inet_dport != rmt_port && inet->inet_dport) ||
+	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
+	    ipv6_only_sock(sk) ||
+	    !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+		return false;
+	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
+		return false;
+	return true;
+}
+
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+void udp_encap_enable(void)
+{
+	static_branch_inc(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+void udp_encap_disable(void)
+{
+	static_branch_dec(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_disable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+	int i;
+
+	for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+		int (*handler)(struct sk_buff *skb, u32 info);
+		const struct ip_tunnel_encap_ops *encap;
+
+		encap = rcu_dereference(iptun_encaps[i]);
+		if (!encap)
+			continue;
+		handler = encap->err_handler;
+		if (handler && !handler(skb, info))
+			return 0;
+	}
+
+	return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+					 const struct iphdr *iph,
+					 struct udphdr *uh,
+					 struct udp_table *udptable,
+					 struct sock *sk,
+					 struct sk_buff *skb, u32 info)
+{
+	int (*lookup)(struct sock *sk, struct sk_buff *skb);
+	int network_offset, transport_offset;
+	struct udp_sock *up;
+
+	network_offset = skb_network_offset(skb);
+	transport_offset = skb_transport_offset(skb);
+
+	/* Network header needs to point to the outer IPv4 header inside ICMP */
+	skb_reset_network_header(skb);
+
+	/* Transport header needs to point to the UDP header */
+	skb_set_transport_header(skb, iph->ihl << 2);
+
+	if (sk) {
+		up = udp_sk(sk);
+
+		lookup = READ_ONCE(up->encap_err_lookup);
+		if (lookup && lookup(sk, skb))
+			sk = NULL;
+
+		goto out;
+	}
+
+	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
+			       udptable, NULL);
+	if (sk) {
+		up = udp_sk(sk);
+
+		lookup = READ_ONCE(up->encap_err_lookup);
+		if (!lookup || lookup(sk, skb))
+			sk = NULL;
+	}
+
+out:
+	if (!sk)
+		sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
+
+	skb_set_transport_header(skb, transport_offset);
+	skb_set_network_header(skb, network_offset);
+
+	return sk;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.
+ * Header points to the ip header of the error packet. We move
+ * on past this. Then (as it used to claim before adjustment)
+ * header points to the first 8 bytes of the udp header.  We need
+ * to find the appropriate port.
+ */
+
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+{
+	struct inet_sock *inet;
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	bool tunnel = false;
+	struct sock *sk;
+	int harderr;
+	int err;
+	struct net *net = dev_net(skb->dev);
+
+	sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
+			       iph->saddr, uh->source, skb->dev->ifindex,
+			       inet_sdif(skb), udptable, NULL);
+
+	if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
+		/* No socket for error: try tunnels before discarding */
+		if (static_branch_unlikely(&udp_encap_needed_key)) {
+			sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
+						  info);
+			if (!sk)
+				return 0;
+		} else
+			sk = ERR_PTR(-ENOENT);
+
+		if (IS_ERR(sk)) {
+			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+			return PTR_ERR(sk);
+		}
+
+		tunnel = true;
+	}
+
+	err = 0;
+	harderr = 0;
+	inet = inet_sk(sk);
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		goto out;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+			ipv4_sk_update_pmtu(skb, sk, info);
+			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+				err = EMSGSIZE;
+				harderr = 1;
+				break;
+			}
+			goto out;
+		}
+		err = EHOSTUNREACH;
+		if (code <= NR_ICMP_UNREACH) {
+			harderr = icmp_err_convert[code].fatal;
+			err = icmp_err_convert[code].errno;
+		}
+		break;
+	case ICMP_REDIRECT:
+		ipv4_sk_redirect(skb, sk);
+		goto out;
+	}
+
+	/*
+	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
+	 *	4.1.3.3.
+	 */
+	if (tunnel) {
+		/* ...not for tunnels though: we don't have a sending socket */
+		if (udp_sk(sk)->encap_err_rcv)
+			udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
+						  (u8 *)(uh+1));
+		goto out;
+	}
+	if (!inet_test_bit(RECVERR, sk)) {
+		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+	} else
+		ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
+
+	sk->sk_err = err;
+	sk_error_report(sk);
+out:
+	return 0;
+}
+
+int udp_err(struct sk_buff *skb, u32 info)
+{
+	return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
+}
+
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+void udp_flush_pending_frames(struct sock *sk)
+{
+	struct udp_sock *up = udp_sk(sk);
+
+	if (up->pending) {
+		up->len = 0;
+		WRITE_ONCE(up->pending, 0);
+		ip_flush_pending_frames(sk);
+	}
+}
+EXPORT_SYMBOL(udp_flush_pending_frames);
+
+/**
+ * 	udp4_hwcsum  -  handle outgoing HW checksumming
+ * 	@skb: 	sk_buff containing the filled-in UDP header
+ * 	        (checksum field must be zeroed out)
+ *	@src:	source IP address
+ *	@dst:	destination IP address
+ */
+void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
+{
+	struct udphdr *uh = udp_hdr(skb);
+	int offset = skb_transport_offset(skb);
+	int len = skb->len - offset;
+	int hlen = len;
+	__wsum csum = 0;
+
+	if (!skb_has_frag_list(skb)) {
+		/*
+		 * Only one fragment on the socket.
+		 */
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		uh->check = ~csum_tcpudp_magic(src, dst, len,
+					       IPPROTO_UDP, 0);
+	} else {
+		struct sk_buff *frags;
+
+		/*
+		 * HW-checksum won't work as there are two or more
+		 * fragments on the socket so that all csums of sk_buffs
+		 * should be together
+		 */
+		skb_walk_frags(skb, frags) {
+			csum = csum_add(csum, frags->csum);
+			hlen -= frags->len;
+		}
+
+		csum = skb_checksum(skb, offset, hlen, csum);
+		skb->ip_summed = CHECKSUM_NONE;
+
+		uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	}
+}
+EXPORT_SYMBOL_GPL(udp4_hwcsum);
+
+/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
+ * for the simple case like when setting the checksum for a UDP tunnel.
+ */
+void udp_set_csum(bool nocheck, struct sk_buff *skb,
+		  __be32 saddr, __be32 daddr, int len)
+{
+	struct udphdr *uh = udp_hdr(skb);
+
+	if (nocheck) {
+		uh->check = 0;
+	} else if (skb_is_gso(skb)) {
+		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		uh->check = 0;
+		uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	} else {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+	}
+}
+EXPORT_SYMBOL(udp_set_csum);
+
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+			struct inet_cork *cork)
+{
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct udphdr *uh;
+	int err;
+	int is_udplite = IS_UDPLITE(sk);
+	int offset = skb_transport_offset(skb);
+	int len = skb->len - offset;
+	int datalen = len - sizeof(*uh);
+	__wsum csum = 0;
+
+	/*
+	 * Create a UDP header
+	 */
+	uh = udp_hdr(skb);
+	uh->source = inet->inet_sport;
+	uh->dest = fl4->fl4_dport;
+	uh->len = htons(len);
+	uh->check = 0;
+
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize) {
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+		if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+		if (sk->sk_no_check_tx) {
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+		    dst_xfrm(skb_dst(skb))) {
+			kfree_skb(skb);
+			return -EIO;
+		}
+
+		if (datalen > cork->gso_size) {
+			skb_shinfo(skb)->gso_size = cork->gso_size;
+			skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+			skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+								 cork->gso_size);
+		}
+		goto csum_partial;
+	}
+
+	if (is_udplite)  				 /*     UDP-Lite      */
+		csum = udplite_csum(skb);
+
+	else if (sk->sk_no_check_tx) {			 /* UDP csum off */
+
+		skb->ip_summed = CHECKSUM_NONE;
+		goto send;
+
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
+
+		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
+		goto send;
+
+	} else
+		csum = udp_csum(skb);
+
+	/* add protocol-dependent pseudo-header */
+	uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
+				      sk->sk_protocol, csum);
+	if (uh->check == 0)
+		uh->check = CSUM_MANGLED_0;
+
+send:
+	err = ip_send_skb(sock_net(sk), skb);
+	if (err) {
+		if (err == -ENOBUFS &&
+		    !inet_test_bit(RECVERR, sk)) {
+			UDP_INC_STATS(sock_net(sk),
+				      UDP_MIB_SNDBUFERRORS, is_udplite);
+			err = 0;
+		}
+	} else
+		UDP_INC_STATS(sock_net(sk),
+			      UDP_MIB_OUTDATAGRAMS, is_udplite);
+	return err;
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+int udp_push_pending_frames(struct sock *sk)
+{
+	struct udp_sock  *up = udp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+	struct sk_buff *skb;
+	int err = 0;
+
+	skb = ip_finish_skb(sk, fl4);
+	if (!skb)
+		goto out;
+
+	err = udp_send_skb(skb, fl4, &inet->cork.base);
+
+out:
+	up->len = 0;
+	WRITE_ONCE(up->pending, 0);
+	return err;
+}
+EXPORT_SYMBOL(udp_push_pending_frames);
+
+static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
+{
+	switch (cmsg->cmsg_type) {
+	case UDP_SEGMENT:
+		if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
+			return -EINVAL;
+		*gso_size = *(__u16 *)CMSG_DATA(cmsg);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
+{
+	struct cmsghdr *cmsg;
+	bool need_ip = false;
+	int err;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_UDP) {
+			need_ip = true;
+			continue;
+		}
+
+		err = __udp_cmsg_send(cmsg, gso_size);
+		if (err)
+			return err;
+	}
+
+	return need_ip;
+}
+EXPORT_SYMBOL_GPL(udp_cmsg_send);
+
+int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct udp_sock *up = udp_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+	struct flowi4 fl4_stack;
+	struct flowi4 *fl4;
+	int ulen = len;
+	struct ipcm_cookie ipc;
+	struct rtable *rt = NULL;
+	int free = 0;
+	int connected = 0;
+	__be32 daddr, faddr, saddr;
+	u8 tos, scope;
+	__be16 dport;
+	int err, is_udplite = IS_UDPLITE(sk);
+	int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
+	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+	struct sk_buff *skb;
+	struct ip_options_data opt_copy;
+
+	if (len > 0xFFFF)
+		return -EMSGSIZE;
+
+	/*
+	 *	Check the flags.
+	 */
+
+	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
+		return -EOPNOTSUPP;
+
+	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+
+	fl4 = &inet->cork.fl.u.ip4;
+	if (READ_ONCE(up->pending)) {
+		/*
+		 * There are pending frames.
+		 * The socket lock must be held while it's corked.
+		 */
+		lock_sock(sk);
+		if (likely(up->pending)) {
+			if (unlikely(up->pending != AF_INET)) {
+				release_sock(sk);
+				return -EINVAL;
+			}
+			goto do_append_data;
+		}
+		release_sock(sk);
+	}
+	ulen += sizeof(struct udphdr);
+
+	/*
+	 *	Get and verify the address.
+	 */
+	if (usin) {
+		if (msg->msg_namelen < sizeof(*usin))
+			return -EINVAL;
+		if (usin->sin_family != AF_INET) {
+			if (usin->sin_family != AF_UNSPEC)
+				return -EAFNOSUPPORT;
+		}
+
+		daddr = usin->sin_addr.s_addr;
+		dport = usin->sin_port;
+		if (dport == 0)
+			return -EINVAL;
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+		daddr = inet->inet_daddr;
+		dport = inet->inet_dport;
+		/* Open fast path for connected socket.
+		   Route will not be used, if at least one option is set.
+		 */
+		connected = 1;
+	}
+
+	ipcm_init_sk(&ipc, inet);
+	ipc.gso_size = READ_ONCE(up->gso_size);
+
+	if (msg->msg_controllen) {
+		err = udp_cmsg_send(sk, msg, &ipc.gso_size);
+		if (err > 0)
+			err = ip_cmsg_send(sk, msg, &ipc,
+					   sk->sk_family == AF_INET6);
+		if (unlikely(err < 0)) {
+			kfree(ipc.opt);
+			return err;
+		}
+		if (ipc.opt)
+			free = 1;
+		connected = 0;
+	}
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
+
+	if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+					    (struct sockaddr *)usin,
+					    &msg->msg_namelen,
+					    &ipc.addr);
+		if (err)
+			goto out_free;
+		if (usin) {
+			if (usin->sin_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_free;
+			}
+			daddr = usin->sin_addr.s_addr;
+			dport = usin->sin_port;
+		}
+	}
+
+	saddr = ipc.addr;
+	ipc.addr = faddr = daddr;
+
+	if (ipc.opt && ipc.opt->opt.srr) {
+		if (!daddr) {
+			err = -EINVAL;
+			goto out_free;
+		}
+		faddr = ipc.opt->opt.faddr;
+		connected = 0;
+	}
+	tos = get_rttos(&ipc, inet);
+	scope = ip_sendmsg_scope(inet, &ipc, msg);
+	if (scope == RT_SCOPE_LINK)
+		connected = 0;
+
+	if (ipv4_is_multicast(daddr)) {
+		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+		connected = 0;
+	} else if (!ipc.oif) {
+		ipc.oif = inet->uc_index;
+	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+		/* oif is set, packet is to local broadcast and
+		 * uc_index is set. oif is most likely set
+		 * by sk_bound_dev_if. If uc_index != oif check if the
+		 * oif is an L3 master and uc_index is an L3 slave.
+		 * If so, we want to allow the send using the uc_index.
+		 */
+		if (ipc.oif != inet->uc_index &&
+		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
+							      inet->uc_index)) {
+			ipc.oif = inet->uc_index;
+		}
+	}
+
+	if (connected)
+		rt = (struct rtable *)sk_dst_check(sk, 0);
+
+	if (!rt) {
+		struct net *net = sock_net(sk);
+		__u8 flow_flags = inet_sk_flowi_flags(sk);
+
+		fl4 = &fl4_stack;
+
+		flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope,
+				   sk->sk_protocol, flow_flags, faddr, saddr,
+				   dport, inet->inet_sport, sk->sk_uid);
+
+		security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
+		rt = ip_route_output_flow(net, fl4, sk);
+		if (IS_ERR(rt)) {
+			err = PTR_ERR(rt);
+			rt = NULL;
+			if (err == -ENETUNREACH)
+				IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+			goto out;
+		}
+
+		err = -EACCES;
+		if ((rt->rt_flags & RTCF_BROADCAST) &&
+		    !sock_flag(sk, SOCK_BROADCAST))
+			goto out;
+		if (connected)
+			sk_dst_set(sk, dst_clone(&rt->dst));
+	}
+
+	if (msg->msg_flags&MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	saddr = fl4->saddr;
+	if (!ipc.addr)
+		daddr = ipc.addr = fl4->daddr;
+
+	/* Lockless fast path for the non-corking case. */
+	if (!corkreq) {
+		struct inet_cork cork;
+
+		skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
+				  sizeof(struct udphdr), &ipc, &rt,
+				  &cork, msg->msg_flags);
+		err = PTR_ERR(skb);
+		if (!IS_ERR_OR_NULL(skb))
+			err = udp_send_skb(skb, fl4, &cork);
+		goto out;
+	}
+
+	lock_sock(sk);
+	if (unlikely(up->pending)) {
+		/* The socket is already corked while preparing it. */
+		/* ... which is an evident application bug. --ANK */
+		release_sock(sk);
+
+		net_dbg_ratelimited("socket already corked\n");
+		err = -EINVAL;
+		goto out;
+	}
+	/*
+	 *	Now cork the socket to pend data.
+	 */
+	fl4 = &inet->cork.fl.u.ip4;
+	fl4->daddr = daddr;
+	fl4->saddr = saddr;
+	fl4->fl4_dport = dport;
+	fl4->fl4_sport = inet->inet_sport;
+	WRITE_ONCE(up->pending, AF_INET);
+
+do_append_data:
+	up->len += ulen;
+	err = ip_append_data(sk, fl4, getfrag, msg, ulen,
+			     sizeof(struct udphdr), &ipc, &rt,
+			     corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+	if (err)
+		udp_flush_pending_frames(sk);
+	else if (!corkreq)
+		err = udp_push_pending_frames(sk);
+	else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
+		WRITE_ONCE(up->pending, 0);
+	release_sock(sk);
+
+out:
+	ip_rt_put(rt);
+out_free:
+	if (free)
+		kfree(ipc.opt);
+	if (!err)
+		return len;
+	/*
+	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
+	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
+	 * we don't have a good statistic (IpOutDiscards but it can be too many
+	 * things).  We could add another new stat but at least for now that
+	 * seems like overkill.
+	 */
+	if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		UDP_INC_STATS(sock_net(sk),
+			      UDP_MIB_SNDBUFERRORS, is_udplite);
+	}
+	return err;
+
+do_confirm:
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4->daddr);
+	if (!(msg->msg_flags&MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto out;
+}
+EXPORT_SYMBOL(udp_sendmsg);
+
+void udp_splice_eof(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct udp_sock *up = udp_sk(sk);
+
+	if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk))
+		return;
+
+	lock_sock(sk);
+	if (up->pending && !udp_test_bit(CORK, sk))
+		udp_push_pending_frames(sk);
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(udp_splice_eof);
+
+#define UDP_SKB_IS_STATELESS 0x80000000
+
+/* all head states (dst, sk, nf conntrack) except skb extensions are
+ * cleared by udp_rcv().
+ *
+ * We need to preserve secpath, if present, to eventually process
+ * IP_CMSG_PASSSEC at recvmsg() time.
+ *
+ * Other extensions can be cleared.
+ */
+static bool udp_try_make_stateless(struct sk_buff *skb)
+{
+	if (!skb_has_extensions(skb))
+		return true;
+
+	if (!secpath_exists(skb)) {
+		skb_ext_reset(skb);
+		return true;
+	}
+
+	return false;
+}
+
+static void udp_set_dev_scratch(struct sk_buff *skb)
+{
+	struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
+
+	BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
+	scratch->_tsize_state = skb->truesize;
+#if BITS_PER_LONG == 64
+	scratch->len = skb->len;
+	scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
+	scratch->is_linear = !skb_is_nonlinear(skb);
+#endif
+	if (udp_try_make_stateless(skb))
+		scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
+}
+
+static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
+{
+	/* We come here after udp_lib_checksum_complete() returned 0.
+	 * This means that __skb_checksum_complete() might have
+	 * set skb->csum_valid to 1.
+	 * On 64bit platforms, we can set csum_unnecessary
+	 * to true, but only if the skb is not shared.
+	 */
+#if BITS_PER_LONG == 64
+	if (!skb_shared(skb))
+		udp_skb_scratch(skb)->csum_unnecessary = true;
+#endif
+}
+
+static int udp_skb_truesize(struct sk_buff *skb)
+{
+	return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
+}
+
+static bool udp_skb_has_head_state(struct sk_buff *skb)
+{
+	return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
+}
+
+/* fully reclaim rmem/fwd memory allocated for skb */
+static void udp_rmem_release(struct sock *sk, int size, int partial,
+			     bool rx_queue_lock_held)
+{
+	struct udp_sock *up = udp_sk(sk);
+	struct sk_buff_head *sk_queue;
+	int amt;
+
+	if (likely(partial)) {
+		up->forward_deficit += size;
+		size = up->forward_deficit;
+		if (size < READ_ONCE(up->forward_threshold) &&
+		    !skb_queue_empty(&up->reader_queue))
+			return;
+	} else {
+		size += up->forward_deficit;
+	}
+	up->forward_deficit = 0;
+
+	/* acquire the sk_receive_queue for fwd allocated memory scheduling,
+	 * if the called don't held it already
+	 */
+	sk_queue = &sk->sk_receive_queue;
+	if (!rx_queue_lock_held)
+		spin_lock(&sk_queue->lock);
+
+
+	sk_forward_alloc_add(sk, size);
+	amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
+	sk_forward_alloc_add(sk, -amt);
+
+	if (amt)
+		__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
+
+	atomic_sub(size, &sk->sk_rmem_alloc);
+
+	/* this can save us from acquiring the rx queue lock on next receive */
+	skb_queue_splice_tail_init(sk_queue, &up->reader_queue);
+
+	if (!rx_queue_lock_held)
+		spin_unlock(&sk_queue->lock);
+}
+
+/* Note: called with reader_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
+{
+	prefetch(&skb->data);
+	udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
+}
+EXPORT_SYMBOL(udp_skb_destructor);
+
+/* as above, but the caller held the rx queue lock, too */
+static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
+{
+	prefetch(&skb->data);
+	udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
+}
+
+/* Idea of busylocks is to let producers grab an extra spinlock
+ * to relieve pressure on the receive_queue spinlock shared by consumer.
+ * Under flood, this means that only one producer can be in line
+ * trying to acquire the receive_queue spinlock.
+ * These busylock can be allocated on a per cpu manner, instead of a
+ * per socket one (that would consume a cache line per socket)
+ */
+static int udp_busylocks_log __read_mostly;
+static spinlock_t *udp_busylocks __read_mostly;
+
+static spinlock_t *busylock_acquire(void *ptr)
+{
+	spinlock_t *busy;
+
+	busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
+	spin_lock(busy);
+	return busy;
+}
+
+static void busylock_release(spinlock_t *busy)
+{
+	if (busy)
+		spin_unlock(busy);
+}
+
+static int udp_rmem_schedule(struct sock *sk, int size)
+{
+	int delta;
+
+	delta = size - sk->sk_forward_alloc;
+	if (delta > 0 && !__sk_mem_schedule(sk, delta, SK_MEM_RECV))
+		return -ENOBUFS;
+
+	return 0;
+}
+
+int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff_head *list = &sk->sk_receive_queue;
+	int rmem, err = -ENOMEM;
+	spinlock_t *busy = NULL;
+	int size;
+
+	/* try to avoid the costly atomic add/sub pair when the receive
+	 * queue is full; always allow at least a packet
+	 */
+	rmem = atomic_read(&sk->sk_rmem_alloc);
+	if (rmem > sk->sk_rcvbuf)
+		goto drop;
+
+	/* Under mem pressure, it might be helpful to help udp_recvmsg()
+	 * having linear skbs :
+	 * - Reduce memory overhead and thus increase receive queue capacity
+	 * - Less cache line misses at copyout() time
+	 * - Less work at consume_skb() (less alien page frag freeing)
+	 */
+	if (rmem > (sk->sk_rcvbuf >> 1)) {
+		skb_condense(skb);
+
+		busy = busylock_acquire(sk);
+	}
+	size = skb->truesize;
+	udp_set_dev_scratch(skb);
+
+	/* we drop only if the receive buf is full and the receive
+	 * queue contains some other skb
+	 */
+	rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
+	if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
+		goto uncharge_drop;
+
+	spin_lock(&list->lock);
+	err = udp_rmem_schedule(sk, size);
+	if (err) {
+		spin_unlock(&list->lock);
+		goto uncharge_drop;
+	}
+
+	sk_forward_alloc_add(sk, -size);
+
+	/* no need to setup a destructor, we will explicitly release the
+	 * forward allocated memory on dequeue
+	 */
+	sock_skb_set_dropcount(sk, skb);
+
+	__skb_queue_tail(list, skb);
+	spin_unlock(&list->lock);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
+
+	busylock_release(busy);
+	return 0;
+
+uncharge_drop:
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+
+drop:
+	atomic_inc(&sk->sk_drops);
+	busylock_release(busy);
+	return err;
+}
+EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
+
+void udp_destruct_common(struct sock *sk)
+{
+	/* reclaim completely the forward allocated memory */
+	struct udp_sock *up = udp_sk(sk);
+	unsigned int total = 0;
+	struct sk_buff *skb;
+
+	skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue);
+	while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) {
+		total += skb->truesize;
+		kfree_skb(skb);
+	}
+	udp_rmem_release(sk, total, 0, true);
+}
+EXPORT_SYMBOL_GPL(udp_destruct_common);
+
+static void udp_destruct_sock(struct sock *sk)
+{
+	udp_destruct_common(sk);
+	inet_sock_destruct(sk);
+}
+
+int udp_init_sock(struct sock *sk)
+{
+	udp_lib_init_sock(sk);
+	sk->sk_destruct = udp_destruct_sock;
+	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+	return 0;
+}
+
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
+{
+	if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
+		bool slow = lock_sock_fast(sk);
+
+		sk_peek_offset_bwd(sk, len);
+		unlock_sock_fast(sk, slow);
+	}
+
+	if (!skb_unref(skb))
+		return;
+
+	/* In the more common cases we cleared the head states previously,
+	 * see __udp_queue_rcv_skb().
+	 */
+	if (unlikely(udp_skb_has_head_state(skb)))
+		skb_release_head_state(skb);
+	__consume_stateless_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_consume_udp);
+
+static struct sk_buff *__first_packet_length(struct sock *sk,
+					     struct sk_buff_head *rcvq,
+					     int *total)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_peek(rcvq)) != NULL) {
+		if (udp_lib_checksum_complete(skb)) {
+			__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
+					IS_UDPLITE(sk));
+			__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+					IS_UDPLITE(sk));
+			atomic_inc(&sk->sk_drops);
+			__skb_unlink(skb, rcvq);
+			*total += skb->truesize;
+			kfree_skb(skb);
+		} else {
+			udp_skb_csum_unnecessary_set(skb);
+			break;
+		}
+	}
+	return skb;
+}
+
+/**
+ *	first_packet_length	- return length of first packet in receive queue
+ *	@sk: socket
+ *
+ *	Drops all bad checksum frames, until a valid one is found.
+ *	Returns the length of found skb, or -1 if none is found.
+ */
+static int first_packet_length(struct sock *sk)
+{
+	struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
+	struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+	struct sk_buff *skb;
+	int total = 0;
+	int res;
+
+	spin_lock_bh(&rcvq->lock);
+	skb = __first_packet_length(sk, rcvq, &total);
+	if (!skb && !skb_queue_empty_lockless(sk_queue)) {
+		spin_lock(&sk_queue->lock);
+		skb_queue_splice_tail_init(sk_queue, rcvq);
+		spin_unlock(&sk_queue->lock);
+
+		skb = __first_packet_length(sk, rcvq, &total);
+	}
+	res = skb ? skb->len : -1;
+	if (total)
+		udp_rmem_release(sk, total, 1, false);
+	spin_unlock_bh(&rcvq->lock);
+	return res;
+}
+
+/*
+ *	IOCTL requests applicable to the UDP protocol
+ */
+
+int udp_ioctl(struct sock *sk, int cmd, int *karg)
+{
+	switch (cmd) {
+	case SIOCOUTQ:
+	{
+		*karg = sk_wmem_alloc_get(sk);
+		return 0;
+	}
+
+	case SIOCINQ:
+	{
+		*karg = max_t(int, 0, first_packet_length(sk));
+		return 0;
+	}
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(udp_ioctl);
+
+struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
+			       int *off, int *err)
+{
+	struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+	struct sk_buff_head *queue;
+	struct sk_buff *last;
+	long timeo;
+	int error;
+
+	queue = &udp_sk(sk)->reader_queue;
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	do {
+		struct sk_buff *skb;
+
+		error = sock_error(sk);
+		if (error)
+			break;
+
+		error = -EAGAIN;
+		do {
+			spin_lock_bh(&queue->lock);
+			skb = __skb_try_recv_from_queue(sk, queue, flags, off,
+							err, &last);
+			if (skb) {
+				if (!(flags & MSG_PEEK))
+					udp_skb_destructor(sk, skb);
+				spin_unlock_bh(&queue->lock);
+				return skb;
+			}
+
+			if (skb_queue_empty_lockless(sk_queue)) {
+				spin_unlock_bh(&queue->lock);
+				goto busy_check;
+			}
+
+			/* refill the reader queue and walk it again
+			 * keep both queues locked to avoid re-acquiring
+			 * the sk_receive_queue lock if fwd memory scheduling
+			 * is needed.
+			 */
+			spin_lock(&sk_queue->lock);
+			skb_queue_splice_tail_init(sk_queue, queue);
+
+			skb = __skb_try_recv_from_queue(sk, queue, flags, off,
+							err, &last);
+			if (skb && !(flags & MSG_PEEK))
+				udp_skb_dtor_locked(sk, skb);
+			spin_unlock(&sk_queue->lock);
+			spin_unlock_bh(&queue->lock);
+			if (skb)
+				return skb;
+
+busy_check:
+			if (!sk_can_busy_loop(sk))
+				break;
+
+			sk_busy_loop(sk, flags & MSG_DONTWAIT);
+		} while (!skb_queue_empty_lockless(sk_queue));
+
+		/* sk_queue is empty, reader_queue may contain peeked packets */
+	} while (timeo &&
+		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
+					      &error, &timeo,
+					      (struct sk_buff *)sk_queue));
+
+	*err = error;
+	return NULL;
+}
+EXPORT_SYMBOL(__skb_recv_udp);
+
+int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
+{
+	struct sk_buff *skb;
+	int err;
+
+try_again:
+	skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
+	if (!skb)
+		return err;
+
+	if (udp_lib_checksum_complete(skb)) {
+		int is_udplite = IS_UDPLITE(sk);
+		struct net *net = sock_net(sk);
+
+		__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
+		__UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
+		atomic_inc(&sk->sk_drops);
+		kfree_skb(skb);
+		goto try_again;
+	}
+
+	WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+	return recv_actor(sk, skb);
+}
+EXPORT_SYMBOL(udp_read_skb);
+
+/*
+ * 	This should be easy, if there is something there we
+ * 	return it, otherwise we block.
+ */
+
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+		int *addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+	struct sk_buff *skb;
+	unsigned int ulen, copied;
+	int off, err, peeking = flags & MSG_PEEK;
+	int is_udplite = IS_UDPLITE(sk);
+	bool checksum_valid = false;
+
+	if (flags & MSG_ERRQUEUE)
+		return ip_recv_error(sk, msg, len, addr_len);
+
+try_again:
+	off = sk_peek_offset(sk, flags);
+	skb = __skb_recv_udp(sk, flags, &off, &err);
+	if (!skb)
+		return err;
+
+	ulen = udp_skb_len(skb);
+	copied = len;
+	if (copied > ulen - off)
+		copied = ulen - off;
+	else if (copied < ulen)
+		msg->msg_flags |= MSG_TRUNC;
+
+	/*
+	 * If checksum is needed at all, try to do it while copying the
+	 * data.  If the data is truncated, or if we only want a partial
+	 * coverage checksum (UDP-Lite), do it before the copy.
+	 */
+
+	if (copied < ulen || peeking ||
+	    (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
+		checksum_valid = udp_skb_csum_unnecessary(skb) ||
+				!__udp_lib_checksum_complete(skb);
+		if (!checksum_valid)
+			goto csum_copy_err;
+	}
+
+	if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
+		if (udp_skb_is_linear(skb))
+			err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
+		else
+			err = skb_copy_datagram_msg(skb, off, msg, copied);
+	} else {
+		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
+
+		if (err == -EINVAL)
+			goto csum_copy_err;
+	}
+
+	if (unlikely(err)) {
+		if (!peeking) {
+			atomic_inc(&sk->sk_drops);
+			UDP_INC_STATS(sock_net(sk),
+				      UDP_MIB_INERRORS, is_udplite);
+		}
+		kfree_skb(skb);
+		return err;
+	}
+
+	if (!peeking)
+		UDP_INC_STATS(sock_net(sk),
+			      UDP_MIB_INDATAGRAMS, is_udplite);
+
+	sock_recv_cmsgs(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_port = udp_hdr(skb)->source;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+		*addr_len = sizeof(*sin);
+
+		BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+						      (struct sockaddr *)sin,
+						      addr_len);
+	}
+
+	if (udp_test_bit(GRO_ENABLED, sk))
+		udp_cmsg_recv(msg, sk, skb);
+
+	if (inet_cmsg_flags(inet))
+		ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
+
+	err = copied;
+	if (flags & MSG_TRUNC)
+		err = ulen;
+
+	skb_consume_udp(sk, skb, peeking ? -err : err);
+	return err;
+
+csum_copy_err:
+	if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
+				 udp_skb_destructor)) {
+		UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+	}
+	kfree_skb(skb);
+
+	/* starting over for a new packet, but check if we need to yield */
+	cond_resched();
+	msg->msg_flags &= ~MSG_TRUNC;
+	goto try_again;
+}
+
+int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	/* This check is replicated from __ip4_datagram_connect() and
+	 * intended to prevent BPF program called below from accessing bytes
+	 * that are out of the bound specified by user in addr_len.
+	 */
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+EXPORT_SYMBOL(udp_pre_connect);
+
+int __udp_disconnect(struct sock *sk, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	/*
+	 *	1003.1g - break association.
+	 */
+
+	sk->sk_state = TCP_CLOSE;
+	inet->inet_daddr = 0;
+	inet->inet_dport = 0;
+	sock_rps_reset_rxhash(sk);
+	sk->sk_bound_dev_if = 0;
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
+		inet_reset_saddr(sk);
+		if (sk->sk_prot->rehash &&
+		    (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+			sk->sk_prot->rehash(sk);
+	}
+
+	if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
+		sk->sk_prot->unhash(sk);
+		inet->inet_sport = 0;
+	}
+	sk_dst_reset(sk);
+	return 0;
+}
+EXPORT_SYMBOL(__udp_disconnect);
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+	lock_sock(sk);
+	__udp_disconnect(sk, flags);
+	release_sock(sk);
+	return 0;
+}
+EXPORT_SYMBOL(udp_disconnect);
+
+void udp_lib_unhash(struct sock *sk)
+{
+	if (sk_hashed(sk)) {
+		struct udp_table *udptable = udp_get_table_prot(sk);
+		struct udp_hslot *hslot, *hslot2;
+
+		hslot  = udp_hashslot(udptable, sock_net(sk),
+				      udp_sk(sk)->udp_port_hash);
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+
+		spin_lock_bh(&hslot->lock);
+		if (rcu_access_pointer(sk->sk_reuseport_cb))
+			reuseport_detach_sock(sk);
+		if (sk_del_node_init_rcu(sk)) {
+			hslot->count--;
+			inet_sk(sk)->inet_num = 0;
+			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+			spin_lock(&hslot2->lock);
+			hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+			hslot2->count--;
+			spin_unlock(&hslot2->lock);
+		}
+		spin_unlock_bh(&hslot->lock);
+	}
+}
+EXPORT_SYMBOL(udp_lib_unhash);
+
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
+ */
+void udp_lib_rehash(struct sock *sk, u16 newhash)
+{
+	if (sk_hashed(sk)) {
+		struct udp_table *udptable = udp_get_table_prot(sk);
+		struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+		nhslot2 = udp_hashslot2(udptable, newhash);
+		udp_sk(sk)->udp_portaddr_hash = newhash;
+
+		if (hslot2 != nhslot2 ||
+		    rcu_access_pointer(sk->sk_reuseport_cb)) {
+			hslot = udp_hashslot(udptable, sock_net(sk),
+					     udp_sk(sk)->udp_port_hash);
+			/* we must lock primary chain too */
+			spin_lock_bh(&hslot->lock);
+			if (rcu_access_pointer(sk->sk_reuseport_cb))
+				reuseport_detach_sock(sk);
+
+			if (hslot2 != nhslot2) {
+				spin_lock(&hslot2->lock);
+				hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+				hslot2->count--;
+				spin_unlock(&hslot2->lock);
+
+				spin_lock(&nhslot2->lock);
+				hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+							 &nhslot2->head);
+				nhslot2->count++;
+				spin_unlock(&nhslot2->lock);
+			}
+
+			spin_unlock_bh(&hslot->lock);
+		}
+	}
+}
+EXPORT_SYMBOL(udp_lib_rehash);
+
+void udp_v4_rehash(struct sock *sk)
+{
+	u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	udp_lib_rehash(sk, new_hash);
+}
+
+static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int rc;
+
+	if (inet_sk(sk)->inet_daddr) {
+		sock_rps_save_rxhash(sk, skb);
+		sk_mark_napi_id(sk, skb);
+		sk_incoming_cpu_update(sk);
+	} else {
+		sk_mark_napi_id_once(sk, skb);
+	}
+
+	rc = __udp_enqueue_schedule_skb(sk, skb);
+	if (rc < 0) {
+		int is_udplite = IS_UDPLITE(sk);
+		int drop_reason;
+
+		/* Note that an ENOMEM error is charged twice */
+		if (rc == -ENOMEM) {
+			UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+					is_udplite);
+			drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+		} else {
+			UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
+				      is_udplite);
+			drop_reason = SKB_DROP_REASON_PROTO_MEM;
+		}
+		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+		kfree_skb_reason(skb, drop_reason);
+		trace_udp_fail_queue_rcv_skb(rc, sk);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* returns:
+ *  -1: error
+ *   0: success
+ *  >0: "udp encap" protocol resubmission
+ *
+ * Note that in the success and error cases, the skb is assumed to
+ * have either been requeued or freed.
+ */
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	struct udp_sock *up = udp_sk(sk);
+	int is_udplite = IS_UDPLITE(sk);
+
+	/*
+	 *	Charge it to the socket, dropping if the queue is full.
+	 */
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+		goto drop;
+	}
+	nf_reset_ct(skb);
+
+	if (static_branch_unlikely(&udp_encap_needed_key) &&
+	    READ_ONCE(up->encap_type)) {
+		int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+
+		/*
+		 * This is an encapsulation socket so pass the skb to
+		 * the socket's udp_encap_rcv() hook. Otherwise, just
+		 * fall through and pass this up the UDP socket.
+		 * up->encap_rcv() returns the following value:
+		 * =0 if skb was successfully passed to the encap
+		 *    handler or was discarded by it.
+		 * >0 if skb should be passed on to UDP.
+		 * <0 if skb should be resubmitted as proto -N
+		 */
+
+		/* if we're overly short, let UDP handle it */
+		encap_rcv = READ_ONCE(up->encap_rcv);
+		if (encap_rcv) {
+			int ret;
+
+			/* Verify checksum before giving to encap */
+			if (udp_lib_checksum_complete(skb))
+				goto csum_error;
+
+			ret = encap_rcv(sk, skb);
+			if (ret <= 0) {
+				__UDP_INC_STATS(sock_net(sk),
+						UDP_MIB_INDATAGRAMS,
+						is_udplite);
+				return -ret;
+			}
+		}
+
+		/* FALLTHROUGH -- it's a UDP Packet */
+	}
+
+	/*
+	 * 	UDP-Lite specific tests, ignored on UDP sockets
+	 */
+	if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+		u16 pcrlen = READ_ONCE(up->pcrlen);
+
+		/*
+		 * MIB statistics other than incrementing the error count are
+		 * disabled for the following two types of errors: these depend
+		 * on the application settings, not on the functioning of the
+		 * protocol stack as such.
+		 *
+		 * RFC 3828 here recommends (sec 3.3): "There should also be a
+		 * way ... to ... at least let the receiving application block
+		 * delivery of packets with coverage values less than a value
+		 * provided by the application."
+		 */
+		if (pcrlen == 0) {          /* full coverage was set  */
+			net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
+					    UDP_SKB_CB(skb)->cscov, skb->len);
+			goto drop;
+		}
+		/* The next case involves violating the min. coverage requested
+		 * by the receiver. This is subtle: if receiver wants x and x is
+		 * greater than the buffersize/MTU then receiver will complain
+		 * that it wants x while sender emits packets of smaller size y.
+		 * Therefore the above ...()->partial_cov statement is essential.
+		 */
+		if (UDP_SKB_CB(skb)->cscov < pcrlen) {
+			net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
+					    UDP_SKB_CB(skb)->cscov, pcrlen);
+			goto drop;
+		}
+	}
+
+	prefetch(&sk->sk_rmem_alloc);
+	if (rcu_access_pointer(sk->sk_filter) &&
+	    udp_lib_checksum_complete(skb))
+			goto csum_error;
+
+	if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
+		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+		goto drop;
+	}
+
+	udp_csum_pull_header(skb);
+
+	ipv4_pktinfo_prepare(sk, skb);
+	return __udp_queue_rcv_skb(sk, skb);
+
+csum_error:
+	drop_reason = SKB_DROP_REASON_UDP_CSUM;
+	__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+drop:
+	__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+	atomic_inc(&sk->sk_drops);
+	kfree_skb_reason(skb, drop_reason);
+	return -1;
+}
+
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *next, *segs;
+	int ret;
+
+	if (likely(!udp_unexpected_gso(sk, skb)))
+		return udp_queue_rcv_one_skb(sk, skb);
+
+	BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
+	__skb_push(skb, -skb_mac_offset(skb));
+	segs = udp_rcv_segment(sk, skb, true);
+	skb_list_walk_safe(segs, skb, next) {
+		__skb_pull(skb, skb_transport_offset(skb));
+
+		udp_post_segment_fix_csum(skb);
+		ret = udp_queue_rcv_one_skb(sk, skb);
+		if (ret > 0)
+			ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
+	}
+	return 0;
+}
+
+/* For TCP sockets, sk_rx_dst is protected by socket lock
+ * For UDP, we use xchg() to guard against concurrent changes.
+ */
+bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+	struct dst_entry *old;
+
+	if (dst_hold_safe(dst)) {
+		old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst);
+		dst_release(old);
+		return old != dst;
+	}
+	return false;
+}
+EXPORT_SYMBOL(udp_sk_rx_dst_set);
+
+/*
+ *	Multicasts and broadcasts go to each listener.
+ *
+ *	Note: called only from the BH handler context.
+ */
+static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
+				    struct udphdr  *uh,
+				    __be32 saddr, __be32 daddr,
+				    struct udp_table *udptable,
+				    int proto)
+{
+	struct sock *sk, *first = NULL;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+	unsigned int offset = offsetof(typeof(*sk), sk_node);
+	int dif = skb->dev->ifindex;
+	int sdif = inet_sdif(skb);
+	struct hlist_node *node;
+	struct sk_buff *nskb;
+
+	if (use_hash2) {
+		hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+			    udptable->mask;
+		hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
+start_lookup:
+		hslot = &udptable->hash2[hash2];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
+
+	sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+		if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
+					 uh->source, saddr, dif, sdif, hnum))
+			continue;
+
+		if (!first) {
+			first = sk;
+			continue;
+		}
+		nskb = skb_clone(skb, GFP_ATOMIC);
+
+		if (unlikely(!nskb)) {
+			atomic_inc(&sk->sk_drops);
+			__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
+					IS_UDPLITE(sk));
+			__UDP_INC_STATS(net, UDP_MIB_INERRORS,
+					IS_UDPLITE(sk));
+			continue;
+		}
+		if (udp_queue_rcv_skb(sk, nskb) > 0)
+			consume_skb(nskb);
+	}
+
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
+	if (first) {
+		if (udp_queue_rcv_skb(first, skb) > 0)
+			consume_skb(skb);
+	} else {
+		kfree_skb(skb);
+		__UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
+				proto == IPPROTO_UDPLITE);
+	}
+	return 0;
+}
+
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires checksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
+				 int proto)
+{
+	int err;
+
+	UDP_SKB_CB(skb)->partial_cov = 0;
+	UDP_SKB_CB(skb)->cscov = skb->len;
+
+	if (proto == IPPROTO_UDPLITE) {
+		err = udplite_checksum_init(skb, uh);
+		if (err)
+			return err;
+
+		if (UDP_SKB_CB(skb)->partial_cov) {
+			skb->csum = inet_compute_pseudo(skb, proto);
+			return 0;
+		}
+	}
+
+	/* Note, we are only interested in != 0 or == 0, thus the
+	 * force to int.
+	 */
+	err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+							inet_compute_pseudo);
+	if (err)
+		return err;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
+		/* If SW calculated the value, we know it's bad */
+		if (skb->csum_complete_sw)
+			return 1;
+
+		/* HW says the value is bad. Let's validate that.
+		 * skb->csum is no longer the full packet checksum,
+		 * so don't treat it as such.
+		 */
+		skb_checksum_complete_unset(skb);
+	}
+
+	return 0;
+}
+
+/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+ * return code conversion for ip layer consumption
+ */
+static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
+			       struct udphdr *uh)
+{
+	int ret;
+
+	if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+		skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
+
+	ret = udp_queue_rcv_skb(sk, skb);
+
+	/* a return value > 0 means to resubmit the input, but
+	 * it wants the return to be -protocol, or 0
+	 */
+	if (ret > 0)
+		return -ret;
+	return 0;
+}
+
+/*
+ *	All we need to do is get the socket, and then do a checksum.
+ */
+
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+		   int proto)
+{
+	struct sock *sk;
+	struct udphdr *uh;
+	unsigned short ulen;
+	struct rtable *rt = skb_rtable(skb);
+	__be32 saddr, daddr;
+	struct net *net = dev_net(skb->dev);
+	bool refcounted;
+	int drop_reason;
+
+	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+
+	/*
+	 *  Validate the packet.
+	 */
+	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+		goto drop;		/* No space for header. */
+
+	uh   = udp_hdr(skb);
+	ulen = ntohs(uh->len);
+	saddr = ip_hdr(skb)->saddr;
+	daddr = ip_hdr(skb)->daddr;
+
+	if (ulen > skb->len)
+		goto short_packet;
+
+	if (proto == IPPROTO_UDP) {
+		/* UDP validates ulen. */
+		if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
+			goto short_packet;
+		uh = udp_hdr(skb);
+	}
+
+	if (udp4_csum_init(skb, uh, proto))
+		goto csum_error;
+
+	sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
+			     &refcounted, udp_ehashfn);
+	if (IS_ERR(sk))
+		goto no_sk;
+
+	if (sk) {
+		struct dst_entry *dst = skb_dst(skb);
+		int ret;
+
+		if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
+			udp_sk_rx_dst_set(sk, dst);
+
+		ret = udp_unicast_rcv_skb(sk, skb, uh);
+		if (refcounted)
+			sock_put(sk);
+		return ret;
+	}
+
+	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+		return __udp4_lib_mcast_deliver(net, skb, uh,
+						saddr, daddr, udptable, proto);
+
+	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+	if (sk)
+		return udp_unicast_rcv_skb(sk, skb, uh);
+no_sk:
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto drop;
+	nf_reset_ct(skb);
+
+	/* No socket. Drop packet silently, if checksum is wrong */
+	if (udp_lib_checksum_complete(skb))
+		goto csum_error;
+
+	drop_reason = SKB_DROP_REASON_NO_SOCKET;
+	__UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	/*
+	 * Hmm.  We got an UDP packet to a port to which we
+	 * don't wanna listen.  Ignore it.
+	 */
+	kfree_skb_reason(skb, drop_reason);
+	return 0;
+
+short_packet:
+	drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+	net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
+			    proto == IPPROTO_UDPLITE ? "Lite" : "",
+			    &saddr, ntohs(uh->source),
+			    ulen, skb->len,
+			    &daddr, ntohs(uh->dest));
+	goto drop;
+
+csum_error:
+	/*
+	 * RFC1122: OK.  Discards the bad packet silently (as far as
+	 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
+	 */
+	drop_reason = SKB_DROP_REASON_UDP_CSUM;
+	net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
+			    proto == IPPROTO_UDPLITE ? "Lite" : "",
+			    &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
+			    ulen);
+	__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
+drop:
+	__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+	kfree_skb_reason(skb, drop_reason);
+	return 0;
+}
+
+/* We can only early demux multicast if there is a single matching socket.
+ * If more than one socket found returns NULL
+ */
+static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
+						  __be16 loc_port, __be32 loc_addr,
+						  __be16 rmt_port, __be32 rmt_addr,
+						  int dif, int sdif)
+{
+	struct udp_table *udptable = net->ipv4.udp_table;
+	unsigned short hnum = ntohs(loc_port);
+	struct sock *sk, *result;
+	struct udp_hslot *hslot;
+	unsigned int slot;
+
+	slot = udp_hashfn(net, hnum, udptable->mask);
+	hslot = &udptable->hash[slot];
+
+	/* Do not bother scanning a too big list */
+	if (hslot->count > 10)
+		return NULL;
+
+	result = NULL;
+	sk_for_each_rcu(sk, &hslot->head) {
+		if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
+					rmt_port, rmt_addr, dif, sdif, hnum)) {
+			if (result)
+				return NULL;
+			result = sk;
+		}
+	}
+
+	return result;
+}
+
+/* For unicast we should only early demux connected sockets or we can
+ * break forwarding setups.  The chains here can be long so only check
+ * if the first socket is an exact match and if not move on.
+ */
+static struct sock *__udp4_lib_demux_lookup(struct net *net,
+					    __be16 loc_port, __be32 loc_addr,
+					    __be16 rmt_port, __be32 rmt_addr,
+					    int dif, int sdif)
+{
+	struct udp_table *udptable = net->ipv4.udp_table;
+	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
+	unsigned short hnum = ntohs(loc_port);
+	unsigned int hash2, slot2;
+	struct udp_hslot *hslot2;
+	__portpair ports;
+	struct sock *sk;
+
+	hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
+	slot2 = hash2 & udptable->mask;
+	hslot2 = &udptable->hash2[slot2];
+	ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
+	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+		if (inet_match(net, sk, acookie, ports, dif, sdif))
+			return sk;
+		/* Only check first socket in chain */
+		break;
+	}
+	return NULL;
+}
+
+int udp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	struct in_device *in_dev = NULL;
+	const struct iphdr *iph;
+	const struct udphdr *uh;
+	struct sock *sk = NULL;
+	struct dst_entry *dst;
+	int dif = skb->dev->ifindex;
+	int sdif = inet_sdif(skb);
+	int ours;
+
+	/* validate the packet */
+	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
+		return 0;
+
+	iph = ip_hdr(skb);
+	uh = udp_hdr(skb);
+
+	if (skb->pkt_type == PACKET_MULTICAST) {
+		in_dev = __in_dev_get_rcu(skb->dev);
+
+		if (!in_dev)
+			return 0;
+
+		ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+				       iph->protocol);
+		if (!ours)
+			return 0;
+
+		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
+						   uh->source, iph->saddr,
+						   dif, sdif);
+	} else if (skb->pkt_type == PACKET_HOST) {
+		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
+					     uh->source, iph->saddr, dif, sdif);
+	}
+
+	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
+		return 0;
+
+	skb->sk = sk;
+	skb->destructor = sock_efree;
+	dst = rcu_dereference(sk->sk_rx_dst);
+
+	if (dst)
+		dst = dst_check(dst, 0);
+	if (dst) {
+		u32 itag = 0;
+
+		/* set noref for now.
+		 * any place which wants to hold dst has to call
+		 * dst_hold_safe()
+		 */
+		skb_dst_set_noref(skb, dst);
+
+		/* for unconnected multicast sockets we need to validate
+		 * the source on each packet
+		 */
+		if (!inet_sk(sk)->inet_daddr && in_dev)
+			return ip_mc_validate_source(skb, iph->daddr,
+						     iph->saddr,
+						     iph->tos & IPTOS_RT_MASK,
+						     skb->dev, in_dev, &itag);
+	}
+	return 0;
+}
+
+int udp_rcv(struct sk_buff *skb)
+{
+	return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
+}
+
+void udp_destroy_sock(struct sock *sk)
+{
+	struct udp_sock *up = udp_sk(sk);
+	bool slow = lock_sock_fast(sk);
+
+	/* protects from races with udp_abort() */
+	sock_set_flag(sk, SOCK_DEAD);
+	udp_flush_pending_frames(sk);
+	unlock_sock_fast(sk, slow);
+	if (static_branch_unlikely(&udp_encap_needed_key)) {
+		if (up->encap_type) {
+			void (*encap_destroy)(struct sock *sk);
+			encap_destroy = READ_ONCE(up->encap_destroy);
+			if (encap_destroy)
+				encap_destroy(sk);
+		}
+		if (udp_test_bit(ENCAP_ENABLED, sk))
+			static_branch_dec(&udp_encap_needed_key);
+	}
+}
+
+/*
+ *	Socket option code for UDP
+ */
+int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+		       sockptr_t optval, unsigned int optlen,
+		       int (*push_pending_frames)(struct sock *))
+{
+	struct udp_sock *up = udp_sk(sk);
+	int val, valbool;
+	int err = 0;
+	int is_udplite = IS_UDPLITE(sk);
+
+	if (level == SOL_SOCKET) {
+		err = sk_setsockopt(sk, level, optname, optval, optlen);
+
+		if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
+			sockopt_lock_sock(sk);
+			/* paired with READ_ONCE in udp_rmem_release() */
+			WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
+			sockopt_release_sock(sk);
+		}
+		return err;
+	}
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (copy_from_sockptr(&val, optval, sizeof(val)))
+		return -EFAULT;
+
+	valbool = val ? 1 : 0;
+
+	switch (optname) {
+	case UDP_CORK:
+		if (val != 0) {
+			udp_set_bit(CORK, sk);
+		} else {
+			udp_clear_bit(CORK, sk);
+			lock_sock(sk);
+			push_pending_frames(sk);
+			release_sock(sk);
+		}
+		break;
+
+	case UDP_ENCAP:
+		switch (val) {
+		case 0:
+#ifdef CONFIG_XFRM
+		case UDP_ENCAP_ESPINUDP:
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+#if IS_ENABLED(CONFIG_IPV6)
+			if (sk->sk_family == AF_INET6)
+				WRITE_ONCE(up->encap_rcv,
+					   ipv6_stub->xfrm6_udp_encap_rcv);
+			else
+#endif
+				WRITE_ONCE(up->encap_rcv,
+					   xfrm4_udp_encap_rcv);
+#endif
+			fallthrough;
+		case UDP_ENCAP_L2TPINUDP:
+			WRITE_ONCE(up->encap_type, val);
+			udp_tunnel_encap_enable(sk);
+			break;
+		default:
+			err = -ENOPROTOOPT;
+			break;
+		}
+		break;
+
+	case UDP_NO_CHECK6_TX:
+		udp_set_no_check6_tx(sk, valbool);
+		break;
+
+	case UDP_NO_CHECK6_RX:
+		udp_set_no_check6_rx(sk, valbool);
+		break;
+
+	case UDP_SEGMENT:
+		if (val < 0 || val > USHRT_MAX)
+			return -EINVAL;
+		WRITE_ONCE(up->gso_size, val);
+		break;
+
+	case UDP_GRO:
+
+		/* when enabling GRO, accept the related GSO packet type */
+		if (valbool)
+			udp_tunnel_encap_enable(sk);
+		udp_assign_bit(GRO_ENABLED, sk, valbool);
+		udp_assign_bit(ACCEPT_L4, sk, valbool);
+		break;
+
+	/*
+	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
+	 */
+	/* The sender sets actual checksum coverage length via this option.
+	 * The case coverage > packet length is handled by send module. */
+	case UDPLITE_SEND_CSCOV:
+		if (!is_udplite)         /* Disable the option on UDP sockets */
+			return -ENOPROTOOPT;
+		if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
+			val = 8;
+		else if (val > USHRT_MAX)
+			val = USHRT_MAX;
+		WRITE_ONCE(up->pcslen, val);
+		udp_set_bit(UDPLITE_SEND_CC, sk);
+		break;
+
+	/* The receiver specifies a minimum checksum coverage value. To make
+	 * sense, this should be set to at least 8 (as done below). If zero is
+	 * used, this again means full checksum coverage.                     */
+	case UDPLITE_RECV_CSCOV:
+		if (!is_udplite)         /* Disable the option on UDP sockets */
+			return -ENOPROTOOPT;
+		if (val != 0 && val < 8) /* Avoid silly minimal values.       */
+			val = 8;
+		else if (val > USHRT_MAX)
+			val = USHRT_MAX;
+		WRITE_ONCE(up->pcrlen, val);
+		udp_set_bit(UDPLITE_RECV_CC, sk);
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(udp_lib_setsockopt);
+
+int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+		   unsigned int optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE || level == SOL_SOCKET)
+		return udp_lib_setsockopt(sk, level, optname,
+					  optval, optlen,
+					  udp_push_pending_frames);
+	return ip_setsockopt(sk, level, optname, optval, optlen);
+}
+
+int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+		       char __user *optval, int __user *optlen)
+{
+	struct udp_sock *up = udp_sk(sk);
+	int val, len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case UDP_CORK:
+		val = udp_test_bit(CORK, sk);
+		break;
+
+	case UDP_ENCAP:
+		val = READ_ONCE(up->encap_type);
+		break;
+
+	case UDP_NO_CHECK6_TX:
+		val = udp_get_no_check6_tx(sk);
+		break;
+
+	case UDP_NO_CHECK6_RX:
+		val = udp_get_no_check6_rx(sk);
+		break;
+
+	case UDP_SEGMENT:
+		val = READ_ONCE(up->gso_size);
+		break;
+
+	case UDP_GRO:
+		val = udp_test_bit(GRO_ENABLED, sk);
+		break;
+
+	/* The following two cannot be changed on UDP sockets, the return is
+	 * always 0 (which corresponds to the full checksum coverage of UDP). */
+	case UDPLITE_SEND_CSCOV:
+		val = READ_ONCE(up->pcslen);
+		break;
+
+	case UDPLITE_RECV_CSCOV:
+		val = READ_ONCE(up->pcrlen);
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL(udp_lib_getsockopt);
+
+int udp_getsockopt(struct sock *sk, int level, int optname,
+		   char __user *optval, int __user *optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+	return ip_getsockopt(sk, level, optname, optval, optlen);
+}
+
+/**
+ * 	udp_poll - wait for a UDP event.
+ *	@file: - file struct
+ *	@sock: - socket
+ *	@wait: - poll table
+ *
+ *	This is same as datagram poll, except for the special case of
+ *	blocking sockets. If application is using a blocking fd
+ *	and a packet with checksum error is in the queue;
+ *	then it could get return from select indicating data available
+ *	but then block when reading it. Add special case code
+ *	to work around these arguably broken applications.
+ */
+__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	__poll_t mask = datagram_poll(file, sock, wait);
+	struct sock *sk = sock->sk;
+
+	if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	/* Check for false positives due to checksum errors */
+	if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
+		mask &= ~(EPOLLIN | EPOLLRDNORM);
+
+	/* psock ingress_msg queue should not contain any bad checksum frames */
+	if (sk_is_readable(sk))
+		mask |= EPOLLIN | EPOLLRDNORM;
+	return mask;
+
+}
+EXPORT_SYMBOL(udp_poll);
+
+int udp_abort(struct sock *sk, int err)
+{
+	if (!has_current_bpf_ctx())
+		lock_sock(sk);
+
+	/* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
+	 * with close()
+	 */
+	if (sock_flag(sk, SOCK_DEAD))
+		goto out;
+
+	sk->sk_err = err;
+	sk_error_report(sk);
+	__udp_disconnect(sk, 0);
+
+out:
+	if (!has_current_bpf_ctx())
+		release_sock(sk);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(udp_abort);
+
+struct proto udp_prot = {
+	.name			= "UDP",
+	.owner			= THIS_MODULE,
+	.close			= udp_lib_close,
+	.pre_connect		= udp_pre_connect,
+	.connect		= ip4_datagram_connect,
+	.disconnect		= udp_disconnect,
+	.ioctl			= udp_ioctl,
+	.init			= udp_init_sock,
+	.destroy		= udp_destroy_sock,
+	.setsockopt		= udp_setsockopt,
+	.getsockopt		= udp_getsockopt,
+	.sendmsg		= udp_sendmsg,
+	.recvmsg		= udp_recvmsg,
+	.splice_eof		= udp_splice_eof,
+	.release_cb		= ip4_datagram_release_cb,
+	.hash			= udp_lib_hash,
+	.unhash			= udp_lib_unhash,
+	.rehash			= udp_v4_rehash,
+	.get_port		= udp_v4_get_port,
+	.put_port		= udp_lib_unhash,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= udp_bpf_update_proto,
+#endif
+	.memory_allocated	= &udp_memory_allocated,
+	.per_cpu_fw_alloc	= &udp_memory_per_cpu_fw_alloc,
+
+	.sysctl_mem		= sysctl_udp_mem,
+	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_udp_rmem_min),
+	.obj_size		= sizeof(struct udp_sock),
+	.h.udp_table		= NULL,
+	.diag_destroy		= udp_abort,
+};
+EXPORT_SYMBOL(udp_prot);
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+static unsigned short seq_file_family(const struct seq_file *seq);
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+	unsigned short family = seq_file_family(seq);
+
+	/* AF_UNSPEC is used as a match all */
+	return ((family == AF_UNSPEC || family == sk->sk_family) &&
+		net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static const struct seq_operations bpf_iter_udp_seq_ops;
+#endif
+static struct udp_table *udp_get_table_seq(struct seq_file *seq,
+					   struct net *net)
+{
+	const struct udp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+	if (seq->op == &bpf_iter_udp_seq_ops)
+		return net->ipv4.udp_table;
+#endif
+
+	afinfo = pde_data(file_inode(seq->file));
+	return afinfo->udp_table ? : net->ipv4.udp_table;
+}
+
+static struct sock *udp_get_first(struct seq_file *seq, int start)
+{
+	struct udp_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct udp_table *udptable;
+	struct sock *sk;
+
+	udptable = udp_get_table_seq(seq, net);
+
+	for (state->bucket = start; state->bucket <= udptable->mask;
+	     ++state->bucket) {
+		struct udp_hslot *hslot = &udptable->hash[state->bucket];
+
+		if (hlist_empty(&hslot->head))
+			continue;
+
+		spin_lock_bh(&hslot->lock);
+		sk_for_each(sk, &hslot->head) {
+			if (seq_sk_match(seq, sk))
+				goto found;
+		}
+		spin_unlock_bh(&hslot->lock);
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct udp_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct udp_table *udptable;
+
+	do {
+		sk = sk_next(sk);
+	} while (sk && !seq_sk_match(seq, sk));
+
+	if (!sk) {
+		udptable = udp_get_table_seq(seq, net);
+
+		if (state->bucket <= udptable->mask)
+			spin_unlock_bh(&udptable->hash[state->bucket].lock);
+
+		return udp_get_first(seq, state->bucket + 1);
+	}
+	return sk;
+}
+
+static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = udp_get_first(seq, 0);
+
+	if (sk)
+		while (pos && (sk = udp_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct udp_iter_state *state = seq->private;
+	state->bucket = MAX_UDP_PORTS;
+
+	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL(udp_seq_start);
+
+void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = udp_get_idx(seq, 0);
+	else
+		sk = udp_get_next(seq, v);
+
+	++*pos;
+	return sk;
+}
+EXPORT_SYMBOL(udp_seq_next);
+
+void udp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct udp_iter_state *state = seq->private;
+	struct udp_table *udptable;
+
+	udptable = udp_get_table_seq(seq, seq_file_net(seq));
+
+	if (state->bucket <= udptable->mask)
+		spin_unlock_bh(&udptable->hash[state->bucket].lock);
+}
+EXPORT_SYMBOL(udp_seq_stop);
+
+/* ------------------------------------------------------------------------ */
+static void udp4_format_sock(struct sock *sp, struct seq_file *f,
+		int bucket)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	__be32 dest = inet->inet_daddr;
+	__be32 src  = inet->inet_rcv_saddr;
+	__u16 destp	  = ntohs(inet->inet_dport);
+	__u16 srcp	  = ntohs(inet->inet_sport);
+
+	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
+		bucket, src, srcp, dest, destp, sp->sk_state,
+		sk_wmem_alloc_get(sp),
+		udp_rqueue_get(sp),
+		0, 0L, 0,
+		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+		0, sock_i_ino(sp),
+		refcount_read(&sp->sk_refcnt), sp,
+		atomic_read(&sp->sk_drops));
+}
+
+int udp4_seq_show(struct seq_file *seq, void *v)
+{
+	seq_setwidth(seq, 127);
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "   sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode ref pointer drops");
+	else {
+		struct udp_iter_state *state = seq->private;
+
+		udp4_format_sock(v, seq, state->bucket);
+	}
+	seq_pad(seq, '\n');
+	return 0;
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__udp {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct udp_sock *, udp_sk);
+	uid_t uid __aligned(8);
+	int bucket __aligned(8);
+};
+
+struct bpf_udp_iter_state {
+	struct udp_iter_state state;
+	unsigned int cur_sk;
+	unsigned int end_sk;
+	unsigned int max_sk;
+	int offset;
+	struct sock **batch;
+	bool st_bucket_done;
+};
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+				      unsigned int new_batch_sz);
+static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct udp_iter_state *state = &iter->state;
+	struct net *net = seq_file_net(seq);
+	int resume_bucket, resume_offset;
+	struct udp_table *udptable;
+	unsigned int batch_sks = 0;
+	bool resized = false;
+	struct sock *sk;
+
+	resume_bucket = state->bucket;
+	resume_offset = iter->offset;
+
+	/* The current batch is done, so advance the bucket. */
+	if (iter->st_bucket_done)
+		state->bucket++;
+
+	udptable = udp_get_table_seq(seq, net);
+
+again:
+	/* New batch for the next bucket.
+	 * Iterate over the hash table to find a bucket with sockets matching
+	 * the iterator attributes, and return the first matching socket from
+	 * the bucket. The remaining matched sockets from the bucket are batched
+	 * before releasing the bucket lock. This allows BPF programs that are
+	 * called in seq_show to acquire the bucket lock if needed.
+	 */
+	iter->cur_sk = 0;
+	iter->end_sk = 0;
+	iter->st_bucket_done = false;
+	batch_sks = 0;
+
+	for (; state->bucket <= udptable->mask; state->bucket++) {
+		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
+
+		if (hlist_empty(&hslot2->head))
+			continue;
+
+		iter->offset = 0;
+		spin_lock_bh(&hslot2->lock);
+		udp_portaddr_for_each_entry(sk, &hslot2->head) {
+			if (seq_sk_match(seq, sk)) {
+				/* Resume from the last iterated socket at the
+				 * offset in the bucket before iterator was stopped.
+				 */
+				if (state->bucket == resume_bucket &&
+				    iter->offset < resume_offset) {
+					++iter->offset;
+					continue;
+				}
+				if (iter->end_sk < iter->max_sk) {
+					sock_hold(sk);
+					iter->batch[iter->end_sk++] = sk;
+				}
+				batch_sks++;
+			}
+		}
+		spin_unlock_bh(&hslot2->lock);
+
+		if (iter->end_sk)
+			break;
+	}
+
+	/* All done: no batch made. */
+	if (!iter->end_sk)
+		return NULL;
+
+	if (iter->end_sk == batch_sks) {
+		/* Batching is done for the current bucket; return the first
+		 * socket to be iterated from the batch.
+		 */
+		iter->st_bucket_done = true;
+		goto done;
+	}
+	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
+		resized = true;
+		/* After allocating a larger batch, retry one more time to grab
+		 * the whole bucket.
+		 */
+		goto again;
+	}
+done:
+	return iter->batch[0];
+}
+
+static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct sock *sk;
+
+	/* Whenever seq_next() is called, the iter->cur_sk is
+	 * done with seq_show(), so unref the iter->cur_sk.
+	 */
+	if (iter->cur_sk < iter->end_sk) {
+		sock_put(iter->batch[iter->cur_sk++]);
+		++iter->offset;
+	}
+
+	/* After updating iter->cur_sk, check if there are more sockets
+	 * available in the current bucket batch.
+	 */
+	if (iter->cur_sk < iter->end_sk)
+		sk = iter->batch[iter->cur_sk];
+	else
+		/* Prepare a new batch. */
+		sk = bpf_iter_udp_batch(seq);
+
+	++*pos;
+	return sk;
+}
+
+static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* bpf iter does not support lseek, so it always
+	 * continue from where it was stop()-ped.
+	 */
+	if (*pos)
+		return bpf_iter_udp_batch(seq);
+
+	return SEQ_START_TOKEN;
+}
+
+static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+			     struct udp_sock *udp_sk, uid_t uid, int bucket)
+{
+	struct bpf_iter__udp ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.udp_sk = udp_sk;
+	ctx.uid = uid;
+	ctx.bucket = bucket;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
+{
+	struct udp_iter_state *state = seq->private;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	struct sock *sk = v;
+	uid_t uid;
+	int ret;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+
+	lock_sock(sk);
+
+	if (unlikely(sk_unhashed(sk))) {
+		ret = SEQ_SKIP;
+		goto unlock;
+	}
+
+	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+
+unlock:
+	release_sock(sk);
+	return ret;
+}
+
+static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
+{
+	while (iter->cur_sk < iter->end_sk)
+		sock_put(iter->batch[iter->cur_sk++]);
+}
+
+static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_udp_iter_state *iter = seq->private;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
+	}
+
+	if (iter->cur_sk < iter->end_sk) {
+		bpf_iter_udp_put_batch(iter);
+		iter->st_bucket_done = false;
+	}
+}
+
+static const struct seq_operations bpf_iter_udp_seq_ops = {
+	.start		= bpf_iter_udp_seq_start,
+	.next		= bpf_iter_udp_seq_next,
+	.stop		= bpf_iter_udp_seq_stop,
+	.show		= bpf_iter_udp_seq_show,
+};
+#endif
+
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+	const struct udp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+	/* BPF iterator: bpf programs to filter sockets. */
+	if (seq->op == &bpf_iter_udp_seq_ops)
+		return AF_UNSPEC;
+#endif
+
+	/* Proc fs iterator */
+	afinfo = pde_data(file_inode(seq->file));
+	return afinfo->family;
+}
+
+const struct seq_operations udp_seq_ops = {
+	.start		= udp_seq_start,
+	.next		= udp_seq_next,
+	.stop		= udp_seq_stop,
+	.show		= udp4_seq_show,
+};
+EXPORT_SYMBOL(udp_seq_ops);
+
+static struct udp_seq_afinfo udp4_seq_afinfo = {
+	.family		= AF_INET,
+	.udp_table	= NULL,
+};
+
+static int __net_init udp4_proc_init_net(struct net *net)
+{
+	if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
+			sizeof(struct udp_iter_state), &udp4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit udp4_proc_exit_net(struct net *net)
+{
+	remove_proc_entry("udp", net->proc_net);
+}
+
+static struct pernet_operations udp4_net_ops = {
+	.init = udp4_proc_init_net,
+	.exit = udp4_proc_exit_net,
+};
+
+int __init udp4_proc_init(void)
+{
+	return register_pernet_subsys(&udp4_net_ops);
+}
+
+void udp4_proc_exit(void)
+{
+	unregister_pernet_subsys(&udp4_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+static __initdata unsigned long uhash_entries;
+static int __init set_uhash_entries(char *str)
+{
+	ssize_t ret;
+
+	if (!str)
+		return 0;
+
+	ret = kstrtoul(str, 0, &uhash_entries);
+	if (ret)
+		return 0;
+
+	if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
+		uhash_entries = UDP_HTABLE_SIZE_MIN;
+	return 1;
+}
+__setup("uhash_entries=", set_uhash_entries);
+
+void __init udp_table_init(struct udp_table *table, const char *name)
+{
+	unsigned int i;
+
+	table->hash = alloc_large_system_hash(name,
+					      2 * sizeof(struct udp_hslot),
+					      uhash_entries,
+					      21, /* one slot per 2 MB */
+					      0,
+					      &table->log,
+					      &table->mask,
+					      UDP_HTABLE_SIZE_MIN,
+					      UDP_HTABLE_SIZE_MAX);
+
+	table->hash2 = table->hash + (table->mask + 1);
+	for (i = 0; i <= table->mask; i++) {
+		INIT_HLIST_HEAD(&table->hash[i].head);
+		table->hash[i].count = 0;
+		spin_lock_init(&table->hash[i].lock);
+	}
+	for (i = 0; i <= table->mask; i++) {
+		INIT_HLIST_HEAD(&table->hash2[i].head);
+		table->hash2[i].count = 0;
+		spin_lock_init(&table->hash2[i].lock);
+	}
+}
+
+u32 udp_flow_hashrnd(void)
+{
+	static u32 hashrnd __read_mostly;
+
+	net_get_random_once(&hashrnd, sizeof(hashrnd));
+
+	return hashrnd;
+}
+EXPORT_SYMBOL(udp_flow_hashrnd);
+
+static void __net_init udp_sysctl_init(struct net *net)
+{
+	net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
+	net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	net->ipv4.sysctl_udp_l3mdev_accept = 0;
+#endif
+}
+
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+	struct udp_table *udptable;
+	int i;
+
+	udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+	if (!udptable)
+		goto out;
+
+	udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
+				      GFP_KERNEL_ACCOUNT);
+	if (!udptable->hash)
+		goto free_table;
+
+	udptable->hash2 = udptable->hash + hash_entries;
+	udptable->mask = hash_entries - 1;
+	udptable->log = ilog2(hash_entries);
+
+	for (i = 0; i < hash_entries; i++) {
+		INIT_HLIST_HEAD(&udptable->hash[i].head);
+		udptable->hash[i].count = 0;
+		spin_lock_init(&udptable->hash[i].lock);
+
+		INIT_HLIST_HEAD(&udptable->hash2[i].head);
+		udptable->hash2[i].count = 0;
+		spin_lock_init(&udptable->hash2[i].lock);
+	}
+
+	return udptable;
+
+free_table:
+	kfree(udptable);
+out:
+	return NULL;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+	struct udp_table *udptable = net->ipv4.udp_table;
+
+	if (udptable == &udp_table)
+		return;
+
+	kvfree(udptable->hash);
+	kfree(udptable);
+}
+
+static void __net_init udp_set_table(struct net *net)
+{
+	struct udp_table *udptable;
+	unsigned int hash_entries;
+	struct net *old_net;
+
+	if (net_eq(net, &init_net))
+		goto fallback;
+
+	old_net = current->nsproxy->net_ns;
+	hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+	if (!hash_entries)
+		goto fallback;
+
+	/* Set min to keep the bitmap on stack in udp_lib_get_port() */
+	if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
+		hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
+	else
+		hash_entries = roundup_pow_of_two(hash_entries);
+
+	udptable = udp_pernet_table_alloc(hash_entries);
+	if (udptable) {
+		net->ipv4.udp_table = udptable;
+	} else {
+		pr_warn("Failed to allocate UDP hash table (entries: %u) "
+			"for a netns, fallback to the global one\n",
+			hash_entries);
+fallback:
+		net->ipv4.udp_table = &udp_table;
+	}
+}
+
+static int __net_init udp_pernet_init(struct net *net)
+{
+	udp_sysctl_init(net);
+	udp_set_table(net);
+
+	return 0;
+}
+
+static void __net_exit udp_pernet_exit(struct net *net)
+{
+	udp_pernet_table_free(net);
+}
+
+static struct pernet_operations __net_initdata udp_sysctl_ops = {
+	.init	= udp_pernet_init,
+	.exit	= udp_pernet_exit,
+};
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
+		     struct udp_sock *udp_sk, uid_t uid, int bucket)
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+				      unsigned int new_batch_sz)
+{
+	struct sock **new_batch;
+
+	new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
+				   GFP_USER | __GFP_NOWARN);
+	if (!new_batch)
+		return -ENOMEM;
+
+	bpf_iter_udp_put_batch(iter);
+	kvfree(iter->batch);
+	iter->batch = new_batch;
+	iter->max_sk = new_batch_sz;
+
+	return 0;
+}
+
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+	struct bpf_udp_iter_state *iter = priv_data;
+	int ret;
+
+	ret = bpf_iter_init_seq_net(priv_data, aux);
+	if (ret)
+		return ret;
+
+	ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+	if (ret)
+		bpf_iter_fini_seq_net(priv_data);
+
+	return ret;
+}
+
+static void bpf_iter_fini_udp(void *priv_data)
+{
+	struct bpf_udp_iter_state *iter = priv_data;
+
+	bpf_iter_fini_seq_net(priv_data);
+	kvfree(iter->batch);
+}
+
+static const struct bpf_iter_seq_info udp_seq_info = {
+	.seq_ops		= &bpf_iter_udp_seq_ops,
+	.init_seq_private	= bpf_iter_init_udp,
+	.fini_seq_private	= bpf_iter_fini_udp,
+	.seq_priv_size		= sizeof(struct bpf_udp_iter_state),
+};
+
+static struct bpf_iter_reg udp_reg_info = {
+	.target			= "udp",
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__udp, udp_sk),
+		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+	},
+	.seq_info		= &udp_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+	udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
+	if (bpf_iter_reg_target(&udp_reg_info))
+		pr_warn("Warning: could not register bpf iterator udp\n");
+}
+#endif
+
+void __init udp_init(void)
+{
+	unsigned long limit;
+	unsigned int i;
+
+	udp_table_init(&udp_table, "UDP");
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+	sysctl_udp_mem[0] = limit / 4 * 3;
+	sysctl_udp_mem[1] = limit;
+	sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
+
+	/* 16 spinlocks per cpu */
+	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
+	udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
+				GFP_KERNEL);
+	if (!udp_busylocks)
+		panic("UDP: failed to alloc udp_busylocks\n");
+	for (i = 0; i < (1U << udp_busylocks_log); i++)
+		spin_lock_init(udp_busylocks + i);
+
+	if (register_pernet_subsys(&udp_sysctl_ops))
+		panic("UDP: failed to init sysctl parameters.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	bpf_iter_register();
+#endif
+}
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
new file mode 100644
index 0000000000..0735d820e4
--- /dev/null
+++ b/net/ipv4/udp_bpf.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */
+
+#include <linux/skmsg.h>
+#include <net/sock.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+
+#include "udp_impl.h"
+
+static struct proto *udpv6_prot_saved __read_mostly;
+
+static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+			  int flags, int *addr_len)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len);
+#endif
+	return udp_prot.recvmsg(sk, msg, len, flags, addr_len);
+}
+
+static bool udp_sk_has_data(struct sock *sk)
+{
+	return !skb_queue_empty(&udp_sk(sk)->reader_queue) ||
+	       !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static bool psock_has_data(struct sk_psock *psock)
+{
+	return !skb_queue_empty(&psock->ingress_skb) ||
+	       !sk_psock_queue_empty(psock);
+}
+
+#define udp_msg_has_data(__sk, __psock)	\
+		({ udp_sk_has_data(__sk) || psock_has_data(__psock); })
+
+static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+			     long timeo)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret = 0;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return 1;
+
+	if (!timeo)
+		return ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = udp_msg_has_data(sk, psock);
+	if (!ret) {
+		wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		ret = udp_msg_has_data(sk, psock);
+	}
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
+static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+			   int flags, int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied, ret;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+
+	if (!len)
+		return 0;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+
+	if (!psock_has_data(psock)) {
+		ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+		goto out;
+	}
+
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	if (!copied) {
+		long timeo;
+		int data;
+
+		timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+		data = udp_msg_wait_data(sk, psock, timeo);
+		if (data) {
+			if (psock_has_data(psock))
+				goto msg_bytes_ready;
+			ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+			goto out;
+		}
+		copied = -EAGAIN;
+	}
+	ret = copied;
+out:
+	sk_psock_put(sk, psock);
+	return ret;
+}
+
+enum {
+	UDP_BPF_IPV4,
+	UDP_BPF_IPV6,
+	UDP_BPF_NUM_PROTS,
+};
+
+static DEFINE_SPINLOCK(udpv6_prot_lock);
+static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
+
+static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+	*prot        = *base;
+	prot->close  = sock_map_close;
+	prot->recvmsg = udp_bpf_recvmsg;
+	prot->sock_is_readable = sk_msg_is_readable;
+}
+
+static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
+{
+	if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
+		spin_lock_bh(&udpv6_prot_lock);
+		if (likely(ops != udpv6_prot_saved)) {
+			udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops);
+			smp_store_release(&udpv6_prot_saved, ops);
+		}
+		spin_unlock_bh(&udpv6_prot_lock);
+	}
+}
+
+static int __init udp_bpf_v4_build_proto(void)
+{
+	udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot);
+	return 0;
+}
+late_initcall(udp_bpf_v4_build_proto);
+
+int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+	int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+
+	if (restore) {
+		sk->sk_write_space = psock->saved_write_space;
+		sock_replace_proto(sk, psock->sk_proto);
+		return 0;
+	}
+
+	if (sk->sk_family == AF_INET6)
+		udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
+
+	sock_replace_proto(sk, &udp_bpf_prots[family]);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
new file mode 100644
index 0000000000..de3f2d31f5
--- /dev/null
+++ b/net/ipv4/udp_diag.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * udp_diag.c	Module for monitoring UDP transport protocols sockets.
+ *
+ * Authors:	Pavel Emelyanov, <xemul@parallels.com>
+ */
+
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/udp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/sock_diag.h>
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+			struct netlink_callback *cb,
+			const struct inet_diag_req_v2 *req,
+			struct nlattr *bc, bool net_admin)
+{
+	if (!inet_diag_bc_sk(bc, sk))
+		return 0;
+
+	return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
+				 net_admin);
+}
+
+static int udp_dump_one(struct udp_table *tbl,
+			struct netlink_callback *cb,
+			const struct inet_diag_req_v2 *req)
+{
+	struct sk_buff *in_skb = cb->skb;
+	int err;
+	struct sock *sk = NULL;
+	struct sk_buff *rep;
+	struct net *net = sock_net(in_skb->sk);
+
+	rcu_read_lock();
+	if (req->sdiag_family == AF_INET)
+		/* src and dst are swapped for historical reasons */
+		sk = __udp4_lib_lookup(net,
+				req->id.idiag_src[0], req->id.idiag_sport,
+				req->id.idiag_dst[0], req->id.idiag_dport,
+				req->id.idiag_if, 0, tbl, NULL);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (req->sdiag_family == AF_INET6)
+		sk = __udp6_lib_lookup(net,
+				(struct in6_addr *)req->id.idiag_src,
+				req->id.idiag_sport,
+				(struct in6_addr *)req->id.idiag_dst,
+				req->id.idiag_dport,
+				req->id.idiag_if, 0, tbl, NULL);
+#endif
+	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+	rcu_read_unlock();
+	err = -ENOENT;
+	if (!sk)
+		goto out_nosk;
+
+	err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+	if (err)
+		goto out;
+
+	err = -ENOMEM;
+	rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+			inet_diag_msg_attrs_size() +
+			nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
+			GFP_KERNEL);
+	if (!rep)
+		goto out;
+
+	err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0,
+				netlink_net_capable(in_skb, CAP_NET_ADMIN));
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(rep);
+		goto out;
+	}
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+	if (sk)
+		sock_put(sk);
+out_nosk:
+	return err;
+}
+
+static void udp_dump(struct udp_table *table, struct sk_buff *skb,
+		     struct netlink_callback *cb,
+		     const struct inet_diag_req_v2 *r)
+{
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+	struct net *net = sock_net(skb->sk);
+	struct inet_diag_dump_data *cb_data;
+	int num, s_num, slot, s_slot;
+	struct nlattr *bc;
+
+	cb_data = cb->data;
+	bc = cb_data->inet_diag_nla_bc;
+	s_slot = cb->args[0];
+	num = s_num = cb->args[1];
+
+	for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
+		struct udp_hslot *hslot = &table->hash[slot];
+		struct sock *sk;
+
+		num = 0;
+
+		if (hlist_empty(&hslot->head))
+			continue;
+
+		spin_lock_bh(&hslot->lock);
+		sk_for_each(sk, &hslot->head) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+			if (num < s_num)
+				goto next;
+			if (!(r->idiag_states & (1 << sk->sk_state)))
+				goto next;
+			if (r->sdiag_family != AF_UNSPEC &&
+					sk->sk_family != r->sdiag_family)
+				goto next;
+			if (r->id.idiag_sport != inet->inet_sport &&
+			    r->id.idiag_sport)
+				goto next;
+			if (r->id.idiag_dport != inet->inet_dport &&
+			    r->id.idiag_dport)
+				goto next;
+
+			if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
+				spin_unlock_bh(&hslot->lock);
+				goto done;
+			}
+next:
+			num++;
+		}
+		spin_unlock_bh(&hslot->lock);
+	}
+done:
+	cb->args[0] = slot;
+	cb->args[1] = num;
+}
+
+static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  const struct inet_diag_req_v2 *r)
+{
+	udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r);
+}
+
+static int udp_diag_dump_one(struct netlink_callback *cb,
+			     const struct inet_diag_req_v2 *req)
+{
+	return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req);
+}
+
+static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+		void *info)
+{
+	r->idiag_rqueue = udp_rqueue_get(sk);
+	r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int __udp_diag_destroy(struct sk_buff *in_skb,
+			      const struct inet_diag_req_v2 *req,
+			      struct udp_table *tbl)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct sock *sk;
+	int err;
+
+	rcu_read_lock();
+
+	if (req->sdiag_family == AF_INET)
+		sk = __udp4_lib_lookup(net,
+				req->id.idiag_dst[0], req->id.idiag_dport,
+				req->id.idiag_src[0], req->id.idiag_sport,
+				req->id.idiag_if, 0, tbl, NULL);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (req->sdiag_family == AF_INET6) {
+		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+		    ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+			sk = __udp4_lib_lookup(net,
+					req->id.idiag_dst[3], req->id.idiag_dport,
+					req->id.idiag_src[3], req->id.idiag_sport,
+					req->id.idiag_if, 0, tbl, NULL);
+
+		else
+			sk = __udp6_lib_lookup(net,
+					(struct in6_addr *)req->id.idiag_dst,
+					req->id.idiag_dport,
+					(struct in6_addr *)req->id.idiag_src,
+					req->id.idiag_sport,
+					req->id.idiag_if, 0, tbl, NULL);
+	}
+#endif
+	else {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+		sk = NULL;
+
+	rcu_read_unlock();
+
+	if (!sk)
+		return -ENOENT;
+
+	if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+		sock_put(sk);
+		return -ENOENT;
+	}
+
+	err = sock_diag_destroy(sk, ECONNABORTED);
+
+	sock_put(sk);
+
+	return err;
+}
+
+static int udp_diag_destroy(struct sk_buff *in_skb,
+			    const struct inet_diag_req_v2 *req)
+{
+	return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table);
+}
+
+static int udplite_diag_destroy(struct sk_buff *in_skb,
+				const struct inet_diag_req_v2 *req)
+{
+	return __udp_diag_destroy(in_skb, req, &udplite_table);
+}
+
+#endif
+
+static const struct inet_diag_handler udp_diag_handler = {
+	.dump		 = udp_diag_dump,
+	.dump_one	 = udp_diag_dump_one,
+	.idiag_get_info  = udp_diag_get_info,
+	.idiag_type	 = IPPROTO_UDP,
+	.idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+	.destroy	 = udp_diag_destroy,
+#endif
+};
+
+static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			      const struct inet_diag_req_v2 *r)
+{
+	udp_dump(&udplite_table, skb, cb, r);
+}
+
+static int udplite_diag_dump_one(struct netlink_callback *cb,
+				 const struct inet_diag_req_v2 *req)
+{
+	return udp_dump_one(&udplite_table, cb, req);
+}
+
+static const struct inet_diag_handler udplite_diag_handler = {
+	.dump		 = udplite_diag_dump,
+	.dump_one	 = udplite_diag_dump_one,
+	.idiag_get_info  = udp_diag_get_info,
+	.idiag_type	 = IPPROTO_UDPLITE,
+	.idiag_info_size = 0,
+#ifdef CONFIG_INET_DIAG_DESTROY
+	.destroy	 = udplite_diag_destroy,
+#endif
+};
+
+static int __init udp_diag_init(void)
+{
+	int err;
+
+	err = inet_diag_register(&udp_diag_handler);
+	if (err)
+		goto out;
+	err = inet_diag_register(&udplite_diag_handler);
+	if (err)
+		goto out_lite;
+out:
+	return err;
+out_lite:
+	inet_diag_unregister(&udp_diag_handler);
+	goto out;
+}
+
+static void __exit udp_diag_exit(void)
+{
+	inet_diag_unregister(&udplite_diag_handler);
+	inet_diag_unregister(&udp_diag_handler);
+}
+
+module_init(udp_diag_init);
+module_exit(udp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
new file mode 100644
index 0000000000..e1ff3a3759
--- /dev/null
+++ b/net/ipv4/udp_impl.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UDP4_IMPL_H
+#define _UDP4_IMPL_H
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+
+int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+
+int udp_v4_get_port(struct sock *sk, unsigned short snum);
+void udp_v4_rehash(struct sock *sk);
+
+int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+		   unsigned int optlen);
+int udp_getsockopt(struct sock *sk, int level, int optname,
+		   char __user *optval, int __user *optlen);
+
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+		int *addr_len);
+void udp_destroy_sock(struct sock *sk);
+
+#ifdef CONFIG_PROC_FS
+int udp4_seq_show(struct seq_file *seq, void *v);
+#endif
+#endif	/* _UDP4_IMPL_H */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
new file mode 100644
index 0000000000..6c95d28d0c
--- /dev/null
+++ b/net/ipv4/udp_offload.c
@@ -0,0 +1,751 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *	IPV4 GSO/GRO offload support
+ *	Linux INET implementation
+ *
+ *	UDPv4 GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/udp.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+
+static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+	netdev_features_t features,
+	struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+					     netdev_features_t features),
+	__be16 new_protocol, bool is_ipv6)
+{
+	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+	bool remcsum, need_csum, offload_csum, gso_partial;
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct udphdr *uh = udp_hdr(skb);
+	u16 mac_offset = skb->mac_header;
+	__be16 protocol = skb->protocol;
+	u16 mac_len = skb->mac_len;
+	int udp_offset, outer_hlen;
+	__wsum partial;
+	bool need_ipsec;
+
+	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+		goto out;
+
+	/* Adjust partial header checksum to negate old length.
+	 * We cannot rely on the value contained in uh->len as it is
+	 * possible that the actual value exceeds the boundaries of the
+	 * 16 bit length field due to the header being added outside of an
+	 * IP or IPv6 frame that was already limited to 64K - 1.
+	 */
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)
+		partial = (__force __wsum)uh->len;
+	else
+		partial = (__force __wsum)htonl(skb->len);
+	partial = csum_sub(csum_unfold(uh->check), partial);
+
+	/* setup inner skb. */
+	skb->encapsulation = 0;
+	SKB_GSO_CB(skb)->encap_level = 0;
+	__skb_pull(skb, tnl_hlen);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb_inner_network_offset(skb));
+	skb_set_transport_header(skb, skb_inner_transport_offset(skb));
+	skb->mac_len = skb_inner_network_offset(skb);
+	skb->protocol = new_protocol;
+
+	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
+	skb->encap_hdr_csum = need_csum;
+
+	remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+	skb->remcsum_offload = remcsum;
+
+	need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+	/* Try to offload checksum if possible */
+	offload_csum = !!(need_csum &&
+			  !need_ipsec &&
+			  (skb->dev->features &
+			   (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) :
+				      (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
+
+	features &= skb->dev->hw_enc_features;
+	if (need_csum)
+		features &= ~NETIF_F_SCTP_CRC;
+
+	/* The only checksum offload we care about from here on out is the
+	 * outer one so strip the existing checksum feature flags and
+	 * instead set the flag based on our outer checksum offload value.
+	 */
+	if (remcsum) {
+		features &= ~NETIF_F_CSUM_MASK;
+		if (!need_csum || offload_csum)
+			features |= NETIF_F_HW_CSUM;
+	}
+
+	/* segment inner packet. */
+	segs = gso_inner_segment(skb, features);
+	if (IS_ERR_OR_NULL(segs)) {
+		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+				     mac_len);
+		goto out;
+	}
+
+	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
+	outer_hlen = skb_tnl_header_len(skb);
+	udp_offset = outer_hlen - tnl_hlen;
+	skb = segs;
+	do {
+		unsigned int len;
+
+		if (remcsum)
+			skb->ip_summed = CHECKSUM_NONE;
+
+		/* Set up inner headers if we are offloading inner checksum */
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			skb_reset_inner_headers(skb);
+			skb->encapsulation = 1;
+		}
+
+		skb->mac_len = mac_len;
+		skb->protocol = protocol;
+
+		__skb_push(skb, outer_hlen);
+		skb_reset_mac_header(skb);
+		skb_set_network_header(skb, mac_len);
+		skb_set_transport_header(skb, udp_offset);
+		len = skb->len - udp_offset;
+		uh = udp_hdr(skb);
+
+		/* If we are only performing partial GSO the inner header
+		 * will be using a length value equal to only one MSS sized
+		 * segment instead of the entire frame.
+		 */
+		if (gso_partial && skb_is_gso(skb)) {
+			uh->len = htons(skb_shinfo(skb)->gso_size +
+					SKB_GSO_CB(skb)->data_offset +
+					skb->head - (unsigned char *)uh);
+		} else {
+			uh->len = htons(len);
+		}
+
+		if (!need_csum)
+			continue;
+
+		uh->check = ~csum_fold(csum_add(partial,
+				       (__force __wsum)htonl(len)));
+
+		if (skb->encapsulation || !offload_csum) {
+			uh->check = gso_make_checksum(skb, ~uh->check);
+			if (uh->check == 0)
+				uh->check = CSUM_MANGLED_0;
+		} else {
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->csum_start = skb_transport_header(skb) - skb->head;
+			skb->csum_offset = offsetof(struct udphdr, check);
+		}
+	} while ((skb = skb->next));
+out:
+	return segs;
+}
+
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+				       netdev_features_t features,
+				       bool is_ipv6)
+{
+	const struct net_offload __rcu **offloads;
+	__be16 protocol = skb->protocol;
+	const struct net_offload *ops;
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+					     netdev_features_t features);
+
+	rcu_read_lock();
+
+	switch (skb->inner_protocol_type) {
+	case ENCAP_TYPE_ETHER:
+		protocol = skb->inner_protocol;
+		gso_inner_segment = skb_mac_gso_segment;
+		break;
+	case ENCAP_TYPE_IPPROTO:
+		offloads = is_ipv6 ? inet6_offloads : inet_offloads;
+		ops = rcu_dereference(offloads[skb->inner_ipproto]);
+		if (!ops || !ops->callbacks.gso_segment)
+			goto out_unlock;
+		gso_inner_segment = ops->callbacks.gso_segment;
+		break;
+	default:
+		goto out_unlock;
+	}
+
+	segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment,
+					protocol, is_ipv6);
+
+out_unlock:
+	rcu_read_unlock();
+
+	return segs;
+}
+EXPORT_SYMBOL(skb_udp_tunnel_segment);
+
+static void __udpv4_gso_segment_csum(struct sk_buff *seg,
+				     __be32 *oldip, __be32 *newip,
+				     __be16 *oldport, __be16 *newport)
+{
+	struct udphdr *uh;
+	struct iphdr *iph;
+
+	if (*oldip == *newip && *oldport == *newport)
+		return;
+
+	uh = udp_hdr(seg);
+	iph = ip_hdr(seg);
+
+	if (uh->check) {
+		inet_proto_csum_replace4(&uh->check, seg, *oldip, *newip,
+					 true);
+		inet_proto_csum_replace2(&uh->check, seg, *oldport, *newport,
+					 false);
+		if (!uh->check)
+			uh->check = CSUM_MANGLED_0;
+	}
+	*oldport = *newport;
+
+	csum_replace4(&iph->check, *oldip, *newip);
+	*oldip = *newip;
+}
+
+static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+	struct sk_buff *seg;
+	struct udphdr *uh, *uh2;
+	struct iphdr *iph, *iph2;
+
+	seg = segs;
+	uh = udp_hdr(seg);
+	iph = ip_hdr(seg);
+
+	if ((udp_hdr(seg)->dest == udp_hdr(seg->next)->dest) &&
+	    (udp_hdr(seg)->source == udp_hdr(seg->next)->source) &&
+	    (ip_hdr(seg)->daddr == ip_hdr(seg->next)->daddr) &&
+	    (ip_hdr(seg)->saddr == ip_hdr(seg->next)->saddr))
+		return segs;
+
+	while ((seg = seg->next)) {
+		uh2 = udp_hdr(seg);
+		iph2 = ip_hdr(seg);
+
+		__udpv4_gso_segment_csum(seg,
+					 &iph2->saddr, &iph->saddr,
+					 &uh2->source, &uh->source);
+		__udpv4_gso_segment_csum(seg,
+					 &iph2->daddr, &iph->daddr,
+					 &uh2->dest, &uh->dest);
+	}
+
+	return segs;
+}
+
+static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
+					      netdev_features_t features,
+					      bool is_ipv6)
+{
+	unsigned int mss = skb_shinfo(skb)->gso_size;
+
+	skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+	if (IS_ERR(skb))
+		return skb;
+
+	udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss);
+
+	return is_ipv6 ? skb : __udpv4_gso_segment_list_csum(skb);
+}
+
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+				  netdev_features_t features, bool is_ipv6)
+{
+	struct sock *sk = gso_skb->sk;
+	unsigned int sum_truesize = 0;
+	struct sk_buff *segs, *seg;
+	struct udphdr *uh;
+	unsigned int mss;
+	bool copy_dtor;
+	__sum16 check;
+	__be16 newlen;
+
+	mss = skb_shinfo(gso_skb)->gso_size;
+	if (gso_skb->len <= sizeof(*uh) + mss)
+		return ERR_PTR(-EINVAL);
+
+	if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) {
+		/* Packet is from an untrusted source, reset gso_segs. */
+		skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh),
+							     mss);
+		return NULL;
+	}
+
+	if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)
+		return __udp_gso_segment_list(gso_skb, features, is_ipv6);
+
+	skb_pull(gso_skb, sizeof(*uh));
+
+	/* clear destructor to avoid skb_segment assigning it to tail */
+	copy_dtor = gso_skb->destructor == sock_wfree;
+	if (copy_dtor)
+		gso_skb->destructor = NULL;
+
+	segs = skb_segment(gso_skb, features);
+	if (IS_ERR_OR_NULL(segs)) {
+		if (copy_dtor)
+			gso_skb->destructor = sock_wfree;
+		return segs;
+	}
+
+	/* GSO partial and frag_list segmentation only requires splitting
+	 * the frame into an MSS multiple and possibly a remainder, both
+	 * cases return a GSO skb. So update the mss now.
+	 */
+	if (skb_is_gso(segs))
+		mss *= skb_shinfo(segs)->gso_segs;
+
+	seg = segs;
+	uh = udp_hdr(seg);
+
+	/* preserve TX timestamp flags and TS key for first segment */
+	skb_shinfo(seg)->tskey = skb_shinfo(gso_skb)->tskey;
+	skb_shinfo(seg)->tx_flags |=
+			(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP);
+
+	/* compute checksum adjustment based on old length versus new */
+	newlen = htons(sizeof(*uh) + mss);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+	for (;;) {
+		if (copy_dtor) {
+			seg->destructor = sock_wfree;
+			seg->sk = sk;
+			sum_truesize += seg->truesize;
+		}
+
+		if (!seg->next)
+			break;
+
+		uh->len = newlen;
+		uh->check = check;
+
+		if (seg->ip_summed == CHECKSUM_PARTIAL)
+			gso_reset_checksum(seg, ~check);
+		else
+			uh->check = gso_make_checksum(seg, ~check) ? :
+				    CSUM_MANGLED_0;
+
+		seg = seg->next;
+		uh = udp_hdr(seg);
+	}
+
+	/* last packet can be partial gso_size, account for that in checksum */
+	newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
+		       seg->data_len);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+	uh->len = newlen;
+	uh->check = check;
+
+	if (seg->ip_summed == CHECKSUM_PARTIAL)
+		gso_reset_checksum(seg, ~check);
+	else
+		uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+
+	/* update refcount for the packet */
+	if (copy_dtor) {
+		int delta = sum_truesize - gso_skb->truesize;
+
+		/* In some pathological cases, delta can be negative.
+		 * We need to either use refcount_add() or refcount_sub_and_test()
+		 */
+		if (likely(delta >= 0))
+			refcount_add(delta, &sk->sk_wmem_alloc);
+		else
+			WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc));
+	}
+	return segs;
+}
+EXPORT_SYMBOL_GPL(__udp_gso_segment);
+
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+					 netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int mss;
+	__wsum csum;
+	struct udphdr *uh;
+	struct iphdr *iph;
+
+	if (skb->encapsulation &&
+	    (skb_shinfo(skb)->gso_type &
+	     (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
+		segs = skb_udp_tunnel_segment(skb, features, false);
+		goto out;
+	}
+
+	if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
+		goto out;
+
+	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+		goto out;
+
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+		return __udp_gso_segment(skb, features, false);
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
+
+	/* Do software UFO. Complete and fill in the UDP checksum as
+	 * HW cannot do checksum of UDP packets sent as multiple
+	 * IP fragments.
+	 */
+
+	uh = udp_hdr(skb);
+	iph = ip_hdr(skb);
+
+	uh->check = 0;
+	csum = skb_checksum(skb, 0, skb->len, 0);
+	uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
+	if (uh->check == 0)
+		uh->check = CSUM_MANGLED_0;
+
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	/* If there is no outer header we can fake a checksum offload
+	 * due to the fact that we have already done the checksum in
+	 * software prior to segmenting the frame.
+	 */
+	if (!skb->encap_hdr_csum)
+		features |= NETIF_F_HW_CSUM;
+
+	/* Fragment the skb. IP headers of the fragments are updated in
+	 * inet_gso_segment()
+	 */
+	segs = skb_segment(skb, features);
+out:
+	return segs;
+}
+
+static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+	if (unlikely(p->len + skb->len >= 65536))
+		return -E2BIG;
+
+	if (NAPI_GRO_CB(p)->last == p)
+		skb_shinfo(p)->frag_list = skb;
+	else
+		NAPI_GRO_CB(p)->last->next = skb;
+
+	skb_pull(skb, skb_gro_offset(skb));
+
+	NAPI_GRO_CB(p)->last = skb;
+	NAPI_GRO_CB(p)->count++;
+	p->data_len += skb->len;
+
+	/* sk owenrship - if any - completely transferred to the aggregated packet */
+	skb->destructor = NULL;
+	p->truesize += skb->truesize;
+	p->len += skb->len;
+
+	NAPI_GRO_CB(skb)->same_flow = 1;
+
+	return 0;
+}
+
+
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+					       struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_gro_udphdr(skb);
+	struct sk_buff *pp = NULL;
+	struct udphdr *uh2;
+	struct sk_buff *p;
+	unsigned int ulen;
+	int ret = 0;
+
+	/* requires non zero csum, for symmetry with GSO */
+	if (!uh->check) {
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	/* Do not deal with padded or malicious packets, sorry ! */
+	ulen = ntohs(uh->len);
+	if (ulen <= sizeof(*uh) || ulen != skb_gro_len(skb)) {
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+	/* pull encapsulating udp header */
+	skb_gro_pull(skb, sizeof(struct udphdr));
+
+	list_for_each_entry(p, head, list) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		uh2 = udp_hdr(p);
+
+		/* Match ports only, as csum is always non zero */
+		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) {
+			NAPI_GRO_CB(skb)->flush = 1;
+			return p;
+		}
+
+		/* Terminate the flow on len mismatch or if it grow "too much".
+		 * Under small packet flood GRO count could elsewhere grow a lot
+		 * leading to excessive truesize values.
+		 * On len mismatch merge the first packet shorter than gso_size,
+		 * otherwise complete the GRO packet.
+		 */
+		if (ulen > ntohs(uh2->len)) {
+			pp = p;
+		} else {
+			if (NAPI_GRO_CB(skb)->is_flist) {
+				if (!pskb_may_pull(skb, skb_gro_offset(skb))) {
+					NAPI_GRO_CB(skb)->flush = 1;
+					return NULL;
+				}
+				if ((skb->ip_summed != p->ip_summed) ||
+				    (skb->csum_level != p->csum_level)) {
+					NAPI_GRO_CB(skb)->flush = 1;
+					return NULL;
+				}
+				ret = skb_gro_receive_list(p, skb);
+			} else {
+				skb_gro_postpull_rcsum(skb, uh,
+						       sizeof(struct udphdr));
+
+				ret = skb_gro_receive(p, skb);
+			}
+		}
+
+		if (ret || ulen != ntohs(uh2->len) ||
+		    NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+			pp = p;
+
+		return pp;
+	}
+
+	/* mismatch, but we never need to flush */
+	return NULL;
+}
+
+struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+				struct udphdr *uh, struct sock *sk)
+{
+	struct sk_buff *pp = NULL;
+	struct sk_buff *p;
+	struct udphdr *uh2;
+	unsigned int off = skb_gro_offset(skb);
+	int flush = 1;
+
+	/* we can do L4 aggregation only if the packet can't land in a tunnel
+	 * otherwise we could corrupt the inner stream
+	 */
+	NAPI_GRO_CB(skb)->is_flist = 0;
+	if (!sk || !udp_sk(sk)->gro_receive) {
+		if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
+			NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1;
+
+		if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
+		    (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist)
+			return call_gro_receive(udp_gro_receive_segment, head, skb);
+
+		/* no GRO, be sure flush the current packet */
+		goto out;
+	}
+
+	if (NAPI_GRO_CB(skb)->encap_mark ||
+	    (uh->check && skb->ip_summed != CHECKSUM_PARTIAL &&
+	     NAPI_GRO_CB(skb)->csum_cnt == 0 &&
+	     !NAPI_GRO_CB(skb)->csum_valid))
+		goto out;
+
+	/* mark that this skb passed once through the tunnel gro layer */
+	NAPI_GRO_CB(skb)->encap_mark = 1;
+
+	flush = 0;
+
+	list_for_each_entry(p, head, list) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		uh2 = (struct udphdr   *)(p->data + off);
+
+		/* Match ports and either checksums are either both zero
+		 * or nonzero.
+		 */
+		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) ||
+		    (!uh->check ^ !uh2->check)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+	}
+
+	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
+	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+	pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+
+out:
+	skb_gro_flush_final(skb, pp, flush);
+	return pp;
+}
+EXPORT_SYMBOL(udp_gro_receive);
+
+static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
+					__be16 dport)
+{
+	const struct iphdr *iph = skb_gro_network_header(skb);
+	struct net *net = dev_net(skb->dev);
+	int iif, sdif;
+
+	inet_get_iif_sdif(skb, &iif, &sdif);
+
+	return __udp4_lib_lookup(net, iph->saddr, sport,
+				 iph->daddr, dport, iif,
+				 sdif, net->ipv4.udp_table, NULL);
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_gro_udphdr(skb);
+	struct sock *sk = NULL;
+	struct sk_buff *pp;
+
+	if (unlikely(!uh))
+		goto flush;
+
+	/* Don't bother verifying checksum if we're going to flush anyway. */
+	if (NAPI_GRO_CB(skb)->flush)
+		goto skip;
+
+	if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
+						 inet_gro_compute_pseudo))
+		goto flush;
+	else if (uh->check)
+		skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
+					     inet_gro_compute_pseudo);
+skip:
+	NAPI_GRO_CB(skb)->is_ipv6 = 0;
+
+	if (static_branch_unlikely(&udp_encap_needed_key))
+		sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
+
+	pp = udp_gro_receive(head, skb, uh, sk);
+	return pp;
+
+flush:
+	NAPI_GRO_CB(skb)->flush = 1;
+	return NULL;
+}
+
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_hdr(skb);
+
+	skb->csum_start = (unsigned char *)uh - skb->head;
+	skb->csum_offset = offsetof(struct udphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+
+	if (skb->encapsulation)
+		skb->inner_transport_header = skb->transport_header;
+
+	return 0;
+}
+
+int udp_gro_complete(struct sk_buff *skb, int nhoff,
+		     udp_lookup_t lookup)
+{
+	__be16 newlen = htons(skb->len - nhoff);
+	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+	struct sock *sk;
+	int err;
+
+	uh->len = newlen;
+
+	sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
+				udp4_lib_lookup_skb, skb, uh->source, uh->dest);
+	if (sk && udp_sk(sk)->gro_complete) {
+		skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+					: SKB_GSO_UDP_TUNNEL;
+
+		/* clear the encap mark, so that inner frag_list gro_complete
+		 * can take place
+		 */
+		NAPI_GRO_CB(skb)->encap_mark = 0;
+
+		/* Set encapsulation before calling into inner gro_complete()
+		 * functions to make them set up the inner offsets.
+		 */
+		skb->encapsulation = 1;
+		err = udp_sk(sk)->gro_complete(sk, skb,
+				nhoff + sizeof(struct udphdr));
+	} else {
+		err = udp_gro_complete_segment(skb);
+	}
+
+	if (skb->remcsum_offload)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+
+	return err;
+}
+EXPORT_SYMBOL(udp_gro_complete);
+
+INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+
+	/* do fraglist only if there is no outer UDP encap (or we already processed it) */
+	if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
+		uh->len = htons(skb->len - nhoff);
+
+		skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
+		skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+		if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+			if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
+				skb->csum_level++;
+		} else {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			skb->csum_level = 0;
+		}
+
+		return 0;
+	}
+
+	if (uh->check)
+		uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
+					  iph->daddr, 0);
+
+	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
+}
+
+static const struct net_offload udpv4_offload = {
+	.callbacks = {
+		.gso_segment = udp4_ufo_fragment,
+		.gro_receive  =	udp4_gro_receive,
+		.gro_complete =	udp4_gro_complete,
+	},
+};
+
+int __init udpv4_offload_init(void)
+{
+	return inet_add_offload(&udpv4_offload, IPPROTO_UDP);
+}
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
new file mode 100644
index 0000000000..1e7e4aecdc
--- /dev/null
+++ b/net/ipv4/udp_tunnel_core.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <net/dst_metadata.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+
+int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
+		     struct socket **sockp)
+{
+	int err;
+	struct socket *sock = NULL;
+	struct sockaddr_in udp_addr;
+
+	err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock);
+	if (err < 0)
+		goto error;
+
+	if (cfg->bind_ifindex) {
+		err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true);
+		if (err < 0)
+			goto error;
+	}
+
+	udp_addr.sin_family = AF_INET;
+	udp_addr.sin_addr = cfg->local_ip;
+	udp_addr.sin_port = cfg->local_udp_port;
+	err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+			  sizeof(udp_addr));
+	if (err < 0)
+		goto error;
+
+	if (cfg->peer_udp_port) {
+		udp_addr.sin_family = AF_INET;
+		udp_addr.sin_addr = cfg->peer_ip;
+		udp_addr.sin_port = cfg->peer_udp_port;
+		err = kernel_connect(sock, (struct sockaddr *)&udp_addr,
+				     sizeof(udp_addr), 0);
+		if (err < 0)
+			goto error;
+	}
+
+	sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
+
+	*sockp = sock;
+	return 0;
+
+error:
+	if (sock) {
+		kernel_sock_shutdown(sock, SHUT_RDWR);
+		sock_release(sock);
+	}
+	*sockp = NULL;
+	return err;
+}
+EXPORT_SYMBOL(udp_sock_create4);
+
+void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
+			   struct udp_tunnel_sock_cfg *cfg)
+{
+	struct sock *sk = sock->sk;
+
+	/* Disable multicast loopback */
+	inet_clear_bit(MC_LOOP, sk);
+
+	/* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
+	inet_inc_convert_csum(sk);
+
+	rcu_assign_sk_user_data(sk, cfg->sk_user_data);
+
+	udp_sk(sk)->encap_type = cfg->encap_type;
+	udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+	udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv;
+	udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
+	udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+	udp_sk(sk)->gro_receive = cfg->gro_receive;
+	udp_sk(sk)->gro_complete = cfg->gro_complete;
+
+	udp_tunnel_encap_enable(sk);
+}
+EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
+
+void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
+			     unsigned short type)
+{
+	struct sock *sk = sock->sk;
+	struct udp_tunnel_info ti;
+
+	ti.type = type;
+	ti.sa_family = sk->sk_family;
+	ti.port = inet_sk(sk)->inet_sport;
+
+	udp_tunnel_nic_add_port(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
+
+void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
+			     unsigned short type)
+{
+	struct sock *sk = sock->sk;
+	struct udp_tunnel_info ti;
+
+	ti.type = type;
+	ti.sa_family = sk->sk_family;
+	ti.port = inet_sk(sk)->inet_sport;
+
+	udp_tunnel_nic_del_port(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port);
+
+/* Notify netdevs that UDP port started listening */
+void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct udp_tunnel_info ti;
+	struct net_device *dev;
+
+	ti.type = type;
+	ti.sa_family = sk->sk_family;
+	ti.port = inet_sk(sk)->inet_sport;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		udp_tunnel_nic_add_port(dev, &ti);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
+
+/* Notify netdevs that UDP port is no more listening */
+void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct udp_tunnel_info ti;
+	struct net_device *dev;
+
+	ti.type = type;
+	ti.sa_family = sk->sk_family;
+	ti.port = inet_sk(sk)->inet_sport;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		udp_tunnel_nic_del_port(dev, &ti);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
+
+void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
+			 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
+			 __be16 df, __be16 src_port, __be16 dst_port,
+			 bool xnet, bool nocheck)
+{
+	struct udphdr *uh;
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = dst_port;
+	uh->source = src_port;
+	uh->len = htons(skb->len);
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	udp_set_csum(nocheck, skb, src, dst, skb->len);
+
+	iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
+
+void udp_tunnel_sock_release(struct socket *sock)
+{
+	rcu_assign_sk_user_data(sock->sk, NULL);
+	synchronize_rcu();
+	kernel_sock_shutdown(sock, SHUT_RDWR);
+	sock_release(sock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
+
+struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb,  unsigned short family,
+				    __be16 flags, __be64 tunnel_id, int md_size)
+{
+	struct metadata_dst *tun_dst;
+	struct ip_tunnel_info *info;
+
+	if (family == AF_INET)
+		tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size);
+	else
+		tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size);
+	if (!tun_dst)
+		return NULL;
+
+	info = &tun_dst->u.tun_info;
+	info->key.tp_src = udp_hdr(skb)->source;
+	info->key.tp_dst = udp_hdr(skb)->dest;
+	if (udp_hdr(skb)->check)
+		info->key.tun_flags |= TUNNEL_CSUM;
+	return tun_dst;
+}
+EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
new file mode 100644
index 0000000000..0292197497
--- /dev/null
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -0,0 +1,975 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020 Facebook Inc.
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/udp_tunnel.h>
+#include <net/vxlan.h>
+
+enum udp_tunnel_nic_table_entry_flags {
+	UDP_TUNNEL_NIC_ENTRY_ADD	= BIT(0),
+	UDP_TUNNEL_NIC_ENTRY_DEL	= BIT(1),
+	UDP_TUNNEL_NIC_ENTRY_OP_FAIL	= BIT(2),
+	UDP_TUNNEL_NIC_ENTRY_FROZEN	= BIT(3),
+};
+
+struct udp_tunnel_nic_table_entry {
+	__be16 port;
+	u8 type;
+	u8 flags;
+	u16 use_cnt;
+#define UDP_TUNNEL_NIC_USE_CNT_MAX	U16_MAX
+	u8 hw_priv;
+};
+
+/**
+ * struct udp_tunnel_nic - UDP tunnel port offload state
+ * @work:	async work for talking to hardware from process context
+ * @dev:	netdev pointer
+ * @need_sync:	at least one port start changed
+ * @need_replay: space was freed, we need a replay of all ports
+ * @work_pending: @work is currently scheduled
+ * @n_tables:	number of tables under @entries
+ * @missed:	bitmap of tables which overflown
+ * @entries:	table of tables of ports currently offloaded
+ */
+struct udp_tunnel_nic {
+	struct work_struct work;
+
+	struct net_device *dev;
+
+	u8 need_sync:1;
+	u8 need_replay:1;
+	u8 work_pending:1;
+
+	unsigned int n_tables;
+	unsigned long missed;
+	struct udp_tunnel_nic_table_entry **entries;
+};
+
+/* We ensure all work structs are done using driver state, but not the code.
+ * We need a workqueue we can flush before module gets removed.
+ */
+static struct workqueue_struct *udp_tunnel_nic_workqueue;
+
+static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type)
+{
+	switch (type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		return "vxlan";
+	case UDP_TUNNEL_TYPE_GENEVE:
+		return "geneve";
+	case UDP_TUNNEL_TYPE_VXLAN_GPE:
+		return "vxlan-gpe";
+	default:
+		return "unknown";
+	}
+}
+
+static bool
+udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry)
+{
+	return entry->use_cnt == 0 && !entry->flags;
+}
+
+static bool
+udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry)
+{
+	return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN);
+}
+
+static bool
+udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry)
+{
+	return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static void
+udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry)
+{
+	if (!udp_tunnel_nic_entry_is_free(entry))
+		entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static void
+udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry)
+{
+	entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static bool
+udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry)
+{
+	return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD |
+			       UDP_TUNNEL_NIC_ENTRY_DEL);
+}
+
+static void
+udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn,
+			   struct udp_tunnel_nic_table_entry *entry,
+			   unsigned int flag)
+{
+	entry->flags |= flag;
+	utn->need_sync = 1;
+}
+
+static void
+udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry,
+			     struct udp_tunnel_info *ti)
+{
+	memset(ti, 0, sizeof(*ti));
+	ti->port = entry->port;
+	ti->type = entry->type;
+	ti->hw_priv = entry->hw_priv;
+}
+
+static bool
+udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	unsigned int i, j;
+
+	for (i = 0; i < utn->n_tables; i++)
+		for (j = 0; j < info->tables[i].n_entries; j++)
+			if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j]))
+				return false;
+	return true;
+}
+
+static bool
+udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+	const struct udp_tunnel_nic_table_info *table;
+	unsigned int i, j;
+
+	if (!utn->missed)
+		return false;
+
+	for (i = 0; i < utn->n_tables; i++) {
+		table = &dev->udp_tunnel_nic_info->tables[i];
+		if (!test_bit(i, &utn->missed))
+			continue;
+
+		for (j = 0; j < table->n_entries; j++)
+			if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j]))
+				return true;
+	}
+
+	return false;
+}
+
+static void
+__udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table,
+			  unsigned int idx, struct udp_tunnel_info *ti)
+{
+	struct udp_tunnel_nic_table_entry *entry;
+	struct udp_tunnel_nic *utn;
+
+	utn = dev->udp_tunnel_nic;
+	entry = &utn->entries[table][idx];
+
+	if (entry->use_cnt)
+		udp_tunnel_nic_ti_from_entry(entry, ti);
+}
+
+static void
+__udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
+			       unsigned int idx, u8 priv)
+{
+	dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv;
+}
+
+static void
+udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry,
+				 int err)
+{
+	bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+
+	WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD &&
+		     entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL);
+
+	if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD &&
+	    (!err || (err == -EEXIST && dodgy)))
+		entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD;
+
+	if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL &&
+	    (!err || (err == -ENOENT && dodgy)))
+		entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL;
+
+	if (!err)
+		entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+	else
+		entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+}
+
+static void
+udp_tunnel_nic_device_sync_one(struct net_device *dev,
+			       struct udp_tunnel_nic *utn,
+			       unsigned int table, unsigned int idx)
+{
+	struct udp_tunnel_nic_table_entry *entry;
+	struct udp_tunnel_info ti;
+	int err;
+
+	entry = &utn->entries[table][idx];
+	if (!udp_tunnel_nic_entry_is_queued(entry))
+		return;
+
+	udp_tunnel_nic_ti_from_entry(entry, &ti);
+	if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD)
+		err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti);
+	else
+		err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx,
+							   &ti);
+	udp_tunnel_nic_entry_update_done(entry, err);
+
+	if (err)
+		netdev_warn(dev,
+			    "UDP tunnel port sync failed port %d type %s: %d\n",
+			    be16_to_cpu(entry->port),
+			    udp_tunnel_nic_tunnel_type_name(entry->type),
+			    err);
+}
+
+static void
+udp_tunnel_nic_device_sync_by_port(struct net_device *dev,
+				   struct udp_tunnel_nic *utn)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	unsigned int i, j;
+
+	for (i = 0; i < utn->n_tables; i++)
+		for (j = 0; j < info->tables[i].n_entries; j++)
+			udp_tunnel_nic_device_sync_one(dev, utn, i, j);
+}
+
+static void
+udp_tunnel_nic_device_sync_by_table(struct net_device *dev,
+				    struct udp_tunnel_nic *utn)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	unsigned int i, j;
+	int err;
+
+	for (i = 0; i < utn->n_tables; i++) {
+		/* Find something that needs sync in this table */
+		for (j = 0; j < info->tables[i].n_entries; j++)
+			if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j]))
+				break;
+		if (j == info->tables[i].n_entries)
+			continue;
+
+		err = info->sync_table(dev, i);
+		if (err)
+			netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n",
+				    i, err);
+
+		for (j = 0; j < info->tables[i].n_entries; j++) {
+			struct udp_tunnel_nic_table_entry *entry;
+
+			entry = &utn->entries[i][j];
+			if (udp_tunnel_nic_entry_is_queued(entry))
+				udp_tunnel_nic_entry_update_done(entry, err);
+		}
+	}
+}
+
+static void
+__udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+	if (!utn->need_sync)
+		return;
+
+	if (dev->udp_tunnel_nic_info->sync_table)
+		udp_tunnel_nic_device_sync_by_table(dev, utn);
+	else
+		udp_tunnel_nic_device_sync_by_port(dev, utn);
+
+	utn->need_sync = 0;
+	/* Can't replay directly here, in case we come from the tunnel driver's
+	 * notification - trying to replay may deadlock inside tunnel driver.
+	 */
+	utn->need_replay = udp_tunnel_nic_should_replay(dev, utn);
+}
+
+static void
+udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	bool may_sleep;
+
+	if (!utn->need_sync)
+		return;
+
+	/* Drivers which sleep in the callback need to update from
+	 * the workqueue, if we come from the tunnel driver's notification.
+	 */
+	may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP;
+	if (!may_sleep)
+		__udp_tunnel_nic_device_sync(dev, utn);
+	if (may_sleep || utn->need_replay) {
+		queue_work(udp_tunnel_nic_workqueue, &utn->work);
+		utn->work_pending = 1;
+	}
+}
+
+static bool
+udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table,
+				struct udp_tunnel_info *ti)
+{
+	return table->tunnel_types & ti->type;
+}
+
+static bool
+udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn,
+			  struct udp_tunnel_info *ti)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	unsigned int i;
+
+	/* Special case IPv4-only NICs */
+	if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY &&
+	    ti->sa_family != AF_INET)
+		return false;
+
+	for (i = 0; i < utn->n_tables; i++)
+		if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti))
+			return true;
+	return false;
+}
+
+static int
+udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn,
+			     struct udp_tunnel_info *ti)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	struct udp_tunnel_nic_table_entry *entry;
+	unsigned int i, j;
+
+	for (i = 0; i < utn->n_tables; i++)
+		for (j = 0; j < info->tables[i].n_entries; j++) {
+			entry =	&utn->entries[i][j];
+
+			if (!udp_tunnel_nic_entry_is_free(entry) &&
+			    entry->port == ti->port &&
+			    entry->type != ti->type) {
+				__set_bit(i, &utn->missed);
+				return true;
+			}
+		}
+	return false;
+}
+
+static void
+udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn,
+			 unsigned int table, unsigned int idx, int use_cnt_adj)
+{
+	struct udp_tunnel_nic_table_entry *entry =  &utn->entries[table][idx];
+	bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+	unsigned int from, to;
+
+	WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX);
+
+	/* If not going from used to unused or vice versa - all done.
+	 * For dodgy entries make sure we try to sync again (queue the entry).
+	 */
+	entry->use_cnt += use_cnt_adj;
+	if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj))
+		return;
+
+	/* Cancel the op before it was sent to the device, if possible,
+	 * otherwise we'd need to take special care to issue commands
+	 * in the same order the ports arrived.
+	 */
+	if (use_cnt_adj < 0) {
+		from = UDP_TUNNEL_NIC_ENTRY_ADD;
+		to = UDP_TUNNEL_NIC_ENTRY_DEL;
+	} else {
+		from = UDP_TUNNEL_NIC_ENTRY_DEL;
+		to = UDP_TUNNEL_NIC_ENTRY_ADD;
+	}
+
+	if (entry->flags & from) {
+		entry->flags &= ~from;
+		if (!dodgy)
+			return;
+	}
+
+	udp_tunnel_nic_entry_queue(utn, entry, to);
+}
+
+static bool
+udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn,
+			     unsigned int table, unsigned int idx,
+			     struct udp_tunnel_info *ti, int use_cnt_adj)
+{
+	struct udp_tunnel_nic_table_entry *entry =  &utn->entries[table][idx];
+
+	if (udp_tunnel_nic_entry_is_free(entry) ||
+	    entry->port != ti->port ||
+	    entry->type != ti->type)
+		return false;
+
+	if (udp_tunnel_nic_entry_is_frozen(entry))
+		return true;
+
+	udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj);
+	return true;
+}
+
+/* Try to find existing matching entry and adjust its use count, instead of
+ * adding a new one. Returns true if entry was found. In case of delete the
+ * entry may have gotten removed in the process, in which case it will be
+ * queued for removal.
+ */
+static bool
+udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+			    struct udp_tunnel_info *ti, int use_cnt_adj)
+{
+	const struct udp_tunnel_nic_table_info *table;
+	unsigned int i, j;
+
+	for (i = 0; i < utn->n_tables; i++) {
+		table = &dev->udp_tunnel_nic_info->tables[i];
+		if (!udp_tunnel_nic_table_is_capable(table, ti))
+			continue;
+
+		for (j = 0; j < table->n_entries; j++)
+			if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti,
+							 use_cnt_adj))
+				return true;
+	}
+
+	return false;
+}
+
+static bool
+udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+			    struct udp_tunnel_info *ti)
+{
+	return udp_tunnel_nic_try_existing(dev, utn, ti, +1);
+}
+
+static bool
+udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+			    struct udp_tunnel_info *ti)
+{
+	return udp_tunnel_nic_try_existing(dev, utn, ti, -1);
+}
+
+static bool
+udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn,
+		       struct udp_tunnel_info *ti)
+{
+	const struct udp_tunnel_nic_table_info *table;
+	unsigned int i, j;
+
+	for (i = 0; i < utn->n_tables; i++) {
+		table = &dev->udp_tunnel_nic_info->tables[i];
+		if (!udp_tunnel_nic_table_is_capable(table, ti))
+			continue;
+
+		for (j = 0; j < table->n_entries; j++) {
+			struct udp_tunnel_nic_table_entry *entry;
+
+			entry = &utn->entries[i][j];
+			if (!udp_tunnel_nic_entry_is_free(entry))
+				continue;
+
+			entry->port = ti->port;
+			entry->type = ti->type;
+			entry->use_cnt = 1;
+			udp_tunnel_nic_entry_queue(utn, entry,
+						   UDP_TUNNEL_NIC_ENTRY_ADD);
+			return true;
+		}
+
+		/* The different table may still fit this port in, but there
+		 * are no devices currently which have multiple tables accepting
+		 * the same tunnel type, and false positives are okay.
+		 */
+		__set_bit(i, &utn->missed);
+	}
+
+	return false;
+}
+
+static void
+__udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	struct udp_tunnel_nic *utn;
+
+	utn = dev->udp_tunnel_nic;
+	if (!utn)
+		return;
+	if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)
+		return;
+	if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN &&
+	    ti->port == htons(IANA_VXLAN_UDP_PORT)) {
+		if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+			netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n");
+		return;
+	}
+
+	if (!udp_tunnel_nic_is_capable(dev, utn, ti))
+		return;
+
+	/* It may happen that a tunnel of one type is removed and different
+	 * tunnel type tries to reuse its port before the device was informed.
+	 * Rely on utn->missed to re-add this port later.
+	 */
+	if (udp_tunnel_nic_has_collision(dev, utn, ti))
+		return;
+
+	if (!udp_tunnel_nic_add_existing(dev, utn, ti))
+		udp_tunnel_nic_add_new(dev, utn, ti);
+
+	udp_tunnel_nic_device_sync(dev, utn);
+}
+
+static void
+__udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+	struct udp_tunnel_nic *utn;
+
+	utn = dev->udp_tunnel_nic;
+	if (!utn)
+		return;
+
+	if (!udp_tunnel_nic_is_capable(dev, utn, ti))
+		return;
+
+	udp_tunnel_nic_del_existing(dev, utn, ti);
+
+	udp_tunnel_nic_device_sync(dev, utn);
+}
+
+static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	struct udp_tunnel_nic *utn;
+	unsigned int i, j;
+
+	ASSERT_RTNL();
+
+	utn = dev->udp_tunnel_nic;
+	if (!utn)
+		return;
+
+	utn->need_sync = false;
+	for (i = 0; i < utn->n_tables; i++)
+		for (j = 0; j < info->tables[i].n_entries; j++) {
+			struct udp_tunnel_nic_table_entry *entry;
+
+			entry = &utn->entries[i][j];
+
+			entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL |
+					  UDP_TUNNEL_NIC_ENTRY_OP_FAIL);
+			/* We don't release rtnl across ops */
+			WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN);
+			if (!entry->use_cnt)
+				continue;
+
+			udp_tunnel_nic_entry_queue(utn, entry,
+						   UDP_TUNNEL_NIC_ENTRY_ADD);
+		}
+
+	__udp_tunnel_nic_device_sync(dev, utn);
+}
+
+static size_t
+__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	struct udp_tunnel_nic *utn;
+	unsigned int j;
+	size_t size;
+
+	utn = dev->udp_tunnel_nic;
+	if (!utn)
+		return 0;
+
+	size = 0;
+	for (j = 0; j < info->tables[table].n_entries; j++) {
+		if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j]))
+			continue;
+
+		size += nla_total_size(0) +		 /* _TABLE_ENTRY */
+			nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */
+			nla_total_size(sizeof(u32));	 /* _ENTRY_TYPE */
+	}
+
+	return size;
+}
+
+static int
+__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
+			    struct sk_buff *skb)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	struct udp_tunnel_nic *utn;
+	struct nlattr *nest;
+	unsigned int j;
+
+	utn = dev->udp_tunnel_nic;
+	if (!utn)
+		return 0;
+
+	for (j = 0; j < info->tables[table].n_entries; j++) {
+		if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j]))
+			continue;
+
+		nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY);
+		if (!nest)
+			return -EMSGSIZE;
+
+		if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT,
+				 utn->entries[table][j].port) ||
+		    nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE,
+				ilog2(utn->entries[table][j].type)))
+			goto err_cancel;
+
+		nla_nest_end(skb, nest);
+	}
+
+	return 0;
+
+err_cancel:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = {
+	.get_port	= __udp_tunnel_nic_get_port,
+	.set_port_priv	= __udp_tunnel_nic_set_port_priv,
+	.add_port	= __udp_tunnel_nic_add_port,
+	.del_port	= __udp_tunnel_nic_del_port,
+	.reset_ntf	= __udp_tunnel_nic_reset_ntf,
+	.dump_size	= __udp_tunnel_nic_dump_size,
+	.dump_write	= __udp_tunnel_nic_dump_write,
+};
+
+static void
+udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	unsigned int i, j;
+
+	for (i = 0; i < utn->n_tables; i++)
+		for (j = 0; j < info->tables[i].n_entries; j++) {
+			int adj_cnt = -utn->entries[i][j].use_cnt;
+
+			if (adj_cnt)
+				udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt);
+		}
+
+	__udp_tunnel_nic_device_sync(dev, utn);
+
+	for (i = 0; i < utn->n_tables; i++)
+		memset(utn->entries[i], 0, array_size(info->tables[i].n_entries,
+						      sizeof(**utn->entries)));
+	WARN_ON(utn->need_sync);
+	utn->need_replay = 0;
+}
+
+static void
+udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	struct udp_tunnel_nic_shared_node *node;
+	unsigned int i, j;
+
+	/* Freeze all the ports we are already tracking so that the replay
+	 * does not double up the refcount.
+	 */
+	for (i = 0; i < utn->n_tables; i++)
+		for (j = 0; j < info->tables[i].n_entries; j++)
+			udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]);
+	utn->missed = 0;
+	utn->need_replay = 0;
+
+	if (!info->shared) {
+		udp_tunnel_get_rx_info(dev);
+	} else {
+		list_for_each_entry(node, &info->shared->devices, list)
+			udp_tunnel_get_rx_info(node->dev);
+	}
+
+	for (i = 0; i < utn->n_tables; i++)
+		for (j = 0; j < info->tables[i].n_entries; j++)
+			udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]);
+}
+
+static void udp_tunnel_nic_device_sync_work(struct work_struct *work)
+{
+	struct udp_tunnel_nic *utn =
+		container_of(work, struct udp_tunnel_nic, work);
+
+	rtnl_lock();
+	utn->work_pending = 0;
+	__udp_tunnel_nic_device_sync(utn->dev, utn);
+
+	if (utn->need_replay)
+		udp_tunnel_nic_replay(utn->dev, utn);
+	rtnl_unlock();
+}
+
+static struct udp_tunnel_nic *
+udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info,
+		     unsigned int n_tables)
+{
+	struct udp_tunnel_nic *utn;
+	unsigned int i;
+
+	utn = kzalloc(sizeof(*utn), GFP_KERNEL);
+	if (!utn)
+		return NULL;
+	utn->n_tables = n_tables;
+	INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work);
+
+	utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL);
+	if (!utn->entries)
+		goto err_free_utn;
+
+	for (i = 0; i < n_tables; i++) {
+		utn->entries[i] = kcalloc(info->tables[i].n_entries,
+					  sizeof(*utn->entries[i]), GFP_KERNEL);
+		if (!utn->entries[i])
+			goto err_free_prev_entries;
+	}
+
+	return utn;
+
+err_free_prev_entries:
+	while (i--)
+		kfree(utn->entries[i]);
+	kfree(utn->entries);
+err_free_utn:
+	kfree(utn);
+	return NULL;
+}
+
+static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn)
+{
+	unsigned int i;
+
+	for (i = 0; i < utn->n_tables; i++)
+		kfree(utn->entries[i]);
+	kfree(utn->entries);
+	kfree(utn);
+}
+
+static int udp_tunnel_nic_register(struct net_device *dev)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+	struct udp_tunnel_nic_shared_node *node = NULL;
+	struct udp_tunnel_nic *utn;
+	unsigned int n_tables, i;
+
+	BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE <
+		     UDP_TUNNEL_NIC_MAX_TABLES);
+	/* Expect use count of at most 2 (IPv4, IPv6) per device */
+	BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX <
+		     UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2);
+
+	/* Check that the driver info is sane */
+	if (WARN_ON(!info->set_port != !info->unset_port) ||
+	    WARN_ON(!info->set_port == !info->sync_table) ||
+	    WARN_ON(!info->tables[0].n_entries))
+		return -EINVAL;
+
+	if (WARN_ON(info->shared &&
+		    info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+		return -EINVAL;
+
+	n_tables = 1;
+	for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
+		if (!info->tables[i].n_entries)
+			continue;
+
+		n_tables++;
+		if (WARN_ON(!info->tables[i - 1].n_entries))
+			return -EINVAL;
+	}
+
+	/* Create UDP tunnel state structures */
+	if (info->shared) {
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
+			return -ENOMEM;
+
+		node->dev = dev;
+	}
+
+	if (info->shared && info->shared->udp_tunnel_nic_info) {
+		utn = info->shared->udp_tunnel_nic_info;
+	} else {
+		utn = udp_tunnel_nic_alloc(info, n_tables);
+		if (!utn) {
+			kfree(node);
+			return -ENOMEM;
+		}
+	}
+
+	if (info->shared) {
+		if (!info->shared->udp_tunnel_nic_info) {
+			INIT_LIST_HEAD(&info->shared->devices);
+			info->shared->udp_tunnel_nic_info = utn;
+		}
+
+		list_add_tail(&node->list, &info->shared->devices);
+	}
+
+	utn->dev = dev;
+	dev_hold(dev);
+	dev->udp_tunnel_nic = utn;
+
+	if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+		udp_tunnel_get_rx_info(dev);
+
+	return 0;
+}
+
+static void
+udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+
+	/* For a shared table remove this dev from the list of sharing devices
+	 * and if there are other devices just detach.
+	 */
+	if (info->shared) {
+		struct udp_tunnel_nic_shared_node *node, *first;
+
+		list_for_each_entry(node, &info->shared->devices, list)
+			if (node->dev == dev)
+				break;
+		if (list_entry_is_head(node, &info->shared->devices, list))
+			return;
+
+		list_del(&node->list);
+		kfree(node);
+
+		first = list_first_entry_or_null(&info->shared->devices,
+						 typeof(*first), list);
+		if (first) {
+			udp_tunnel_drop_rx_info(dev);
+			utn->dev = first->dev;
+			goto release_dev;
+		}
+
+		info->shared->udp_tunnel_nic_info = NULL;
+	}
+
+	/* Flush before we check work, so we don't waste time adding entries
+	 * from the work which we will boot immediately.
+	 */
+	udp_tunnel_nic_flush(dev, utn);
+
+	/* Wait for the work to be done using the state, netdev core will
+	 * retry unregister until we give up our reference on this device.
+	 */
+	if (utn->work_pending)
+		return;
+
+	udp_tunnel_nic_free(utn);
+release_dev:
+	dev->udp_tunnel_nic = NULL;
+	dev_put(dev);
+}
+
+static int
+udp_tunnel_nic_netdevice_event(struct notifier_block *unused,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	const struct udp_tunnel_nic_info *info;
+	struct udp_tunnel_nic *utn;
+
+	info = dev->udp_tunnel_nic_info;
+	if (!info)
+		return NOTIFY_DONE;
+
+	if (event == NETDEV_REGISTER) {
+		int err;
+
+		err = udp_tunnel_nic_register(dev);
+		if (err)
+			netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err);
+		return notifier_from_errno(err);
+	}
+	/* All other events will need the udp_tunnel_nic state */
+	utn = dev->udp_tunnel_nic;
+	if (!utn)
+		return NOTIFY_DONE;
+
+	if (event == NETDEV_UNREGISTER) {
+		udp_tunnel_nic_unregister(dev, utn);
+		return NOTIFY_OK;
+	}
+
+	/* All other events only matter if NIC has to be programmed open */
+	if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+		return NOTIFY_DONE;
+
+	if (event == NETDEV_UP) {
+		WARN_ON(!udp_tunnel_nic_is_empty(dev, utn));
+		udp_tunnel_get_rx_info(dev);
+		return NOTIFY_OK;
+	}
+	if (event == NETDEV_GOING_DOWN) {
+		udp_tunnel_nic_flush(dev, utn);
+		return NOTIFY_OK;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = {
+	.notifier_call = udp_tunnel_nic_netdevice_event,
+};
+
+static int __init udp_tunnel_nic_init_module(void)
+{
+	int err;
+
+	udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0);
+	if (!udp_tunnel_nic_workqueue)
+		return -ENOMEM;
+
+	rtnl_lock();
+	udp_tunnel_nic_ops = &__udp_tunnel_nic_ops;
+	rtnl_unlock();
+
+	err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block);
+	if (err)
+		goto err_unset_ops;
+
+	return 0;
+
+err_unset_ops:
+	rtnl_lock();
+	udp_tunnel_nic_ops = NULL;
+	rtnl_unlock();
+	destroy_workqueue(udp_tunnel_nic_workqueue);
+	return err;
+}
+late_initcall(udp_tunnel_nic_init_module);
+
+static void __exit udp_tunnel_nic_cleanup_module(void)
+{
+	unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block);
+
+	rtnl_lock();
+	udp_tunnel_nic_ops = NULL;
+	rtnl_unlock();
+
+	destroy_workqueue(udp_tunnel_nic_workqueue);
+}
+module_exit(udp_tunnel_nic_cleanup_module);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp_tunnel_stub.c b/net/ipv4/udp_tunnel_stub.c
new file mode 100644
index 0000000000..c4b2888f5f
--- /dev/null
+++ b/net/ipv4/udp_tunnel_stub.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020 Facebook Inc.
+
+#include <net/udp_tunnel.h>
+
+const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops;
+EXPORT_SYMBOL_GPL(udp_tunnel_nic_ops);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
new file mode 100644
index 0000000000..af37af3ab7
--- /dev/null
+++ b/net/ipv4/udplite.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  UDPLITE     An implementation of the UDP-Lite protocol (RFC 3828).
+ *
+ *  Authors:    Gerrit Renker       <gerrit@erg.abdn.ac.uk>
+ *
+ *  Changes:
+ *  Fixes:
+ */
+
+#define pr_fmt(fmt) "UDPLite: " fmt
+
+#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include "udp_impl.h"
+
+struct udp_table 	udplite_table __read_mostly;
+EXPORT_SYMBOL(udplite_table);
+
+/* Designate sk as UDP-Lite socket */
+static int udplite_sk_init(struct sock *sk)
+{
+	udp_init_sock(sk);
+	pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
+		     "please contact the netdev mailing list\n");
+	return 0;
+}
+
+static int udplite_rcv(struct sk_buff *skb)
+{
+	return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
+}
+
+static int udplite_err(struct sk_buff *skb, u32 info)
+{
+	return __udp4_lib_err(skb, info, &udplite_table);
+}
+
+static const struct net_protocol udplite_protocol = {
+	.handler	= udplite_rcv,
+	.err_handler	= udplite_err,
+	.no_policy	= 1,
+};
+
+struct proto 	udplite_prot = {
+	.name		   = "UDP-Lite",
+	.owner		   = THIS_MODULE,
+	.close		   = udp_lib_close,
+	.connect	   = ip4_datagram_connect,
+	.disconnect	   = udp_disconnect,
+	.ioctl		   = udp_ioctl,
+	.init		   = udplite_sk_init,
+	.destroy	   = udp_destroy_sock,
+	.setsockopt	   = udp_setsockopt,
+	.getsockopt	   = udp_getsockopt,
+	.sendmsg	   = udp_sendmsg,
+	.recvmsg	   = udp_recvmsg,
+	.hash		   = udp_lib_hash,
+	.unhash		   = udp_lib_unhash,
+	.rehash		   = udp_v4_rehash,
+	.get_port	   = udp_v4_get_port,
+
+	.memory_allocated  = &udp_memory_allocated,
+	.per_cpu_fw_alloc  = &udp_memory_per_cpu_fw_alloc,
+
+	.sysctl_mem	   = sysctl_udp_mem,
+	.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+	.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
+	.obj_size	   = sizeof(struct udp_sock),
+	.h.udp_table	   = &udplite_table,
+};
+EXPORT_SYMBOL(udplite_prot);
+
+static struct inet_protosw udplite4_protosw = {
+	.type		=  SOCK_DGRAM,
+	.protocol	=  IPPROTO_UDPLITE,
+	.prot		=  &udplite_prot,
+	.ops		=  &inet_dgram_ops,
+	.flags		=  INET_PROTOSW_PERMANENT,
+};
+
+#ifdef CONFIG_PROC_FS
+static struct udp_seq_afinfo udplite4_seq_afinfo = {
+	.family		= AF_INET,
+	.udp_table 	= &udplite_table,
+};
+
+static int __net_init udplite4_proc_init_net(struct net *net)
+{
+	if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops,
+			sizeof(struct udp_iter_state), &udplite4_seq_afinfo))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit udplite4_proc_exit_net(struct net *net)
+{
+	remove_proc_entry("udplite", net->proc_net);
+}
+
+static struct pernet_operations udplite4_net_ops = {
+	.init = udplite4_proc_init_net,
+	.exit = udplite4_proc_exit_net,
+};
+
+static __init int udplite4_proc_init(void)
+{
+	return register_pernet_subsys(&udplite4_net_ops);
+}
+#else
+static inline int udplite4_proc_init(void)
+{
+	return 0;
+}
+#endif
+
+void __init udplite4_register(void)
+{
+	udp_table_init(&udplite_table, "UDP-Lite");
+	if (proto_register(&udplite_prot, 1))
+		goto out_register_err;
+
+	if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0)
+		goto out_unregister_proto;
+
+	inet_register_protosw(&udplite4_protosw);
+
+	if (udplite4_proc_init())
+		pr_err("%s: Cannot register /proc!\n", __func__);
+	return;
+
+out_unregister_proto:
+	proto_unregister(&udplite_prot);
+out_register_err:
+	pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);
+}
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
new file mode 100644
index 0000000000..183f6dc372
--- /dev/null
+++ b/net/ipv4/xfrm4_input.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * xfrm4_input.c
+ *
+ * Changes:
+ *	YOSHIFUJI Hideaki @USAGI
+ *		Split up af-specific portion
+ *	Derek Atkins <derek@ihtfp.com>
+ *		Add Encapsulation support
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk,
+				   struct sk_buff *skb)
+{
+	return dst_input(skb);
+}
+
+static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
+					 struct sk_buff *skb)
+{
+	if (!skb_dst(skb)) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev))
+			goto drop;
+	}
+
+	if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2))
+		goto drop;
+
+	return 0;
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+int xfrm4_transport_finish(struct sk_buff *skb, int async)
+{
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct iphdr *iph = ip_hdr(skb);
+
+	iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+
+#ifndef CONFIG_NETFILTER
+	if (!async)
+		return -iph->protocol;
+#endif
+
+	__skb_push(skb, skb->data - skb_network_header(skb));
+	iph->tot_len = htons(skb->len);
+	ip_send_check(iph);
+
+	if (xo && (xo->flags & XFRM_GRO)) {
+		skb_mac_header_rebuild(skb);
+		skb_reset_transport_header(skb);
+		return 0;
+	}
+
+	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+		xfrm4_rcv_encap_finish);
+	return 0;
+}
+
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct udp_sock *up = udp_sk(sk);
+	struct udphdr *uh;
+	struct iphdr *iph;
+	int iphlen, len;
+	__u8 *udpdata;
+	__be32 *udpdata32;
+	u16 encap_type;
+
+	encap_type = READ_ONCE(up->encap_type);
+	/* if this is not encapsulated socket, then just return now */
+	if (!encap_type)
+		return 1;
+
+	/* If this is a paged skb, make sure we pull up
+	 * whatever data we need to look at. */
+	len = skb->len - sizeof(struct udphdr);
+	if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
+		return 1;
+
+	/* Now we can get the pointers */
+	uh = udp_hdr(skb);
+	udpdata = (__u8 *)uh + sizeof(struct udphdr);
+	udpdata32 = (__be32 *)udpdata;
+
+	switch (encap_type) {
+	default:
+	case UDP_ENCAP_ESPINUDP:
+		/* Check if this is a keepalive packet.  If so, eat it. */
+		if (len == 1 && udpdata[0] == 0xff) {
+			goto drop;
+		} else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
+			/* ESP Packet without Non-ESP header */
+			len = sizeof(struct udphdr);
+		} else
+			/* Must be an IKE packet.. pass it through */
+			return 1;
+		break;
+	case UDP_ENCAP_ESPINUDP_NON_IKE:
+		/* Check if this is a keepalive packet.  If so, eat it. */
+		if (len == 1 && udpdata[0] == 0xff) {
+			goto drop;
+		} else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+			   udpdata32[0] == 0 && udpdata32[1] == 0) {
+
+			/* ESP Packet with Non-IKE marker */
+			len = sizeof(struct udphdr) + 2 * sizeof(u32);
+		} else
+			/* Must be an IKE packet.. pass it through */
+			return 1;
+		break;
+	}
+
+	/* At this point we are sure that this is an ESPinUDP packet,
+	 * so we need to remove 'len' bytes from the packet (the UDP
+	 * header and optional ESP marker bytes) and then modify the
+	 * protocol to ESP, and then call into the transform receiver.
+	 */
+	if (skb_unclone(skb, GFP_ATOMIC))
+		goto drop;
+
+	/* Now we can update and verify the packet length... */
+	iph = ip_hdr(skb);
+	iphlen = iph->ihl << 2;
+	iph->tot_len = htons(ntohs(iph->tot_len) - len);
+	if (skb->len < iphlen + len) {
+		/* packet is too small!?! */
+		goto drop;
+	}
+
+	/* pull the data buffer up to the ESP header and set the
+	 * transport header to point to ESP.  Keep UDP on the stack
+	 * for later.
+	 */
+	__skb_pull(skb, len);
+	skb_reset_transport_header(skb);
+
+	/* process ESP */
+	return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, encap_type);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL(xfrm4_udp_encap_rcv);
+
+int xfrm4_rcv(struct sk_buff *skb)
+{
+	return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
+}
+EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
new file mode 100644
index 0000000000..3cff51ba72
--- /dev/null
+++ b/net/ipv4/xfrm4_output.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+
+static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_NETFILTER
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+	if (!x) {
+		IPCB(skb)->flags |= IPSKB_REROUTED;
+		return dst_output(net, sk, skb);
+	}
+#endif
+
+	return xfrm_output(sk, skb);
+}
+
+int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+			    net, sk, skb, skb->dev, skb_dst(skb)->dev,
+			    __xfrm4_output,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+void xfrm4_local_error(struct sk_buff *skb, u32 mtu)
+{
+	struct iphdr *hdr;
+
+	hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
+	ip_local_error(skb->sk, EMSGSIZE, hdr->daddr,
+		       inet_sk(skb->sk)->inet_dport, mtu);
+}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
new file mode 100644
index 0000000000..c33bca2c38
--- /dev/null
+++ b/net/ipv4/xfrm4_policy.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * xfrm4_policy.c
+ *
+ * Changes:
+ *	Kazunori MIYAZAWA @USAGI
+ * 	YOSHIFUJI Hideaki @USAGI
+ *		Split up af-specific portion
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/inetdevice.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/l3mdev.h>
+
+static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
+					    int tos, int oif,
+					    const xfrm_address_t *saddr,
+					    const xfrm_address_t *daddr,
+					    u32 mark)
+{
+	struct rtable *rt;
+
+	memset(fl4, 0, sizeof(*fl4));
+	fl4->daddr = daddr->a4;
+	fl4->flowi4_tos = tos;
+	fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(net, oif);
+	fl4->flowi4_mark = mark;
+	if (saddr)
+		fl4->saddr = saddr->a4;
+
+	rt = __ip_route_output_key(net, fl4);
+	if (!IS_ERR(rt))
+		return &rt->dst;
+
+	return ERR_CAST(rt);
+}
+
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
+					  const xfrm_address_t *saddr,
+					  const xfrm_address_t *daddr,
+					  u32 mark)
+{
+	struct flowi4 fl4;
+
+	return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark);
+}
+
+static int xfrm4_get_saddr(struct net *net, int oif,
+			   xfrm_address_t *saddr, xfrm_address_t *daddr,
+			   u32 mark)
+{
+	struct dst_entry *dst;
+	struct flowi4 fl4;
+
+	dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark);
+	if (IS_ERR(dst))
+		return -EHOSTUNREACH;
+
+	saddr->a4 = fl4.saddr;
+	dst_release(dst);
+	return 0;
+}
+
+static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+			  const struct flowi *fl)
+{
+	struct rtable *rt = (struct rtable *)xdst->route;
+	const struct flowi4 *fl4 = &fl->u.ip4;
+
+	xdst->u.rt.rt_iif = fl4->flowi4_iif;
+
+	xdst->u.dst.dev = dev;
+	netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
+
+	/* Sheit... I remember I did this right. Apparently,
+	 * it was magically lost, so this code needs audit */
+	xdst->u.rt.rt_is_input = rt->rt_is_input;
+	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
+					      RTCF_LOCAL);
+	xdst->u.rt.rt_type = rt->rt_type;
+	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+	xdst->u.rt.rt_gw_family = rt->rt_gw_family;
+	if (rt->rt_gw_family == AF_INET)
+		xdst->u.rt.rt_gw4 = rt->rt_gw4;
+	else if (rt->rt_gw_family == AF_INET6)
+		xdst->u.rt.rt_gw6 = rt->rt_gw6;
+	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+	xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
+	rt_add_uncached_list(&xdst->u.rt);
+
+	return 0;
+}
+
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu,
+			      bool confirm_neigh)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct dst_entry *path = xdst->route;
+
+	path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh);
+}
+
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+			   struct sk_buff *skb)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct dst_entry *path = xdst->route;
+
+	path->ops->redirect(path, sk, skb);
+}
+
+static void xfrm4_dst_destroy(struct dst_entry *dst)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+	dst_destroy_metrics_generic(dst);
+	rt_del_uncached_list(&xdst->u.rt);
+	xfrm_dst_destroy(xdst);
+}
+
+static struct dst_ops xfrm4_dst_ops_template = {
+	.family =		AF_INET,
+	.update_pmtu =		xfrm4_update_pmtu,
+	.redirect =		xfrm4_redirect,
+	.cow_metrics =		dst_cow_metrics_generic,
+	.destroy =		xfrm4_dst_destroy,
+	.ifdown =		xfrm_dst_ifdown,
+	.local_out =		__ip_local_out,
+	.gc_thresh =		32768,
+};
+
+static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+	.dst_ops =		&xfrm4_dst_ops_template,
+	.dst_lookup =		xfrm4_dst_lookup,
+	.get_saddr =		xfrm4_get_saddr,
+	.fill_dst =		xfrm4_fill_dst,
+	.blackhole_route =	ipv4_blackhole_route,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm4_policy_table[] = {
+	{
+		.procname       = "xfrm4_gc_thresh",
+		.data           = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+static __net_init int xfrm4_net_sysctl_init(struct net *net)
+{
+	struct ctl_table *table;
+	struct ctl_table_header *hdr;
+
+	table = xfrm4_policy_table;
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL);
+		if (!table)
+			goto err_alloc;
+
+		table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
+	}
+
+	hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+				     ARRAY_SIZE(xfrm4_policy_table));
+	if (!hdr)
+		goto err_reg;
+
+	net->ipv4.xfrm4_hdr = hdr;
+	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static __net_exit void xfrm4_net_sysctl_exit(struct net *net)
+{
+	struct ctl_table *table;
+
+	if (!net->ipv4.xfrm4_hdr)
+		return;
+
+	table = net->ipv4.xfrm4_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.xfrm4_hdr);
+	if (!net_eq(net, &init_net))
+		kfree(table);
+}
+#else /* CONFIG_SYSCTL */
+static inline int xfrm4_net_sysctl_init(struct net *net)
+{
+	return 0;
+}
+
+static inline void xfrm4_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+	int ret;
+
+	memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template,
+	       sizeof(xfrm4_dst_ops_template));
+	ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops);
+	if (ret)
+		return ret;
+
+	ret = xfrm4_net_sysctl_init(net);
+	if (ret)
+		dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+
+	return ret;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
+{
+	xfrm4_net_sysctl_exit(net);
+	dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+}
+
+static struct pernet_operations __net_initdata xfrm4_net_ops = {
+	.init	= xfrm4_net_init,
+	.exit	= xfrm4_net_exit,
+};
+
+static void __init xfrm4_policy_init(void)
+{
+	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET);
+}
+
+void __init xfrm4_init(void)
+{
+	xfrm4_state_init();
+	xfrm4_policy_init();
+	xfrm4_protocol_init();
+	register_pernet_subsys(&xfrm4_net_ops);
+}
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
new file mode 100644
index 0000000000..b146ce88c5
--- /dev/null
+++ b/net/ipv4/xfrm4_protocol.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/tunnel4.c
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_protocol_mutex);
+
+static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
+{
+	switch (protocol) {
+	case IPPROTO_ESP:
+		return &esp4_handlers;
+	case IPPROTO_AH:
+		return &ah4_handlers;
+	case IPPROTO_COMP:
+		return &ipcomp4_handlers;
+	}
+
+	return NULL;
+}
+
+#define for_each_protocol_rcu(head, handler)		\
+	for (handler = rcu_dereference(head);		\
+	     handler != NULL;				\
+	     handler = rcu_dereference(handler->next))	\
+
+static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+	int ret;
+	struct xfrm4_protocol *handler;
+	struct xfrm4_protocol __rcu **head = proto_handlers(protocol);
+
+	if (!head)
+		return 0;
+
+	for_each_protocol_rcu(*head, handler)
+		if ((ret = handler->cb_handler(skb, err)) <= 0)
+			return ret;
+
+	return 0;
+}
+
+int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+		    int encap_type)
+{
+	int ret;
+	struct xfrm4_protocol *handler;
+	struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr);
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+	if (!head)
+		goto out;
+
+	if (!skb_dst(skb)) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev))
+			goto drop;
+	}
+
+	for_each_protocol_rcu(*head, handler)
+		if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+			return ret;
+
+out:
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
+
+static int xfrm4_esp_rcv(struct sk_buff *skb)
+{
+	int ret;
+	struct xfrm4_protocol *handler;
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+	for_each_protocol_rcu(esp4_handlers, handler)
+		if ((ret = handler->handler(skb)) != -EINVAL)
+			return ret;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm4_protocol *handler;
+
+	for_each_protocol_rcu(esp4_handlers, handler)
+		if (!handler->err_handler(skb, info))
+			return 0;
+
+	return -ENOENT;
+}
+
+static int xfrm4_ah_rcv(struct sk_buff *skb)
+{
+	int ret;
+	struct xfrm4_protocol *handler;
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+	for_each_protocol_rcu(ah4_handlers, handler)
+		if ((ret = handler->handler(skb)) != -EINVAL)
+			return ret;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm4_protocol *handler;
+
+	for_each_protocol_rcu(ah4_handlers, handler)
+		if (!handler->err_handler(skb, info))
+			return 0;
+
+	return -ENOENT;
+}
+
+static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
+{
+	int ret;
+	struct xfrm4_protocol *handler;
+
+	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+	for_each_protocol_rcu(ipcomp4_handlers, handler)
+		if ((ret = handler->handler(skb)) != -EINVAL)
+			return ret;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm4_protocol *handler;
+
+	for_each_protocol_rcu(ipcomp4_handlers, handler)
+		if (!handler->err_handler(skb, info))
+			return 0;
+
+	return -ENOENT;
+}
+
+static const struct net_protocol esp4_protocol = {
+	.handler	=	xfrm4_esp_rcv,
+	.err_handler	=	xfrm4_esp_err,
+	.no_policy	=	1,
+};
+
+static const struct net_protocol ah4_protocol = {
+	.handler	=	xfrm4_ah_rcv,
+	.err_handler	=	xfrm4_ah_err,
+	.no_policy	=	1,
+};
+
+static const struct net_protocol ipcomp4_protocol = {
+	.handler	=	xfrm4_ipcomp_rcv,
+	.err_handler	=	xfrm4_ipcomp_err,
+	.no_policy	=	1,
+};
+
+static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
+	.family		=	AF_INET,
+	.callback	=	xfrm4_rcv_cb,
+};
+
+static inline const struct net_protocol *netproto(unsigned char protocol)
+{
+	switch (protocol) {
+	case IPPROTO_ESP:
+		return &esp4_protocol;
+	case IPPROTO_AH:
+		return &ah4_protocol;
+	case IPPROTO_COMP:
+		return &ipcomp4_protocol;
+	}
+
+	return NULL;
+}
+
+int xfrm4_protocol_register(struct xfrm4_protocol *handler,
+			    unsigned char protocol)
+{
+	struct xfrm4_protocol __rcu **pprev;
+	struct xfrm4_protocol *t;
+	bool add_netproto = false;
+	int ret = -EEXIST;
+	int priority = handler->priority;
+
+	if (!proto_handlers(protocol) || !netproto(protocol))
+		return -EINVAL;
+
+	mutex_lock(&xfrm4_protocol_mutex);
+
+	if (!rcu_dereference_protected(*proto_handlers(protocol),
+				       lockdep_is_held(&xfrm4_protocol_mutex)))
+		add_netproto = true;
+
+	for (pprev = proto_handlers(protocol);
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t->priority < priority)
+			break;
+		if (t->priority == priority)
+			goto err;
+	}
+
+	handler->next = *pprev;
+	rcu_assign_pointer(*pprev, handler);
+
+	ret = 0;
+
+err:
+	mutex_unlock(&xfrm4_protocol_mutex);
+
+	if (add_netproto) {
+		if (inet_add_protocol(netproto(protocol), protocol)) {
+			pr_err("%s: can't add protocol\n", __func__);
+			ret = -EAGAIN;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_register);
+
+int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
+			      unsigned char protocol)
+{
+	struct xfrm4_protocol __rcu **pprev;
+	struct xfrm4_protocol *t;
+	int ret = -ENOENT;
+
+	if (!proto_handlers(protocol) || !netproto(protocol))
+		return -EINVAL;
+
+	mutex_lock(&xfrm4_protocol_mutex);
+
+	for (pprev = proto_handlers(protocol);
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t == handler) {
+			*pprev = handler->next;
+			ret = 0;
+			break;
+		}
+	}
+
+	if (!rcu_dereference_protected(*proto_handlers(protocol),
+				       lockdep_is_held(&xfrm4_protocol_mutex))) {
+		if (inet_del_protocol(netproto(protocol), protocol) < 0) {
+			pr_err("%s: can't remove protocol\n", __func__);
+			ret = -EAGAIN;
+		}
+	}
+
+	mutex_unlock(&xfrm4_protocol_mutex);
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_deregister);
+
+void __init xfrm4_protocol_init(void)
+{
+	xfrm_input_register_afinfo(&xfrm4_input_afinfo);
+}
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
new file mode 100644
index 0000000000..87d4db5914
--- /dev/null
+++ b/net/ipv4/xfrm4_state.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * xfrm4_state.c
+ *
+ * Changes:
+ * 	YOSHIFUJI Hideaki @USAGI
+ * 		Split up af-specific portion
+ *
+ */
+
+#include <net/xfrm.h>
+
+static struct xfrm_state_afinfo xfrm4_state_afinfo = {
+	.family			= AF_INET,
+	.proto			= IPPROTO_IPIP,
+	.output			= xfrm4_output,
+	.transport_finish	= xfrm4_transport_finish,
+	.local_error		= xfrm4_local_error,
+};
+
+void __init xfrm4_state_init(void)
+{
+	xfrm_state_register_afinfo(&xfrm4_state_afinfo);
+}
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
new file mode 100644
index 0000000000..8489fa1065
--- /dev/null
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* xfrm4_tunnel.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/xfrm.h>
+#include <net/protocol.h>
+
+static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	skb_push(skb, -skb_network_offset(skb));
+	return 0;
+}
+
+static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return ip_hdr(skb)->protocol;
+}
+
+static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
+{
+	if (x->props.mode != XFRM_MODE_TUNNEL) {
+		NL_SET_ERR_MSG(extack, "IPv4 tunnel can only be used with tunnel mode");
+		return -EINVAL;
+	}
+
+	if (x->encap) {
+		NL_SET_ERR_MSG(extack, "IPv4 tunnel is not compatible with encapsulation");
+		return -EINVAL;
+	}
+
+	x->props.header_len = sizeof(struct iphdr);
+
+	return 0;
+}
+
+static void ipip_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type ipip_type = {
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_IPIP,
+	.init_state	= ipip_init_state,
+	.destructor	= ipip_destroy,
+	.input		= ipip_xfrm_rcv,
+	.output		= ipip_output
+};
+
+static int xfrm_tunnel_rcv(struct sk_buff *skb)
+{
+	return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr);
+}
+
+static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
+{
+	return -ENOENT;
+}
+
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
+	.handler	=	xfrm_tunnel_rcv,
+	.err_handler	=	xfrm_tunnel_err,
+	.priority	=	4,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
+	.handler	=	xfrm_tunnel_rcv,
+	.err_handler	=	xfrm_tunnel_err,
+	.priority	=	3,
+};
+#endif
+
+static int __init ipip_init(void)
+{
+	if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
+		pr_info("%s: can't add xfrm type\n", __func__);
+		return -EAGAIN;
+	}
+
+	if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
+		pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);
+		xfrm_unregister_type(&ipip_type, AF_INET);
+		return -EAGAIN;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
+		pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);
+		xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
+		xfrm_unregister_type(&ipip_type, AF_INET);
+		return -EAGAIN;
+	}
+#endif
+	return 0;
+}
+
+static void __exit ipip_fini(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
+		pr_info("%s: can't remove xfrm handler for AF_INET6\n",
+			__func__);
+#endif
+	if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
+		pr_info("%s: can't remove xfrm handler for AF_INET\n",
+			__func__);
+	xfrm_unregister_type(&ipip_type, AF_INET);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);
-- 
cgit v1.2.3