summaryrefslogtreecommitdiffstats
path: root/src/VBox/NetworkServices/NAT/pxping.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/VBox/NetworkServices/NAT/pxping.c')
-rw-r--r--src/VBox/NetworkServices/NAT/pxping.c2001
1 files changed, 2001 insertions, 0 deletions
diff --git a/src/VBox/NetworkServices/NAT/pxping.c b/src/VBox/NetworkServices/NAT/pxping.c
new file mode 100644
index 00000000..86dc284b
--- /dev/null
+++ b/src/VBox/NetworkServices/NAT/pxping.c
@@ -0,0 +1,2001 @@
+/* $Id: pxping.c $ */
+/** @file
+ * NAT Network - ping proxy, raw sockets version.
+ */
+
+/*
+ * Copyright (C) 2013-2019 Oracle Corporation
+ *
+ * This file is part of VirtualBox Open Source Edition (OSE), as
+ * available from http://www.virtualbox.org. This file is free software;
+ * you can redistribute it and/or modify it under the terms of the GNU
+ * General Public License (GPL) as published by the Free Software
+ * Foundation, in version 2 as it comes in the "COPYING" file of the
+ * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
+ * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
+ */
+
+#define LOG_GROUP LOG_GROUP_NAT_SERVICE
+
+#include "winutils.h"
+#include "proxy.h"
+#include "proxy_pollmgr.h"
+#include "pxremap.h"
+
+#include <iprt/string.h>
+
+#ifndef RT_OS_WINDOWS
+#include <sys/types.h>
+#include <sys/socket.h>
+#ifdef RT_OS_DARWIN
+# define __APPLE_USE_RFC_3542
+#endif
+#include <netinet/in.h>
+#include <poll.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#else
+#include <iprt/stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "winpoll.h"
+#endif
+
+#include "lwip/opt.h"
+
+#include "lwip/sys.h"
+#include "lwip/tcpip.h"
+#include "lwip/inet_chksum.h"
+#include "lwip/ip.h"
+#include "lwip/icmp.h"
+
+#if defined(RT_OS_LINUX) && !defined(__USE_GNU)
+#if __GLIBC_PREREQ(2, 8)
+/*
+ * XXX: This is gross. in6_pktinfo is now hidden behind _GNU_SOURCE
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=6775
+ *
+ * But in older glibc versions, e.g. RHEL5, it is not! I don't want
+ * to deal with _GNU_SOURCE now, so as a kludge check for glibc
+ * version. It seems the __USE_GNU guard was introduced in 2.8.
+ */
+struct in6_pktinfo {
+ struct in6_addr ipi6_addr;
+ unsigned int ipi6_ifindex;
+};
+#endif /* __GLIBC_PREREQ */
+#endif /* RT_OS_LINUX && !__USE_GNU */
+
+
+/* forward */
+struct ping_pcb;
+
+
+/**
+ * Global state for ping proxy collected in one entity to minimize
+ * globals. There's only one instance of this structure.
+ *
+ * Raw ICMP sockets are promiscuous, so it doesn't make sense to have
+ * multiple. If this code ever needs to support multiple netifs, the
+ * netif member should be exiled into "pcb".
+ */
+struct pxping {
+ SOCKET sock4;
+
+#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS)
+# define DF_WITH_IP_HDRINCL
+ int hdrincl;
+#else
+ int df;
+#endif
+ int ttl;
+ int tos;
+
+ SOCKET sock6;
+#ifdef RT_OS_WINDOWS
+ LPFN_WSARECVMSG pfWSARecvMsg6;
+#endif
+ int hopl;
+
+ struct pollmgr_handler pmhdl4;
+ struct pollmgr_handler pmhdl6;
+
+ struct netif *netif;
+
+ /**
+ * Protect lwIP and pmgr accesses to the list of pcbs.
+ */
+ sys_mutex_t lock;
+
+ /*
+ * We need to find pcbs both from the guest side and from the host
+ * side. If we need to support industrial grade ping throughput,
+ * we will need two pcb hashes. For now, a short linked list
+ * should be enough. Cf. pxping_pcb_for_request() and
+ * pxping_pcb_for_reply().
+ */
+#define PXPING_MAX_PCBS 8
+ size_t npcbs;
+ struct ping_pcb *pcbs;
+
+#define TIMEOUT 5
+ int timer_active;
+ size_t timeout_slot;
+ struct ping_pcb *timeout_list[TIMEOUT];
+};
+
+
+/**
+ * Quasi PCB for ping.
+ */
+struct ping_pcb {
+ ipX_addr_t src;
+ ipX_addr_t dst;
+
+ u8_t is_ipv6;
+ u8_t is_mapped;
+
+ u16_t guest_id;
+ u16_t host_id;
+
+ /**
+ * Desired slot in pxping::timeout_list. See pxping_timer().
+ */
+ size_t timeout_slot;
+
+ /**
+ * Chaining for pxping::timeout_list
+ */
+ struct ping_pcb **pprev_timeout;
+ struct ping_pcb *next_timeout;
+
+ /**
+ * Chaining for pxping::pcbs
+ */
+ struct ping_pcb *next;
+
+ union {
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } peer;
+};
+
+
+/**
+ * lwIP thread callback message for IPv4 ping.
+ *
+ * We pass raw IP datagram for ip_output_if() so we only need pbuf and
+ * netif (from pxping).
+ */
+struct ping_msg {
+ struct tcpip_msg msg;
+ struct pxping *pxping;
+ struct pbuf *p;
+};
+
+
+/**
+ * lwIP thread callback message for IPv6 ping.
+ *
+ * We cannot obtain raw IPv6 datagram from host without extra trouble,
+ * so we pass ICMPv6 payload in pbuf and also other parameters to
+ * ip6_output_if().
+ */
+struct ping6_msg {
+ struct tcpip_msg msg;
+ struct pxping *pxping;
+ struct pbuf *p;
+ ip6_addr_t src, dst;
+ int hopl, tclass;
+};
+
+
+#ifdef RT_OS_WINDOWS
+static int pxping_init_windows(struct pxping *pxping);
+#endif
+static void pxping_recv4(void *arg, struct pbuf *p);
+static void pxping_recv6(void *arg, struct pbuf *p);
+
+static void pxping_timer(void *arg);
+static void pxping_timer_needed(struct pxping *pxping);
+
+static struct ping_pcb *pxping_pcb_for_request(struct pxping *pxping,
+ int is_ipv6,
+ ipX_addr_t *src, ipX_addr_t *dst,
+ u16_t guest_id);
+static struct ping_pcb *pxping_pcb_for_reply(struct pxping *pxping, int is_ipv6,
+ ipX_addr_t *dst, u16_t host_id);
+
+static FNRTSTRFORMATTYPE pxping_pcb_rtstrfmt;
+static struct ping_pcb *pxping_pcb_allocate(struct pxping *pxping);
+static void pxping_pcb_register(struct pxping *pxping, struct ping_pcb *pcb);
+static void pxping_pcb_deregister(struct pxping *pxping, struct ping_pcb *pcb);
+static void pxping_pcb_delete(struct pxping *pxping, struct ping_pcb *pcb);
+static void pxping_timeout_add(struct pxping *pxping, struct ping_pcb *pcb);
+static void pxping_timeout_del(struct pxping *pxping, struct ping_pcb *pcb);
+
+static int pxping_pmgr_pump(struct pollmgr_handler *handler, SOCKET fd, int revents);
+
+static void pxping_pmgr_icmp4(struct pxping *pxping);
+static void pxping_pmgr_icmp4_echo(struct pxping *pxping,
+ u16_t iplen, struct sockaddr_in *peer);
+static void pxping_pmgr_icmp4_error(struct pxping *pxping,
+ u16_t iplen, struct sockaddr_in *peer);
+static void pxping_pmgr_icmp6(struct pxping *pxping);
+static void pxping_pmgr_icmp6_echo(struct pxping *pxping,
+ ip6_addr_t *src, ip6_addr_t *dst,
+ int hopl, int tclass, u16_t icmplen);
+static void pxping_pmgr_icmp6_error(struct pxping *pxping,
+ ip6_addr_t *src, ip6_addr_t *dst,
+ int hopl, int tclass, u16_t icmplen);
+
+static void pxping_pmgr_forward_inbound(struct pxping *pxping, u16_t iplen);
+static void pxping_pcb_forward_inbound(void *arg);
+
+static void pxping_pmgr_forward_inbound6(struct pxping *pxping,
+ ip6_addr_t *src, ip6_addr_t *dst,
+ u8_t hopl, u8_t tclass,
+ u16_t icmplen);
+static void pxping_pcb_forward_inbound6(void *arg);
+
+/*
+ * NB: This is not documented except in RTFS.
+ *
+ * If ip_output_if() is passed dest == NULL then it treats p as
+ * complete IP packet with payload pointing to the IP header. It does
+ * not build IP header, ignores all header-related arguments, fetches
+ * real destination from the header in the pbuf and outputs pbuf to
+ * the specified netif.
+ */
+#define ip_raw_output_if(p, netif) \
+ (ip_output_if((p), NULL, NULL, 0, 0, 0, (netif)))
+
+
+
+static struct pxping g_pxping;
+
+
+err_t
+pxping_init(struct netif *netif, SOCKET sock4, SOCKET sock6)
+{
+ const int on = 1;
+ int status;
+
+ if (sock4 == INVALID_SOCKET && sock6 == INVALID_SOCKET) {
+ return ERR_VAL;
+ }
+
+ g_pxping.netif = netif;
+ sys_mutex_new(&g_pxping.lock);
+
+ g_pxping.sock4 = sock4;
+ if (g_pxping.sock4 != INVALID_SOCKET) {
+#ifdef DF_WITH_IP_HDRINCL
+ g_pxping.hdrincl = 0;
+#else
+ g_pxping.df = -1;
+#endif
+ g_pxping.ttl = -1;
+ g_pxping.tos = 0;
+
+#ifdef RT_OS_LINUX
+ {
+ const int dont = IP_PMTUDISC_DONT;
+ status = setsockopt(sock4, IPPROTO_IP, IP_MTU_DISCOVER,
+ &dont, sizeof(dont));
+ if (status != 0) {
+ DPRINTF(("IP_MTU_DISCOVER: %R[sockerr]\n", SOCKERRNO()));
+ }
+ }
+#endif /* RT_OS_LINUX */
+
+ g_pxping.pmhdl4.callback = pxping_pmgr_pump;
+ g_pxping.pmhdl4.data = (void *)&g_pxping;
+ g_pxping.pmhdl4.slot = -1;
+ pollmgr_add(&g_pxping.pmhdl4, g_pxping.sock4, POLLIN);
+
+ ping_proxy_accept(pxping_recv4, &g_pxping);
+ }
+
+ g_pxping.sock6 = sock6;
+#ifdef RT_OS_WINDOWS
+ /* we need recvmsg */
+ if (g_pxping.sock6 != INVALID_SOCKET) {
+ status = pxping_init_windows(&g_pxping);
+ if (status == SOCKET_ERROR) {
+ g_pxping.sock6 = INVALID_SOCKET;
+ /* close(sock6); */
+ }
+ }
+#endif
+ if (g_pxping.sock6 != INVALID_SOCKET) {
+ g_pxping.hopl = -1;
+
+#if !defined(IPV6_RECVPKTINFO)
+#define IPV6_RECVPKTINFO (IPV6_PKTINFO)
+#endif
+ status = setsockopt(sock6, IPPROTO_IPV6, IPV6_RECVPKTINFO,
+ (const char *)&on, sizeof(on));
+ if (status < 0) {
+ DPRINTF(("IPV6_RECVPKTINFO: %R[sockerr]\n", SOCKERRNO()));
+ /* XXX: for now this is fatal */
+ }
+
+#if !defined(IPV6_RECVHOPLIMIT)
+#define IPV6_RECVHOPLIMIT (IPV6_HOPLIMIT)
+#endif
+ status = setsockopt(sock6, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
+ (const char *)&on, sizeof(on));
+ if (status < 0) {
+ DPRINTF(("IPV6_RECVHOPLIMIT: %R[sockerr]\n", SOCKERRNO()));
+ }
+
+#ifdef IPV6_RECVTCLASS /* new in RFC 3542, there's no RFC 2292 counterpart */
+ /** @todo IPV6_RECVTCLASS */
+#endif
+
+ g_pxping.pmhdl6.callback = pxping_pmgr_pump;
+ g_pxping.pmhdl6.data = (void *)&g_pxping;
+ g_pxping.pmhdl6.slot = -1;
+ pollmgr_add(&g_pxping.pmhdl6, g_pxping.sock6, POLLIN);
+
+ ping6_proxy_accept(pxping_recv6, &g_pxping);
+ }
+
+ status = RTStrFormatTypeRegister("ping_pcb", pxping_pcb_rtstrfmt, NULL);
+ AssertRC(status);
+
+ return ERR_OK;
+}
+
+
+#ifdef RT_OS_WINDOWS
+static int
+pxping_init_windows(struct pxping *pxping)
+{
+ GUID WSARecvMsgGUID = WSAID_WSARECVMSG;
+ DWORD nread;
+ int status;
+
+ pxping->pfWSARecvMsg6 = NULL;
+ status = WSAIoctl(pxping->sock6,
+ SIO_GET_EXTENSION_FUNCTION_POINTER,
+ &WSARecvMsgGUID, sizeof(WSARecvMsgGUID),
+ &pxping->pfWSARecvMsg6, sizeof(pxping->pfWSARecvMsg6),
+ &nread,
+ NULL, NULL);
+ return status;
+}
+#endif /* RT_OS_WINDOWS */
+
+
+static u32_t
+chksum_delta_16(u16_t oval, u16_t nval)
+{
+ u32_t sum = (u16_t)~oval;
+ sum += nval;
+ return sum;
+}
+
+
+static u32_t
+chksum_update_16(u16_t *oldp, u16_t nval)
+{
+ u32_t sum = chksum_delta_16(*oldp, nval);
+ *oldp = nval;
+ return sum;
+}
+
+
+static u32_t
+chksum_delta_32(u32_t oval, u32_t nval)
+{
+ u32_t sum = ~oval;
+ sum = FOLD_U32T(sum);
+ sum += FOLD_U32T(nval);
+ return sum;
+}
+
+
+static u32_t
+chksum_update_32(u32_t *oldp, u32_t nval)
+{
+ u32_t sum = chksum_delta_32(*oldp, nval);
+ *oldp = nval;
+ return sum;
+}
+
+
+static u32_t
+chksum_delta_ipv6(const ip6_addr_t *oldp, const ip6_addr_t *newp)
+{
+ u32_t sum;
+
+ sum = chksum_delta_32(oldp->addr[0], newp->addr[0]);
+ sum += chksum_delta_32(oldp->addr[1], newp->addr[1]);
+ sum += chksum_delta_32(oldp->addr[2], newp->addr[2]);
+ sum += chksum_delta_32(oldp->addr[3], newp->addr[3]);
+
+ return sum;
+}
+
+
+static u32_t
+chksum_update_ipv6(ip6_addr_t *oldp, const ip6_addr_t *newp)
+{
+ u32_t sum;
+
+ sum = chksum_update_32(&oldp->addr[0], newp->addr[0]);
+ sum += chksum_update_32(&oldp->addr[1], newp->addr[1]);
+ sum += chksum_update_32(&oldp->addr[2], newp->addr[2]);
+ sum += chksum_update_32(&oldp->addr[3], newp->addr[3]);
+
+ return sum;
+}
+
+
+/**
+ * ICMP Echo Request in pbuf "p" is to be proxied.
+ */
+static void
+pxping_recv4(void *arg, struct pbuf *p)
+{
+ struct pxping *pxping = (struct pxping *)arg;
+ struct ping_pcb *pcb;
+#ifdef DF_WITH_IP_HDRINCL
+ struct ip_hdr iph_orig;
+#endif
+ struct icmp_echo_hdr icmph_orig;
+ struct ip_hdr *iph;
+ struct icmp_echo_hdr *icmph;
+ int df, ttl, tos;
+ u32_t sum;
+ u16_t iphlen;
+ int status;
+
+ iphlen = ip_current_header_tot_len();
+ if (iphlen != IP_HLEN) { /* we don't do options */
+ pbuf_free(p);
+ return;
+ }
+
+ iph = (/* UNCONST */ struct ip_hdr *)ip_current_header();
+ icmph = (struct icmp_echo_hdr *)p->payload;
+
+ pcb = pxping_pcb_for_request(pxping, 0,
+ ipX_current_src_addr(),
+ ipX_current_dest_addr(),
+ icmph->id);
+ if (pcb == NULL) {
+ pbuf_free(p);
+ return;
+ }
+
+ DPRINTF(("ping %p: %R[ping_pcb] seq %d len %u ttl %d\n",
+ pcb, pcb,
+ ntohs(icmph->seqno), (unsigned int)p->tot_len,
+ IPH_TTL(iph)));
+
+ ttl = IPH_TTL(iph);
+ if (!pcb->is_mapped) {
+ if (RT_UNLIKELY(ttl == 1)) {
+ status = pbuf_header(p, iphlen); /* back to IP header */
+ if (RT_LIKELY(status == 0)) {
+ icmp_time_exceeded(p, ICMP_TE_TTL);
+ }
+ pbuf_free(p);
+ return;
+ }
+ --ttl;
+ }
+
+ /*
+ * OS X doesn't provide a socket option to control fragmentation.
+ * Solaris doesn't provide IP_DONTFRAG on all releases we support.
+ * In this case we have to use IP_HDRINCL. We don't want to use
+ * it always since it doesn't handle fragmentation (but that's ok
+ * for DF) and Windows doesn't do automatic source address
+ * selection with IP_HDRINCL.
+ */
+ df = (IPH_OFFSET(iph) & PP_HTONS(IP_DF)) != 0;
+
+#ifdef DF_WITH_IP_HDRINCL
+ if (df != pxping->hdrincl) {
+ status = setsockopt(pxping->sock4, IPPROTO_IP, IP_HDRINCL,
+ &df, sizeof(df));
+ if (RT_LIKELY(status == 0)) {
+ pxping->hdrincl = df;
+ }
+ else {
+ DPRINTF(("IP_HDRINCL: %R[sockerr]\n", SOCKERRNO()));
+ }
+ }
+
+ if (pxping->hdrincl) {
+ status = pbuf_header(p, iphlen); /* back to IP header */
+ if (RT_UNLIKELY(status != 0)) {
+ pbuf_free(p);
+ return;
+ }
+
+ /* we will overwrite IP header, save original for ICMP errors */
+ memcpy(&iph_orig, iph, iphlen);
+
+ if (pcb->is_mapped) {
+ ip4_addr_set_u32(&iph->dest, pcb->peer.sin.sin_addr.s_addr);
+ }
+
+ if (g_proxy_options->src4 != NULL) {
+ ip4_addr_set_u32(&iph->src, g_proxy_options->src4->sin_addr.s_addr);
+ }
+ else {
+ /* let the kernel select suitable source address */
+ ip_addr_set_any(&iph->src);
+ }
+
+ IPH_TTL_SET(iph, ttl); /* already decremented */
+ IPH_ID_SET(iph, 0); /* kernel will set one */
+#ifdef RT_OS_DARWIN
+ /* wants ip_offset and ip_len fields in host order */
+ IPH_OFFSET_SET(iph, ntohs(IPH_OFFSET(iph)));
+ IPH_LEN_SET(iph, ntohs(IPH_LEN(iph)));
+ /* wants checksum of everything (sic!), in host order */
+ sum = inet_chksum_pbuf(p);
+ IPH_CHKSUM_SET(iph, sum);
+#else /* !RT_OS_DARWIN */
+ IPH_CHKSUM_SET(iph, 0); /* kernel will recalculate */
+#endif
+ }
+ else /* !pxping->hdrincl */
+#endif /* DF_WITH_IP_HDRINCL */
+ {
+#if !defined(DF_WITH_IP_HDRINCL)
+ /* control DF flag via setsockopt(2) */
+#define USE_DF_OPTION(_Optname) \
+ const int dfopt = _Optname; \
+ const char * const dfoptname = #_Optname; NOREF(dfoptname)
+#if defined(RT_OS_LINUX)
+ USE_DF_OPTION(IP_MTU_DISCOVER);
+ df = df ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
+#elif defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
+ USE_DF_OPTION(IP_DONTFRAG);
+#elif defined(RT_OS_WINDOWS)
+ USE_DF_OPTION(IP_DONTFRAGMENT);
+#endif
+ if (df != pxping->df) {
+ status = setsockopt(pxping->sock4, IPPROTO_IP, dfopt,
+ (char *)&df, sizeof(df));
+ if (RT_LIKELY(status == 0)) {
+ pxping->df = df;
+ }
+ else {
+ DPRINTF(("%s: %R[sockerr]\n", dfoptname, SOCKERRNO()));
+ }
+ }
+#endif /* !DF_WITH_IP_HDRINCL */
+
+ if (ttl != pxping->ttl) {
+ status = setsockopt(pxping->sock4, IPPROTO_IP, IP_TTL,
+ (char *)&ttl, sizeof(ttl));
+ if (RT_LIKELY(status == 0)) {
+ pxping->ttl = ttl;
+ }
+ else {
+ DPRINTF(("IP_TTL: %R[sockerr]\n", SOCKERRNO()));
+ }
+ }
+
+ tos = IPH_TOS(iph);
+ if (tos != pxping->tos) {
+ status = setsockopt(pxping->sock4, IPPROTO_IP, IP_TOS,
+ (char *)&tos, sizeof(tos));
+ if (RT_LIKELY(status == 0)) {
+ pxping->tos = tos;
+ }
+ else {
+ DPRINTF(("IP_TOS: %R[sockerr]\n", SOCKERRNO()));
+ }
+ }
+ }
+
+ /* rewrite ICMP echo header */
+ memcpy(&icmph_orig, icmph, sizeof(*icmph));
+ sum = (u16_t)~icmph->chksum;
+ sum += chksum_update_16(&icmph->id, pcb->host_id);
+ sum = FOLD_U32T(sum);
+ icmph->chksum = ~sum;
+
+ status = proxy_sendto(pxping->sock4, p,
+ &pcb->peer.sin, sizeof(pcb->peer.sin));
+ if (status != 0) {
+ int error = -status;
+ DPRINTF(("%s: sendto: %R[sockerr]\n", __func__, error));
+
+#ifdef DF_WITH_IP_HDRINCL
+ if (pxping->hdrincl) {
+ /* restore original IP header */
+ memcpy(iph, &iph_orig, iphlen);
+ }
+ else
+#endif
+ {
+ status = pbuf_header(p, iphlen); /* back to IP header */
+ if (RT_UNLIKELY(status != 0)) {
+ pbuf_free(p);
+ return;
+ }
+ }
+
+ /* restore original ICMP header */
+ memcpy(icmph, &icmph_orig, sizeof(*icmph));
+
+ /*
+ * Some ICMP errors may be generated by the kernel and we read
+ * them from the socket and forward them normally, hence the
+ * ifdefs below.
+ */
+ switch (error) {
+
+#if !( defined(RT_OS_SOLARIS) \
+ || (defined(RT_OS_LINUX) && !defined(DF_WITH_IP_HDRINCL)) \
+ )
+ case EMSGSIZE:
+ icmp_dest_unreach(p, ICMP_DUR_FRAG);
+ break;
+#endif
+
+ case ENETDOWN:
+ case ENETUNREACH:
+ icmp_dest_unreach(p, ICMP_DUR_NET);
+ break;
+
+ case EHOSTDOWN:
+ case EHOSTUNREACH:
+ icmp_dest_unreach(p, ICMP_DUR_HOST);
+ break;
+ }
+ }
+
+ pbuf_free(p);
+}
+
+
+/**
+ * ICMPv6 Echo Request in pbuf "p" is to be proxied.
+ */
+static void
+pxping_recv6(void *arg, struct pbuf *p)
+{
+ struct pxping *pxping = (struct pxping *)arg;
+ struct ping_pcb *pcb;
+ struct ip6_hdr *iph;
+ struct icmp6_echo_hdr *icmph;
+ int hopl;
+ u16_t iphlen;
+ u16_t id, seq;
+ int status;
+
+ iph = (/* UNCONST */ struct ip6_hdr *)ip6_current_header();
+ iphlen = ip_current_header_tot_len();
+
+ icmph = (struct icmp6_echo_hdr *)p->payload;
+
+ id = icmph->id;
+ seq = icmph->seqno;
+
+ pcb = pxping_pcb_for_request(pxping, 1,
+ ipX_current_src_addr(),
+ ipX_current_dest_addr(),
+ id);
+ if (pcb == NULL) {
+ pbuf_free(p);
+ return;
+ }
+
+ DPRINTF(("ping %p: %R[ping_pcb] seq %d len %u hopl %d\n",
+ pcb, pcb,
+ ntohs(seq), (unsigned int)p->tot_len,
+ IP6H_HOPLIM(iph)));
+
+ hopl = IP6H_HOPLIM(iph);
+ if (!pcb->is_mapped) {
+ if (hopl == 1) {
+ status = pbuf_header(p, iphlen); /* back to IP header */
+ if (RT_LIKELY(status == 0)) {
+ icmp6_time_exceeded(p, ICMP6_TE_HL);
+ }
+ pbuf_free(p);
+ return;
+ }
+ --hopl;
+ }
+
+ /*
+ * Rewrite ICMPv6 echo header. We don't need to recompute the
+ * checksum since, unlike IPv4, checksum includes pseudo-header.
+ * OS computes checksum for us on send() since it needs to select
+ * source address.
+ */
+ icmph->id = pcb->host_id;
+
+ /** @todo use control messages to save a syscall? */
+ if (hopl != pxping->hopl) {
+ status = setsockopt(pxping->sock6, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
+ (char *)&hopl, sizeof(hopl));
+ if (status == 0) {
+ pxping->hopl = hopl;
+ }
+ else {
+ DPRINTF(("IPV6_HOPLIMIT: %R[sockerr]\n", SOCKERRNO()));
+ }
+ }
+
+ status = proxy_sendto(pxping->sock6, p,
+ &pcb->peer.sin6, sizeof(pcb->peer.sin6));
+ if (status != 0) {
+ int error = -status;
+ DPRINTF(("%s: sendto: %R[sockerr]\n", __func__, error));
+
+ status = pbuf_header(p, iphlen); /* back to IP header */
+ if (RT_UNLIKELY(status != 0)) {
+ pbuf_free(p);
+ return;
+ }
+
+ /* restore original ICMP header */
+ icmph->id = pcb->guest_id;
+
+ switch (error) {
+ case EACCES:
+ icmp6_dest_unreach(p, ICMP6_DUR_PROHIBITED);
+ break;
+
+#ifdef ENONET
+ case ENONET:
+#endif
+ case ENETDOWN:
+ case ENETUNREACH:
+ case EHOSTDOWN:
+ case EHOSTUNREACH:
+ icmp6_dest_unreach(p, ICMP6_DUR_NO_ROUTE);
+ break;
+ }
+ }
+
+ pbuf_free(p);
+}
+
+
+/**
+ * Formatter for %R[ping_pcb].
+ */
+static DECLCALLBACK(size_t)
+pxping_pcb_rtstrfmt(PFNRTSTROUTPUT pfnOutput, void *pvArgOutput,
+ const char *pszType, const void *pvValue,
+ int cchWidth, int cchPrecision, unsigned int fFlags,
+ void *pvUser)
+{
+ const struct ping_pcb *pcb = (const struct ping_pcb *)pvValue;
+ size_t cb = 0;
+
+ NOREF(cchWidth);
+ NOREF(cchPrecision);
+ NOREF(fFlags);
+ NOREF(pvUser);
+
+ AssertReturn(strcmp(pszType, "ping_pcb") == 0, 0);
+
+ if (pcb == NULL) {
+ return RTStrFormat(pfnOutput, pvArgOutput, NULL, NULL, "(null)");
+ }
+
+ /* XXX: %RTnaipv4 takes the value, but %RTnaipv6 takes the pointer */
+ if (pcb->is_ipv6) {
+ cb += RTStrFormat(pfnOutput, pvArgOutput, NULL, NULL,
+ "%RTnaipv6 -> %RTnaipv6", &pcb->src, &pcb->dst);
+ if (pcb->is_mapped) {
+ cb += RTStrFormat(pfnOutput, pvArgOutput, NULL, NULL,
+ " (%RTnaipv6)", &pcb->peer.sin6.sin6_addr);
+ }
+ }
+ else {
+ cb += RTStrFormat(pfnOutput, pvArgOutput, NULL, NULL,
+ "%RTnaipv4 -> %RTnaipv4",
+ ip4_addr_get_u32(ipX_2_ip(&pcb->src)),
+ ip4_addr_get_u32(ipX_2_ip(&pcb->dst)));
+ if (pcb->is_mapped) {
+ cb += RTStrFormat(pfnOutput, pvArgOutput, NULL, NULL,
+ " (%RTnaipv4)", pcb->peer.sin.sin_addr.s_addr);
+ }
+ }
+
+ cb += RTStrFormat(pfnOutput, pvArgOutput, NULL, NULL,
+ " id %04x->%04x", ntohs(pcb->guest_id), ntohs(pcb->host_id));
+
+ return cb;
+}
+
+
+static struct ping_pcb *
+pxping_pcb_allocate(struct pxping *pxping)
+{
+ struct ping_pcb *pcb;
+
+ if (pxping->npcbs >= PXPING_MAX_PCBS) {
+ return NULL;
+ }
+
+ pcb = (struct ping_pcb *)malloc(sizeof(*pcb));
+ if (pcb == NULL) {
+ return NULL;
+ }
+
+ ++pxping->npcbs;
+ return pcb;
+}
+
+
+static void
+pxping_pcb_delete(struct pxping *pxping, struct ping_pcb *pcb)
+{
+ LWIP_ASSERT1(pxping->npcbs > 0);
+ LWIP_ASSERT1(pcb->next == NULL);
+ LWIP_ASSERT1(pcb->pprev_timeout == NULL);
+
+ DPRINTF(("%s: ping %p\n", __func__, (void *)pcb));
+
+ --pxping->npcbs;
+ free(pcb);
+}
+
+
+static void
+pxping_timeout_add(struct pxping *pxping, struct ping_pcb *pcb)
+{
+ struct ping_pcb **chain;
+
+ LWIP_ASSERT1(pcb->pprev_timeout == NULL);
+
+ chain = &pxping->timeout_list[pcb->timeout_slot];
+ if ((pcb->next_timeout = *chain) != NULL) {
+ (*chain)->pprev_timeout = &pcb->next_timeout;
+ }
+ *chain = pcb;
+ pcb->pprev_timeout = chain;
+}
+
+
+static void
+pxping_timeout_del(struct pxping *pxping, struct ping_pcb *pcb)
+{
+ LWIP_UNUSED_ARG(pxping);
+
+ LWIP_ASSERT1(pcb->pprev_timeout != NULL);
+ if (pcb->next_timeout != NULL) {
+ pcb->next_timeout->pprev_timeout = pcb->pprev_timeout;
+ }
+ *pcb->pprev_timeout = pcb->next_timeout;
+ pcb->pprev_timeout = NULL;
+ pcb->next_timeout = NULL;
+}
+
+
+static void
+pxping_pcb_register(struct pxping *pxping, struct ping_pcb *pcb)
+{
+ pcb->next = pxping->pcbs;
+ pxping->pcbs = pcb;
+
+ pxping_timeout_add(pxping, pcb);
+}
+
+
+static void
+pxping_pcb_deregister(struct pxping *pxping, struct ping_pcb *pcb)
+{
+ struct ping_pcb **p;
+
+ for (p = &pxping->pcbs; *p != NULL; p = &(*p)->next) {
+ if (*p == pcb) {
+ *p = pcb->next;
+ pcb->next = NULL;
+ break;
+ }
+ }
+
+ pxping_timeout_del(pxping, pcb);
+}
+
+
+static struct ping_pcb *
+pxping_pcb_for_request(struct pxping *pxping,
+ int is_ipv6, ipX_addr_t *src, ipX_addr_t *dst,
+ u16_t guest_id)
+{
+ struct ping_pcb *pcb;
+
+ /* on lwip thread, so no concurrent updates */
+ for (pcb = pxping->pcbs; pcb != NULL; pcb = pcb->next) {
+ if (pcb->guest_id == guest_id
+ && pcb->is_ipv6 == is_ipv6
+ && ipX_addr_cmp(is_ipv6, &pcb->dst, dst)
+ && ipX_addr_cmp(is_ipv6, &pcb->src, src))
+ {
+ break;
+ }
+ }
+
+ if (pcb == NULL) {
+ int mapped;
+
+ pcb = pxping_pcb_allocate(pxping);
+ if (pcb == NULL) {
+ return NULL;
+ }
+
+ pcb->is_ipv6 = is_ipv6;
+ ipX_addr_copy(is_ipv6, pcb->src, *src);
+ ipX_addr_copy(is_ipv6, pcb->dst, *dst);
+
+ pcb->guest_id = guest_id;
+#ifdef RT_OS_WINDOWS
+# define random() (rand())
+#endif
+ pcb->host_id = random() & 0xffffUL;
+
+ pcb->pprev_timeout = NULL;
+ pcb->next_timeout = NULL;
+
+ if (is_ipv6) {
+ pcb->peer.sin6.sin6_family = AF_INET6;
+#if HAVE_SA_LEN
+ pcb->peer.sin6.sin6_len = sizeof(pcb->peer.sin6);
+#endif
+ pcb->peer.sin6.sin6_port = htons(IPPROTO_ICMPV6);
+ pcb->peer.sin6.sin6_flowinfo = 0;
+ mapped = pxremap_outbound_ip6((ip6_addr_t *)&pcb->peer.sin6.sin6_addr,
+ ipX_2_ip6(&pcb->dst));
+ }
+ else {
+ pcb->peer.sin.sin_family = AF_INET;
+#if HAVE_SA_LEN
+ pcb->peer.sin.sin_len = sizeof(pcb->peer.sin);
+#endif
+ pcb->peer.sin.sin_port = htons(IPPROTO_ICMP);
+ mapped = pxremap_outbound_ip4((ip_addr_t *)&pcb->peer.sin.sin_addr,
+ ipX_2_ip(&pcb->dst));
+ }
+
+ if (mapped == PXREMAP_FAILED) {
+ free(pcb);
+ return NULL;
+ }
+ else {
+ pcb->is_mapped = (mapped == PXREMAP_MAPPED);
+ }
+
+ pcb->timeout_slot = pxping->timeout_slot;
+
+ sys_mutex_lock(&pxping->lock);
+ pxping_pcb_register(pxping, pcb);
+ sys_mutex_unlock(&pxping->lock);
+
+ DPRINTF(("ping %p: %R[ping_pcb] - created\n", pcb, pcb));
+
+ pxping_timer_needed(pxping);
+ }
+ else {
+ /* just bump up expiration timeout lazily */
+ DPRINTF(("ping %p: %R[ping_pcb] - slot %d -> %d\n",
+ pcb, pcb,
+ (unsigned int)pcb->timeout_slot,
+ (unsigned int)pxping->timeout_slot));
+ pcb->timeout_slot = pxping->timeout_slot;
+ }
+
+ return pcb;
+}
+
+
+/**
+ * Called on pollmgr thread. Caller must do the locking since caller
+ * is going to use the returned pcb, which needs to be protected from
+ * being expired by pxping_timer() on lwip thread.
+ */
+static struct ping_pcb *
+pxping_pcb_for_reply(struct pxping *pxping,
+ int is_ipv6, ipX_addr_t *dst, u16_t host_id)
+{
+ struct ping_pcb *pcb;
+
+ for (pcb = pxping->pcbs; pcb != NULL; pcb = pcb->next) {
+ if (pcb->host_id == host_id
+ && pcb->is_ipv6 == is_ipv6
+ /* XXX: allow broadcast pings? */
+ && ipX_addr_cmp(is_ipv6, &pcb->dst, dst))
+ {
+ return pcb;
+ }
+ }
+
+ return NULL;
+}
+
+
+static void
+pxping_timer(void *arg)
+{
+ struct pxping *pxping = (struct pxping *)arg;
+ struct ping_pcb **chain, *pcb;
+
+ pxping->timer_active = 0;
+
+ /*
+ * New slot points to the list of pcbs to check for expiration.
+ */
+ LWIP_ASSERT1(pxping->timeout_slot < TIMEOUT);
+ if (++pxping->timeout_slot == TIMEOUT) {
+ pxping->timeout_slot = 0;
+ }
+
+ chain = &pxping->timeout_list[pxping->timeout_slot];
+ pcb = *chain;
+
+ /* protect from pollmgr concurrent reads */
+ sys_mutex_lock(&pxping->lock);
+
+ while (pcb != NULL) {
+ struct ping_pcb *xpcb = pcb;
+ pcb = pcb->next_timeout;
+
+ if (xpcb->timeout_slot == pxping->timeout_slot) {
+ /* expired */
+ pxping_pcb_deregister(pxping, xpcb);
+ pxping_pcb_delete(pxping, xpcb);
+ }
+ else {
+ /*
+ * If there was another request, we updated timeout_slot
+ * but delayed actually moving the pcb until now.
+ */
+ pxping_timeout_del(pxping, xpcb); /* from current slot */
+ pxping_timeout_add(pxping, xpcb); /* to new slot */
+ }
+ }
+
+ sys_mutex_unlock(&pxping->lock);
+ pxping_timer_needed(pxping);
+}
+
+
+static void
+pxping_timer_needed(struct pxping *pxping)
+{
+ if (!pxping->timer_active && pxping->pcbs != NULL) {
+ pxping->timer_active = 1;
+ sys_timeout(1 * 1000, pxping_timer, pxping);
+ }
+}
+
+
+static int
+pxping_pmgr_pump(struct pollmgr_handler *handler, SOCKET fd, int revents)
+{
+ struct pxping *pxping;
+
+ pxping = (struct pxping *)handler->data;
+ LWIP_ASSERT1(fd == pxping->sock4 || fd == pxping->sock6);
+
+ if (revents & ~(POLLIN|POLLERR)) {
+ DPRINTF0(("%s: unexpected revents 0x%x\n", __func__, revents));
+ return POLLIN;
+ }
+
+ if (revents & POLLERR) {
+ int sockerr = -1;
+ socklen_t optlen = (socklen_t)sizeof(sockerr);
+ int status;
+
+ status = getsockopt(fd, SOL_SOCKET,
+ SO_ERROR, (char *)&sockerr, &optlen);
+ if (status < 0) {
+ DPRINTF(("%s: sock %d: SO_ERROR failed: %R[sockerr]\n",
+ __func__, fd, SOCKERRNO()));
+ }
+ else {
+ DPRINTF(("%s: sock %d: %R[sockerr]\n",
+ __func__, fd, sockerr));
+ }
+ }
+
+ if ((revents & POLLIN) == 0) {
+ return POLLIN;
+ }
+
+ if (fd == pxping->sock4) {
+ pxping_pmgr_icmp4(pxping);
+ }
+ else /* fd == pxping->sock6 */ {
+ pxping_pmgr_icmp6(pxping);
+ }
+
+ return POLLIN;
+}
+
+
+/**
+ * Process incoming ICMP message for the host.
+ * NB: we will get a lot of spam here and have to sift through it.
+ */
+static void
+pxping_pmgr_icmp4(struct pxping *pxping)
+{
+ struct sockaddr_in sin;
+ socklen_t salen = sizeof(sin);
+ ssize_t nread;
+ struct ip_hdr *iph;
+ struct icmp_echo_hdr *icmph;
+ u16_t iplen, ipoff;
+
+ memset(&sin, 0, sizeof(sin));
+
+ /*
+ * Reads from raw IPv4 sockets deliver complete IP datagrams with
+ * IP header included.
+ */
+ nread = recvfrom(pxping->sock4, pollmgr_udpbuf, sizeof(pollmgr_udpbuf), 0,
+ (struct sockaddr *)&sin, &salen);
+ if (nread < 0) {
+ DPRINTF(("%s: %R[sockerr]\n", __func__, SOCKERRNO()));
+ return;
+ }
+
+ if (nread < IP_HLEN) {
+ DPRINTF2(("%s: read %d bytes, IP header truncated\n",
+ __func__, (unsigned int)nread));
+ return;
+ }
+
+ iph = (struct ip_hdr *)pollmgr_udpbuf;
+
+ /* match version */
+ if (IPH_V(iph) != 4) {
+ DPRINTF2(("%s: unexpected IP version %d\n", __func__, IPH_V(iph)));
+ return;
+ }
+
+ /* no fragmentation */
+ ipoff = IPH_OFFSET(iph);
+#if defined(RT_OS_DARWIN)
+ /* darwin reports IPH_OFFSET in host byte order */
+ ipoff = htons(ipoff);
+ IPH_OFFSET_SET(iph, ipoff);
+#endif
+ if ((ipoff & PP_HTONS(IP_OFFMASK | IP_MF)) != 0) {
+ DPRINTF2(("%s: dropping fragmented datagram (0x%04x)\n",
+ __func__, ntohs(ipoff)));
+ return;
+ }
+
+ /* no options */
+ if (IPH_HL(iph) * 4 != IP_HLEN) {
+ DPRINTF2(("%s: dropping datagram with options (IP header length %d)\n",
+ __func__, IPH_HL(iph) * 4));
+ return;
+ }
+
+ if (IPH_PROTO(iph) != IP_PROTO_ICMP) {
+ DPRINTF2(("%s: unexpected protocol %d\n", __func__, IPH_PROTO(iph)));
+ return;
+ }
+
+ iplen = IPH_LEN(iph);
+#if !defined(RT_OS_DARWIN)
+ /* darwin reports IPH_LEN in host byte order */
+ iplen = ntohs(iplen);
+#endif
+#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS)
+ /* darwin and solaris change IPH_LEN to payload length only */
+ iplen += IP_HLEN; /* we verified there are no options */
+ IPH_LEN_SET(iph, htons(iplen));
+#endif
+ if (nread < iplen) {
+ DPRINTF2(("%s: read %d bytes but total length is %d bytes\n",
+ __func__, (unsigned int)nread, (unsigned int)iplen));
+ return;
+ }
+
+ if (iplen < IP_HLEN + ICMP_HLEN) {
+ DPRINTF2(("%s: IP length %d bytes, ICMP header truncated\n",
+ __func__, iplen));
+ return;
+ }
+
+ icmph = (struct icmp_echo_hdr *)(pollmgr_udpbuf + IP_HLEN);
+ if (ICMPH_TYPE(icmph) == ICMP_ER) {
+ pxping_pmgr_icmp4_echo(pxping, iplen, &sin);
+ }
+ else if (ICMPH_TYPE(icmph) == ICMP_DUR || ICMPH_TYPE(icmph) == ICMP_TE) {
+ pxping_pmgr_icmp4_error(pxping, iplen, &sin);
+ }
+#if 1
+ else {
+ DPRINTF2(("%s: ignoring ICMP type %d\n", __func__, ICMPH_TYPE(icmph)));
+ }
+#endif
+}
+
+
+/**
+ * Check if this incoming ICMP echo reply is for one of our pings and
+ * forward it to the guest.
+ */
+static void
+pxping_pmgr_icmp4_echo(struct pxping *pxping,
+ u16_t iplen, struct sockaddr_in *peer)
+{
+ struct ip_hdr *iph;
+ struct icmp_echo_hdr *icmph;
+ u16_t id, seq;
+ ip_addr_t guest_ip, target_ip;
+ int mapped;
+ struct ping_pcb *pcb;
+ u16_t guest_id;
+ u16_t oipsum;
+ u32_t sum;
+ RT_NOREF(peer);
+
+ iph = (struct ip_hdr *)pollmgr_udpbuf;
+ icmph = (struct icmp_echo_hdr *)(pollmgr_udpbuf + IP_HLEN);
+
+ id = icmph->id;
+ seq = icmph->seqno;
+
+ DPRINTF(("<--- PING %RTnaipv4 id 0x%x seq %d\n",
+ peer->sin_addr.s_addr, ntohs(id), ntohs(seq)));
+
+ /*
+ * Is this a reply to one of our pings?
+ */
+
+ ip_addr_copy(target_ip, iph->src);
+ mapped = pxremap_inbound_ip4(&target_ip, &target_ip);
+ if (mapped == PXREMAP_FAILED) {
+ return;
+ }
+ if (mapped == PXREMAP_ASIS && IPH_TTL(iph) == 1) {
+ DPRINTF2(("%s: dropping packet with ttl 1\n", __func__));
+ return;
+ }
+
+ sys_mutex_lock(&pxping->lock);
+ pcb = pxping_pcb_for_reply(pxping, 0, ip_2_ipX(&target_ip), id);
+ if (pcb == NULL) {
+ sys_mutex_unlock(&pxping->lock);
+ DPRINTF2(("%s: no match\n", __func__));
+ return;
+ }
+
+ DPRINTF2(("%s: pcb %p\n", __func__, (void *)pcb));
+
+ /* save info before unlocking since pcb may expire */
+ ip_addr_copy(guest_ip, *ipX_2_ip(&pcb->src));
+ guest_id = pcb->guest_id;
+
+ sys_mutex_unlock(&pxping->lock);
+
+
+ /*
+ * Rewrite headers and forward to guest.
+ */
+
+ /* rewrite ICMP echo header */
+ sum = (u16_t)~icmph->chksum;
+ sum += chksum_update_16(&icmph->id, guest_id);
+ sum = FOLD_U32T(sum);
+ icmph->chksum = ~sum;
+
+ /* rewrite IP header */
+ oipsum = IPH_CHKSUM(iph);
+ if (oipsum == 0) {
+ /* Solaris doesn't compute checksum for local replies */
+ ip_addr_copy(iph->dest, guest_ip);
+ if (mapped == PXREMAP_MAPPED) {
+ ip_addr_copy(iph->src, target_ip);
+ }
+ else {
+ IPH_TTL_SET(iph, IPH_TTL(iph) - 1);
+ }
+ IPH_CHKSUM_SET(iph, inet_chksum(iph, ntohs(IPH_LEN(iph))));
+ }
+ else {
+ sum = (u16_t)~oipsum;
+ sum += chksum_update_32((u32_t *)&iph->dest,
+ ip4_addr_get_u32(&guest_ip));
+ if (mapped == PXREMAP_MAPPED) {
+ sum += chksum_update_32((u32_t *)&iph->src,
+ ip4_addr_get_u32(&target_ip));
+ }
+ else {
+ IPH_TTL_SET(iph, IPH_TTL(iph) - 1);
+ sum += PP_NTOHS(~0x0100);
+ }
+ sum = FOLD_U32T(sum);
+ IPH_CHKSUM_SET(iph, ~sum);
+ }
+
+ pxping_pmgr_forward_inbound(pxping, iplen);
+}
+
+
+/**
+ * Check if this incoming ICMP error (destination unreachable or time
+ * exceeded) is about one of our pings and forward it to the guest.
+ */
+static void
+pxping_pmgr_icmp4_error(struct pxping *pxping,
+ u16_t iplen, struct sockaddr_in *peer)
+{
+ struct ip_hdr *iph, *oiph;
+ struct icmp_echo_hdr *icmph, *oicmph;
+ u16_t oipoff, oiphlen, oiplen;
+ u16_t id, seq;
+ ip_addr_t guest_ip, target_ip, error_ip;
+ int target_mapped, error_mapped;
+ struct ping_pcb *pcb;
+ u16_t guest_id;
+ u32_t sum;
+ RT_NOREF(peer);
+
+ iph = (struct ip_hdr *)pollmgr_udpbuf;
+ icmph = (struct icmp_echo_hdr *)(pollmgr_udpbuf + IP_HLEN);
+
+ /*
+ * Inner IP datagram is not checked by the kernel and may be
+ * anything, possibly malicious.
+ */
+
+ oipoff = IP_HLEN + ICMP_HLEN;
+ oiplen = iplen - oipoff; /* NB: truncated length, not IPH_LEN(oiph) */
+ if (oiplen < IP_HLEN) {
+ DPRINTF2(("%s: original datagram truncated to %d bytes\n",
+ __func__, oiplen));
+ }
+
+ /* IP header of the original message */
+ oiph = (struct ip_hdr *)(pollmgr_udpbuf + oipoff);
+
+ /* match version */
+ if (IPH_V(oiph) != 4) {
+ DPRINTF2(("%s: unexpected IP version %d\n", __func__, IPH_V(oiph)));
+ return;
+ }
+
+ /* can't match fragments except the first one */
+ if ((IPH_OFFSET(oiph) & PP_HTONS(IP_OFFMASK)) != 0) {
+ DPRINTF2(("%s: ignoring fragment with offset %d\n",
+ __func__, ntohs(IPH_OFFSET(oiph) & PP_HTONS(IP_OFFMASK))));
+ return;
+ }
+
+ if (IPH_PROTO(oiph) != IP_PROTO_ICMP) {
+#if 0
+ /* don't spam with every "destination unreachable" in the system */
+ DPRINTF2(("%s: ignoring protocol %d\n", __func__, IPH_PROTO(oiph)));
+#endif
+ return;
+ }
+
+ oiphlen = IPH_HL(oiph) * 4;
+ if (oiplen < oiphlen + ICMP_HLEN) {
+ DPRINTF2(("%s: original datagram truncated to %d bytes\n",
+ __func__, oiplen));
+ return;
+ }
+
+ oicmph = (struct icmp_echo_hdr *)(pollmgr_udpbuf + oipoff + oiphlen);
+ if (ICMPH_TYPE(oicmph) != ICMP_ECHO) {
+ DPRINTF2(("%s: ignoring ICMP error for original ICMP type %d\n",
+ __func__, ICMPH_TYPE(oicmph)));
+ return;
+ }
+
+ id = oicmph->id;
+ seq = oicmph->seqno;
+
+ DPRINTF2(("%s: ping %RTnaipv4 id 0x%x seq %d",
+ __func__, ip4_addr_get_u32(&oiph->dest), ntohs(id), ntohs(seq)));
+ if (ICMPH_TYPE(icmph) == ICMP_DUR) {
+ DPRINTF2((" unreachable (code %d)\n", ICMPH_CODE(icmph)));
+ }
+ else {
+ DPRINTF2((" time exceeded\n"));
+ }
+
+
+ /*
+ * Is the inner (failed) datagram one of our pings?
+ */
+
+ ip_addr_copy(target_ip, oiph->dest); /* inner (failed) */
+ target_mapped = pxremap_inbound_ip4(&target_ip, &target_ip);
+ if (target_mapped == PXREMAP_FAILED) {
+ return;
+ }
+
+ sys_mutex_lock(&pxping->lock);
+ pcb = pxping_pcb_for_reply(pxping, 0, ip_2_ipX(&target_ip), id);
+ if (pcb == NULL) {
+ sys_mutex_unlock(&pxping->lock);
+ DPRINTF2(("%s: no match\n", __func__));
+ return;
+ }
+
+ DPRINTF2(("%s: pcb %p\n", __func__, (void *)pcb));
+
+ /* save info before unlocking since pcb may expire */
+ ip_addr_copy(guest_ip, *ipX_2_ip(&pcb->src));
+ guest_id = pcb->guest_id;
+
+ sys_mutex_unlock(&pxping->lock);
+
+
+ /*
+ * Rewrite both inner and outer headers and forward to guest.
+ * Note that the checksum of the outer ICMP error message is
+ * preserved by the changes we do to inner headers.
+ */
+
+ ip_addr_copy(error_ip, iph->src); /* node that reports the error */
+ error_mapped = pxremap_inbound_ip4(&error_ip, &error_ip);
+ if (error_mapped == PXREMAP_FAILED) {
+ return;
+ }
+ if (error_mapped == PXREMAP_ASIS && IPH_TTL(iph) == 1) {
+ DPRINTF2(("%s: dropping packet with ttl 1\n", __func__));
+ return;
+ }
+
+ /* rewrite inner ICMP echo header */
+ sum = (u16_t)~oicmph->chksum;
+ sum += chksum_update_16(&oicmph->id, guest_id);
+ sum = FOLD_U32T(sum);
+ oicmph->chksum = ~sum;
+
+ /* rewrite inner IP header */
+#if defined(RT_OS_DARWIN)
+ /* darwin converts inner length to host byte order too */
+ IPH_LEN_SET(oiph, htons(IPH_LEN(oiph)));
+#endif
+ sum = (u16_t)~IPH_CHKSUM(oiph);
+ sum += chksum_update_32((u32_t *)&oiph->src, ip4_addr_get_u32(&guest_ip));
+ if (target_mapped == PXREMAP_MAPPED) {
+ sum += chksum_update_32((u32_t *)&oiph->dest, ip4_addr_get_u32(&target_ip));
+ }
+ sum = FOLD_U32T(sum);
+ IPH_CHKSUM_SET(oiph, ~sum);
+
+ /* rewrite outer IP header */
+ sum = (u16_t)~IPH_CHKSUM(iph);
+ sum += chksum_update_32((u32_t *)&iph->dest, ip4_addr_get_u32(&guest_ip));
+ if (error_mapped == PXREMAP_MAPPED) {
+ sum += chksum_update_32((u32_t *)&iph->src, ip4_addr_get_u32(&error_ip));
+ }
+ else {
+ IPH_TTL_SET(iph, IPH_TTL(iph) - 1);
+ sum += PP_NTOHS(~0x0100);
+ }
+ sum = FOLD_U32T(sum);
+ IPH_CHKSUM_SET(iph, ~sum);
+
+ pxping_pmgr_forward_inbound(pxping, iplen);
+}
+
+
+/**
+ * Process incoming ICMPv6 message for the host.
+ * NB: we will get a lot of spam here and have to sift through it.
+ */
+static void
+pxping_pmgr_icmp6(struct pxping *pxping)
+{
+#ifndef RT_OS_WINDOWS
+ struct msghdr mh;
+ ssize_t nread;
+#else
+ WSAMSG mh;
+ DWORD nread;
+#endif
+ IOVEC iov[1];
+ static u8_t cmsgbuf[128];
+ struct cmsghdr *cmh;
+ struct sockaddr_in6 sin6;
+ /* socklen_t salen = sizeof(sin6); - unused */
+ struct icmp6_echo_hdr *icmph;
+ struct in6_pktinfo *pktinfo;
+ int hopl, tclass;
+#ifdef RT_OS_WINDOWS
+ int status;
+#endif
+
+ /*
+ * Reads from raw IPv6 sockets deliver only the payload. Full
+ * headers are available via recvmsg(2)/cmsg(3).
+ */
+ IOVEC_SET_BASE(iov[0], pollmgr_udpbuf);
+ IOVEC_SET_LEN(iov[0], sizeof(pollmgr_udpbuf));
+
+ memset(&mh, 0, sizeof(mh));
+#ifndef RT_OS_WINDOWS
+ mh.msg_name = &sin6;
+ mh.msg_namelen = sizeof(sin6);
+ mh.msg_iov = iov;
+ mh.msg_iovlen = 1;
+ mh.msg_control = cmsgbuf;
+ mh.msg_controllen = sizeof(cmsgbuf);
+ mh.msg_flags = 0;
+
+ nread = recvmsg(pxping->sock6, &mh, 0);
+ if (nread < 0) {
+ DPRINTF(("%s: %R[sockerr]\n", __func__, SOCKERRNO()));
+ return;
+ }
+#else /* RT_OS_WINDOWS */
+ mh.name = (LPSOCKADDR)&sin6;
+ mh.namelen = sizeof(sin6);
+ mh.lpBuffers = iov;
+ mh.dwBufferCount = 1;
+ mh.Control.buf = cmsgbuf;
+ mh.Control.len = sizeof(cmsgbuf);
+ mh.dwFlags = 0;
+
+ status = (*pxping->pfWSARecvMsg6)(pxping->sock6, &mh, &nread, NULL, NULL);
+ if (status == SOCKET_ERROR) {
+ DPRINTF2(("%s: error %d\n", __func__, WSAGetLastError()));
+ return;
+ }
+#endif
+
+ icmph = (struct icmp6_echo_hdr *)pollmgr_udpbuf;
+
+ DPRINTF2(("%s: %RTnaipv6 ICMPv6: ", __func__, &sin6.sin6_addr));
+
+ if (icmph->type == ICMP6_TYPE_EREP) {
+ DPRINTF2(("echo reply %04x %u\n",
+ (unsigned int)icmph->id, (unsigned int)icmph->seqno));
+ }
+ else { /* XXX */
+ if (icmph->type == ICMP6_TYPE_EREQ) {
+ DPRINTF2(("echo request %04x %u\n",
+ (unsigned int)icmph->id, (unsigned int)icmph->seqno));
+ }
+ else if (icmph->type == ICMP6_TYPE_DUR) {
+ DPRINTF2(("destination unreachable\n"));
+ }
+ else if (icmph->type == ICMP6_TYPE_PTB) {
+ DPRINTF2(("packet too big\n"));
+ }
+ else if (icmph->type == ICMP6_TYPE_TE) {
+ DPRINTF2(("time exceeded\n"));
+ }
+ else if (icmph->type == ICMP6_TYPE_PP) {
+ DPRINTF2(("parameter problem\n"));
+ }
+ else {
+ DPRINTF2(("type %d len %u\n", icmph->type, (unsigned int)nread));
+ }
+
+ if (icmph->type >= ICMP6_TYPE_EREQ) {
+ return; /* informational message */
+ }
+ }
+
+ pktinfo = NULL;
+ hopl = -1;
+ tclass = -1;
+ for (cmh = CMSG_FIRSTHDR(&mh); cmh != NULL; cmh = CMSG_NXTHDR(&mh, cmh)) {
+ if (cmh->cmsg_len == 0)
+ break;
+
+ if (cmh->cmsg_level == IPPROTO_IPV6
+ && cmh->cmsg_type == IPV6_HOPLIMIT
+ && cmh->cmsg_len == CMSG_LEN(sizeof(int)))
+ {
+ hopl = *(int *)CMSG_DATA(cmh);
+ DPRINTF2(("hoplimit = %d\n", hopl));
+ }
+
+ if (cmh->cmsg_level == IPPROTO_IPV6
+ && cmh->cmsg_type == IPV6_PKTINFO
+ && cmh->cmsg_len == CMSG_LEN(sizeof(struct in6_pktinfo)))
+ {
+ pktinfo = (struct in6_pktinfo *)CMSG_DATA(cmh);
+ DPRINTF2(("pktinfo found\n"));
+ }
+ }
+
+ if (pktinfo == NULL) {
+ /*
+ * ip6_output_if() doesn't do checksum for us so we need to
+ * manually recompute it - for this we must know the
+ * destination address of the pseudo-header that we will
+ * rewrite with guest's address. (TODO: yeah, yeah, we can
+ * compute it from scratch...)
+ */
+ DPRINTF2(("%s: unable to get pktinfo\n", __func__));
+ return;
+ }
+
+ if (hopl < 0) {
+ hopl = LWIP_ICMP6_HL;
+ }
+
+ if (icmph->type == ICMP6_TYPE_EREP) {
+ pxping_pmgr_icmp6_echo(pxping,
+ (ip6_addr_t *)&sin6.sin6_addr,
+ (ip6_addr_t *)&pktinfo->ipi6_addr,
+ hopl, tclass, (u16_t)nread);
+ }
+ else if (icmph->type < ICMP6_TYPE_EREQ) {
+ pxping_pmgr_icmp6_error(pxping,
+ (ip6_addr_t *)&sin6.sin6_addr,
+ (ip6_addr_t *)&pktinfo->ipi6_addr,
+ hopl, tclass, (u16_t)nread);
+ }
+}
+
+
+/**
+ * Check if this incoming ICMPv6 echo reply is for one of our pings
+ * and forward it to the guest.
+ */
+static void
+pxping_pmgr_icmp6_echo(struct pxping *pxping,
+ ip6_addr_t *src, ip6_addr_t *dst,
+ int hopl, int tclass, u16_t icmplen)
+{
+ struct icmp6_echo_hdr *icmph;
+ ip6_addr_t guest_ip, target_ip;
+ int mapped;
+ struct ping_pcb *pcb;
+ u16_t id, guest_id;
+ u32_t sum;
+
+ ip6_addr_copy(target_ip, *src);
+ mapped = pxremap_inbound_ip6(&target_ip, &target_ip);
+ if (mapped == PXREMAP_FAILED) {
+ return;
+ }
+ else if (mapped == PXREMAP_ASIS) {
+ if (hopl == 1) {
+ DPRINTF2(("%s: dropping packet with ttl 1\n", __func__));
+ return;
+ }
+ --hopl;
+ }
+
+ icmph = (struct icmp6_echo_hdr *)pollmgr_udpbuf;
+ id = icmph->id;
+
+ sys_mutex_lock(&pxping->lock);
+ pcb = pxping_pcb_for_reply(pxping, 1, ip6_2_ipX(&target_ip), id);
+ if (pcb == NULL) {
+ sys_mutex_unlock(&pxping->lock);
+ DPRINTF2(("%s: no match\n", __func__));
+ return;
+ }
+
+ DPRINTF2(("%s: pcb %p\n", __func__, (void *)pcb));
+
+ /* save info before unlocking since pcb may expire */
+ ip6_addr_copy(guest_ip, *ipX_2_ip6(&pcb->src));
+ guest_id = pcb->guest_id;
+
+ sys_mutex_unlock(&pxping->lock);
+
+ /* rewrite ICMPv6 echo header */
+ sum = (u16_t)~icmph->chksum;
+ sum += chksum_update_16(&icmph->id, guest_id);
+ sum += chksum_delta_ipv6(dst, &guest_ip); /* pseudo */
+ if (mapped) {
+ sum += chksum_delta_ipv6(src, &target_ip); /* pseudo */
+ }
+ sum = FOLD_U32T(sum);
+ icmph->chksum = ~sum;
+
+ pxping_pmgr_forward_inbound6(pxping,
+ &target_ip, /* echo reply src */
+ &guest_ip, /* echo reply dst */
+ hopl, tclass, icmplen);
+}
+
+
+/**
+ * Check if this incoming ICMPv6 error is about one of our pings and
+ * forward it to the guest.
+ */
+static void
+pxping_pmgr_icmp6_error(struct pxping *pxping,
+ ip6_addr_t *src, ip6_addr_t *dst,
+ int hopl, int tclass, u16_t icmplen)
+{
+ struct icmp6_hdr *icmph;
+ u8_t *bufptr;
+ size_t buflen, hlen;
+ int proto;
+ struct ip6_hdr *oiph;
+ struct icmp6_echo_hdr *oicmph;
+ struct ping_pcb *pcb;
+ ip6_addr_t guest_ip, target_ip, error_ip;
+ int target_mapped, error_mapped;
+ u16_t guest_id;
+ u32_t sum;
+
+ icmph = (struct icmp6_hdr *)pollmgr_udpbuf;
+
+ /*
+ * Inner IP datagram is not checked by the kernel and may be
+ * anything, possibly malicious.
+ */
+ oiph = NULL;
+ oicmph = NULL;
+
+ bufptr = pollmgr_udpbuf;
+ buflen = icmplen;
+
+ hlen = sizeof(*icmph);
+ proto = IP6_NEXTH_ENCAPS; /* i.e. IPv6, lwIP's name is unfortuate */
+ for (;;) {
+ if (hlen > buflen) {
+ DPRINTF2(("truncated datagram inside ICMPv6 error message is too short\n"));
+ return;
+ }
+ buflen -= hlen;
+ bufptr += hlen;
+
+ if (proto == IP6_NEXTH_ENCAPS && oiph == NULL) { /* outermost IPv6 */
+ oiph = (struct ip6_hdr *)bufptr;
+ if (IP6H_V(oiph) != 6) {
+ DPRINTF2(("%s: unexpected IP version %d\n", __func__, IP6H_V(oiph)));
+ return;
+ }
+
+ proto = IP6H_NEXTH(oiph);
+ hlen = IP6_HLEN;
+ }
+ else if (proto == IP6_NEXTH_ICMP6) {
+ oicmph = (struct icmp6_echo_hdr *)bufptr;
+ break;
+ }
+ else if (proto == IP6_NEXTH_ROUTING
+ || proto == IP6_NEXTH_HOPBYHOP
+ || proto == IP6_NEXTH_DESTOPTS)
+ {
+ proto = bufptr[0];
+ hlen = (bufptr[1] + 1) * 8;
+ }
+ else {
+ DPRINTF2(("%s: stopping at protocol %d\n", __func__, proto));
+ break;
+ }
+ }
+
+ if (oiph == NULL || oicmph == NULL) {
+ return;
+ }
+
+ if (buflen < sizeof(*oicmph)) {
+ DPRINTF2(("%s: original ICMPv6 is truncated too short\n", __func__));
+ return;
+ }
+
+ if (oicmph->type != ICMP6_TYPE_EREQ) {
+ DPRINTF2(("%s: ignoring original ICMPv6 type %d\n", __func__, oicmph->type));
+ return;
+ }
+
+ ip6_addr_copy(target_ip, oiph->dest); /* inner (failed) */
+ target_mapped = pxremap_inbound_ip6(&target_ip, &target_ip);
+ if (target_mapped == PXREMAP_FAILED) {
+ return;
+ }
+
+ sys_mutex_lock(&pxping->lock);
+ pcb = pxping_pcb_for_reply(pxping, 1, ip6_2_ipX(&target_ip), oicmph->id);
+ if (pcb == NULL) {
+ sys_mutex_unlock(&pxping->lock);
+ DPRINTF2(("%s: no match\n", __func__));
+ return;
+ }
+
+ DPRINTF2(("%s: pcb %p\n", __func__, (void *)pcb));
+
+ /* save info before unlocking since pcb may expire */
+ ip6_addr_copy(guest_ip, *ipX_2_ip6(&pcb->src));
+ guest_id = pcb->guest_id;
+
+ sys_mutex_unlock(&pxping->lock);
+
+
+ /*
+ * Rewrite inner and outer headers and forward to guest. Note
+ * that IPv6 has no IP header checksum, but uses pseudo-header for
+ * ICMPv6, so we update both in one go, adjusting ICMPv6 checksum
+ * as we rewrite IP header.
+ */
+
+ ip6_addr_copy(error_ip, *src); /* node that reports the error */
+ error_mapped = pxremap_inbound_ip6(&error_ip, &error_ip);
+ if (error_mapped == PXREMAP_FAILED) {
+ return;
+ }
+ if (error_mapped == PXREMAP_ASIS && hopl == 1) {
+ DPRINTF2(("%s: dropping packet with ttl 1\n", __func__));
+ return;
+ }
+
+ /* rewrite inner ICMPv6 echo header and inner IPv6 header */
+ sum = (u16_t)~oicmph->chksum;
+ sum += chksum_update_16(&oicmph->id, guest_id);
+ sum += chksum_update_ipv6((ip6_addr_t *)&oiph->src, &guest_ip);
+ if (target_mapped) {
+ sum += chksum_delta_ipv6((ip6_addr_t *)&oiph->dest, &target_ip);
+ }
+ sum = FOLD_U32T(sum);
+ oicmph->chksum = ~sum;
+
+ /* rewrite outer ICMPv6 error header */
+ sum = (u16_t)~icmph->chksum;
+ sum += chksum_delta_ipv6(dst, &guest_ip); /* pseudo */
+ if (error_mapped) {
+ sum += chksum_delta_ipv6(src, &error_ip); /* pseudo */
+ }
+ sum = FOLD_U32T(sum);
+ icmph->chksum = ~sum;
+
+ pxping_pmgr_forward_inbound6(pxping,
+ &error_ip, /* error src */
+ &guest_ip, /* error dst */
+ hopl, tclass, icmplen);
+}
+
+
+/**
+ * Hand off ICMP datagram to the lwip thread where it will be
+ * forwarded to the guest.
+ *
+ * We no longer need ping_pcb. The pcb may get expired on the lwip
+ * thread, but we have already patched necessary information into the
+ * datagram.
+ */
+static void
+pxping_pmgr_forward_inbound(struct pxping *pxping, u16_t iplen)
+{
+ struct pbuf *p;
+ struct ping_msg *msg;
+ err_t error;
+
+ p = pbuf_alloc(PBUF_LINK, iplen, PBUF_RAM);
+ if (p == NULL) {
+ DPRINTF(("%s: pbuf_alloc(%d) failed\n",
+ __func__, (unsigned int)iplen));
+ return;
+ }
+
+ error = pbuf_take(p, pollmgr_udpbuf, iplen);
+ if (error != ERR_OK) {
+ DPRINTF(("%s: pbuf_take(%d) failed\n",
+ __func__, (unsigned int)iplen));
+ pbuf_free(p);
+ return;
+ }
+
+ msg = (struct ping_msg *)malloc(sizeof(*msg));
+ if (msg == NULL) {
+ pbuf_free(p);
+ return;
+ }
+
+ msg->msg.type = TCPIP_MSG_CALLBACK_STATIC;
+ msg->msg.sem = NULL;
+ msg->msg.msg.cb.function = pxping_pcb_forward_inbound;
+ msg->msg.msg.cb.ctx = (void *)msg;
+
+ msg->pxping = pxping;
+ msg->p = p;
+
+ proxy_lwip_post(&msg->msg);
+}
+
+
+static void
+pxping_pcb_forward_inbound(void *arg)
+{
+ struct ping_msg *msg = (struct ping_msg *)arg;
+ err_t error;
+
+ LWIP_ASSERT1(msg != NULL);
+ LWIP_ASSERT1(msg->pxping != NULL);
+ LWIP_ASSERT1(msg->p != NULL);
+
+ error = ip_raw_output_if(msg->p, msg->pxping->netif);
+ if (error != ERR_OK) {
+ DPRINTF(("%s: ip_output_if: %s\n",
+ __func__, proxy_lwip_strerr(error)));
+ }
+ pbuf_free(msg->p);
+ free(msg);
+}
+
+
+static void
+pxping_pmgr_forward_inbound6(struct pxping *pxping,
+ ip6_addr_t *src, ip6_addr_t *dst,
+ u8_t hopl, u8_t tclass,
+ u16_t icmplen)
+{
+ struct pbuf *p;
+ struct ping6_msg *msg;
+
+ err_t error;
+
+ p = pbuf_alloc(PBUF_IP, icmplen, PBUF_RAM);
+ if (p == NULL) {
+ DPRINTF(("%s: pbuf_alloc(%d) failed\n",
+ __func__, (unsigned int)icmplen));
+ return;
+ }
+
+ error = pbuf_take(p, pollmgr_udpbuf, icmplen);
+ if (error != ERR_OK) {
+ DPRINTF(("%s: pbuf_take(%d) failed\n",
+ __func__, (unsigned int)icmplen));
+ pbuf_free(p);
+ return;
+ }
+
+ msg = (struct ping6_msg *)malloc(sizeof(*msg));
+ if (msg == NULL) {
+ pbuf_free(p);
+ return;
+ }
+
+ msg->msg.type = TCPIP_MSG_CALLBACK_STATIC;
+ msg->msg.sem = NULL;
+ msg->msg.msg.cb.function = pxping_pcb_forward_inbound6;
+ msg->msg.msg.cb.ctx = (void *)msg;
+
+ msg->pxping = pxping;
+ msg->p = p;
+ ip6_addr_copy(msg->src, *src);
+ ip6_addr_copy(msg->dst, *dst);
+ msg->hopl = hopl;
+ msg->tclass = tclass;
+
+ proxy_lwip_post(&msg->msg);
+}
+
+
+static void
+pxping_pcb_forward_inbound6(void *arg)
+{
+ struct ping6_msg *msg = (struct ping6_msg *)arg;
+ err_t error;
+
+ LWIP_ASSERT1(msg != NULL);
+ LWIP_ASSERT1(msg->pxping != NULL);
+ LWIP_ASSERT1(msg->p != NULL);
+
+ error = ip6_output_if(msg->p,
+ &msg->src, &msg->dst, msg->hopl, msg->tclass,
+ IP6_NEXTH_ICMP6, msg->pxping->netif);
+ if (error != ERR_OK) {
+ DPRINTF(("%s: ip6_output_if: %s\n",
+ __func__, proxy_lwip_strerr(error)));
+ }
+ pbuf_free(msg->p);
+ free(msg);
+}