/* $Id: socket.c $ */ /** @file * NAT - socket handling. */ /* * Copyright (C) 2006-2023 Oracle and/or its affiliates. * * This file is part of VirtualBox base platform packages, as * available from https://www.virtualbox.org. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, in version 3 of the * License. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see . * * SPDX-License-Identifier: GPL-3.0-only */ /* * This code is based on: * * Copyright (c) 1995 Danny Gasparovski. * * Please read the file COPYRIGHT for the * terms and conditions of the copyright. */ #include #include "ip_icmp.h" #include "main.h" #ifdef __sun__ #include #endif #include #if defined (RT_OS_WINDOWS) #include #include #endif #include #if defined(DECLARE_IOVEC) && defined(RT_OS_WINDOWS) AssertCompileMembersSameSizeAndOffset(struct iovec, iov_base, WSABUF, buf); AssertCompileMembersSameSizeAndOffset(struct iovec, iov_len, WSABUF, len); #endif #ifdef VBOX_WITH_NAT_SEND2HOME DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags) { int idxAddr; int ret = 0; bool fSendDone = false; LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags)); for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr) { struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr); AssertReturn((pNewSocket, false)); pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport; /** @todo more verbose on errors, * @note: we shouldn't care if this send fail or not (we're in broadcast). */ LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket)); ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in)); if (ret < 0) LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr)); fSendDone |= ret > 0; } LogFlowFunc(("Leave %RTbool\n", fSendDone)); return fSendDone; } #endif /* !VBOX_WITH_NAT_SEND2HOME */ #if !defined(RT_OS_WINDOWS) static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *); static void sorecvfrom_icmp_unix(PNATState, struct socket *); #endif /* !RT_OS_WINDOWS */ void so_init(void) { } struct socket * solookup(struct socket *head, struct in_addr laddr, u_int lport, struct in_addr faddr, u_int fport) { struct socket *so; for (so = head->so_next; so != head; so = so->so_next) { if ( so->so_lport == lport && so->so_laddr.s_addr == laddr.s_addr && so->so_faddr.s_addr == faddr.s_addr && so->so_fport == fport) return so; } return (struct socket *)NULL; } /* * Create a new socket, initialise the fields * It is the responsibility of the caller to * insque() it into the correct linked-list */ struct socket * socreate(void) { struct socket *so; so = (struct socket *)RTMemAllocZ(sizeof(struct socket)); if (so) { so->so_state = SS_NOFDREF; so->s = -1; #if !defined(RT_OS_WINDOWS) so->so_poll_index = -1; #endif } return so; } /* * remque and free a socket, clobber cache */ void sofree(PNATState pData, struct socket *so) { LogFlowFunc(("ENTER:%R[natsock]\n", so)); /* * We should not remove socket when polling routine do the polling * instead we mark it for deletion. */ if (so->fUnderPolling) { so->fShouldBeRemoved = 1; LogFlowFunc(("LEAVE:%R[natsock] postponed deletion\n", so)); return; } /** * Check that we don't freeng socket with tcbcb */ Assert(!sototcpcb(so)); /* udp checks */ Assert(!so->so_timeout); Assert(!so->so_timeout_arg); if (so == tcp_last_so) tcp_last_so = &tcb; else if (so == udp_last_so) udp_last_so = &udb; /* check if mbuf haven't been already freed */ if (so->so_m != NULL) { m_freem(pData, so->so_m); so->so_m = NULL; } if (so->so_ohdr != NULL) { RTMemFree(so->so_ohdr); so->so_ohdr = NULL; } if (so->so_next && so->so_prev) { remque(pData, so); /* crashes if so is not in a queue */ NSOCK_DEC(); } RTMemFree(so); LogFlowFuncLeave(); } /* * Worker for sobind() below. */ static int sobindto(struct socket *so, uint32_t addr, uint16_t port) { struct sockaddr_in self; int status; if (addr == INADDR_ANY && port == 0 && so->so_type != IPPROTO_UDP) { /* TCP sockets without constraints don't need to be bound */ Log2(("NAT: sobind: %s guest %RTnaipv4:%d - nothing to do\n", so->so_type == IPPROTO_UDP ? "udp" : "tcp", so->so_laddr.s_addr, ntohs(so->so_lport))); return 0; } RT_ZERO(self); #ifdef RT_OS_DARWIN self.sin_len = sizeof(self); #endif self.sin_family = AF_INET; self.sin_addr.s_addr = addr; self.sin_port = port; status = bind(so->s, (struct sockaddr *)&self, sizeof(self)); if (status == 0) { Log2(("NAT: sobind: %s guest %RTnaipv4:%d to host %RTnaipv4:%d\n", so->so_type == IPPROTO_UDP ? "udp" : "tcp", so->so_laddr.s_addr, ntohs(so->so_lport), addr, ntohs(port))); return 0; } Log2(("NAT: sobind: %s guest %RTnaipv4:%d to host %RTnaipv4:%d error %d%s\n", so->so_type == IPPROTO_UDP ? "udp" : "tcp", so->so_laddr.s_addr, ntohs(so->so_lport), addr, ntohs(port), errno, port ? " (will retry with random port)" : "")); if (port) /* retry without */ status = sobindto(so, addr, 0); if (addr) return status; else return 0; } /* * Bind the socket to specific host address and/or port if necessary. * We also always bind udp sockets to force the local port to be * allocated and known in advance. */ int sobind(PNATState pData, struct socket *so) { uint32_t addr = pData->bindIP.s_addr; /* may be INADDR_ANY */ bool fSamePorts = !!(pData->i32AliasMode & PKT_ALIAS_SAME_PORTS); uint16_t port; int status; if (fSamePorts) { int opt = 1; setsockopt(so->s, SOL_SOCKET, SO_REUSEADDR, (char *)&opt, sizeof(opt)); port = so->so_lport; } else { port = 0; } status = sobindto(so, addr, port); return status; } /* * Read from so's socket into sb_snd, updating all relevant sbuf fields * NOTE: This will only be called if it is select()ed for reading, so * a read() of 0 (or less) means it's disconnected */ int soread(PNATState pData, struct socket *so) { int n, nn, lss, total; struct sbuf *sb = &so->so_snd; u_int len = sb->sb_datalen - sb->sb_cc; struct iovec iov[2]; int mss = so->so_tcpcb->t_maxseg; int sockerr; STAM_PROFILE_START(&pData->StatIOread, a); STAM_COUNTER_RESET(&pData->StatIORead_in_1); STAM_COUNTER_RESET(&pData->StatIORead_in_2); QSOCKET_LOCK(tcb); SOCKET_LOCK(so); QSOCKET_UNLOCK(tcb); LogFlow(("soread: so = %R[natsock]\n", so)); Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb)); /* * No need to check if there's enough room to read. * soread wouldn't have been called if there weren't */ len = sb->sb_datalen - sb->sb_cc; iov[0].iov_base = sb->sb_wptr; iov[1].iov_base = 0; iov[1].iov_len = 0; if (sb->sb_wptr < sb->sb_rptr) { iov[0].iov_len = sb->sb_rptr - sb->sb_wptr; /* Should never succeed, but... */ if (iov[0].iov_len > len) iov[0].iov_len = len; if (iov[0].iov_len > mss) iov[0].iov_len -= iov[0].iov_len%mss; n = 1; } else { iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr; /* Should never succeed, but... */ if (iov[0].iov_len > len) iov[0].iov_len = len; len -= iov[0].iov_len; if (len) { iov[1].iov_base = sb->sb_data; iov[1].iov_len = sb->sb_rptr - sb->sb_data; if (iov[1].iov_len > len) iov[1].iov_len = len; total = iov[0].iov_len + iov[1].iov_len; if (total > mss) { lss = total % mss; if (iov[1].iov_len > lss) { iov[1].iov_len -= lss; n = 2; } else { lss -= iov[1].iov_len; iov[0].iov_len -= lss; n = 1; } } else n = 2; } else { if (iov[0].iov_len > mss) iov[0].iov_len -= iov[0].iov_len%mss; n = 1; } } #ifdef HAVE_READV nn = readv(so->s, (struct iovec *)iov, n); #else nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0)); #endif if (nn < 0) sockerr = errno; /* save it, as it may be clobbered by logging */ else sockerr = 0; Log2(("%s: read(1) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn)); Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb)); if (nn <= 0) { #ifdef RT_OS_WINDOWS /* * Windows reports ESHUTDOWN after SHUT_RD (SD_RECEIVE) * instead of just returning EOF indication. */ if (nn < 0 && sockerr == ESHUTDOWN) { nn = 0; sockerr = 0; } #endif if (nn == 0) /* XXX: should this be inside #if defined(RT_OS_WINDOWS)? */ { /* * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that * _could_ mean that the connection is closed. But we will receive an * FD_CLOSE event later if the connection was _really_ closed. With * www.youtube.com I see this very often. Closing the socket too early * would be dangerous. */ int status; unsigned long pending = 0; status = ioctlsocket(so->s, FIONREAD, &pending); if (status < 0) Log(("NAT:%s: error in WSAIoctl: %d\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, errno)); if (pending != 0) { SOCKET_UNLOCK(so); STAM_PROFILE_STOP(&pData->StatIOread, a); return 0; } } if ( nn < 0 && soIgnorableErrorCode(sockerr)) { SOCKET_UNLOCK(so); STAM_PROFILE_STOP(&pData->StatIOread, a); return 0; } else { int fUninitializedTemplate = 0; int shuterr; fUninitializedTemplate = RT_BOOL(( sototcpcb(so) && ( sototcpcb(so)->t_template.ti_src.s_addr == INADDR_ANY || sototcpcb(so)->t_template.ti_dst.s_addr == INADDR_ANY))); /* nn == 0 means peer has performed an orderly shutdown */ Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sockerr, strerror(sockerr))); shuterr = sofcantrcvmore(so); if (!sockerr && !shuterr && !fUninitializedTemplate) tcp_sockclosed(pData, sototcpcb(so)); else { LogRel2(("NAT: sockerr %d, shuterr %d - %R[natsock]\n", sockerr, shuterr, so)); tcp_drop(pData, sototcpcb(so), sockerr); } SOCKET_UNLOCK(so); STAM_PROFILE_STOP(&pData->StatIOread, a); return -1; } } STAM_STATS( if (n == 1) { STAM_COUNTER_INC(&pData->StatIORead_in_1); STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn); } else { STAM_COUNTER_INC(&pData->StatIORead_in_2); STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn); } ); #ifndef HAVE_READV /* * If there was no error, try and read the second time round * We read again if n = 2 (ie, there's another part of the buffer) * and we read as much as we could in the first read * We don't test for <= 0 this time, because there legitimately * might not be any more data (since the socket is non-blocking), * a close will be detected on next iteration. * A return of -1 wont (shouldn't) happen, since it didn't happen above */ if (n == 2 && (unsigned)nn == iov[0].iov_len) { int ret; ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0); if (ret > 0) nn += ret; STAM_STATS( if (ret > 0) { STAM_COUNTER_INC(&pData->StatIORead_in_2); STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret); } ); } Log2(("%s: read(2) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn)); #endif /* Update fields */ sb->sb_cc += nn; sb->sb_wptr += nn; Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sb)); if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen)) { sb->sb_wptr -= sb->sb_datalen; Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, sb)); } STAM_PROFILE_STOP(&pData->StatIOread, a); SOCKET_UNLOCK(so); return nn; } /* * Get urgent data * * When the socket is created, we set it SO_OOBINLINE, * so when OOB data arrives, we soread() it and everything * in the send buffer is sent as urgent data */ void sorecvoob(PNATState pData, struct socket *so) { struct tcpcb *tp = sototcpcb(so); ssize_t ret; LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so)); /* * We take a guess at how much urgent data has arrived. * In most situations, when urgent data arrives, the next * read() should get all the urgent data. This guess will * be wrong however if more data arrives just after the * urgent data, or the read() doesn't return all the * urgent data. */ ret = soread(pData, so); if (RT_LIKELY(ret > 0)) { /* * @todo for now just scrub the URG pointer. To faithfully * proxy URG we need to read the srteam until SIOCATMARK, and * then mark the first byte of the next read ar urgent. */ #if 0 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd); #endif tp->t_force = 1; tcp_output(pData, tp); tp->t_force = 0; } } /* * Send urgent data * There's a lot duplicated code here, but... */ int sosendoob(struct socket *so) { struct sbuf *sb = &so->so_rcv; char buff[2048]; /* XXX Shouldn't be sending more oob data than this */ int n, len; LogFlowFunc(("sosendoob so = %R[natsock]\n", so)); if (so->so_urgc > sizeof(buff)) so->so_urgc = sizeof(buff); /* XXX */ if (sb->sb_rptr < sb->sb_wptr) { /* We can send it directly */ n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */ so->so_urgc -= n; Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n", n, so->so_urgc)); } else { /* * Since there's no sendv or sendtov like writev, * we must copy all data to a linear buffer then * send it all */ len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr; if (len > so->so_urgc) len = so->so_urgc; memcpy(buff, sb->sb_rptr, len); so->so_urgc -= len; if (so->so_urgc) { n = sb->sb_wptr - sb->sb_data; if (n > so->so_urgc) n = so->so_urgc; memcpy(buff + len, sb->sb_data, n); so->so_urgc -= n; len += n; } n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */ #ifdef DEBUG if (n != len) Log(("Didn't send all data urgently XXXXX\n")); #endif Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n", n, so->so_urgc)); } sb->sb_cc -= n; sb->sb_rptr += n; if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen)) sb->sb_rptr -= sb->sb_datalen; return n; } /* * Write data from so_rcv to so's socket, * updating all sbuf field as necessary */ int sowrite(PNATState pData, struct socket *so) { int n, nn; struct sbuf *sb = &so->so_rcv; u_int len = sb->sb_cc; struct iovec iov[2]; STAM_PROFILE_START(&pData->StatIOwrite, a); STAM_COUNTER_RESET(&pData->StatIOWrite_in_1); STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes); STAM_COUNTER_RESET(&pData->StatIOWrite_in_2); STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes); STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes); STAM_COUNTER_RESET(&pData->StatIOWrite_no_w); STAM_COUNTER_RESET(&pData->StatIOWrite_rest); STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes); LogFlowFunc(("so = %R[natsock]\n", so)); Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb)); QSOCKET_LOCK(tcb); SOCKET_LOCK(so); QSOCKET_UNLOCK(tcb); if (so->so_urgc) { sosendoob(so); if (sb->sb_cc == 0) { SOCKET_UNLOCK(so); STAM_PROFILE_STOP(&pData->StatIOwrite, a); return 0; } } /* * No need to check if there's something to write, * sowrite wouldn't have been called otherwise */ len = sb->sb_cc; iov[0].iov_base = sb->sb_rptr; iov[1].iov_base = 0; iov[1].iov_len = 0; if (sb->sb_rptr < sb->sb_wptr) { iov[0].iov_len = sb->sb_wptr - sb->sb_rptr; /* Should never succeed, but... */ if (iov[0].iov_len > len) iov[0].iov_len = len; n = 1; } else { iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr; if (iov[0].iov_len > len) iov[0].iov_len = len; len -= iov[0].iov_len; if (len) { iov[1].iov_base = sb->sb_data; iov[1].iov_len = sb->sb_wptr - sb->sb_data; if (iov[1].iov_len > len) iov[1].iov_len = len; n = 2; } else n = 1; } STAM_STATS({ if (n == 1) { STAM_COUNTER_INC(&pData->StatIOWrite_in_1); STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len); } else { STAM_COUNTER_INC(&pData->StatIOWrite_in_2); STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len); STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len); } }); /* Check if there's urgent data to send, and if so, send it */ #ifdef HAVE_READV nn = writev(so->s, (const struct iovec *)iov, n); #else nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0); #endif Log2(("%s: wrote(1) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn)); /* This should never happen, but people tell me it does *shrug* */ if ( nn < 0 && soIgnorableErrorCode(errno)) { SOCKET_UNLOCK(so); STAM_PROFILE_STOP(&pData->StatIOwrite, a); return 0; } if (nn < 0 || (nn == 0 && iov[0].iov_len > 0)) { Log2(("%s: disconnected, so->so_state = %x, errno = %d\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so->so_state, errno)); sofcantsendmore(so); tcp_sockclosed(pData, sototcpcb(so)); SOCKET_UNLOCK(so); STAM_PROFILE_STOP(&pData->StatIOwrite, a); return -1; } #ifndef HAVE_READV if (n == 2 && (unsigned)nn == iov[0].iov_len) { int ret; ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0); if (ret > 0) nn += ret; # ifdef VBOX_WITH_STATISTICS if (ret > 0 && ret != (ssize_t)iov[1].iov_len) { STAM_COUNTER_INC(&pData->StatIOWrite_rest); STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret)); } #endif } Log2(("%s: wrote(2) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn)); #endif /* Update sbuf */ sb->sb_cc -= nn; sb->sb_rptr += nn; Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sb)); if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen)) { sb->sb_rptr -= sb->sb_datalen; Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, sb)); } /* * If in DRAIN mode, and there's no more data, set * it CANTSENDMORE */ if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0) sofcantsendmore(so); SOCKET_UNLOCK(so); STAM_PROFILE_STOP(&pData->StatIOwrite, a); return nn; } /* * recvfrom() a UDP socket */ void sorecvfrom(PNATState pData, struct socket *so) { LogFlowFunc(("sorecvfrom: so = %p\n", so)); #ifdef RT_OS_WINDOWS /* ping is handled with ICMP API in ip_icmpwin.c */ Assert(so->so_type == IPPROTO_UDP); #else if (so->so_type == IPPROTO_ICMP) { /* This is a "ping" reply */ sorecvfrom_icmp_unix(pData, so); udp_detach(pData, so); } else #endif /* !RT_OS_WINDOWS */ { static char achBuf[64 * 1024]; /* A "normal" UDP packet */ struct sockaddr_in addr; socklen_t addrlen = sizeof(struct sockaddr_in); struct iovec iov[2]; ssize_t nread; struct mbuf *m; QSOCKET_LOCK(udb); SOCKET_LOCK(so); QSOCKET_UNLOCK(udb); m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData)); if (m == NULL) { SOCKET_UNLOCK(so); return; } m->m_data += ETH_HLEN; m->m_pkthdr.header = mtod(m, void *); m->m_data += sizeof(struct udpiphdr); /* small packets will fit without copying */ iov[0].iov_base = mtod(m, char *); iov[0].iov_len = M_TRAILINGSPACE(m); /* large packets will spill into a temp buffer */ iov[1].iov_base = achBuf; iov[1].iov_len = sizeof(achBuf); #if !defined(RT_OS_WINDOWS) { struct msghdr mh; memset(&mh, 0, sizeof(mh)); mh.msg_iov = iov; mh.msg_iovlen = 2; mh.msg_name = &addr; mh.msg_namelen = addrlen; nread = recvmsg(so->s, &mh, 0); } #else /* RT_OS_WINDOWS */ { DWORD nbytes; /* NB: can't use nread b/c of different size */ DWORD flags = 0; int status; AssertCompile(sizeof(WSABUF) == sizeof(struct iovec)); AssertCompileMembersSameSizeAndOffset(WSABUF, len, struct iovec, iov_len); AssertCompileMembersSameSizeAndOffset(WSABUF, buf, struct iovec, iov_base); status = WSARecvFrom(so->s, (WSABUF *)&iov[0], 2, &nbytes, &flags, (struct sockaddr *)&addr, &addrlen, NULL, NULL); if (status != SOCKET_ERROR) nread = nbytes; else nread = -1; } #endif if (nread >= 0) { if (nread <= iov[0].iov_len) m->m_len = nread; else { m->m_len = iov[0].iov_len; m_append(pData, m, nread - iov[0].iov_len, iov[1].iov_base); } Assert(m_length(m, NULL) == (size_t)nread); /* * Hack: domain name lookup will be used the most for UDP, * and since they'll only be used once there's no need * for the 4 minute (or whatever) timeout... So we time them * out much quicker (10 seconds for now...) */ if (so->so_expire) { if (so->so_fport != RT_H2N_U16_C(53)) so->so_expire = curtime + SO_EXPIRE; } /* * DNS proxy requests are forwarded to the real resolver, * but its socket's so_faddr is that of the DNS proxy * itself. * * last argument should be changed if Slirp will inject IP attributes */ if ( pData->fUseDnsProxy && so->so_fport == RT_H2N_U16_C(53) && CTL_CHECK(so->so_faddr.s_addr, CTL_DNS)) dnsproxy_answer(pData, so, m); /* packets definetly will be fragmented, could confuse receiver peer. */ if (nread > if_mtu) m->m_flags |= M_SKIP_FIREWALL; /* * If this packet was destined for CTL_ADDR, * make it look like that's where it came from, done by udp_output */ udp_output(pData, so, m, &addr); } else { m_freem(pData, m); if (!soIgnorableErrorCode(errno)) { u_char code; if (errno == EHOSTUNREACH) code = ICMP_UNREACH_HOST; else if (errno == ENETUNREACH) code = ICMP_UNREACH_NET; else code = ICMP_UNREACH_PORT; Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code)); icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno)); so->so_m = NULL; } } SOCKET_UNLOCK(so); } } /* * sendto() a socket */ int sosendto(PNATState pData, struct socket *so, struct mbuf *m) { int ret; struct sockaddr_in *paddr; struct sockaddr addr; #if 0 struct sockaddr_in host_addr; #endif caddr_t buf = 0; int mlen; LogFlowFunc(("sosendto: so = %R[natsock], m = %p\n", so, m)); memset(&addr, 0, sizeof(struct sockaddr)); #ifdef RT_OS_DARWIN addr.sa_len = sizeof(struct sockaddr_in); #endif paddr = (struct sockaddr_in *)&addr; paddr->sin_family = AF_INET; if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr) { /* It's an alias */ uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask; switch(last_byte) { #if 0 /* handle this case at 'default:' */ case CTL_BROADCAST: addr.sin_addr.s_addr = INADDR_BROADCAST; /* Send the packet to host to fully emulate broadcast */ /** @todo r=klaus: on Linux host this causes the host to receive * the packet twice for some reason. And I cannot find any place * in the man pages which states that sending a broadcast does not * reach the host itself. */ host_addr.sin_family = AF_INET; host_addr.sin_port = so->so_fport; host_addr.sin_addr = our_addr; sendto(so->s, m->m_data, m->m_len, 0, (struct sockaddr *)&host_addr, sizeof (struct sockaddr)); break; #endif case CTL_DNS: case CTL_ALIAS: default: if (last_byte == ~pData->netmask) paddr->sin_addr.s_addr = INADDR_BROADCAST; else paddr->sin_addr = loopback_addr; break; } } else paddr->sin_addr = so->so_faddr; paddr->sin_port = so->so_fport; Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n", RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr))); /* Don't care what port we get */ /* * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255 * generates bodyless messages, annoying memmory management system. */ mlen = m_length(m, NULL); if (mlen > 0) { buf = RTMemAlloc(mlen); if (buf == NULL) { return -1; } m_copydata(m, 0, mlen, buf); } ret = sendto(so->s, buf, mlen, 0, (struct sockaddr *)&addr, sizeof (struct sockaddr)); #ifdef VBOX_WITH_NAT_SEND2HOME if (slirpIsWideCasting(pData, so->so_faddr.s_addr)) { slirpSend2Home(pData, so, buf, mlen, 0); } #endif if (buf) RTMemFree(buf); if (ret < 0) { Log2(("UDP: sendto fails (%s)\n", strerror(errno))); return -1; } /* * Kill the socket if there's no reply in 4 minutes, * but only if it's an expirable socket */ if (so->so_expire) so->so_expire = curtime + SO_EXPIRE; so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */ return 0; } /* * XXX This should really be tcp_listen */ struct socket * solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags) { struct sockaddr_in addr; struct socket *so; socklen_t addrlen = sizeof(addr); int s, opt = 1; int status; LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags)); if ((so = socreate()) == NULL) { /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */ return NULL; } /* Don't tcp_attach... we don't need so_snd nor so_rcv */ if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL) { RTMemFree(so); return NULL; } SOCKET_LOCK_CREATE(so); SOCKET_LOCK(so); QSOCKET_LOCK(tcb); insque(pData, so,&tcb); NSOCK_INC(); QSOCKET_UNLOCK(tcb); /* * SS_FACCEPTONCE sockets must time out. */ if (flags & SS_FACCEPTONCE) so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2; so->so_state = (SS_FACCEPTCONN|flags); so->so_lport = lport; /* Kept in network format */ so->so_laddr.s_addr = laddr; /* Ditto */ memset(&addr, 0, sizeof(addr)); #ifdef RT_OS_DARWIN addr.sin_len = sizeof(addr); #endif addr.sin_family = AF_INET; addr.sin_addr.s_addr = bind_addr; addr.sin_port = port; /** * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack, * kernel will choose the optimal value for requests queue length. * @note: MSDN recommends low (2-4) values for bluetooth networking devices. */ if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0) || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0) || (listen(s, pData->soMaxConn) < 0)) { #ifdef RT_OS_WINDOWS int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */ closesocket(s); QSOCKET_LOCK(tcb); sofree(pData, so); QSOCKET_UNLOCK(tcb); /* Restore the real errno */ WSASetLastError(tmperrno); #else int tmperrno = errno; /* Don't clobber the real reason we failed */ close(s); if (sototcpcb(so)) tcp_close(pData, sototcpcb(so)); else sofree(pData, so); /* Restore the real errno */ errno = tmperrno; #endif return NULL; } fd_nonblock(s); setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int)); getsockname(s,(struct sockaddr *)&addr,&addrlen); so->so_fport = addr.sin_port; /* set socket buffers */ opt = pData->socket_rcv; status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int)); if (status < 0) { LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt)); goto no_sockopt; } opt = pData->socket_snd; status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int)); if (status < 0) { LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt)); goto no_sockopt; } no_sockopt: if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr) so->so_faddr = alias_addr; else so->so_faddr = addr.sin_addr; so->s = s; SOCKET_UNLOCK(so); return so; } /* * Data is available in so_rcv * Just write() the data to the socket * XXX not yet... * @todo do we really need this function, what it's intended to do? */ void sorwakeup(struct socket *so) { NOREF(so); #if 0 sowrite(so); FD_CLR(so->s,&writefds); #endif } /* * Data has been freed in so_snd * We have room for a read() if we want to * For now, don't read, it'll be done in the main loop */ void sowwakeup(struct socket *so) { NOREF(so); } /* * Various session state calls * XXX Should be #define's * The socket state stuff needs work, these often get call 2 or 3 * times each when only 1 was needed */ void soisfconnecting(struct socket *so) { so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE| SS_FCANTSENDMORE|SS_FWDRAIN); so->so_state |= SS_ISFCONNECTING; /* Clobber other states */ } void soisfconnected(struct socket *so) { LogFlowFunc(("ENTER: so:%R[natsock]\n", so)); so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF); so->so_state |= SS_ISFCONNECTED; /* Clobber other states */ LogFlowFunc(("LEAVE: so:%R[natsock]\n", so)); } int sofcantrcvmore(struct socket *so) { int err = 0; LogFlowFunc(("ENTER: so:%R[natsock]\n", so)); if ((so->so_state & SS_NOFDREF) == 0) { /* * If remote closes first and then sends an RST, the recv() in * soread() will keep reporting EOF without any error * indication. As far as I can tell the only way to detect * this on Linux is to check if shutdown() succeeds here (but * see below). * * OTOH on OS X shutdown() "helpfully" checks if remote has * already closed and then always returns ENOTCONN * immediately. */ int status = shutdown(so->s, SHUT_RD); #if defined(RT_OS_LINUX) if (status < 0) err = errno; #else RT_NOREF(status); #endif } so->so_state &= ~(SS_ISFCONNECTING); if (so->so_state & SS_FCANTSENDMORE) { #if defined(RT_OS_LINUX) /* * If we have closed first, and remote closes, shutdown will * return ENOTCONN, but this is expected. Don't tell the * caller there was an error. */ if (err == ENOTCONN) err = 0; #endif so->so_state = SS_NOFDREF; /* Don't select it */ /* XXX close() here as well? */ } else so->so_state |= SS_FCANTRCVMORE; LogFlowFunc(("LEAVE: %d\n", err)); return err; } void sofcantsendmore(struct socket *so) { LogFlowFunc(("ENTER: so:%R[natsock]\n", so)); if ((so->so_state & SS_NOFDREF) == 0) shutdown(so->s, 1); /* send FIN to fhost */ so->so_state &= ~(SS_ISFCONNECTING); if (so->so_state & SS_FCANTRCVMORE) so->so_state = SS_NOFDREF; /* as above */ else so->so_state |= SS_FCANTSENDMORE; LogFlowFuncLeave(); } void soisfdisconnected(struct socket *so) { NOREF(so); #if 0 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED); close(so->s); so->so_state = SS_ISFDISCONNECTED; /* * XXX Do nothing ... ? */ #endif } /* * Set write drain mode * Set CANTSENDMORE once all data has been write()n */ void sofwdrain(struct socket *so) { if (SBUF_LEN(&so->so_rcv)) so->so_state |= SS_FWDRAIN; else sofcantsendmore(so); } #if !defined(RT_OS_WINDOWS) static void send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr) { struct ip *ip; uint32_t dst, src; char ip_copy[256]; struct icmp *icp; int old_ip_len = 0; int hlen, original_hlen = 0; struct mbuf *m; struct icmp_msg *icm; uint8_t proto; int type = 0; ip = (struct ip *)buff; /* Fix ip->ip_len to contain the total packet length including the header * in _host_ byte order for all OSes. On Darwin, that value already is in * host byte order. Solaris and Darwin report only the payload. */ #ifndef RT_OS_DARWIN ip->ip_len = RT_N2H_U16(ip->ip_len); #endif hlen = (ip->ip_hl << 2); #if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN) ip->ip_len += hlen; #endif if (ip->ip_len < hlen + ICMP_MINLEN) { Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n")); return; } icp = (struct icmp *)((char *)ip + hlen); Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code)); if ( icp->icmp_type != ICMP_ECHOREPLY && icp->icmp_type != ICMP_TIMXCEED && icp->icmp_type != ICMP_UNREACH) { return; } /* * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is * ICMP_ECHOREPLY assuming data 0 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)} */ if (ip->ip_len < hlen + 8) { Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n")); return; } type = icp->icmp_type; if ( type == ICMP_TIMXCEED || type == ICMP_UNREACH) { /* * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram */ if (ip->ip_len < hlen + 2*8 + sizeof(struct ip)) { Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n")); return; } ip = &icp->icmp_ip; } icm = icmp_find_original_mbuf(pData, ip); if (icm == NULL) { Log(("NAT: Can't find the corresponding packet for the received ICMP\n")); return; } m = icm->im_m; if (!m) { LogFunc(("%R[natsock] hasn't stored it's mbuf on sent\n", icm->im_so)); goto done; } src = addr->sin_addr.s_addr; if (type == ICMP_ECHOREPLY) { struct ip *ip0 = mtod(m, struct ip *); struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2)); if (icp0->icmp_type != ICMP_ECHO) { Log(("NAT: we haven't found echo for this reply\n")); goto done; } /* * while combining buffer to send (see ip_icmp.c) we control ICMP header only, * IP header combined by OS network stack, our local copy of IP header contians values * in host byte order so no byte order conversion is required. IP headers fields are converting * in ip_output0 routine only. */ if ( (ip->ip_len - hlen) != (ip0->ip_len - (ip0->ip_hl << 2))) { Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n", (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2)))); goto done; } } /* ip points on origianal ip header */ ip = mtod(m, struct ip *); proto = ip->ip_p; /* Now ip is pointing on header we've sent from guest */ if ( icp->icmp_type == ICMP_TIMXCEED || icp->icmp_type == ICMP_UNREACH) { old_ip_len = (ip->ip_hl << 2) + 64; if (old_ip_len > sizeof(ip_copy)) old_ip_len = sizeof(ip_copy); memcpy(ip_copy, ip, old_ip_len); } /* source address from original IP packet*/ dst = ip->ip_src.s_addr; /* overide ther tail of old packet */ ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */ original_hlen = ip->ip_hl << 2; /* saves original ip header and options */ m_copyback(pData, m, original_hlen, len - hlen, buff + hlen); ip->ip_len = m_length(m, NULL); ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/ icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2)); type = icp->icmp_type; if ( type == ICMP_TIMXCEED || type == ICMP_UNREACH) { /* according RFC 793 error messages required copy of initial IP header + 64 bit */ memcpy(&icp->icmp_ip, ip_copy, old_ip_len); /* undo byte order conversions done in ip_input() */ HTONS(icp->icmp_ip.ip_len); HTONS(icp->icmp_ip.ip_id); HTONS(icp->icmp_ip.ip_off); ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */ } ip->ip_src.s_addr = src; ip->ip_dst.s_addr = dst; icmp_reflect(pData, m); /* m was freed */ icm->im_m = NULL; done: icmp_msg_delete(pData, icm); } static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so) { struct sockaddr_in addr; socklen_t addrlen = sizeof(struct sockaddr_in); struct ip ip; char *buff; int len = 0; /* 1- step: read the ip header */ len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK, (struct sockaddr *)&addr, &addrlen); if ( len < 0 && ( soIgnorableErrorCode(errno) || errno == ENOTCONN)) { Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n")); return; } if ( len < sizeof(struct ip) || len < 0 || len == 0) { u_char code; code = ICMP_UNREACH_PORT; if (errno == EHOSTUNREACH) code = ICMP_UNREACH_HOST; else if (errno == ENETUNREACH) code = ICMP_UNREACH_NET; LogRel(("NAT: UDP ICMP rx errno=%d (%s)\n", errno, strerror(errno))); icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno)); so->so_m = NULL; Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n")); return; } /* basic check of IP header */ if ( ip.ip_v != IPVERSION # ifndef RT_OS_DARWIN || ip.ip_p != IPPROTO_ICMP # endif ) { Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n")); return; } # ifndef RT_OS_DARWIN /* Darwin reports the IP length already in host byte order. */ ip.ip_len = RT_N2H_U16(ip.ip_len); # endif # if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN) /* Solaris and Darwin report the payload only */ ip.ip_len += (ip.ip_hl << 2); # endif /* Note: ip->ip_len in host byte order (all OS) */ len = ip.ip_len; buff = RTMemAlloc(len); if (buff == NULL) { Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n")); return; } /* 2 - step: we're reading rest of the datagramm to the buffer */ addrlen = sizeof(struct sockaddr_in); memset(&addr, 0, addrlen); len = recvfrom(so->s, buff, len, 0, (struct sockaddr *)&addr, &addrlen); if ( len < 0 && ( soIgnorableErrorCode(errno) || errno == ENOTCONN)) { Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n", ip.ip_len)); RTMemFree(buff); return; } if ( len < 0 || len == 0) { Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n", errno, len, (ip.ip_len - sizeof(struct ip)))); RTMemFree(buff); return; } /* len is modified in 2nd read, when the rest of the datagramm was read */ send_icmp_to_guest(pData, buff, len, &addr); RTMemFree(buff); } #endif /* !RT_OS_WINDOWS */