diff options
Diffstat (limited to 'src/c-ares/ares_process.c')
-rw-r--r-- | src/c-ares/ares_process.c | 1445 |
1 files changed, 1445 insertions, 0 deletions
diff --git a/src/c-ares/ares_process.c b/src/c-ares/ares_process.c new file mode 100644 index 00000000..e9648965 --- /dev/null +++ b/src/c-ares/ares_process.c @@ -0,0 +1,1445 @@ + +/* Copyright 1998 by the Massachusetts Institute of Technology. + * Copyright (C) 2004-2017 by Daniel Stenberg + * + * Permission to use, copy, modify, and distribute this + * software and its documentation for any purpose and without + * fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting + * documentation, and that the name of M.I.T. not be used in + * advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" + * without express or implied warranty. + */ + +#include "ares_setup.h" + +#ifdef HAVE_SYS_UIO_H +# include <sys/uio.h> +#endif +#ifdef HAVE_NETINET_IN_H +# include <netinet/in.h> +#endif +#ifdef HAVE_NETINET_TCP_H +# include <netinet/tcp.h> +#endif +#ifdef HAVE_NETDB_H +# include <netdb.h> +#endif +#ifdef HAVE_ARPA_NAMESER_H +# include <arpa/nameser.h> +#else +# include "nameser.h" +#endif +#ifdef HAVE_ARPA_NAMESER_COMPAT_H +# include <arpa/nameser_compat.h> +#endif + +#ifdef HAVE_STRINGS_H +# include <strings.h> +#endif +#ifdef HAVE_SYS_IOCTL_H +# include <sys/ioctl.h> +#endif +#ifdef NETWARE +# include <sys/filio.h> +#endif + +#include <assert.h> +#include <fcntl.h> + +#include "ares.h" +#include "ares_dns.h" +#include "ares_nowarn.h" +#include "ares_private.h" + + +static int try_again(int errnum); +static void write_tcp_data(ares_channel channel, fd_set *write_fds, + ares_socket_t write_fd, struct timeval *now); +static void read_tcp_data(ares_channel channel, fd_set *read_fds, + ares_socket_t read_fd, struct timeval *now); +static void read_udp_packets(ares_channel channel, fd_set *read_fds, + ares_socket_t read_fd, struct timeval *now); +static void advance_tcp_send_queue(ares_channel channel, int whichserver, + ssize_t num_bytes); +static void process_timeouts(ares_channel channel, struct timeval *now); +static void process_broken_connections(ares_channel channel, + struct timeval *now); +static void process_answer(ares_channel channel, unsigned char *abuf, + int alen, int whichserver, int tcp, + struct timeval *now); +static void handle_error(ares_channel channel, int whichserver, + struct timeval *now); +static void skip_server(ares_channel channel, struct query *query, + int whichserver); +static void next_server(ares_channel channel, struct query *query, + struct timeval *now); +static int open_tcp_socket(ares_channel channel, struct server_state *server); +static int open_udp_socket(ares_channel channel, struct server_state *server); +static int same_questions(const unsigned char *qbuf, int qlen, + const unsigned char *abuf, int alen); +static int same_address(struct sockaddr *sa, struct ares_addr *aa); +static void end_query(ares_channel channel, struct query *query, int status, + unsigned char *abuf, int alen); + +/* return true if now is exactly check time or later */ +int ares__timedout(struct timeval *now, + struct timeval *check) +{ + long secs = (now->tv_sec - check->tv_sec); + + if(secs > 0) + return 1; /* yes, timed out */ + if(secs < 0) + return 0; /* nope, not timed out */ + + /* if the full seconds were identical, check the sub second parts */ + return (now->tv_usec - check->tv_usec >= 0); +} + +/* add the specific number of milliseconds to the time in the first argument */ +static void timeadd(struct timeval *now, int millisecs) +{ + now->tv_sec += millisecs/1000; + now->tv_usec += (millisecs%1000)*1000; + + if(now->tv_usec >= 1000000) { + ++(now->tv_sec); + now->tv_usec -= 1000000; + } +} + +/* + * generic process function + */ +static void processfds(ares_channel channel, + fd_set *read_fds, ares_socket_t read_fd, + fd_set *write_fds, ares_socket_t write_fd) +{ + struct timeval now = ares__tvnow(); + + write_tcp_data(channel, write_fds, write_fd, &now); + read_tcp_data(channel, read_fds, read_fd, &now); + read_udp_packets(channel, read_fds, read_fd, &now); + process_timeouts(channel, &now); + process_broken_connections(channel, &now); +} + +/* Something interesting happened on the wire, or there was a timeout. + * See what's up and respond accordingly. + */ +void ares_process(ares_channel channel, fd_set *read_fds, fd_set *write_fds) +{ + processfds(channel, read_fds, ARES_SOCKET_BAD, write_fds, ARES_SOCKET_BAD); +} + +/* Something interesting happened on the wire, or there was a timeout. + * See what's up and respond accordingly. + */ +void ares_process_fd(ares_channel channel, + ares_socket_t read_fd, /* use ARES_SOCKET_BAD or valid + file descriptors */ + ares_socket_t write_fd) +{ + processfds(channel, NULL, read_fd, NULL, write_fd); +} + + +/* Return 1 if the specified error number describes a readiness error, or 0 + * otherwise. This is mostly for HP-UX, which could return EAGAIN or + * EWOULDBLOCK. See this man page + * + * http://devrsrc1.external.hp.com/STKS/cgi-bin/man2html? + * manpage=/usr/share/man/man2.Z/send.2 + */ +static int try_again(int errnum) +{ +#if !defined EWOULDBLOCK && !defined EAGAIN +#error "Neither EWOULDBLOCK nor EAGAIN defined" +#endif + switch (errnum) + { +#ifdef EWOULDBLOCK + case EWOULDBLOCK: + return 1; +#endif +#if defined EAGAIN && EAGAIN != EWOULDBLOCK + case EAGAIN: + return 1; +#endif + } + return 0; +} + +static ssize_t socket_writev(ares_channel channel, ares_socket_t s, const struct iovec * vec, int len) +{ + if (channel->sock_funcs) + return channel->sock_funcs->asendv(s, vec, len, channel->sock_func_cb_data); + + return writev(s, vec, len); +} + +static ssize_t socket_write(ares_channel channel, ares_socket_t s, const void * data, size_t len) +{ + if (channel->sock_funcs) + { + struct iovec vec; + vec.iov_base = (void*)data; + vec.iov_len = len; + return channel->sock_funcs->asendv(s, &vec, 1, channel->sock_func_cb_data); + } + return swrite(s, data, len); +} + +/* If any TCP sockets select true for writing, write out queued data + * we have for them. + */ +static void write_tcp_data(ares_channel channel, + fd_set *write_fds, + ares_socket_t write_fd, + struct timeval *now) +{ + struct server_state *server; + struct send_request *sendreq; + struct iovec *vec; + int i; + ssize_t scount; + ssize_t wcount; + size_t n; + + if(!write_fds && (write_fd == ARES_SOCKET_BAD)) + /* no possible action */ + return; + + for (i = 0; i < channel->nservers; i++) + { + /* Make sure server has data to send and is selected in write_fds or + write_fd. */ + server = &channel->servers[i]; + if (!server->qhead || server->tcp_socket == ARES_SOCKET_BAD || + server->is_broken) + continue; + + if(write_fds) { + if(!FD_ISSET(server->tcp_socket, write_fds)) + continue; + } + else { + if(server->tcp_socket != write_fd) + continue; + } + + if(write_fds) + /* If there's an error and we close this socket, then open + * another with the same fd to talk to another server, then we + * don't want to think that it was the new socket that was + * ready. This is not disastrous, but is likely to result in + * extra system calls and confusion. */ + FD_CLR(server->tcp_socket, write_fds); + + /* Count the number of send queue items. */ + n = 0; + for (sendreq = server->qhead; sendreq; sendreq = sendreq->next) + n++; + + /* Allocate iovecs so we can send all our data at once. */ + vec = ares_malloc(n * sizeof(struct iovec)); + if (vec) + { + /* Fill in the iovecs and send. */ + n = 0; + for (sendreq = server->qhead; sendreq; sendreq = sendreq->next) + { + vec[n].iov_base = (char *) sendreq->data; + vec[n].iov_len = sendreq->len; + n++; + } + wcount = socket_writev(channel, server->tcp_socket, vec, (int)n); + ares_free(vec); + if (wcount < 0) + { + if (!try_again(SOCKERRNO)) + handle_error(channel, i, now); + continue; + } + + /* Advance the send queue by as many bytes as we sent. */ + advance_tcp_send_queue(channel, i, wcount); + } + else + { + /* Can't allocate iovecs; just send the first request. */ + sendreq = server->qhead; + + scount = socket_write(channel, server->tcp_socket, sendreq->data, sendreq->len); + if (scount < 0) + { + if (!try_again(SOCKERRNO)) + handle_error(channel, i, now); + continue; + } + + /* Advance the send queue by as many bytes as we sent. */ + advance_tcp_send_queue(channel, i, scount); + } + } +} + +/* Consume the given number of bytes from the head of the TCP send queue. */ +static void advance_tcp_send_queue(ares_channel channel, int whichserver, + ssize_t num_bytes) +{ + struct send_request *sendreq; + struct server_state *server = &channel->servers[whichserver]; + while (num_bytes > 0) { + sendreq = server->qhead; + if ((size_t)num_bytes >= sendreq->len) { + num_bytes -= sendreq->len; + server->qhead = sendreq->next; + if (sendreq->data_storage) + ares_free(sendreq->data_storage); + ares_free(sendreq); + if (server->qhead == NULL) { + SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 0); + server->qtail = NULL; + + /* qhead is NULL so we cannot continue this loop */ + break; + } + } + else { + sendreq->data += num_bytes; + sendreq->len -= num_bytes; + num_bytes = 0; + } + } +} + +static ssize_t socket_recvfrom(ares_channel channel, + ares_socket_t s, + void * data, + size_t data_len, + int flags, + struct sockaddr *from, + socklen_t *from_len) +{ + if (channel->sock_funcs) + return channel->sock_funcs->arecvfrom(s, data, data_len, + flags, from, from_len, + channel->sock_func_cb_data); + +#ifdef HAVE_RECVFROM + return recvfrom(s, data, data_len, flags, from, from_len); +#else + return sread(s, data, data_len); +#endif +} + +static ssize_t socket_recv(ares_channel channel, + ares_socket_t s, + void * data, + size_t data_len) +{ + if (channel->sock_funcs) + return channel->sock_funcs->arecvfrom(s, data, data_len, 0, 0, 0, + channel->sock_func_cb_data); + + return sread(s, data, data_len); +} + +/* If any TCP socket selects true for reading, read some data, + * allocate a buffer if we finish reading the length word, and process + * a packet if we finish reading one. + */ +static void read_tcp_data(ares_channel channel, fd_set *read_fds, + ares_socket_t read_fd, struct timeval *now) +{ + struct server_state *server; + int i; + ssize_t count; + + if(!read_fds && (read_fd == ARES_SOCKET_BAD)) + /* no possible action */ + return; + + for (i = 0; i < channel->nservers; i++) + { + /* Make sure the server has a socket and is selected in read_fds. */ + server = &channel->servers[i]; + if (server->tcp_socket == ARES_SOCKET_BAD || server->is_broken) + continue; + + if(read_fds) { + if(!FD_ISSET(server->tcp_socket, read_fds)) + continue; + } + else { + if(server->tcp_socket != read_fd) + continue; + } + + if(read_fds) + /* If there's an error and we close this socket, then open another + * with the same fd to talk to another server, then we don't want to + * think that it was the new socket that was ready. This is not + * disastrous, but is likely to result in extra system calls and + * confusion. */ + FD_CLR(server->tcp_socket, read_fds); + + if (server->tcp_lenbuf_pos != 2) + { + /* We haven't yet read a length word, so read that (or + * what's left to read of it). + */ + count = socket_recv(channel, server->tcp_socket, + server->tcp_lenbuf + server->tcp_lenbuf_pos, + 2 - server->tcp_lenbuf_pos); + if (count <= 0) + { + if (!(count == -1 && try_again(SOCKERRNO))) + handle_error(channel, i, now); + continue; + } + + server->tcp_lenbuf_pos += (int)count; + if (server->tcp_lenbuf_pos == 2) + { + /* We finished reading the length word. Decode the + * length and allocate a buffer for the data. + */ + server->tcp_length = server->tcp_lenbuf[0] << 8 + | server->tcp_lenbuf[1]; + server->tcp_buffer = ares_malloc(server->tcp_length); + if (!server->tcp_buffer) { + handle_error(channel, i, now); + return; /* bail out on malloc failure. TODO: make this + function return error codes */ + } + server->tcp_buffer_pos = 0; + } + } + else + { + /* Read data into the allocated buffer. */ + count = socket_recv(channel, server->tcp_socket, + server->tcp_buffer + server->tcp_buffer_pos, + server->tcp_length - server->tcp_buffer_pos); + if (count <= 0) + { + if (!(count == -1 && try_again(SOCKERRNO))) + handle_error(channel, i, now); + continue; + } + + server->tcp_buffer_pos += (int)count; + if (server->tcp_buffer_pos == server->tcp_length) + { + /* We finished reading this answer; process it and + * prepare to read another length word. + */ + process_answer(channel, server->tcp_buffer, server->tcp_length, + i, 1, now); + ares_free(server->tcp_buffer); + server->tcp_buffer = NULL; + server->tcp_lenbuf_pos = 0; + server->tcp_buffer_pos = 0; + } + } + } +} + +/* If any UDP sockets select true for reading, process them. */ +static void read_udp_packets(ares_channel channel, fd_set *read_fds, + ares_socket_t read_fd, struct timeval *now) +{ + struct server_state *server; + int i; + ssize_t count; + unsigned char buf[MAXENDSSZ + 1]; +#ifdef HAVE_RECVFROM + ares_socklen_t fromlen; + union { + struct sockaddr sa; + struct sockaddr_in sa4; + struct sockaddr_in6 sa6; + } from; +#endif + + if(!read_fds && (read_fd == ARES_SOCKET_BAD)) + /* no possible action */ + return; + + for (i = 0; i < channel->nservers; i++) + { + /* Make sure the server has a socket and is selected in read_fds. */ + server = &channel->servers[i]; + + if (server->udp_socket == ARES_SOCKET_BAD || server->is_broken) + continue; + + if(read_fds) { + if(!FD_ISSET(server->udp_socket, read_fds)) + continue; + } + else { + if(server->udp_socket != read_fd) + continue; + } + + if(read_fds) + /* If there's an error and we close this socket, then open + * another with the same fd to talk to another server, then we + * don't want to think that it was the new socket that was + * ready. This is not disastrous, but is likely to result in + * extra system calls and confusion. */ + FD_CLR(server->udp_socket, read_fds); + + /* To reduce event loop overhead, read and process as many + * packets as we can. */ + do { + if (server->udp_socket == ARES_SOCKET_BAD) + count = 0; + + else { + if (server->addr.family == AF_INET) + fromlen = sizeof(from.sa4); + else + fromlen = sizeof(from.sa6); + count = socket_recvfrom(channel, server->udp_socket, (void *)buf, + sizeof(buf), 0, &from.sa, &fromlen); + } + + if (count == -1 && try_again(SOCKERRNO)) + continue; + else if (count <= 0) + handle_error(channel, i, now); +#ifdef HAVE_RECVFROM + else if (!same_address(&from.sa, &server->addr)) + /* The address the response comes from does not match the address we + * sent the request to. Someone may be attempting to perform a cache + * poisoning attack. */ + break; +#endif + else + process_answer(channel, buf, (int)count, i, 0, now); + } while (count > 0); + } +} + +/* If any queries have timed out, note the timeout and move them on. */ +static void process_timeouts(ares_channel channel, struct timeval *now) +{ + time_t t; /* the time of the timeouts we're processing */ + struct query *query; + struct list_node* list_head; + struct list_node* list_node; + + /* Process all the timeouts that have fired since the last time we processed + * timeouts. If things are going well, then we'll have hundreds/thousands of + * queries that fall into future buckets, and only a handful of requests + * that fall into the "now" bucket, so this should be quite quick. + */ + for (t = channel->last_timeout_processed; t <= now->tv_sec; t++) + { + list_head = &(channel->queries_by_timeout[t % ARES_TIMEOUT_TABLE_SIZE]); + for (list_node = list_head->next; list_node != list_head; ) + { + query = list_node->data; + list_node = list_node->next; /* in case the query gets deleted */ + if (query->timeout.tv_sec && ares__timedout(now, &query->timeout)) + { + query->error_status = ARES_ETIMEOUT; + ++query->timeouts; + next_server(channel, query, now); + } + } + } + channel->last_timeout_processed = now->tv_sec; +} + +/* Handle an answer from a server. */ +static void process_answer(ares_channel channel, unsigned char *abuf, + int alen, int whichserver, int tcp, + struct timeval *now) +{ + int tc, rcode, packetsz; + unsigned short id; + struct query *query; + struct list_node* list_head; + struct list_node* list_node; + + /* If there's no room in the answer for a header, we can't do much + * with it. */ + if (alen < HFIXEDSZ) + return; + + /* Grab the query ID, truncate bit, and response code from the packet. */ + id = DNS_HEADER_QID(abuf); + tc = DNS_HEADER_TC(abuf); + rcode = DNS_HEADER_RCODE(abuf); + + /* Find the query corresponding to this packet. The queries are + * hashed/bucketed by query id, so this lookup should be quick. Note that + * both the query id and the questions must be the same; when the query id + * wraps around we can have multiple outstanding queries with the same query + * id, so we need to check both the id and question. + */ + query = NULL; + list_head = &(channel->queries_by_qid[id % ARES_QID_TABLE_SIZE]); + for (list_node = list_head->next; list_node != list_head; + list_node = list_node->next) + { + struct query *q = list_node->data; + if ((q->qid == id) && same_questions(q->qbuf, q->qlen, abuf, alen)) + { + query = q; + break; + } + } + if (!query) + return; + + packetsz = PACKETSZ; + /* If we use EDNS and server answers with one of these RCODES, the protocol + * extension is not understood by the responder. We must retry the query + * without EDNS enabled. + */ + if (channel->flags & ARES_FLAG_EDNS) + { + packetsz = channel->ednspsz; + if (rcode == NOTIMP || rcode == FORMERR || rcode == SERVFAIL) + { + int qlen = (query->tcplen - 2) - EDNSFIXEDSZ; + channel->flags ^= ARES_FLAG_EDNS; + query->tcplen -= EDNSFIXEDSZ; + query->qlen -= EDNSFIXEDSZ; + query->tcpbuf[0] = (unsigned char)((qlen >> 8) & 0xff); + query->tcpbuf[1] = (unsigned char)(qlen & 0xff); + DNS_HEADER_SET_ARCOUNT(query->tcpbuf + 2, 0); + query->tcpbuf = ares_realloc(query->tcpbuf, query->tcplen); + query->qbuf = query->tcpbuf + 2; + ares__send_query(channel, query, now); + return; + } + } + + /* If we got a truncated UDP packet and are not ignoring truncation, + * don't accept the packet, and switch the query to TCP if we hadn't + * done so already. + */ + if ((tc || alen > packetsz) && !tcp && !(channel->flags & ARES_FLAG_IGNTC)) + { + if (!query->using_tcp) + { + query->using_tcp = 1; + ares__send_query(channel, query, now); + } + return; + } + + /* Limit alen to PACKETSZ if we aren't using TCP (only relevant if we + * are ignoring truncation. + */ + if (alen > packetsz && !tcp) + alen = packetsz; + + /* If we aren't passing through all error packets, discard packets + * with SERVFAIL, NOTIMP, or REFUSED response codes. + */ + if (!(channel->flags & ARES_FLAG_NOCHECKRESP)) + { + if (rcode == SERVFAIL || rcode == NOTIMP || rcode == REFUSED) + { + skip_server(channel, query, whichserver); + if (query->server == whichserver) + next_server(channel, query, now); + return; + } + } + + end_query(channel, query, ARES_SUCCESS, abuf, alen); +} + +/* Close all the connections that are no longer usable. */ +static void process_broken_connections(ares_channel channel, + struct timeval *now) +{ + int i; + for (i = 0; i < channel->nservers; i++) + { + struct server_state *server = &channel->servers[i]; + if (server->is_broken) + { + handle_error(channel, i, now); + } + } +} + +/* Swap the contents of two lists */ +static void swap_lists(struct list_node* head_a, + struct list_node* head_b) +{ + int is_a_empty = ares__is_list_empty(head_a); + int is_b_empty = ares__is_list_empty(head_b); + struct list_node old_a = *head_a; + struct list_node old_b = *head_b; + + if (is_a_empty) { + ares__init_list_head(head_b); + } else { + *head_b = old_a; + old_a.next->prev = head_b; + old_a.prev->next = head_b; + } + if (is_b_empty) { + ares__init_list_head(head_a); + } else { + *head_a = old_b; + old_b.next->prev = head_a; + old_b.prev->next = head_a; + } +} + +static void handle_error(ares_channel channel, int whichserver, + struct timeval *now) +{ + struct server_state *server; + struct query *query; + struct list_node list_head; + struct list_node* list_node; + + server = &channel->servers[whichserver]; + + /* Reset communications with this server. */ + ares__close_sockets(channel, server); + + /* Tell all queries talking to this server to move on and not try this + * server again. We steal the current list of queries that were in-flight to + * this server, since when we call next_server this can cause the queries to + * be re-sent to this server, which will re-insert these queries in that + * same server->queries_to_server list. + */ + ares__init_list_head(&list_head); + swap_lists(&list_head, &(server->queries_to_server)); + for (list_node = list_head.next; list_node != &list_head; ) + { + query = list_node->data; + list_node = list_node->next; /* in case the query gets deleted */ + assert(query->server == whichserver); + skip_server(channel, query, whichserver); + next_server(channel, query, now); + } + /* Each query should have removed itself from our temporary list as + * it re-sent itself or finished up... + */ + assert(ares__is_list_empty(&list_head)); +} + +static void skip_server(ares_channel channel, struct query *query, + int whichserver) +{ + /* The given server gave us problems with this query, so if we have the + * luxury of using other servers, then let's skip the potentially broken + * server and just use the others. If we only have one server and we need to + * retry then we should just go ahead and re-use that server, since it's our + * only hope; perhaps we just got unlucky, and retrying will work (eg, the + * server timed out our TCP connection just as we were sending another + * request). + */ + if (channel->nservers > 1) + { + query->server_info[whichserver].skip_server = 1; + } +} + +static void next_server(ares_channel channel, struct query *query, + struct timeval *now) +{ + /* We need to try each server channel->tries times. We have channel->nservers + * servers to try. In total, we need to do channel->nservers * channel->tries + * attempts. Use query->try to remember how many times we already attempted + * this query. Use modular arithmetic to find the next server to try. */ + while (++(query->try_count) < (channel->nservers * channel->tries)) + { + struct server_state *server; + + /* Move on to the next server. */ + query->server = (query->server + 1) % channel->nservers; + server = &channel->servers[query->server]; + + /* We don't want to use this server if (1) we decided this connection is + * broken, and thus about to be closed, (2) we've decided to skip this + * server because of earlier errors we encountered, or (3) we already + * sent this query over this exact connection. + */ + if (!server->is_broken && + !query->server_info[query->server].skip_server && + !(query->using_tcp && + (query->server_info[query->server].tcp_connection_generation == + server->tcp_connection_generation))) + { + ares__send_query(channel, query, now); + return; + } + + /* You might think that with TCP we only need one try. However, even + * when using TCP, servers can time-out our connection just as we're + * sending a request, or close our connection because they die, or never + * send us a reply because they get wedged or tickle a bug that drops + * our request. + */ + } + + /* If we are here, all attempts to perform query failed. */ + end_query(channel, query, query->error_status, NULL, 0); +} + +void ares__send_query(ares_channel channel, struct query *query, + struct timeval *now) +{ + struct send_request *sendreq; + struct server_state *server; + int timeplus; + + server = &channel->servers[query->server]; + if (query->using_tcp) + { + /* Make sure the TCP socket for this server is set up and queue + * a send request. + */ + if (server->tcp_socket == ARES_SOCKET_BAD) + { + if (open_tcp_socket(channel, server) == -1) + { + skip_server(channel, query, query->server); + next_server(channel, query, now); + return; + } + } + sendreq = ares_malloc(sizeof(struct send_request)); + if (!sendreq) + { + end_query(channel, query, ARES_ENOMEM, NULL, 0); + return; + } + memset(sendreq, 0, sizeof(struct send_request)); + /* To make the common case fast, we avoid copies by using the query's + * tcpbuf for as long as the query is alive. In the rare case where the + * query ends while it's queued for transmission, then we give the + * sendreq its own copy of the request packet and put it in + * sendreq->data_storage. + */ + sendreq->data_storage = NULL; + sendreq->data = query->tcpbuf; + sendreq->len = query->tcplen; + sendreq->owner_query = query; + sendreq->next = NULL; + if (server->qtail) + server->qtail->next = sendreq; + else + { + SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 1); + server->qhead = sendreq; + } + server->qtail = sendreq; + query->server_info[query->server].tcp_connection_generation = + server->tcp_connection_generation; + } + else + { + if (server->udp_socket == ARES_SOCKET_BAD) + { + if (open_udp_socket(channel, server) == -1) + { + skip_server(channel, query, query->server); + next_server(channel, query, now); + return; + } + } + if (socket_write(channel, server->udp_socket, query->qbuf, query->qlen) == -1) + { + /* FIXME: Handle EAGAIN here since it likely can happen. */ + skip_server(channel, query, query->server); + next_server(channel, query, now); + return; + } + } + timeplus = channel->timeout << (query->try_count / channel->nservers); + timeplus = (timeplus * (9 + (rand () & 7))) / 16; + query->timeout = *now; + timeadd(&query->timeout, timeplus); + /* Keep track of queries bucketed by timeout, so we can process + * timeout events quickly. + */ + ares__remove_from_list(&(query->queries_by_timeout)); + ares__insert_in_list( + &(query->queries_by_timeout), + &(channel->queries_by_timeout[query->timeout.tv_sec % + ARES_TIMEOUT_TABLE_SIZE])); + + /* Keep track of queries bucketed by server, so we can process server + * errors quickly. + */ + ares__remove_from_list(&(query->queries_to_server)); + ares__insert_in_list(&(query->queries_to_server), + &(server->queries_to_server)); +} + +/* + * setsocknonblock sets the given socket to either blocking or non-blocking + * mode based on the 'nonblock' boolean argument. This function is highly + * portable. + */ +static int setsocknonblock(ares_socket_t sockfd, /* operate on this */ + int nonblock /* TRUE or FALSE */) +{ +#if defined(USE_BLOCKING_SOCKETS) + + return 0; /* returns success */ + +#elif defined(HAVE_FCNTL_O_NONBLOCK) + + /* most recent unix versions */ + int flags; + flags = fcntl(sockfd, F_GETFL, 0); + if (FALSE != nonblock) + return fcntl(sockfd, F_SETFL, flags | O_NONBLOCK); + else + return fcntl(sockfd, F_SETFL, flags & (~O_NONBLOCK)); /* LCOV_EXCL_LINE */ + +#elif defined(HAVE_IOCTL_FIONBIO) + + /* older unix versions */ + int flags = nonblock ? 1 : 0; + return ioctl(sockfd, FIONBIO, &flags); + +#elif defined(HAVE_IOCTLSOCKET_FIONBIO) + +#ifdef WATT32 + char flags = nonblock ? 1 : 0; +#else + /* Windows */ + unsigned long flags = nonblock ? 1UL : 0UL; +#endif + return ioctlsocket(sockfd, FIONBIO, &flags); + +#elif defined(HAVE_IOCTLSOCKET_CAMEL_FIONBIO) + + /* Amiga */ + long flags = nonblock ? 1L : 0L; + return IoctlSocket(sockfd, FIONBIO, flags); + +#elif defined(HAVE_SETSOCKOPT_SO_NONBLOCK) + + /* BeOS */ + long b = nonblock ? 1L : 0L; + return setsockopt(sockfd, SOL_SOCKET, SO_NONBLOCK, &b, sizeof(b)); + +#else +# error "no non-blocking method was found/used/set" +#endif +} + +static int configure_socket(ares_socket_t s, int family, ares_channel channel) +{ + union { + struct sockaddr sa; + struct sockaddr_in sa4; + struct sockaddr_in6 sa6; + } local; + + /* do not set options for user-managed sockets */ + if (channel->sock_funcs) + return 0; + + (void)setsocknonblock(s, TRUE); + +#if defined(FD_CLOEXEC) && !defined(MSDOS) + /* Configure the socket fd as close-on-exec. */ + if (fcntl(s, F_SETFD, FD_CLOEXEC) == -1) + return -1; /* LCOV_EXCL_LINE */ +#endif + + /* Set the socket's send and receive buffer sizes. */ + if ((channel->socket_send_buffer_size > 0) && + setsockopt(s, SOL_SOCKET, SO_SNDBUF, + (void *)&channel->socket_send_buffer_size, + sizeof(channel->socket_send_buffer_size)) == -1) + return -1; + + if ((channel->socket_receive_buffer_size > 0) && + setsockopt(s, SOL_SOCKET, SO_RCVBUF, + (void *)&channel->socket_receive_buffer_size, + sizeof(channel->socket_receive_buffer_size)) == -1) + return -1; + +#ifdef SO_BINDTODEVICE + if (channel->local_dev_name[0]) { + if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, + channel->local_dev_name, sizeof(channel->local_dev_name))) { + /* Only root can do this, and usually not fatal if it doesn't work, so */ + /* just continue on. */ + } + } +#endif + + if (family == AF_INET) { + if (channel->local_ip4) { + memset(&local.sa4, 0, sizeof(local.sa4)); + local.sa4.sin_family = AF_INET; + local.sa4.sin_addr.s_addr = htonl(channel->local_ip4); + if (bind(s, &local.sa, sizeof(local.sa4)) < 0) + return -1; + } + } + else if (family == AF_INET6) { + if (memcmp(channel->local_ip6, &ares_in6addr_any, + sizeof(channel->local_ip6)) != 0) { + memset(&local.sa6, 0, sizeof(local.sa6)); + local.sa6.sin6_family = AF_INET6; + memcpy(&local.sa6.sin6_addr, channel->local_ip6, + sizeof(channel->local_ip6)); + if (bind(s, &local.sa, sizeof(local.sa6)) < 0) + return -1; + } + } + + return 0; +} + +static int open_socket(ares_channel channel, int af, int type, int protocol) +{ + if (channel->sock_funcs != 0) + return channel->sock_funcs->asocket(af, + type, + protocol, + channel->sock_func_cb_data); + + return socket(af, type, protocol); +} + +static int connect_socket(ares_channel channel, ares_socket_t sockfd, + const struct sockaddr * addr, + socklen_t addrlen) +{ + if (channel->sock_funcs != 0) + return channel->sock_funcs->aconnect(sockfd, + addr, + addrlen, + channel->sock_func_cb_data); + + return connect(sockfd, addr, addrlen); +} + +static int open_tcp_socket(ares_channel channel, struct server_state *server) +{ + ares_socket_t s; + int opt; + ares_socklen_t salen; + union { + struct sockaddr_in sa4; + struct sockaddr_in6 sa6; + } saddr; + struct sockaddr *sa; + + switch (server->addr.family) + { + case AF_INET: + sa = (void *)&saddr.sa4; + salen = sizeof(saddr.sa4); + memset(sa, 0, salen); + saddr.sa4.sin_family = AF_INET; + if (server->addr.tcp_port) { + saddr.sa4.sin_port = aresx_sitous(server->addr.tcp_port); + } else { + saddr.sa4.sin_port = aresx_sitous(channel->tcp_port); + } + memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4, + sizeof(server->addr.addrV4)); + break; + case AF_INET6: + sa = (void *)&saddr.sa6; + salen = sizeof(saddr.sa6); + memset(sa, 0, salen); + saddr.sa6.sin6_family = AF_INET6; + if (server->addr.tcp_port) { + saddr.sa6.sin6_port = aresx_sitous(server->addr.tcp_port); + } else { + saddr.sa6.sin6_port = aresx_sitous(channel->tcp_port); + } + memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6, + sizeof(server->addr.addrV6)); + break; + default: + return -1; /* LCOV_EXCL_LINE */ + } + + /* Acquire a socket. */ + s = open_socket(channel, server->addr.family, SOCK_STREAM, 0); + if (s == ARES_SOCKET_BAD) + return -1; + + /* Configure it. */ + if (configure_socket(s, server->addr.family, channel) < 0) + { + ares__socket_close(channel, s); + return -1; + } + +#ifdef TCP_NODELAY + /* + * Disable the Nagle algorithm (only relevant for TCP sockets, and thus not + * in configure_socket). In general, in DNS lookups we're pretty much + * interested in firing off a single request and then waiting for a reply, + * so batching isn't very interesting. + */ + opt = 1; + if (channel->sock_funcs == 0 + && + setsockopt(s, IPPROTO_TCP, TCP_NODELAY, + (void *)&opt, sizeof(opt)) == -1) + { + ares__socket_close(channel, s); + return -1; + } +#endif + + if (channel->sock_config_cb) + { + int err = channel->sock_config_cb(s, SOCK_STREAM, + channel->sock_config_cb_data); + if (err < 0) + { + ares__socket_close(channel, s); + return err; + } + } + + /* Connect to the server. */ + if (connect_socket(channel, s, sa, salen) == -1) + { + int err = SOCKERRNO; + + if (err != EINPROGRESS && err != EWOULDBLOCK) + { + ares__socket_close(channel, s); + return -1; + } + } + + if (channel->sock_create_cb) + { + int err = channel->sock_create_cb(s, SOCK_STREAM, + channel->sock_create_cb_data); + if (err < 0) + { + ares__socket_close(channel, s); + return err; + } + } + + SOCK_STATE_CALLBACK(channel, s, 1, 0); + server->tcp_buffer_pos = 0; + server->tcp_socket = s; + server->tcp_connection_generation = ++channel->tcp_connection_generation; + return 0; +} + +static int open_udp_socket(ares_channel channel, struct server_state *server) +{ + ares_socket_t s; + ares_socklen_t salen; + union { + struct sockaddr_in sa4; + struct sockaddr_in6 sa6; + } saddr; + struct sockaddr *sa; + + switch (server->addr.family) + { + case AF_INET: + sa = (void *)&saddr.sa4; + salen = sizeof(saddr.sa4); + memset(sa, 0, salen); + saddr.sa4.sin_family = AF_INET; + if (server->addr.udp_port) { + saddr.sa4.sin_port = aresx_sitous(server->addr.udp_port); + } else { + saddr.sa4.sin_port = aresx_sitous(channel->udp_port); + } + memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4, + sizeof(server->addr.addrV4)); + break; + case AF_INET6: + sa = (void *)&saddr.sa6; + salen = sizeof(saddr.sa6); + memset(sa, 0, salen); + saddr.sa6.sin6_family = AF_INET6; + if (server->addr.udp_port) { + saddr.sa6.sin6_port = aresx_sitous(server->addr.udp_port); + } else { + saddr.sa6.sin6_port = aresx_sitous(channel->udp_port); + } + memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6, + sizeof(server->addr.addrV6)); + break; + default: + return -1; /* LCOV_EXCL_LINE */ + } + + /* Acquire a socket. */ + s = open_socket(channel, server->addr.family, SOCK_DGRAM, 0); + if (s == ARES_SOCKET_BAD) + return -1; + + /* Set the socket non-blocking. */ + if (configure_socket(s, server->addr.family, channel) < 0) + { + ares__socket_close(channel, s); + return -1; + } + + if (channel->sock_config_cb) + { + int err = channel->sock_config_cb(s, SOCK_DGRAM, + channel->sock_config_cb_data); + if (err < 0) + { + ares__socket_close(channel, s); + return err; + } + } + + /* Connect to the server. */ + if (connect_socket(channel, s, sa, salen) == -1) + { + int err = SOCKERRNO; + + if (err != EINPROGRESS && err != EWOULDBLOCK) + { + ares__socket_close(channel, s); + return -1; + } + } + + if (channel->sock_create_cb) + { + int err = channel->sock_create_cb(s, SOCK_DGRAM, + channel->sock_create_cb_data); + if (err < 0) + { + ares__socket_close(channel, s); + return err; + } + } + + SOCK_STATE_CALLBACK(channel, s, 1, 0); + + server->udp_socket = s; + return 0; +} + +static int same_questions(const unsigned char *qbuf, int qlen, + const unsigned char *abuf, int alen) +{ + struct { + const unsigned char *p; + int qdcount; + char *name; + long namelen; + int type; + int dnsclass; + } q, a; + int i, j; + + if (qlen < HFIXEDSZ || alen < HFIXEDSZ) + return 0; + + /* Extract qdcount from the request and reply buffers and compare them. */ + q.qdcount = DNS_HEADER_QDCOUNT(qbuf); + a.qdcount = DNS_HEADER_QDCOUNT(abuf); + if (q.qdcount != a.qdcount) + return 0; + + /* For each question in qbuf, find it in abuf. */ + q.p = qbuf + HFIXEDSZ; + for (i = 0; i < q.qdcount; i++) + { + /* Decode the question in the query. */ + if (ares_expand_name(q.p, qbuf, qlen, &q.name, &q.namelen) + != ARES_SUCCESS) + return 0; + q.p += q.namelen; + if (q.p + QFIXEDSZ > qbuf + qlen) + { + ares_free(q.name); + return 0; + } + q.type = DNS_QUESTION_TYPE(q.p); + q.dnsclass = DNS_QUESTION_CLASS(q.p); + q.p += QFIXEDSZ; + + /* Search for this question in the answer. */ + a.p = abuf + HFIXEDSZ; + for (j = 0; j < a.qdcount; j++) + { + /* Decode the question in the answer. */ + if (ares_expand_name(a.p, abuf, alen, &a.name, &a.namelen) + != ARES_SUCCESS) + { + ares_free(q.name); + return 0; + } + a.p += a.namelen; + if (a.p + QFIXEDSZ > abuf + alen) + { + ares_free(q.name); + ares_free(a.name); + return 0; + } + a.type = DNS_QUESTION_TYPE(a.p); + a.dnsclass = DNS_QUESTION_CLASS(a.p); + a.p += QFIXEDSZ; + + /* Compare the decoded questions. */ + if (strcasecmp(q.name, a.name) == 0 && q.type == a.type + && q.dnsclass == a.dnsclass) + { + ares_free(a.name); + break; + } + ares_free(a.name); + } + + ares_free(q.name); + if (j == a.qdcount) + return 0; + } + return 1; +} + +static int same_address(struct sockaddr *sa, struct ares_addr *aa) +{ + void *addr1; + void *addr2; + + if (sa->sa_family == aa->family) + { + switch (aa->family) + { + case AF_INET: + addr1 = &aa->addrV4; + addr2 = &((struct sockaddr_in *)sa)->sin_addr; + if (memcmp(addr1, addr2, sizeof(aa->addrV4)) == 0) + return 1; /* match */ + break; + case AF_INET6: + addr1 = &aa->addrV6; + addr2 = &((struct sockaddr_in6 *)sa)->sin6_addr; + if (memcmp(addr1, addr2, sizeof(aa->addrV6)) == 0) + return 1; /* match */ + break; + default: + break; /* LCOV_EXCL_LINE */ + } + } + return 0; /* different */ +} + +static void end_query (ares_channel channel, struct query *query, int status, + unsigned char *abuf, int alen) +{ + int i; + + /* First we check to see if this query ended while one of our send + * queues still has pointers to it. + */ + for (i = 0; i < channel->nservers; i++) + { + struct server_state *server = &channel->servers[i]; + struct send_request *sendreq; + for (sendreq = server->qhead; sendreq; sendreq = sendreq->next) + if (sendreq->owner_query == query) + { + sendreq->owner_query = NULL; + assert(sendreq->data_storage == NULL); + if (status == ARES_SUCCESS) + { + /* We got a reply for this query, but this queued sendreq + * points into this soon-to-be-gone query's tcpbuf. Probably + * this means we timed out and queued the query for + * retransmission, then received a response before actually + * retransmitting. This is perfectly fine, so we want to keep + * the connection running smoothly if we can. But in the worst + * case we may have sent only some prefix of the query, with + * some suffix of the query left to send. Also, the buffer may + * be queued on multiple queues. To prevent dangling pointers + * to the query's tcpbuf and handle these cases, we just give + * such sendreqs their own copy of the query packet. + */ + sendreq->data_storage = ares_malloc(sendreq->len); + if (sendreq->data_storage != NULL) + { + memcpy(sendreq->data_storage, sendreq->data, sendreq->len); + sendreq->data = sendreq->data_storage; + } + } + if ((status != ARES_SUCCESS) || (sendreq->data_storage == NULL)) + { + /* We encountered an error (probably a timeout, suggesting the + * DNS server we're talking to is probably unreachable, + * wedged, or severely overloaded) or we couldn't copy the + * request, so mark the connection as broken. When we get to + * process_broken_connections() we'll close the connection and + * try to re-send requests to another server. + */ + server->is_broken = 1; + /* Just to be paranoid, zero out this sendreq... */ + sendreq->data = NULL; + sendreq->len = 0; + } + } + } + + /* Invoke the callback */ + query->callback(query->arg, status, query->timeouts, abuf, alen); + ares__free_query(query); + + /* Simple cleanup policy: if no queries are remaining, close all network + * sockets unless STAYOPEN is set. + */ + if (!(channel->flags & ARES_FLAG_STAYOPEN) && + ares__is_list_empty(&(channel->all_queries))) + { + for (i = 0; i < channel->nservers; i++) + ares__close_sockets(channel, &channel->servers[i]); + } +} + +void ares__free_query(struct query *query) +{ + /* Remove the query from all the lists in which it is linked */ + ares__remove_from_list(&(query->queries_by_qid)); + ares__remove_from_list(&(query->queries_by_timeout)); + ares__remove_from_list(&(query->queries_to_server)); + ares__remove_from_list(&(query->all_queries)); + /* Zero out some important stuff, to help catch bugs */ + query->callback = NULL; + query->arg = NULL; + /* Deallocate the memory associated with the query */ + ares_free(query->tcpbuf); + ares_free(query->server_info); + ares_free(query); +} + +void ares__socket_close(ares_channel channel, ares_socket_t s) +{ + if (channel->sock_funcs) + channel->sock_funcs->aclose(s, channel->sock_func_cb_data); + else + sclose(s); +} |