diff options
Diffstat (limited to '')
-rw-r--r-- | src/dns.c | 1350 |
1 files changed, 1350 insertions, 0 deletions
diff --git a/src/dns.c b/src/dns.c new file mode 100644 index 0000000..8bf8dcd --- /dev/null +++ b/src/dns.c @@ -0,0 +1,1350 @@ +/* + * Name server resolution + * + * Copyright 2020 HAProxy Technologies + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/types.h> + +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/cli.h> +#include <haproxy/dgram.h> +#include <haproxy/dns.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/log.h> +#include <haproxy/ring.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/tools.h> + +static THREAD_LOCAL char *dns_msg_trash; + +DECLARE_STATIC_POOL(dns_session_pool, "dns_session", sizeof(struct dns_session)); +DECLARE_STATIC_POOL(dns_query_pool, "dns_query", sizeof(struct dns_query)); +DECLARE_STATIC_POOL(dns_msg_buf, "dns_msg_buf", DNS_TCP_MSG_RING_MAX_SIZE); + +/* Opens an UDP socket on the namesaver's IP/Port, if required. Returns 0 on + * success, -1 otherwise. ns->dgram must be defined. + */ +static int dns_connect_nameserver(struct dns_nameserver *ns) +{ + struct dgram_conn *dgram = &ns->dgram->conn; + int fd; + + /* Already connected */ + if (dgram->t.sock.fd != -1) + return 0; + + /* Create an UDP socket and connect it on the nameserver's IP/Port */ + if ((fd = socket(dgram->addr.to.ss_family, SOCK_DGRAM, IPPROTO_UDP)) == -1) { + send_log(NULL, LOG_WARNING, + "DNS : section '%s': can't create socket for nameserver '%s'.\n", + ns->counters->pid, ns->id); + return -1; + } + if (connect(fd, (struct sockaddr*)&dgram->addr.to, get_addr_len(&dgram->addr.to)) == -1) { + send_log(NULL, LOG_WARNING, + "DNS : section '%s': can't connect socket for nameserver '%s'.\n", + ns->counters->id, ns->id); + close(fd); + return -1; + } + + /* Make the socket non blocking */ + fd_set_nonblock(fd); + + /* Add the fd in the fd list and update its parameters */ + dgram->t.sock.fd = fd; + fd_insert(fd, dgram, dgram_fd_handler, MAX_THREADS_MASK); + fd_want_recv(fd); + return 0; +} + +/* Sends a message to a name server + * It returns message length on success + * or -1 in error case + * 0 is returned in case of output ring buffer is full + */ +int dns_send_nameserver(struct dns_nameserver *ns, void *buf, size_t len) +{ + int ret = -1; + + if (ns->dgram) { + struct dgram_conn *dgram = &ns->dgram->conn; + int fd; + + HA_SPIN_LOCK(DNS_LOCK, &dgram->lock); + fd = dgram->t.sock.fd; + if (fd == -1) { + if (dns_connect_nameserver(ns) == -1) { + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + fd = dgram->t.sock.fd; + } + + ret = send(fd, buf, len, 0); + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + struct ist myist; + + myist = ist2(buf, len); + ret = ring_write(ns->dgram->ring_req, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1); + if (!ret) { + ns->counters->snd_error++; + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + fd_cant_send(fd); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return ret; + } + ns->counters->snd_error++; + fd_delete(fd); + dgram->t.sock.fd = -1; + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + ns->counters->sent++; + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + } + else if (ns->stream) { + struct ist myist; + + myist = ist2(buf, len); + ret = ring_write(ns->stream->ring_req, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1); + if (!ret) { + ns->counters->snd_error++; + return -1; + } + task_wakeup(ns->stream->task_req, TASK_WOKEN_MSG); + return ret; + } + + return ret; +} + +void dns_session_free(struct dns_session *); + +/* Receives a dns message + * Returns message length + * 0 is returned if no more message available + * -1 in error case + */ +ssize_t dns_recv_nameserver(struct dns_nameserver *ns, void *data, size_t size) +{ + ssize_t ret = -1; + + if (ns->dgram) { + struct dgram_conn *dgram = &ns->dgram->conn; + int fd; + + HA_SPIN_LOCK(DNS_LOCK, &dgram->lock); + fd = dgram->t.sock.fd; + if (fd == -1) { + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + + if ((ret = recv(fd, data, size, 0)) < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_recv(fd); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return 0; + } + fd_delete(fd); + dgram->t.sock.fd = -1; + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + } + else if (ns->stream) { + struct dns_stream_server *dss = ns->stream; + struct dns_session *ds; + + HA_SPIN_LOCK(DNS_LOCK, &dss->lock); + + if (!LIST_ISEMPTY(&dss->wait_sess)) { + ds = LIST_NEXT(&dss->wait_sess, struct dns_session *, waiter); + ret = ds->rx_msg.len < size ? ds->rx_msg.len : size; + memcpy(data, ds->rx_msg.area, ret); + + ds->rx_msg.len = 0; + + /* This barrier is here to ensure that all data is + * stored if the appctx detect the elem is out of the + * list. + */ + __ha_barrier_store(); + + LIST_DEL_INIT(&ds->waiter); + + if (ds->appctx) { + /* This second barrier is here to ensure that + * the waked up appctx won't miss that the elem + * is removed from the list. + */ + __ha_barrier_store(); + + /* awake appctx because it may have other + * message to receive + */ + appctx_wakeup(ds->appctx); + + /* dns_session could already be into free_sess list + * so we firstly remove it */ + LIST_DEL_INIT(&ds->list); + + /* decrease nb_queries to free a slot for a new query on that sess */ + ds->nb_queries--; + if (ds->nb_queries) { + /* it remains pipelined unanswered request + * into this session but we just decrease + * the counter so the session + * can not be full of pipelined requests + * so we can add if to free_sess list + * to receive a new request + */ + LIST_INSERT(&ds->dss->free_sess, &ds->list); + } + else { + /* there is no more pipelined requests + * into this session, so we move it + * to idle_sess list */ + LIST_INSERT(&ds->dss->idle_sess, &ds->list); + + /* update the counter of idle sessions */ + ds->dss->idle_conns++; + + /* Note: this is useless there to update + * the max_active_conns since we increase + * the idle count */ + } + } + else { + /* there is no more appctx for this session + * it means it is ready to die + */ + dns_session_free(ds); + } + + + } + + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + } + + return ret; +} + +static void dns_resolve_recv(struct dgram_conn *dgram) +{ + struct dns_nameserver *ns; + int fd; + + HA_SPIN_LOCK(DNS_LOCK, &dgram->lock); + + fd = dgram->t.sock.fd; + + /* check if ready for reading */ + if ((fd == -1) || !fd_recv_ready(fd)) { + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return; + } + + /* no need to go further if we can't retrieve the nameserver */ + if ((ns = dgram->owner) == NULL) { + _HA_ATOMIC_AND(&fdtab[fd].state, ~(FD_POLL_HUP|FD_POLL_ERR)); + fd_stop_recv(fd); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return; + } + + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + + ns->process_responses(ns); +} + +/* Called when a dns network socket is ready to send data */ +static void dns_resolve_send(struct dgram_conn *dgram) +{ + int fd; + struct dns_nameserver *ns; + struct ring *ring; + struct buffer *buf; + uint64_t msg_len; + size_t len, cnt, ofs; + + HA_SPIN_LOCK(DNS_LOCK, &dgram->lock); + + fd = dgram->t.sock.fd; + + /* check if ready for sending */ + if ((fd == -1) || !fd_send_ready(fd)) { + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return; + } + + /* no need to go further if we can't retrieve the nameserver */ + if ((ns = dgram->owner) == NULL) { + _HA_ATOMIC_AND(&fdtab[fd].state, ~(FD_POLL_HUP|FD_POLL_ERR)); + fd_stop_send(fd); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return; + } + + ring = ns->dgram->ring_req; + buf = &ring->buf; + + HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock); + ofs = ns->dgram->ofs_req; + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(ofs == ~0)) { + ofs = 0; + HA_ATOMIC_INC(b_peek(buf, ofs)); + ofs += ring->ofs; + } + + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs -= ring->ofs; + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + while (ofs + 1 < b_data(buf)) { + int ret; + + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + if (unlikely(msg_len > DNS_TCP_MSG_MAX_SIZE)) { + /* too large a message to ever fit, let's skip it */ + ofs += cnt + msg_len; + continue; + } + + len = b_getblk(buf, dns_msg_trash, msg_len, ofs + cnt); + + ret = send(fd, dns_msg_trash, len, 0); + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_send(fd); + goto out; + } + ns->counters->snd_error++; + fd_delete(fd); + fd = dgram->t.sock.fd = -1; + goto out; + } + ns->counters->sent++; + + ofs += cnt + len; + } + + /* we don't want/need to be waked up any more for sending + * because all ring content is sent */ + fd_stop_send(fd); + +out: + + HA_ATOMIC_INC(b_peek(buf, ofs)); + ofs += ring->ofs; + ns->dgram->ofs_req = ofs; + HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + +} + +/* proto_udp callback functions for a DNS resolution */ +struct dgram_data_cb dns_dgram_cb = { + .recv = dns_resolve_recv, + .send = dns_resolve_send, +}; + +int dns_dgram_init(struct dns_nameserver *ns, struct sockaddr_storage *sk) +{ + struct dns_dgram_server *dgram; + + if ((dgram = calloc(1, sizeof(*dgram))) == NULL) + return -1; + + /* Leave dgram partially initialized, no FD attached for + * now. */ + dgram->conn.owner = ns; + dgram->conn.data = &dns_dgram_cb; + dgram->conn.t.sock.fd = -1; + dgram->conn.addr.to = *sk; + HA_SPIN_INIT(&dgram->conn.lock); + ns->dgram = dgram; + + dgram->ofs_req = ~0; /* init ring offset */ + dgram->ring_req = ring_new(2*DNS_TCP_MSG_RING_MAX_SIZE); + if (!dgram->ring_req) { + ha_alert("memory allocation error initializing the ring for nameserver.\n"); + goto out; + } + + /* attach the task as reader */ + if (!ring_attach(dgram->ring_req)) { + /* mark server attached to the ring */ + ha_alert("nameserver sets too many watchers > 255 on ring. This is a bug and should not happen.\n"); + goto out; + } + return 0; +out: + if (dgram->ring_req) + ring_free(dgram->ring_req); + + free(dgram); + + return -1; +} + +/* + * IO Handler to handle message push to dns tcp server + * It takes its context from appctx->svcctx. + */ +static void dns_session_io_handler(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct dns_session *ds = appctx->svcctx; + struct ring *ring = &ds->ring; + struct buffer *buf = &ring->buf; + uint64_t msg_len; + int available_room; + size_t len, cnt, ofs; + int ret = 0; + + /* if stopping was requested, close immediately */ + if (unlikely(stopping)) + goto close; + + /* we want to be sure to not miss that we have been awaked for a shutdown */ + __ha_barrier_load(); + + /* that means the connection was requested to shutdown + * for instance idle expire */ + if (ds->shutdown) + goto close; + + /* an error was detected */ + if (unlikely(sc_ic(sc)->flags & (CF_WRITE_ERROR|CF_SHUTW))) + goto close; + + /* con closed by server side, we will skip data write and drain data from channel */ + if ((sc_oc(sc)->flags & CF_SHUTW)) { + goto read; + } + + /* if the connection is not established, inform the stream that we want + * to be notified whenever the connection completes. + */ + if (sc_opposite(sc)->state < SC_ST_EST) { + applet_need_more_data(appctx); + se_need_remote_conn(appctx->sedesc); + applet_have_more_data(appctx); + return; + } + + + ofs = ds->ofs; + + HA_RWLOCK_WRLOCK(DNS_LOCK, &ring->lock); + LIST_DEL_INIT(&appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ring->lock); + + HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock); + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(ofs == ~0)) { + ofs = 0; + + HA_ATOMIC_INC(b_peek(buf, ofs)); + ofs += ring->ofs; + } + + /* in this loop, ofs always points to the counter byte that precedes + * the message so that we can take our reference there if we have to + * stop before the end (ret=0). + */ + if (sc_opposite(sc)->state == SC_ST_EST) { + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs -= ring->ofs; + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + ret = 1; + while (ofs + 1 < b_data(buf)) { + struct dns_query *query; + uint16_t original_qid; + uint16_t new_qid; + + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + + /* retrieve available room on output channel */ + available_room = channel_recv_max(sc_ic(sc)); + + /* tx_msg_offset null means we are at the start of a new message */ + if (!ds->tx_msg_offset) { + uint16_t slen; + + /* check if there is enough room to put message len and query id */ + if (available_room < sizeof(slen) + sizeof(new_qid)) { + sc_need_room(sc); + ret = 0; + break; + } + + /* put msg len into then channel */ + slen = (uint16_t)msg_len; + slen = htons(slen); + applet_putblk(appctx, (char *)&slen, sizeof(slen)); + available_room -= sizeof(slen); + + /* backup original query id */ + len = b_getblk(buf, (char *)&original_qid, sizeof(original_qid), ofs + cnt); + if (!len) { + /* should never happen since messages are atomically + * written into ring + */ + ret = 0; + break; + } + + /* generates new query id */ + new_qid = ++ds->query_counter; + new_qid = htons(new_qid); + + /* put new query id into the channel */ + applet_putblk(appctx, (char *)&new_qid, sizeof(new_qid)); + available_room -= sizeof(new_qid); + + /* keep query id mapping */ + + query = pool_alloc(dns_query_pool); + if (query) { + query->qid.key = new_qid; + query->original_qid = original_qid; + query->expire = tick_add(now_ms, 5000); + LIST_INIT(&query->list); + if (LIST_ISEMPTY(&ds->queries)) { + /* enable task to handle expire */ + ds->task_exp->expire = query->expire; + /* ensure this will be executed by the same + * thread than ds_session_release + * to ensure session_release is free + * to destroy the task */ + task_queue(ds->task_exp); + } + LIST_APPEND(&ds->queries, &query->list); + eb32_insert(&ds->query_ids, &query->qid); + ds->onfly_queries++; + } + + /* update the tx_offset to handle output in 16k streams */ + ds->tx_msg_offset = sizeof(original_qid); + + } + + /* check if it remains available room on output chan */ + if (unlikely(!available_room)) { + sc_need_room(sc); + ret = 0; + break; + } + + chunk_reset(&trash); + if ((msg_len - ds->tx_msg_offset) > available_room) { + /* remaining msg data is too large to be written in output channel at one time */ + + len = b_getblk(buf, trash.area, available_room, ofs + cnt + ds->tx_msg_offset); + + /* update offset to complete mesg forwarding later */ + ds->tx_msg_offset += len; + } + else { + /* remaining msg data can be written in output channel at one time */ + len = b_getblk(buf, trash.area, msg_len - ds->tx_msg_offset, ofs + cnt + ds->tx_msg_offset); + + /* reset tx_msg_offset to mark forward fully processed */ + ds->tx_msg_offset = 0; + } + trash.data += len; + + if (applet_putchk(appctx, &trash) == -1) { + /* should never happen since we + * check available_room is large + * enough here. + */ + ret = 0; + break; + } + + if (ds->tx_msg_offset) { + /* msg was not fully processed, we must be awake to drain pending data */ + + sc_need_room(sc); + ret = 0; + break; + } + /* switch to next message */ + ofs += cnt + msg_len; + } + + HA_ATOMIC_INC(b_peek(buf, ofs)); + ofs += ring->ofs; + ds->ofs = ofs; + } + HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock); + + if (ret) { + /* let's be woken up once new request to write arrived */ + HA_RWLOCK_WRLOCK(DNS_LOCK, &ring->lock); + BUG_ON(LIST_INLIST(&appctx->wait_entry)); + LIST_APPEND(&ring->waiters, &appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ring->lock); + applet_have_no_more_data(appctx); + } + +read: + + /* if session is not a waiter it means there is no committed + * message into rx_buf and we are free to use it + * Note: we need a load barrier here to not miss the + * delete from the list + */ + + __ha_barrier_load(); + if (!LIST_INLIST_ATOMIC(&ds->waiter)) { + while (1) { + uint16_t query_id; + struct eb32_node *eb; + struct dns_query *query; + + if (!ds->rx_msg.len) { + /* next message len is not fully available into the channel */ + if (co_data(sc_oc(sc)) < 2) + break; + + /* retrieve message len */ + co_getblk(sc_oc(sc), (char *)&msg_len, 2, 0); + + /* mark as consumed */ + co_skip(sc_oc(sc), 2); + + /* store message len */ + ds->rx_msg.len = ntohs(msg_len); + } + + if (!co_data(sc_oc(sc))) { + /* we need more data but nothing is available */ + break; + } + + if (co_data(sc_oc(sc)) + ds->rx_msg.offset < ds->rx_msg.len) { + /* message only partially available */ + + /* read available data */ + co_getblk(sc_oc(sc), ds->rx_msg.area + ds->rx_msg.offset, co_data(sc_oc(sc)), 0); + + /* update message offset */ + ds->rx_msg.offset += co_data(sc_oc(sc)); + + /* consume all pending data from the channel */ + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + + /* we need to wait for more data */ + break; + } + + /* enough data is available into the channel to read the message until the end */ + + /* read from the channel until the end of the message */ + co_getblk(sc_oc(sc), ds->rx_msg.area + ds->rx_msg.offset, ds->rx_msg.len - ds->rx_msg.offset, 0); + + /* consume all data until the end of the message from the channel */ + co_skip(sc_oc(sc), ds->rx_msg.len - ds->rx_msg.offset); + + /* reset reader offset to 0 for next message reand */ + ds->rx_msg.offset = 0; + + /* try remap query id to original */ + memcpy(&query_id, ds->rx_msg.area, sizeof(query_id)); + eb = eb32_lookup(&ds->query_ids, query_id); + if (!eb) { + /* query id not found means we have an unknown corresponding + * request, perhaps server's bug or or the query reached + * timeout + */ + ds->rx_msg.len = 0; + continue; + } + + /* re-map the original query id set by the requester */ + query = eb32_entry(eb, struct dns_query, qid); + memcpy(ds->rx_msg.area, &query->original_qid, sizeof(query->original_qid)); + + /* remove query ids mapping from pending queries list/tree */ + eb32_delete(&query->qid); + LIST_DELETE(&query->list); + pool_free(dns_query_pool, query); + ds->onfly_queries--; + + /* the dns_session is also added in queue of the + * wait_sess list where the task processing + * response will pop available responses + */ + HA_SPIN_LOCK(DNS_LOCK, &ds->dss->lock); + + BUG_ON(LIST_INLIST(&ds->waiter)); + LIST_APPEND(&ds->dss->wait_sess, &ds->waiter); + + HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock); + + /* awake the task processing the responses */ + task_wakeup(ds->dss->task_rsp, TASK_WOKEN_INIT); + + break; + } + + if (!LIST_INLIST(&ds->waiter)) { + /* there is no more pending data to read and the con was closed by the server side */ + if (!co_data(sc_oc(sc)) && (sc_oc(sc)->flags & CF_SHUTW)) { + goto close; + } + } + + } + + return; +close: + sc_shutw(sc); + sc_shutr(sc); + sc_ic(sc)->flags |= CF_READ_NULL; +} + +void dns_queries_flush(struct dns_session *ds) +{ + struct dns_query *query, *queryb; + + list_for_each_entry_safe(query, queryb, &ds->queries, list) { + eb32_delete(&query->qid); + LIST_DELETE(&query->list); + pool_free(dns_query_pool, query); + } +} + +void dns_session_free(struct dns_session *ds) +{ + if (ds->rx_msg.area) + pool_free(dns_msg_buf, ds->rx_msg.area); + if (ds->tx_ring_area) + pool_free(dns_msg_buf, ds->tx_ring_area); + if (ds->task_exp) + task_destroy(ds->task_exp); + + dns_queries_flush(ds); + + /* Ensure to remove this session from external lists + * Note: we are under the lock of dns_stream_server + * which own the heads of those lists. + */ + LIST_DEL_INIT(&ds->waiter); + LIST_DEL_INIT(&ds->list); + + ds->dss->cur_conns--; + /* Note: this is useless to update + * max_active_conns here because + * we decrease the value + */ + + BUG_ON(!LIST_ISEMPTY(&ds->list)); + BUG_ON(!LIST_ISEMPTY(&ds->waiter)); + BUG_ON(!LIST_ISEMPTY(&ds->queries)); + BUG_ON(!LIST_ISEMPTY(&ds->ring.waiters)); + BUG_ON(!eb_is_empty(&ds->query_ids)); + pool_free(dns_session_pool, ds); +} + +static struct appctx *dns_session_create(struct dns_session *ds); + +static int dns_session_init(struct appctx *appctx) +{ + struct dns_session *ds = appctx->svcctx; + struct stream *s; + struct sockaddr_storage *addr = NULL; + + if (!sockaddr_alloc(&addr, &ds->dss->srv->addr, sizeof(ds->dss->srv->addr))) + goto error; + + if (appctx_finalize_startup(appctx, ds->dss->srv->proxy, &BUF_NULL) == -1) + goto error; + + s = appctx_strm(appctx); + s->scb->dst = addr; + s->scb->flags |= SC_FL_NOLINGER; + s->target = &ds->dss->srv->obj_type; + s->flags = SF_ASSIGNED; + + s->do_log = NULL; + s->uniq_id = 0; + + s->res.flags |= CF_READ_DONTWAIT; + /* for rto and rex to eternity to not expire on idle recv: + * We are using a syslog server. + */ + s->res.rto = TICK_ETERNITY; + s->res.rex = TICK_ETERNITY; + + ds->appctx = appctx; + return 0; + + error: + return -1; +} + +/* + * Function to release a DNS tcp session + */ +static void dns_session_release(struct appctx *appctx) +{ + struct dns_session *ds = appctx->svcctx; + struct dns_stream_server *dss __maybe_unused; + + if (!ds) + return; + + /* We do not call ring_appctx_detach here + * because we want to keep readers counters + * to retry a conn with a different appctx. + */ + HA_RWLOCK_WRLOCK(DNS_LOCK, &ds->ring.lock); + LIST_DEL_INIT(&appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ds->ring.lock); + + dss = ds->dss; + + HA_SPIN_LOCK(DNS_LOCK, &dss->lock); + LIST_DEL_INIT(&ds->list); + + if (stopping) { + dns_session_free(ds); + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return; + } + + if (!ds->nb_queries) { + /* this is an idle session */ + /* Note: this is useless to update max_active_sess + * here because we decrease idle_conns but + * dns_session_free decrease curconns + */ + + ds->dss->idle_conns--; + dns_session_free(ds); + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return; + } + + if (ds->onfly_queries == ds->nb_queries) { + /* the session can be released because + * it means that all queries AND + * responses are in fly */ + dns_session_free(ds); + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return; + } + + /* if there is no pending complete response + * message, ensure to reset + * message offsets if the session + * was closed with an incomplete pending response + */ + if (!LIST_INLIST(&ds->waiter)) + ds->rx_msg.len = ds->rx_msg.offset = 0; + + /* we flush pending sent queries because we never + * have responses + */ + ds->nb_queries -= ds->onfly_queries; + dns_queries_flush(ds); + + /* reset offset to be sure to start from message start */ + ds->tx_msg_offset = 0; + + /* here the ofs and the attached counter + * are kept unchanged + */ + + /* Create a new appctx, We hope we can + * create from the release callback! */ + ds->appctx = dns_session_create(ds); + if (!ds->appctx) { + dns_session_free(ds); + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return; + } + + if (ds->nb_queries < DNS_STREAM_MAX_PIPELINED_REQ) + LIST_INSERT(&ds->dss->free_sess, &ds->list); + + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); +} + +/* DNS tcp session applet */ +static struct applet dns_session_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<STRMDNS>", /* used for logging */ + .fct = dns_session_io_handler, + .init = dns_session_init, + .release = dns_session_release, +}; + +/* + * Function used to create an appctx for a DNS session + * It sets its context into appctx->svcctx. + */ +static struct appctx *dns_session_create(struct dns_session *ds) +{ + struct appctx *appctx; + + appctx = appctx_new_here(&dns_session_applet, NULL); + if (!appctx) + goto out_close; + appctx->svcctx = (void *)ds; + + if (appctx_init(appctx) == -1) { + ha_alert("out of memory in dns_session_create().\n"); + goto out_free_appctx; + } + + return appctx; + + /* Error unrolling */ + out_free_appctx: + appctx_free_on_early_error(appctx); + out_close: + return NULL; +} + +/* Task processing expiration of unresponded queries, this one is supposed + * to be stuck on the same thread than the appctx handler + */ +static struct task *dns_process_query_exp(struct task *t, void *context, unsigned int state) +{ + struct dns_session *ds = (struct dns_session *)context; + struct dns_query *query, *queryb; + + t->expire = TICK_ETERNITY; + + list_for_each_entry_safe(query, queryb, &ds->queries, list) { + if (tick_is_expired(query->expire, now_ms)) { + eb32_delete(&query->qid); + LIST_DELETE(&query->list); + pool_free(dns_query_pool, query); + ds->onfly_queries--; + } + else { + t->expire = query->expire; + break; + } + } + + return t; +} + +/* Task processing expiration of idle sessions */ +static struct task *dns_process_idle_exp(struct task *t, void *context, unsigned int state) +{ + struct dns_stream_server *dss = (struct dns_stream_server *)context; + struct dns_session *ds, *dsb; + int target = 0; + int cur_active_conns; + + HA_SPIN_LOCK(DNS_LOCK, &dss->lock); + + + cur_active_conns = dss->cur_conns - dss->idle_conns; + if (cur_active_conns > dss->max_active_conns) + dss->max_active_conns = cur_active_conns; + + target = (dss->max_active_conns - cur_active_conns) / 2; + list_for_each_entry_safe(ds, dsb, &dss->idle_sess, list) { + if (!target) + break; + + /* remove conn to pending list to ensure it won't be reused */ + LIST_DEL_INIT(&ds->list); + + /* force session shutdown */ + ds->shutdown = 1; + + /* to be sure that the appctx won't miss shutdown */ + __ha_barrier_store(); + + /* wake appctx to perform the shutdown */ + appctx_wakeup(ds->appctx); + } + + /* reset max to current active conns */ + dss->max_active_conns = cur_active_conns; + + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + + t->expire = tick_add(now_ms, 5000); + + return t; +} + +struct dns_session *dns_session_new(struct dns_stream_server *dss) +{ + struct dns_session *ds; + + if (dss->maxconn && (dss->maxconn <= dss->cur_conns)) + return NULL; + + ds = pool_zalloc(dns_session_pool); + if (!ds) + return NULL; + + ds->ofs = ~0; + ds->dss = dss; + LIST_INIT(&ds->list); + LIST_INIT(&ds->queries); + LIST_INIT(&ds->waiter); + ds->rx_msg.offset = ds->rx_msg.len = 0; + ds->rx_msg.area = NULL; + ds->tx_ring_area = NULL; + ds->task_exp = NULL; + ds->appctx = NULL; + ds->shutdown = 0; + ds->nb_queries = 0; + ds->query_ids = EB_ROOT_UNIQUE; + ds->rx_msg.area = pool_alloc(dns_msg_buf); + if (!ds->rx_msg.area) + goto error; + + ds->tx_ring_area = pool_alloc(dns_msg_buf); + if (!ds->tx_ring_area) + goto error; + + ring_init(&ds->ring, ds->tx_ring_area, DNS_TCP_MSG_RING_MAX_SIZE); + /* never fail because it is the first watcher attached to the ring */ + DISGUISE(ring_attach(&ds->ring)); + + if ((ds->task_exp = task_new_here()) == NULL) + goto error; + + ds->task_exp->process = dns_process_query_exp; + ds->task_exp->context = ds; + + ds->appctx = dns_session_create(ds); + if (!ds->appctx) + goto error; + + dss->cur_conns++; + + return ds; + +error: + if (ds->task_exp) + task_destroy(ds->task_exp); + if (ds->rx_msg.area) + pool_free(dns_msg_buf, ds->rx_msg.area); + if (ds->tx_ring_area) + pool_free(dns_msg_buf, ds->tx_ring_area); + + pool_free(dns_session_pool, ds); + + return NULL; +} + +/* + * Task used to consume pending messages from nameserver ring + * and forward them to dns_session ring. + * Note: If no slot found a new dns_session is allocated + */ +static struct task *dns_process_req(struct task *t, void *context, unsigned int state) +{ + struct dns_nameserver *ns = (struct dns_nameserver *)context; + struct dns_stream_server *dss = ns->stream; + struct ring *ring = dss->ring_req; + struct buffer *buf = &ring->buf; + uint64_t msg_len; + size_t len, cnt, ofs; + struct dns_session *ds, *ads; + HA_SPIN_LOCK(DNS_LOCK, &dss->lock); + + ofs = dss->ofs_req; + + HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock); + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(ofs == ~0)) { + ofs = 0; + HA_ATOMIC_INC(b_peek(buf, ofs)); + ofs += ring->ofs; + } + + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs -= ring->ofs; + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + while (ofs + 1 < b_data(buf)) { + struct ist myist; + + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + if (unlikely(msg_len > DNS_TCP_MSG_MAX_SIZE)) { + /* too large a message to ever fit, let's skip it */ + ofs += cnt + msg_len; + continue; + } + + len = b_getblk(buf, dns_msg_trash, msg_len, ofs + cnt); + + myist = ist2(dns_msg_trash, len); + + ads = NULL; + /* try to push request into active sess with free slot */ + if (!LIST_ISEMPTY(&dss->free_sess)) { + ds = LIST_NEXT(&dss->free_sess, struct dns_session *, list); + + if (ring_write(&ds->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1) > 0) { + ds->nb_queries++; + if (ds->nb_queries >= DNS_STREAM_MAX_PIPELINED_REQ) + LIST_DEL_INIT(&ds->list); + ads = ds; + } + else { + /* it means we were unable to put a request in this slot, + * it may be close to be full so we put it at the end + * of free conn list */ + LIST_DEL_INIT(&ds->list); + LIST_APPEND(&dss->free_sess, &ds->list); + } + } + + if (!ads) { + /* try to push request into idle, this one should have enough free space */ + if (!LIST_ISEMPTY(&dss->idle_sess)) { + ds = LIST_NEXT(&dss->idle_sess, struct dns_session *, list); + + /* ring is empty so this ring_write should never fail */ + ring_write(&ds->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1); + ds->nb_queries++; + LIST_DEL_INIT(&ds->list); + + ds->dss->idle_conns--; + + /* we may have to update the max_active_conns */ + if (ds->dss->max_active_conns < ds->dss->cur_conns - ds->dss->idle_conns) + ds->dss->max_active_conns = ds->dss->cur_conns - ds->dss->idle_conns; + + /* since we may unable to find a free list to handle + * this request, this request may be large and fill + * the ring buffer so we prefer to put at the end of free + * list. */ + LIST_APPEND(&dss->free_sess, &ds->list); + ads = ds; + } + } + + /* we didn't find a session available with large enough room */ + if (!ads) { + /* allocate a new session */ + ads = dns_session_new(dss); + if (ads) { + /* ring is empty so this ring_write should never fail */ + ring_write(&ads->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1); + ads->nb_queries++; + LIST_INSERT(&dss->free_sess, &ads->list); + } + else + ns->counters->snd_error++; + } + + if (ads) + ns->counters->sent++; + + ofs += cnt + len; + } + + HA_ATOMIC_INC(b_peek(buf, ofs)); + ofs += ring->ofs; + dss->ofs_req = ofs; + HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock); + + + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return t; +} + +/* + * Task used to consume response + * Note: upper layer callback is called + */ +static struct task *dns_process_rsp(struct task *t, void *context, unsigned int state) +{ + struct dns_nameserver *ns = (struct dns_nameserver *)context; + + ns->process_responses(ns); + + return t; +} + +/* Function used to initialize an TCP nameserver */ +int dns_stream_init(struct dns_nameserver *ns, struct server *srv) +{ + struct dns_stream_server *dss = NULL; + + dss = calloc(1, sizeof(*dss)); + if (!dss) { + ha_alert("memory allocation error initializing dns tcp server '%s'.\n", srv->id); + goto out; + } + + dss->srv = srv; + dss->maxconn = srv->maxconn; + + dss->ofs_req = ~0; /* init ring offset */ + dss->ring_req = ring_new(2*DNS_TCP_MSG_RING_MAX_SIZE); + if (!dss->ring_req) { + ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id); + goto out; + } + /* Create the task associated to the resolver target handling conns */ + if ((dss->task_req = task_new_anywhere()) == NULL) { + ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id); + goto out; + } + + /* Update task's parameters */ + dss->task_req->process = dns_process_req; + dss->task_req->context = ns; + + /* attach the task as reader */ + if (!ring_attach(dss->ring_req)) { + /* mark server attached to the ring */ + ha_alert("server '%s': too many watchers for ring. this should never happen.\n", srv->id); + goto out; + } + + /* Create the task associated to the resolver target handling conns */ + if ((dss->task_rsp = task_new_anywhere()) == NULL) { + ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id); + goto out; + } + + /* Update task's parameters */ + dss->task_rsp->process = dns_process_rsp; + dss->task_rsp->context = ns; + + /* Create the task associated to the resolver target handling conns */ + if ((dss->task_idle = task_new_anywhere()) == NULL) { + ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id); + goto out; + } + + /* Update task's parameters */ + dss->task_idle->process = dns_process_idle_exp; + dss->task_idle->context = dss; + dss->task_idle->expire = tick_add(now_ms, 5000); + + /* let start the task to free idle conns immediately */ + task_queue(dss->task_idle); + + LIST_INIT(&dss->free_sess); + LIST_INIT(&dss->idle_sess); + LIST_INIT(&dss->wait_sess); + HA_SPIN_INIT(&dss->lock); + ns->stream = dss; + return 0; +out: + if (dss && dss->task_rsp) + task_destroy(dss->task_rsp); + if (dss && dss->task_req) + task_destroy(dss->task_req); + if (dss && dss->ring_req) + ring_free(dss->ring_req); + + free(dss); + return -1; +} + +int init_dns_buffers() +{ + dns_msg_trash = malloc(DNS_TCP_MSG_MAX_SIZE); + if (!dns_msg_trash) + return 0; + + return 1; +} + +void deinit_dns_buffers() +{ + ha_free(&dns_msg_trash); +} + +REGISTER_PER_THREAD_ALLOC(init_dns_buffers); +REGISTER_PER_THREAD_FREE(deinit_dns_buffers); |