diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 15:59:48 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 15:59:48 +0000 |
commit | 3b9b6d0b8e7f798023c9d109c490449d528fde80 (patch) | |
tree | 2e1c188dd7b8d7475cd163de9ae02c428343669b /lib/isc/netmgr | |
parent | Initial commit. (diff) | |
download | bind9-3b9b6d0b8e7f798023c9d109c490449d528fde80.tar.xz bind9-3b9b6d0b8e7f798023c9d109c490449d528fde80.zip |
Adding upstream version 1:9.18.19.upstream/1%9.18.19upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lib/isc/netmgr')
-rw-r--r-- | lib/isc/netmgr/http.c | 3755 | ||||
-rw-r--r-- | lib/isc/netmgr/netmgr-int.h | 2273 | ||||
-rw-r--r-- | lib/isc/netmgr/netmgr.c | 3991 | ||||
-rw-r--r-- | lib/isc/netmgr/tcp.c | 1456 | ||||
-rw-r--r-- | lib/isc/netmgr/tcpdns.c | 1500 | ||||
-rw-r--r-- | lib/isc/netmgr/timer.c | 120 | ||||
-rw-r--r-- | lib/isc/netmgr/tlsdns.c | 2363 | ||||
-rw-r--r-- | lib/isc/netmgr/tlsstream.c | 1348 | ||||
-rw-r--r-- | lib/isc/netmgr/udp.c | 1405 | ||||
-rw-r--r-- | lib/isc/netmgr/uv-compat.c | 140 | ||||
-rw-r--r-- | lib/isc/netmgr/uv-compat.h | 126 | ||||
-rw-r--r-- | lib/isc/netmgr/uverr2result.c | 105 |
12 files changed, 18582 insertions, 0 deletions
diff --git a/lib/isc/netmgr/http.c b/lib/isc/netmgr/http.c new file mode 100644 index 0000000..f2d3e2d --- /dev/null +++ b/lib/isc/netmgr/http.c @@ -0,0 +1,3755 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <ctype.h> +#include <inttypes.h> +#include <limits.h> +#include <nghttp2/nghttp2.h> +#include <signal.h> +#include <string.h> + +#include <isc/base64.h> +#include <isc/log.h> +#include <isc/netmgr.h> +#include <isc/print.h> +#include <isc/sockaddr.h> +#include <isc/tls.h> +#include <isc/url.h> +#include <isc/util.h> + +#include "netmgr-int.h" + +#define AUTHEXTRA 7 + +#define MAX_DNS_MESSAGE_SIZE (UINT16_MAX) + +#define DNS_MEDIA_TYPE "application/dns-message" + +/* + * See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control + * for additional details. Basically it means "avoid caching by any + * means." + */ +#define DEFAULT_CACHE_CONTROL "no-cache, no-store, must-revalidate" + +/* + * If server during request processing surpasses any of the limits + * below, it will just reset the stream without returning any error + * codes in a response. Ideally, these parameters should be + * configurable both globally and per every HTTP endpoint description + * in the configuration file, but for now it should be enough. + */ + +/* + * 128K should be enough to encode 64K of data into base64url inside GET + * request and have extra space for other headers + */ +#define MAX_ALLOWED_DATA_IN_HEADERS (MAX_DNS_MESSAGE_SIZE * 2) + +#define MAX_ALLOWED_DATA_IN_POST \ + (MAX_DNS_MESSAGE_SIZE + MAX_DNS_MESSAGE_SIZE / 2) + +#define HEADER_MATCH(header, name, namelen) \ + (((namelen) == sizeof(header) - 1) && \ + (strncasecmp((header), (const char *)(name), (namelen)) == 0)) + +#define MIN_SUCCESSFUL_HTTP_STATUS (200) +#define MAX_SUCCESSFUL_HTTP_STATUS (299) + +/* This definition sets the upper limit of pending write buffer to an + * adequate enough value. That is done mostly to fight a limitation + * for a max TLS record size in flamethrower (2K). In a perfect world + * this constant should not be required, if we ever move closer to + * that state, the constant, and corresponding code, should be + * removed. For now the limit seems adequate enough to fight + * "tinygrams" problem. */ +#define FLUSH_HTTP_WRITE_BUFFER_AFTER (1536) + +/* This switch is here mostly to test the code interoperability with + * buggy implementations */ +#define ENABLE_HTTP_WRITE_BUFFERING 1 + +#define SUCCESSFUL_HTTP_STATUS(code) \ + ((code) >= MIN_SUCCESSFUL_HTTP_STATUS && \ + (code) <= MAX_SUCCESSFUL_HTTP_STATUS) + +#define INITIAL_DNS_MESSAGE_BUFFER_SIZE (512) + +typedef struct isc_nm_http_response_status { + size_t code; + size_t content_length; + bool content_type_valid; +} isc_nm_http_response_status_t; + +typedef struct http_cstream { + isc_nm_recv_cb_t read_cb; + void *read_cbarg; + isc_nm_cb_t connect_cb; + void *connect_cbarg; + + bool sending; + bool reading; + + char *uri; + isc_url_parser_t up; + + char *authority; + size_t authoritylen; + char *path; + + isc_buffer_t *rbuf; + + size_t pathlen; + int32_t stream_id; + + bool post; /* POST or GET */ + isc_buffer_t *postdata; + char *GET_path; + size_t GET_path_len; + + isc_nm_http_response_status_t response_status; + isc_nmsocket_t *httpsock; + LINK(struct http_cstream) link; +} http_cstream_t; + +#define HTTP2_SESSION_MAGIC ISC_MAGIC('H', '2', 'S', 'S') +#define VALID_HTTP2_SESSION(t) ISC_MAGIC_VALID(t, HTTP2_SESSION_MAGIC) + +typedef ISC_LIST(isc__nm_uvreq_t) isc__nm_http_pending_callbacks_t; + +struct isc_nm_http_session { + unsigned int magic; + isc_refcount_t references; + isc_mem_t *mctx; + + size_t sending; + bool reading; + bool closed; + bool closing; + + nghttp2_session *ngsession; + bool client; + + ISC_LIST(http_cstream_t) cstreams; + ISC_LIST(isc_nmsocket_h2_t) sstreams; + size_t nsstreams; + + isc_nmhandle_t *handle; + isc_nmhandle_t *client_httphandle; + isc_nmsocket_t *serversocket; + + isc_buffer_t *buf; + + isc_tlsctx_t *tlsctx; + uint32_t max_concurrent_streams; + + isc__nm_http_pending_callbacks_t pending_write_callbacks; + isc_buffer_t *pending_write_data; +}; + +typedef enum isc_http_error_responses { + ISC_HTTP_ERROR_SUCCESS, /* 200 */ + ISC_HTTP_ERROR_NOT_FOUND, /* 404 */ + ISC_HTTP_ERROR_PAYLOAD_TOO_LARGE, /* 413 */ + ISC_HTTP_ERROR_URI_TOO_LONG, /* 414 */ + ISC_HTTP_ERROR_UNSUPPORTED_MEDIA_TYPE, /* 415 */ + ISC_HTTP_ERROR_BAD_REQUEST, /* 400 */ + ISC_HTTP_ERROR_NOT_IMPLEMENTED, /* 501 */ + ISC_HTTP_ERROR_GENERIC, /* 500 Internal Server Error */ + ISC_HTTP_ERROR_MAX +} isc_http_error_responses_t; + +typedef struct isc_http_send_req { + isc_nm_http_session_t *session; + isc_nmhandle_t *transphandle; + isc_nmhandle_t *httphandle; + isc_nm_cb_t cb; + void *cbarg; + isc_buffer_t *pending_write_data; + isc__nm_http_pending_callbacks_t pending_write_callbacks; +} isc_http_send_req_t; + +#define HTTP_ENDPOINTS_MAGIC ISC_MAGIC('H', 'T', 'E', 'P') +#define VALID_HTTP_ENDPOINTS(t) ISC_MAGIC_VALID(t, HTTP_ENDPOINTS_MAGIC) + +static bool +http_send_outgoing(isc_nm_http_session_t *session, isc_nmhandle_t *httphandle, + isc_nm_cb_t cb, void *cbarg); + +static void +http_do_bio(isc_nm_http_session_t *session, isc_nmhandle_t *send_httphandle, + isc_nm_cb_t send_cb, void *send_cbarg); + +static void +failed_httpstream_read_cb(isc_nmsocket_t *sock, isc_result_t result, + isc_nm_http_session_t *session); + +static void +client_call_failed_read_cb(isc_result_t result, isc_nm_http_session_t *session); + +static void +server_call_failed_read_cb(isc_result_t result, isc_nm_http_session_t *session); + +static void +failed_read_cb(isc_result_t result, isc_nm_http_session_t *session); + +static isc_result_t +server_send_error_response(const isc_http_error_responses_t error, + nghttp2_session *ngsession, isc_nmsocket_t *socket); + +static isc_result_t +client_send(isc_nmhandle_t *handle, const isc_region_t *region); + +static void +finish_http_session(isc_nm_http_session_t *session); + +static void +http_transpost_tcp_nodelay(isc_nmhandle_t *transphandle); + +static void +call_pending_callbacks(isc__nm_http_pending_callbacks_t pending_callbacks, + isc_result_t result); + +static void +server_call_cb(isc_nmsocket_t *socket, isc_nm_http_session_t *session, + const isc_result_t result, isc_region_t *data); + +static isc_nm_httphandler_t * +http_endpoints_find(const char *request_path, + const isc_nm_http_endpoints_t *restrict eps); + +static void +http_init_listener_endpoints(isc_nmsocket_t *listener, + isc_nm_http_endpoints_t *epset); + +static void +http_cleanup_listener_endpoints(isc_nmsocket_t *listener); + +static isc_nm_http_endpoints_t * +http_get_listener_endpoints(isc_nmsocket_t *listener, const int tid); + +static bool +http_session_active(isc_nm_http_session_t *session) { + REQUIRE(VALID_HTTP2_SESSION(session)); + return (!session->closed && !session->closing); +} + +static void * +http_malloc(size_t sz, isc_mem_t *mctx) { + return (isc_mem_allocate(mctx, sz)); +} + +static void * +http_calloc(size_t n, size_t sz, isc_mem_t *mctx) { + const size_t msize = n * sz; + void *data = isc_mem_allocate(mctx, msize); + + memset(data, 0, msize); + return (data); +} + +static void * +http_realloc(void *p, size_t newsz, isc_mem_t *mctx) { + return (isc_mem_reallocate(mctx, p, newsz)); +} + +static void +http_free(void *p, isc_mem_t *mctx) { + if (p == NULL) { /* as standard free() behaves */ + return; + } + isc_mem_free(mctx, p); +} + +static void +init_nghttp2_mem(isc_mem_t *mctx, nghttp2_mem *mem) { + *mem = (nghttp2_mem){ .malloc = (nghttp2_malloc)http_malloc, + .calloc = (nghttp2_calloc)http_calloc, + .realloc = (nghttp2_realloc)http_realloc, + .free = (nghttp2_free)http_free, + .mem_user_data = mctx }; +} + +static void +new_session(isc_mem_t *mctx, isc_tlsctx_t *tctx, + isc_nm_http_session_t **sessionp) { + isc_nm_http_session_t *session = NULL; + + REQUIRE(sessionp != NULL && *sessionp == NULL); + REQUIRE(mctx != NULL); + + session = isc_mem_get(mctx, sizeof(isc_nm_http_session_t)); + *session = (isc_nm_http_session_t){ .magic = HTTP2_SESSION_MAGIC, + .tlsctx = tctx }; + isc_refcount_init(&session->references, 1); + isc_mem_attach(mctx, &session->mctx); + ISC_LIST_INIT(session->cstreams); + ISC_LIST_INIT(session->sstreams); + ISC_LIST_INIT(session->pending_write_callbacks); + + *sessionp = session; +} + +void +isc__nm_httpsession_attach(isc_nm_http_session_t *source, + isc_nm_http_session_t **targetp) { + REQUIRE(VALID_HTTP2_SESSION(source)); + REQUIRE(targetp != NULL && *targetp == NULL); + + isc_refcount_increment(&source->references); + + *targetp = source; +} + +void +isc__nm_httpsession_detach(isc_nm_http_session_t **sessionp) { + isc_nm_http_session_t *session = NULL; + + REQUIRE(sessionp != NULL); + + session = *sessionp; + *sessionp = NULL; + + REQUIRE(VALID_HTTP2_SESSION(session)); + + if (isc_refcount_decrement(&session->references) > 1) { + return; + } + + finish_http_session(session); + + INSIST(ISC_LIST_EMPTY(session->sstreams)); + INSIST(ISC_LIST_EMPTY(session->cstreams)); + + if (session->ngsession != NULL) { + nghttp2_session_del(session->ngsession); + session->ngsession = NULL; + } + + if (session->buf != NULL) { + isc_buffer_free(&session->buf); + } + + /* We need an acquire memory barrier here */ + (void)isc_refcount_current(&session->references); + + session->magic = 0; + isc_mem_putanddetach(&session->mctx, session, + sizeof(isc_nm_http_session_t)); +} + +static http_cstream_t * +find_http_cstream(int32_t stream_id, isc_nm_http_session_t *session) { + http_cstream_t *cstream = NULL; + REQUIRE(VALID_HTTP2_SESSION(session)); + + if (ISC_LIST_EMPTY(session->cstreams)) { + return (NULL); + } + + for (cstream = ISC_LIST_HEAD(session->cstreams); cstream != NULL; + cstream = ISC_LIST_NEXT(cstream, link)) + { + if (cstream->stream_id == stream_id) { + break; + } + } + + /* LRU-like behaviour */ + if (cstream && ISC_LIST_HEAD(session->cstreams) != cstream) { + ISC_LIST_UNLINK(session->cstreams, cstream, link); + ISC_LIST_PREPEND(session->cstreams, cstream, link); + } + + return (cstream); +} + +static isc_result_t +new_http_cstream(isc_nmsocket_t *sock, http_cstream_t **streamp) { + isc_mem_t *mctx = sock->mgr->mctx; + const char *uri = NULL; + bool post; + http_cstream_t *stream = NULL; + isc_result_t result; + + uri = sock->h2.session->handle->sock->h2.connect.uri; + post = sock->h2.session->handle->sock->h2.connect.post; + + stream = isc_mem_get(mctx, sizeof(http_cstream_t)); + *stream = (http_cstream_t){ .stream_id = -1, + .post = post, + .uri = isc_mem_strdup(mctx, uri) }; + ISC_LINK_INIT(stream, link); + + result = isc_url_parse(stream->uri, strlen(stream->uri), 0, + &stream->up); + if (result != ISC_R_SUCCESS) { + isc_mem_free(mctx, stream->uri); + isc_mem_put(mctx, stream, sizeof(http_cstream_t)); + return (result); + } + + isc__nmsocket_attach(sock, &stream->httpsock); + stream->authoritylen = stream->up.field_data[ISC_UF_HOST].len; + stream->authority = isc_mem_get(mctx, stream->authoritylen + AUTHEXTRA); + memmove(stream->authority, &uri[stream->up.field_data[ISC_UF_HOST].off], + stream->up.field_data[ISC_UF_HOST].len); + + if (stream->up.field_set & (1 << ISC_UF_PORT)) { + stream->authoritylen += (size_t)snprintf( + stream->authority + + stream->up.field_data[ISC_UF_HOST].len, + AUTHEXTRA, ":%u", stream->up.port); + } + + /* If we don't have path in URI, we use "/" as path. */ + stream->pathlen = 1; + if (stream->up.field_set & (1 << ISC_UF_PATH)) { + stream->pathlen = stream->up.field_data[ISC_UF_PATH].len; + } + if (stream->up.field_set & (1 << ISC_UF_QUERY)) { + /* +1 for '?' character */ + stream->pathlen += + (size_t)(stream->up.field_data[ISC_UF_QUERY].len + 1); + } + + stream->path = isc_mem_get(mctx, stream->pathlen); + if (stream->up.field_set & (1 << ISC_UF_PATH)) { + memmove(stream->path, + &uri[stream->up.field_data[ISC_UF_PATH].off], + stream->up.field_data[ISC_UF_PATH].len); + } else { + stream->path[0] = '/'; + } + + if (stream->up.field_set & (1 << ISC_UF_QUERY)) { + stream->path[stream->pathlen - + stream->up.field_data[ISC_UF_QUERY].len - 1] = '?'; + memmove(stream->path + stream->pathlen - + stream->up.field_data[ISC_UF_QUERY].len, + &uri[stream->up.field_data[ISC_UF_QUERY].off], + stream->up.field_data[ISC_UF_QUERY].len); + } + + isc_buffer_allocate(mctx, &stream->rbuf, + INITIAL_DNS_MESSAGE_BUFFER_SIZE); + isc_buffer_setautorealloc(stream->rbuf, true); + + ISC_LIST_PREPEND(sock->h2.session->cstreams, stream, link); + *streamp = stream; + + return (ISC_R_SUCCESS); +} + +static void +put_http_cstream(isc_mem_t *mctx, http_cstream_t *stream) { + isc_mem_put(mctx, stream->path, stream->pathlen); + isc_mem_put(mctx, stream->authority, + stream->up.field_data[ISC_UF_HOST].len + AUTHEXTRA); + isc_mem_free(mctx, stream->uri); + if (stream->GET_path != NULL) { + isc_mem_free(mctx, stream->GET_path); + stream->GET_path = NULL; + stream->GET_path_len = 0; + } + + if (stream->postdata != NULL) { + INSIST(stream->post); + isc_buffer_free(&stream->postdata); + } + + if (stream == stream->httpsock->h2.connect.cstream) { + stream->httpsock->h2.connect.cstream = NULL; + } + if (ISC_LINK_LINKED(stream, link)) { + ISC_LIST_UNLINK(stream->httpsock->h2.session->cstreams, stream, + link); + } + isc__nmsocket_detach(&stream->httpsock); + + isc_buffer_free(&stream->rbuf); + isc_mem_put(mctx, stream, sizeof(http_cstream_t)); +} + +static void +finish_http_session(isc_nm_http_session_t *session) { + if (session->closed) { + return; + } + + if (session->handle != NULL) { + if (!session->closed) { + session->closed = true; + isc_nm_cancelread(session->handle); + } + + if (session->client) { + client_call_failed_read_cb(ISC_R_UNEXPECTED, session); + } else { + server_call_failed_read_cb(ISC_R_UNEXPECTED, session); + } + + call_pending_callbacks(session->pending_write_callbacks, + ISC_R_UNEXPECTED); + ISC_LIST_INIT(session->pending_write_callbacks); + + if (session->pending_write_data != NULL) { + isc_buffer_free(&session->pending_write_data); + } + + isc_nmhandle_detach(&session->handle); + } + + if (session->client_httphandle != NULL) { + isc_nmhandle_detach(&session->client_httphandle); + } + + INSIST(ISC_LIST_EMPTY(session->cstreams)); + + /* detach from server socket */ + if (session->serversocket != NULL) { + isc__nmsocket_detach(&session->serversocket); + } + session->closed = true; +} + +static int +on_client_data_chunk_recv_callback(int32_t stream_id, const uint8_t *data, + size_t len, isc_nm_http_session_t *session) { + http_cstream_t *cstream = find_http_cstream(stream_id, session); + + if (cstream != NULL) { + size_t new_rbufsize = len; + INSIST(cstream->rbuf != NULL); + new_rbufsize += isc_buffer_usedlength(cstream->rbuf); + if (new_rbufsize <= MAX_DNS_MESSAGE_SIZE && + new_rbufsize <= cstream->response_status.content_length) + { + isc_buffer_putmem(cstream->rbuf, data, len); + } else { + return (NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE); + } + } else { + return (NGHTTP2_ERR_CALLBACK_FAILURE); + } + + return (0); +} + +static int +on_server_data_chunk_recv_callback(int32_t stream_id, const uint8_t *data, + size_t len, isc_nm_http_session_t *session) { + isc_nmsocket_h2_t *h2 = ISC_LIST_HEAD(session->sstreams); + while (h2 != NULL) { + if (stream_id == h2->stream_id) { + if (isc_buffer_base(&h2->rbuf) == NULL) { + isc_buffer_init( + &h2->rbuf, + isc_mem_allocate(session->mctx, + h2->content_length), + MAX_DNS_MESSAGE_SIZE); + } + size_t new_bufsize = isc_buffer_usedlength(&h2->rbuf) + + len; + if (new_bufsize <= MAX_DNS_MESSAGE_SIZE && + new_bufsize <= h2->content_length) + { + isc_buffer_putmem(&h2->rbuf, data, len); + break; + } + + return (NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE); + } + h2 = ISC_LIST_NEXT(h2, link); + } + if (h2 == NULL) { + return (NGHTTP2_ERR_CALLBACK_FAILURE); + } + + return (0); +} + +static int +on_data_chunk_recv_callback(nghttp2_session *ngsession, uint8_t flags, + int32_t stream_id, const uint8_t *data, size_t len, + void *user_data) { + isc_nm_http_session_t *session = (isc_nm_http_session_t *)user_data; + int rv; + + UNUSED(ngsession); + UNUSED(flags); + + if (session->client) { + rv = on_client_data_chunk_recv_callback(stream_id, data, len, + session); + } else { + rv = on_server_data_chunk_recv_callback(stream_id, data, len, + session); + } + + return (rv); +} + +static void +call_unlink_cstream_readcb(http_cstream_t *cstream, + isc_nm_http_session_t *session, + isc_result_t result) { + isc_region_t read_data; + REQUIRE(VALID_HTTP2_SESSION(session)); + REQUIRE(cstream != NULL); + ISC_LIST_UNLINK(session->cstreams, cstream, link); + INSIST(VALID_NMHANDLE(session->client_httphandle)); + isc_buffer_usedregion(cstream->rbuf, &read_data); + cstream->read_cb(session->client_httphandle, result, &read_data, + cstream->read_cbarg); + put_http_cstream(session->mctx, cstream); +} + +static int +on_client_stream_close_callback(int32_t stream_id, + isc_nm_http_session_t *session) { + http_cstream_t *cstream = find_http_cstream(stream_id, session); + + if (cstream != NULL) { + isc_result_t result = + SUCCESSFUL_HTTP_STATUS(cstream->response_status.code) + ? ISC_R_SUCCESS + : ISC_R_FAILURE; + call_unlink_cstream_readcb(cstream, session, result); + if (ISC_LIST_EMPTY(session->cstreams)) { + int rv = 0; + rv = nghttp2_session_terminate_session( + session->ngsession, NGHTTP2_NO_ERROR); + if (rv != 0) { + return (rv); + } + /* Mark the session as closing one to finish it on a + * subsequent call to http_do_bio() */ + session->closing = true; + } + } else { + return (NGHTTP2_ERR_CALLBACK_FAILURE); + } + + return (0); +} + +static int +on_server_stream_close_callback(int32_t stream_id, + isc_nm_http_session_t *session) { + isc_nmsocket_t *sock = nghttp2_session_get_stream_user_data( + session->ngsession, stream_id); + int rv = 0; + + ISC_LIST_UNLINK(session->sstreams, &sock->h2, link); + session->nsstreams--; + + /* + * By making a call to isc__nmsocket_prep_destroy(), we ensure that + * the socket gets marked as inactive, allowing the HTTP/2 data + * associated with it to be properly disposed of eventually. + * + * An HTTP/2 stream socket will normally be marked as inactive in + * the normal course of operation. However, when browsers terminate + * HTTP/2 streams prematurely (e.g. by sending RST_STREAM), + * corresponding sockets can remain marked as active, retaining + * references to the HTTP/2 data (most notably the session objects), + * preventing them from being correctly freed and leading to BIND + * hanging on shutdown. Calling isc__nmsocket_prep_destroy() + * ensures that this will not happen. + */ + isc__nmsocket_prep_destroy(sock); + isc__nmsocket_detach(&sock); + return (rv); +} + +static int +on_stream_close_callback(nghttp2_session *ngsession, int32_t stream_id, + uint32_t error_code, void *user_data) { + isc_nm_http_session_t *session = (isc_nm_http_session_t *)user_data; + int rv = 0; + + REQUIRE(VALID_HTTP2_SESSION(session)); + REQUIRE(session->ngsession == ngsession); + + UNUSED(error_code); + + if (session->client) { + rv = on_client_stream_close_callback(stream_id, session); + } else { + rv = on_server_stream_close_callback(stream_id, session); + } + + return (rv); +} + +static bool +client_handle_status_header(http_cstream_t *cstream, const uint8_t *value, + const size_t valuelen) { + char tmp[32] = { 0 }; + const size_t tmplen = sizeof(tmp) - 1; + + strncpy(tmp, (const char *)value, ISC_MIN(tmplen, valuelen)); + cstream->response_status.code = strtoul(tmp, NULL, 10); + + if (SUCCESSFUL_HTTP_STATUS(cstream->response_status.code)) { + return (true); + } + + return (false); +} + +static bool +client_handle_content_length_header(http_cstream_t *cstream, + const uint8_t *value, + const size_t valuelen) { + char tmp[32] = { 0 }; + const size_t tmplen = sizeof(tmp) - 1; + + strncpy(tmp, (const char *)value, ISC_MIN(tmplen, valuelen)); + cstream->response_status.content_length = strtoul(tmp, NULL, 10); + + if (cstream->response_status.content_length == 0 || + cstream->response_status.content_length > MAX_DNS_MESSAGE_SIZE) + { + return (false); + } + + return (true); +} + +static bool +client_handle_content_type_header(http_cstream_t *cstream, const uint8_t *value, + const size_t valuelen) { + const char type_dns_message[] = DNS_MEDIA_TYPE; + const size_t len = sizeof(type_dns_message) - 1; + + UNUSED(valuelen); + + if (strncasecmp((const char *)value, type_dns_message, len) == 0) { + cstream->response_status.content_type_valid = true; + return (true); + } + + return (false); +} + +static int +client_on_header_callback(nghttp2_session *ngsession, + const nghttp2_frame *frame, const uint8_t *name, + size_t namelen, const uint8_t *value, size_t valuelen, + uint8_t flags, void *user_data) { + isc_nm_http_session_t *session = (isc_nm_http_session_t *)user_data; + http_cstream_t *cstream = NULL; + const char status[] = ":status"; + const char content_length[] = "Content-Length"; + const char content_type[] = "Content-Type"; + bool header_ok = true; + + REQUIRE(VALID_HTTP2_SESSION(session)); + REQUIRE(session->client); + + UNUSED(flags); + UNUSED(ngsession); + + cstream = find_http_cstream(frame->hd.stream_id, session); + if (cstream == NULL) { + /* + * This could happen in two cases: + * - the server sent us bad data, or + * - we closed the session prematurely before receiving all + * responses (i.e., because of a belated or partial response). + */ + return (NGHTTP2_ERR_CALLBACK_FAILURE); + } + + INSIST(!ISC_LIST_EMPTY(session->cstreams)); + + switch (frame->hd.type) { + case NGHTTP2_HEADERS: + if (frame->headers.cat != NGHTTP2_HCAT_RESPONSE) { + break; + } + + if (HEADER_MATCH(status, name, namelen)) { + header_ok = client_handle_status_header(cstream, value, + valuelen); + } else if (HEADER_MATCH(content_length, name, namelen)) { + header_ok = client_handle_content_length_header( + cstream, value, valuelen); + } else if (HEADER_MATCH(content_type, name, namelen)) { + header_ok = client_handle_content_type_header( + cstream, value, valuelen); + } + break; + } + + if (!header_ok) { + return (NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE); + } + + return (0); +} + +static void +initialize_nghttp2_client_session(isc_nm_http_session_t *session) { + nghttp2_session_callbacks *callbacks = NULL; + nghttp2_option *option = NULL; + nghttp2_mem mem; + + init_nghttp2_mem(session->mctx, &mem); + RUNTIME_CHECK(nghttp2_session_callbacks_new(&callbacks) == 0); + RUNTIME_CHECK(nghttp2_option_new(&option) == 0); + +#if NGHTTP2_VERSION_NUM >= (0x010c00) + nghttp2_option_set_max_send_header_block_length( + option, MAX_ALLOWED_DATA_IN_HEADERS); +#endif + + nghttp2_session_callbacks_set_on_data_chunk_recv_callback( + callbacks, on_data_chunk_recv_callback); + + nghttp2_session_callbacks_set_on_stream_close_callback( + callbacks, on_stream_close_callback); + + nghttp2_session_callbacks_set_on_header_callback( + callbacks, client_on_header_callback); + + RUNTIME_CHECK(nghttp2_session_client_new3(&session->ngsession, + callbacks, session, option, + &mem) == 0); + + nghttp2_option_del(option); + nghttp2_session_callbacks_del(callbacks); +} + +static bool +send_client_connection_header(isc_nm_http_session_t *session) { + nghttp2_settings_entry iv[] = { { NGHTTP2_SETTINGS_ENABLE_PUSH, 0 } }; + int rv; + + rv = nghttp2_submit_settings(session->ngsession, NGHTTP2_FLAG_NONE, iv, + sizeof(iv) / sizeof(iv[0])); + if (rv != 0) { + return (false); + } + + return (true); +} + +#define MAKE_NV(NAME, VALUE, VALUELEN) \ + { \ + (uint8_t *)(uintptr_t)(NAME), (uint8_t *)(uintptr_t)(VALUE), \ + sizeof(NAME) - 1, VALUELEN, NGHTTP2_NV_FLAG_NONE \ + } + +#define MAKE_NV2(NAME, VALUE) \ + { \ + (uint8_t *)(uintptr_t)(NAME), (uint8_t *)(uintptr_t)(VALUE), \ + sizeof(NAME) - 1, sizeof(VALUE) - 1, \ + NGHTTP2_NV_FLAG_NONE \ + } + +static ssize_t +client_read_callback(nghttp2_session *ngsession, int32_t stream_id, + uint8_t *buf, size_t length, uint32_t *data_flags, + nghttp2_data_source *source, void *user_data) { + isc_nm_http_session_t *session = (isc_nm_http_session_t *)user_data; + http_cstream_t *cstream = NULL; + + REQUIRE(session->client); + REQUIRE(!ISC_LIST_EMPTY(session->cstreams)); + + UNUSED(ngsession); + UNUSED(source); + + cstream = find_http_cstream(stream_id, session); + if (!cstream || cstream->stream_id != stream_id) { + /* We haven't found the stream, so we are not reading */ + return (NGHTTP2_ERR_CALLBACK_FAILURE); + } + + if (cstream->post) { + size_t len = isc_buffer_remaininglength(cstream->postdata); + + if (len > length) { + len = length; + } + + if (len > 0) { + memmove(buf, isc_buffer_current(cstream->postdata), + len); + isc_buffer_forward(cstream->postdata, len); + } + + if (isc_buffer_remaininglength(cstream->postdata) == 0) { + *data_flags |= NGHTTP2_DATA_FLAG_EOF; + } + + return (len); + } else { + *data_flags |= NGHTTP2_DATA_FLAG_EOF; + return (0); + } + + return (0); +} + +/* + * Send HTTP request to the remote peer. + */ +static isc_result_t +client_submit_request(isc_nm_http_session_t *session, http_cstream_t *stream) { + int32_t stream_id; + char *uri = stream->uri; + isc_url_parser_t *up = &stream->up; + nghttp2_data_provider dp; + + if (stream->post) { + char p[64]; + snprintf(p, sizeof(p), "%u", + isc_buffer_usedlength(stream->postdata)); + nghttp2_nv hdrs[] = { + MAKE_NV2(":method", "POST"), + MAKE_NV(":scheme", + &uri[up->field_data[ISC_UF_SCHEMA].off], + up->field_data[ISC_UF_SCHEMA].len), + MAKE_NV(":authority", stream->authority, + stream->authoritylen), + MAKE_NV(":path", stream->path, stream->pathlen), + MAKE_NV2("content-type", DNS_MEDIA_TYPE), + MAKE_NV2("accept", DNS_MEDIA_TYPE), + MAKE_NV("content-length", p, strlen(p)), + MAKE_NV2("cache-control", DEFAULT_CACHE_CONTROL) + }; + + dp = (nghttp2_data_provider){ .read_callback = + client_read_callback }; + stream_id = nghttp2_submit_request( + session->ngsession, NULL, hdrs, + sizeof(hdrs) / sizeof(hdrs[0]), &dp, stream); + } else { + INSIST(stream->GET_path != NULL); + INSIST(stream->GET_path_len != 0); + nghttp2_nv hdrs[] = { + MAKE_NV2(":method", "GET"), + MAKE_NV(":scheme", + &uri[up->field_data[ISC_UF_SCHEMA].off], + up->field_data[ISC_UF_SCHEMA].len), + MAKE_NV(":authority", stream->authority, + stream->authoritylen), + MAKE_NV(":path", stream->GET_path, + stream->GET_path_len), + MAKE_NV2("accept", DNS_MEDIA_TYPE), + MAKE_NV2("cache-control", DEFAULT_CACHE_CONTROL) + }; + + dp = (nghttp2_data_provider){ .read_callback = + client_read_callback }; + stream_id = nghttp2_submit_request( + session->ngsession, NULL, hdrs, + sizeof(hdrs) / sizeof(hdrs[0]), &dp, stream); + } + if (stream_id < 0) { + return (ISC_R_FAILURE); + } + + stream->stream_id = stream_id; + + return (ISC_R_SUCCESS); +} + +/* + * Read callback from TLS socket. + */ +static void +http_readcb(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, + void *data) { + isc_nm_http_session_t *session = (isc_nm_http_session_t *)data; + ssize_t readlen; + + REQUIRE(VALID_HTTP2_SESSION(session)); + + UNUSED(handle); + + if (result != ISC_R_SUCCESS) { + if (result != ISC_R_TIMEDOUT) { + session->reading = false; + } + failed_read_cb(result, session); + return; + } + + readlen = nghttp2_session_mem_recv(session->ngsession, region->base, + region->length); + if (readlen < 0) { + failed_read_cb(ISC_R_UNEXPECTED, session); + return; + } + + if ((size_t)readlen < region->length) { + size_t unread_size = region->length - readlen; + if (session->buf == NULL) { + isc_buffer_allocate(session->mctx, &session->buf, + unread_size); + isc_buffer_setautorealloc(session->buf, true); + } + isc_buffer_putmem(session->buf, region->base + readlen, + unread_size); + isc_nm_pauseread(session->handle); + } + + /* We might have something to receive or send, do IO */ + http_do_bio(session, NULL, NULL, NULL); +} + +static void +call_pending_callbacks(isc__nm_http_pending_callbacks_t pending_callbacks, + isc_result_t result) { + isc__nm_uvreq_t *cbreq = ISC_LIST_HEAD(pending_callbacks); + while (cbreq != NULL) { + isc__nm_uvreq_t *next = ISC_LIST_NEXT(cbreq, link); + ISC_LIST_UNLINK(pending_callbacks, cbreq, link); + isc__nm_sendcb(cbreq->handle->sock, cbreq, result, false); + cbreq = next; + } +} + +static void +http_writecb(isc_nmhandle_t *handle, isc_result_t result, void *arg) { + isc_http_send_req_t *req = (isc_http_send_req_t *)arg; + isc_nm_http_session_t *session = req->session; + isc_nmhandle_t *transphandle = req->transphandle; + + REQUIRE(VALID_HTTP2_SESSION(session)); + REQUIRE(VALID_NMHANDLE(handle)); + + if (http_session_active(session)) { + INSIST(session->handle == handle); + } + + call_pending_callbacks(req->pending_write_callbacks, result); + + if (req->cb != NULL) { + req->cb(req->httphandle, result, req->cbarg); + isc_nmhandle_detach(&req->httphandle); + } + + isc_buffer_free(&req->pending_write_data); + isc_mem_put(session->mctx, req, sizeof(*req)); + + session->sending--; + http_do_bio(session, NULL, NULL, NULL); + isc_nmhandle_detach(&transphandle); + if (result != ISC_R_SUCCESS && session->sending == 0) { + finish_http_session(session); + } + isc__nm_httpsession_detach(&session); +} + +static void +move_pending_send_callbacks(isc_nm_http_session_t *session, + isc_http_send_req_t *send) { + STATIC_ASSERT( + sizeof(session->pending_write_callbacks) == + sizeof(send->pending_write_callbacks), + "size of pending writes requests callbacks lists differs"); + memmove(&send->pending_write_callbacks, + &session->pending_write_callbacks, + sizeof(session->pending_write_callbacks)); + ISC_LIST_INIT(session->pending_write_callbacks); +} + +static bool +http_send_outgoing(isc_nm_http_session_t *session, isc_nmhandle_t *httphandle, + isc_nm_cb_t cb, void *cbarg) { + isc_http_send_req_t *send = NULL; + size_t total = 0; + isc_region_t send_data = { 0 }; + isc_nmhandle_t *transphandle = NULL; +#ifdef ENABLE_HTTP_WRITE_BUFFERING + size_t max_total_write_size = 0; +#endif /* ENABLE_HTTP_WRITE_BUFFERING */ + + if (!http_session_active(session) || + (!nghttp2_session_want_write(session->ngsession) && + session->pending_write_data == NULL)) + { + return (false); + } + + /* We need to attach to the session->handle earlier because as an + * indirect result of the nghttp2_session_mem_send() the session + * might get closed and the handle detached. However, there is + * still some outgoing data to handle and we need to call it + * anyway if only to get the write callback passed here to get + * called properly. */ + isc_nmhandle_attach(session->handle, &transphandle); + + while (nghttp2_session_want_write(session->ngsession)) { + const uint8_t *data = NULL; + const size_t pending = + nghttp2_session_mem_send(session->ngsession, &data); + const size_t new_total = total + pending; + + /* Sometimes nghttp2_session_mem_send() does not return any + * data to send even though nghttp2_session_want_write() + * returns success. */ + if (pending == 0 || data == NULL) { + break; + } + + /* reallocate buffer if required */ + if (session->pending_write_data == NULL) { + isc_buffer_allocate(session->mctx, + &session->pending_write_data, + INITIAL_DNS_MESSAGE_BUFFER_SIZE); + isc_buffer_setautorealloc(session->pending_write_data, + true); + } + isc_buffer_putmem(session->pending_write_data, data, pending); + total = new_total; + } + +#ifdef ENABLE_HTTP_WRITE_BUFFERING + if (session->pending_write_data != NULL) { + max_total_write_size = + isc_buffer_usedlength(session->pending_write_data); + } + + /* Here we are trying to flush the pending writes buffer earlier + * to avoid hitting unnecessary limitations on a TLS record size + * within some tools (e.g. flamethrower). */ + if (max_total_write_size >= FLUSH_HTTP_WRITE_BUFFER_AFTER) { + /* Case 1: We have equal or more than + * FLUSH_HTTP_WRITE_BUFFER_AFTER bytes to send. Let's flush it. + */ + total = max_total_write_size; + } else if (session->sending > 0 && total > 0) { + /* Case 2: There is one or more write requests in flight and + * we have some new data form nghttp2 to send. Let's put the + * write callback (if any) into the pending write callbacks + * list. Then let's return from the function: as soon as the + * "in-flight" write callback get's called or we have reached + * FLUSH_HTTP_WRITE_BUFFER_AFTER bytes in the write buffer, we + * will flush the buffer. */ + if (cb != NULL) { + isc__nm_uvreq_t *newcb = isc__nm_uvreq_get( + httphandle->sock->mgr, httphandle->sock); + + INSIST(VALID_NMHANDLE(httphandle)); + newcb->cb.send = cb; + newcb->cbarg = cbarg; + isc_nmhandle_attach(httphandle, &newcb->handle); + ISC_LIST_APPEND(session->pending_write_callbacks, newcb, + link); + } + goto nothing_to_send; + } else if (session->sending == 0 && total == 0 && + session->pending_write_data != NULL) + { + /* Case 3: There is no write in flight and we haven't got + * anything new from nghttp2, but there is some data pending + * in the write buffer. Let's flush the buffer. */ + isc_region_t region = { 0 }; + total = isc_buffer_usedlength(session->pending_write_data); + INSIST(total > 0); + isc_buffer_usedregion(session->pending_write_data, ®ion); + INSIST(total == region.length); + } else { + /* The other cases are, uninteresting, fall-through ones. */ + /* In the following cases (4-6) we will just bail out. */ + /* Case 4: There is nothing new to send, nor anything in the + * write buffer. */ + /* Case 5: There is nothing new to send and there is write + * request(s) in flight. */ + /* Case 6: There is nothing new to send nor there are any + * write requests in flight. */ + + /* Case 7: There is some new data to send and there are no any + * write requests in flight: Let's send the data.*/ + INSIST((total == 0 && session->pending_write_data == NULL) || + (total == 0 && session->sending > 0) || + (total == 0 && session->sending == 0) || + (total > 0 && session->sending == 0)); + } +#else + INSIST(ISC_LIST_EMPTY(session->pending_write_callbacks)); +#endif /* ENABLE_HTTP_WRITE_BUFFERING */ + + if (total == 0) { + /* No data returned */ + goto nothing_to_send; + } + + /* If we have reached the point it means that we need to send some + * data and flush the outgoing buffer. The code below does that. */ + send = isc_mem_get(session->mctx, sizeof(*send)); + + *send = (isc_http_send_req_t){ .pending_write_data = + session->pending_write_data, + .cb = cb, + .cbarg = cbarg }; + session->pending_write_data = NULL; + move_pending_send_callbacks(session, send); + + send->transphandle = transphandle; + isc__nm_httpsession_attach(session, &send->session); + + if (cb != NULL) { + INSIST(VALID_NMHANDLE(httphandle)); + isc_nmhandle_attach(httphandle, &send->httphandle); + } + + session->sending++; + isc_buffer_usedregion(send->pending_write_data, &send_data); + isc_nm_send(transphandle, &send_data, http_writecb, send); + return (true); +nothing_to_send: + isc_nmhandle_detach(&transphandle); + return (false); +} + +static void +http_do_bio(isc_nm_http_session_t *session, isc_nmhandle_t *send_httphandle, + isc_nm_cb_t send_cb, void *send_cbarg) { + REQUIRE(VALID_HTTP2_SESSION(session)); + + if (session->closed) { + return; + } else if (session->closing) { + /* + * There might be leftover callbacks waiting to be received + */ + if (session->sending == 0) { + finish_http_session(session); + } + return; + } else if (nghttp2_session_want_read(session->ngsession) == 0 && + nghttp2_session_want_write(session->ngsession) == 0 && + session->pending_write_data == NULL) + { + session->closing = true; + return; + } + + if (nghttp2_session_want_read(session->ngsession) != 0) { + if (!session->reading) { + /* We have not yet started reading from this handle */ + isc_nm_read(session->handle, http_readcb, session); + session->reading = true; + } else if (session->buf != NULL) { + size_t remaining = + isc_buffer_remaininglength(session->buf); + /* Leftover data in the buffer, use it */ + size_t readlen = nghttp2_session_mem_recv( + session->ngsession, + isc_buffer_current(session->buf), remaining); + + if (readlen == remaining) { + isc_buffer_free(&session->buf); + } else { + isc_buffer_forward(session->buf, readlen); + } + + http_do_bio(session, send_httphandle, send_cb, + send_cbarg); + return; + } else { + /* Resume reading, it's idempotent, wait for more */ + isc_nm_resumeread(session->handle); + } + } else { + /* We don't want more data, stop reading for now */ + isc_nm_pauseread(session->handle); + } + + if (send_cb != NULL) { + INSIST(VALID_NMHANDLE(send_httphandle)); + (void)http_send_outgoing(session, send_httphandle, send_cb, + send_cbarg); + } else { + INSIST(send_httphandle == NULL); + INSIST(send_cb == NULL); + INSIST(send_cbarg == NULL); + (void)http_send_outgoing(session, NULL, NULL, NULL); + } + + return; +} + +static isc_result_t +get_http_cstream(isc_nmsocket_t *sock, http_cstream_t **streamp) { + http_cstream_t *cstream = sock->h2.connect.cstream; + isc_result_t result; + + REQUIRE(streamp != NULL && *streamp == NULL); + + sock->h2.connect.cstream = NULL; + + if (cstream == NULL) { + result = new_http_cstream(sock, &cstream); + if (result != ISC_R_SUCCESS) { + INSIST(cstream == NULL); + return (result); + } + } + + *streamp = cstream; + return (ISC_R_SUCCESS); +} + +static void +http_call_connect_cb(isc_nmsocket_t *sock, isc_nm_http_session_t *session, + isc_result_t result) { + isc__nm_uvreq_t *req = NULL; + isc_nmhandle_t *httphandle = isc__nmhandle_get(sock, &sock->peer, + &sock->iface); + + REQUIRE(sock->connect_cb != NULL); + + if (result == ISC_R_SUCCESS) { + req = isc__nm_uvreq_get(sock->mgr, sock); + req->cb.connect = sock->connect_cb; + req->cbarg = sock->connect_cbarg; + if (session != NULL) { + session->client_httphandle = httphandle; + req->handle = NULL; + isc_nmhandle_attach(httphandle, &req->handle); + } else { + req->handle = httphandle; + } + + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, result, true); + } else { + void *cbarg = sock->connect_cbarg; + isc_nm_cb_t connect_cb = sock->connect_cb; + + isc__nmsocket_clearcb(sock); + connect_cb(httphandle, result, cbarg); + isc_nmhandle_detach(&httphandle); + } +} + +static void +transport_connect_cb(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) { + isc_nmsocket_t *http_sock = (isc_nmsocket_t *)cbarg; + isc_nmsocket_t *transp_sock = NULL; + isc_nm_http_session_t *session = NULL; + http_cstream_t *cstream = NULL; + isc_mem_t *mctx = NULL; + + REQUIRE(VALID_NMSOCK(http_sock)); + REQUIRE(VALID_NMHANDLE(handle)); + + transp_sock = handle->sock; + + REQUIRE(VALID_NMSOCK(transp_sock)); + + mctx = transp_sock->mgr->mctx; + + INSIST(http_sock->h2.connect.uri != NULL); + + http_sock->tid = transp_sock->tid; + http_sock->h2.connect.tls_peer_verify_string = + isc_nm_verify_tls_peer_result_string(handle); + if (result != ISC_R_SUCCESS) { + goto error; + } + + new_session(mctx, http_sock->h2.connect.tlsctx, &session); + session->client = true; + transp_sock->h2.session = session; + http_sock->h2.connect.tlsctx = NULL; + /* otherwise we will get some garbage output in DIG */ + http_sock->iface = handle->sock->iface; + http_sock->peer = handle->sock->peer; + + transp_sock->h2.connect.post = http_sock->h2.connect.post; + transp_sock->h2.connect.uri = http_sock->h2.connect.uri; + http_sock->h2.connect.uri = NULL; + isc__nm_httpsession_attach(session, &http_sock->h2.session); + + if (session->tlsctx != NULL) { + const unsigned char *alpn = NULL; + unsigned int alpnlen = 0; + + INSIST(transp_sock->type == isc_nm_tlssocket); + + isc_tls_get_selected_alpn(transp_sock->tlsstream.tls, &alpn, + &alpnlen); + if (alpn == NULL || alpnlen != NGHTTP2_PROTO_VERSION_ID_LEN || + memcmp(NGHTTP2_PROTO_VERSION_ID, alpn, + NGHTTP2_PROTO_VERSION_ID_LEN) != 0) + { + /* + * HTTP/2 negotiation error. Any sensible DoH + * client will fail if HTTP/2 cannot be + * negotiated via ALPN. + */ + result = ISC_R_HTTP2ALPNERROR; + goto error; + } + } + + isc_nmhandle_attach(handle, &session->handle); + + initialize_nghttp2_client_session(session); + if (!send_client_connection_header(session)) { + goto error; + } + + result = get_http_cstream(http_sock, &cstream); + http_sock->h2.connect.cstream = cstream; + if (result != ISC_R_SUCCESS) { + goto error; + } + + http_transpost_tcp_nodelay(handle); + + http_call_connect_cb(http_sock, session, result); + + http_do_bio(session, NULL, NULL, NULL); + isc__nmsocket_detach(&http_sock); + return; + +error: + http_call_connect_cb(http_sock, session, result); + + if (http_sock->h2.connect.uri != NULL) { + isc_mem_free(mctx, http_sock->h2.connect.uri); + } + + isc__nmsocket_prep_destroy(http_sock); + isc__nmsocket_detach(&http_sock); +} + +void +isc_nm_httpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, + const char *uri, bool post, isc_nm_cb_t cb, void *cbarg, + isc_tlsctx_t *tlsctx, + isc_tlsctx_client_session_cache_t *client_sess_cache, + unsigned int timeout, size_t extrahandlesize) { + isc_sockaddr_t local_interface; + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NM(mgr)); + REQUIRE(cb != NULL); + REQUIRE(peer != NULL); + REQUIRE(uri != NULL); + REQUIRE(*uri != '\0'); + + if (local == NULL) { + isc_sockaddr_anyofpf(&local_interface, peer->type.sa.sa_family); + local = &local_interface; + } + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_httpsocket, local); + + sock->extrahandlesize = extrahandlesize; + sock->connect_timeout = timeout; + sock->result = ISC_R_UNSET; + sock->connect_cb = cb; + sock->connect_cbarg = cbarg; + atomic_init(&sock->client, true); + + if (isc__nm_closing(sock)) { + isc__nm_uvreq_t *req = isc__nm_uvreq_get(mgr, sock); + + req->cb.connect = cb; + req->cbarg = cbarg; + req->peer = *peer; + req->local = *local; + req->handle = isc__nmhandle_get(sock, &req->peer, &sock->iface); + + if (isc__nm_in_netthread()) { + sock->tid = isc_nm_tid(); + } + + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, ISC_R_SHUTTINGDOWN, true); + isc__nmsocket_prep_destroy(sock); + isc__nmsocket_detach(&sock); + return; + } + + sock->h2 = (isc_nmsocket_h2_t){ .connect.uri = isc_mem_strdup(mgr->mctx, + uri), + .connect.post = post, + .connect.tlsctx = tlsctx }; + ISC_LINK_INIT(&sock->h2, link); + + /* + * We need to prevent the interface object data from going out of + * scope too early. + */ + if (local == &local_interface) { + sock->h2.connect.local_interface = local_interface; + sock->iface = sock->h2.connect.local_interface; + } + + if (tlsctx != NULL) { + isc_nm_tlsconnect(mgr, local, peer, transport_connect_cb, sock, + tlsctx, client_sess_cache, timeout, 0); + } else { + isc_nm_tcpconnect(mgr, local, peer, transport_connect_cb, sock, + timeout, 0); + } +} + +static isc_result_t +client_send(isc_nmhandle_t *handle, const isc_region_t *region) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = handle->sock; + isc_mem_t *mctx = sock->mgr->mctx; + isc_nm_http_session_t *session = sock->h2.session; + http_cstream_t *cstream = sock->h2.connect.cstream; + + REQUIRE(VALID_HTTP2_SESSION(handle->sock->h2.session)); + REQUIRE(session->client); + REQUIRE(region != NULL); + REQUIRE(region->base != NULL); + REQUIRE(region->length <= MAX_DNS_MESSAGE_SIZE); + + if (session->closed) { + return (ISC_R_CANCELED); + } + + INSIST(cstream != NULL); + + if (cstream->post) { + /* POST */ + isc_buffer_allocate(mctx, &cstream->postdata, region->length); + isc_buffer_putmem(cstream->postdata, region->base, + region->length); + } else { + /* GET */ + size_t path_size = 0; + char *base64url_data = NULL; + size_t base64url_data_len = 0; + isc_buffer_t *buf = NULL; + isc_region_t data = *region; + isc_region_t base64_region; + size_t base64_len = ((4 * data.length / 3) + 3) & ~3; + + isc_buffer_allocate(mctx, &buf, base64_len); + + result = isc_base64_totext(&data, -1, "", buf); + if (result != ISC_R_SUCCESS) { + isc_buffer_free(&buf); + goto error; + } + + isc_buffer_usedregion(buf, &base64_region); + INSIST(base64_region.length == base64_len); + + base64url_data = isc__nm_base64_to_base64url( + mctx, (const char *)base64_region.base, + base64_region.length, &base64url_data_len); + isc_buffer_free(&buf); + if (base64url_data == NULL) { + goto error; + } + + /* len("?dns=") + len(path) + len(base64url) + len("\0") */ + path_size = cstream->pathlen + base64url_data_len + 5 + 1; + cstream->GET_path = isc_mem_allocate(mctx, path_size); + cstream->GET_path_len = (size_t)snprintf( + cstream->GET_path, path_size, "%.*s?dns=%s", + (int)cstream->pathlen, cstream->path, base64url_data); + + INSIST(cstream->GET_path_len == (path_size - 1)); + isc_mem_free(mctx, base64url_data); + } + + cstream->sending = true; + + sock->h2.connect.cstream = NULL; + result = client_submit_request(session, cstream); + if (result != ISC_R_SUCCESS) { + put_http_cstream(session->mctx, cstream); + goto error; + } + +error: + return (result); +} + +isc_result_t +isc__nm_http_request(isc_nmhandle_t *handle, isc_region_t *region, + isc_nm_recv_cb_t cb, void *cbarg) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + http_cstream_t *cstream = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&handle->sock->client)); + + REQUIRE(cb != NULL); + + sock = handle->sock; + + isc__nm_http_read(handle, cb, cbarg); + if (!http_session_active(handle->sock->h2.session)) { + /* the callback was called by isc__nm_http_read() */ + return (ISC_R_CANCELED); + } + result = client_send(handle, region); + if (result != ISC_R_SUCCESS) { + goto error; + } + + return (ISC_R_SUCCESS); + +error: + cstream = sock->h2.connect.cstream; + if (cstream->read_cb != NULL) { + cstream->read_cb(handle, result, NULL, cstream->read_cbarg); + } + return (result); +} + +static int +server_on_begin_headers_callback(nghttp2_session *ngsession, + const nghttp2_frame *frame, void *user_data) { + isc_nm_http_session_t *session = (isc_nm_http_session_t *)user_data; + isc_nmsocket_t *socket = NULL; + + if (frame->hd.type != NGHTTP2_HEADERS || + frame->headers.cat != NGHTTP2_HCAT_REQUEST) + { + return (0); + } else if (frame->hd.length > MAX_ALLOWED_DATA_IN_HEADERS) { + return (NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE); + } + + if (session->nsstreams >= session->max_concurrent_streams) { + return (NGHTTP2_ERR_CALLBACK_FAILURE); + } + + socket = isc_mem_get(session->mctx, sizeof(isc_nmsocket_t)); + isc__nmsocket_init(socket, session->serversocket->mgr, + isc_nm_httpsocket, + (isc_sockaddr_t *)&session->handle->sock->iface); + socket->peer = session->handle->sock->peer; + socket->h2 = (isc_nmsocket_h2_t){ + .psock = socket, + .stream_id = frame->hd.stream_id, + .headers_error_code = ISC_HTTP_ERROR_SUCCESS, + .request_type = ISC_HTTP_REQ_UNSUPPORTED, + .request_scheme = ISC_HTTP_SCHEME_UNSUPPORTED + }; + isc_buffer_initnull(&socket->h2.rbuf); + isc_buffer_initnull(&socket->h2.wbuf); + session->nsstreams++; + isc__nm_httpsession_attach(session, &socket->h2.session); + socket->tid = session->handle->sock->tid; + ISC_LINK_INIT(&socket->h2, link); + ISC_LIST_APPEND(session->sstreams, &socket->h2, link); + + nghttp2_session_set_stream_user_data(ngsession, frame->hd.stream_id, + socket); + return (0); +} + +static isc_nm_httphandler_t * +find_server_request_handler(const char *request_path, + isc_nmsocket_t *serversocket, const int tid) { + isc_nm_httphandler_t *handler = NULL; + + REQUIRE(VALID_NMSOCK(serversocket)); + + if (atomic_load(&serversocket->listening)) { + handler = http_endpoints_find( + request_path, + http_get_listener_endpoints(serversocket, tid)); + } + return (handler); +} + +static isc_http_error_responses_t +server_handle_path_header(isc_nmsocket_t *socket, const uint8_t *value, + const size_t valuelen) { + isc_nm_httphandler_t *handler = NULL; + const uint8_t *qstr = NULL; + size_t vlen = valuelen; + + qstr = memchr(value, '?', valuelen); + if (qstr != NULL) { + vlen = qstr - value; + } + + if (socket->h2.request_path != NULL) { + isc_mem_free(socket->mgr->mctx, socket->h2.request_path); + } + socket->h2.request_path = isc_mem_strndup( + socket->mgr->mctx, (const char *)value, vlen + 1); + + if (!isc_nm_http_path_isvalid(socket->h2.request_path)) { + isc_mem_free(socket->mgr->mctx, socket->h2.request_path); + socket->h2.request_path = NULL; + return (ISC_HTTP_ERROR_BAD_REQUEST); + } + + handler = find_server_request_handler(socket->h2.request_path, + socket->h2.session->serversocket, + socket->tid); + if (handler != NULL) { + socket->h2.cb = handler->cb; + socket->h2.cbarg = handler->cbarg; + socket->extrahandlesize = handler->extrahandlesize; + } else { + isc_mem_free(socket->mgr->mctx, socket->h2.request_path); + socket->h2.request_path = NULL; + return (ISC_HTTP_ERROR_NOT_FOUND); + } + + if (qstr != NULL) { + const char *dns_value = NULL; + size_t dns_value_len = 0; + + if (isc__nm_parse_httpquery((const char *)qstr, &dns_value, + &dns_value_len)) + { + const size_t decoded_size = dns_value_len / 4 * 3; + if (decoded_size <= MAX_DNS_MESSAGE_SIZE) { + if (socket->h2.query_data != NULL) { + isc_mem_free(socket->mgr->mctx, + socket->h2.query_data); + } + socket->h2.query_data = + isc__nm_base64url_to_base64( + socket->mgr->mctx, dns_value, + dns_value_len, + &socket->h2.query_data_len); + } else { + socket->h2.query_too_large = true; + return (ISC_HTTP_ERROR_PAYLOAD_TOO_LARGE); + } + } else { + return (ISC_HTTP_ERROR_BAD_REQUEST); + } + } + return (ISC_HTTP_ERROR_SUCCESS); +} + +static isc_http_error_responses_t +server_handle_method_header(isc_nmsocket_t *socket, const uint8_t *value, + const size_t valuelen) { + const char get[] = "GET"; + const char post[] = "POST"; + + if (HEADER_MATCH(get, value, valuelen)) { + socket->h2.request_type = ISC_HTTP_REQ_GET; + } else if (HEADER_MATCH(post, value, valuelen)) { + socket->h2.request_type = ISC_HTTP_REQ_POST; + } else { + return (ISC_HTTP_ERROR_NOT_IMPLEMENTED); + } + return (ISC_HTTP_ERROR_SUCCESS); +} + +static isc_http_error_responses_t +server_handle_scheme_header(isc_nmsocket_t *socket, const uint8_t *value, + const size_t valuelen) { + const char http[] = "http"; + const char http_secure[] = "https"; + + if (HEADER_MATCH(http_secure, value, valuelen)) { + socket->h2.request_scheme = ISC_HTTP_SCHEME_HTTP_SECURE; + } else if (HEADER_MATCH(http, value, valuelen)) { + socket->h2.request_scheme = ISC_HTTP_SCHEME_HTTP; + } else { + return (ISC_HTTP_ERROR_BAD_REQUEST); + } + return (ISC_HTTP_ERROR_SUCCESS); +} + +static isc_http_error_responses_t +server_handle_content_length_header(isc_nmsocket_t *socket, + const uint8_t *value, + const size_t valuelen) { + char tmp[32] = { 0 }; + const size_t tmplen = sizeof(tmp) - 1; + + strncpy(tmp, (const char *)value, + valuelen > tmplen ? tmplen : valuelen); + socket->h2.content_length = strtoul(tmp, NULL, 10); + if (socket->h2.content_length > MAX_DNS_MESSAGE_SIZE) { + return (ISC_HTTP_ERROR_PAYLOAD_TOO_LARGE); + } else if (socket->h2.content_length == 0) { + return (ISC_HTTP_ERROR_BAD_REQUEST); + } + return (ISC_HTTP_ERROR_SUCCESS); +} + +static isc_http_error_responses_t +server_handle_content_type_header(isc_nmsocket_t *socket, const uint8_t *value, + const size_t valuelen) { + const char type_dns_message[] = DNS_MEDIA_TYPE; + isc_http_error_responses_t resp = ISC_HTTP_ERROR_SUCCESS; + + UNUSED(socket); + + if (!HEADER_MATCH(type_dns_message, value, valuelen)) { + resp = ISC_HTTP_ERROR_UNSUPPORTED_MEDIA_TYPE; + } + return (resp); +} + +static isc_http_error_responses_t +server_handle_header(isc_nmsocket_t *socket, const uint8_t *name, + size_t namelen, const uint8_t *value, + const size_t valuelen) { + isc_http_error_responses_t code = ISC_HTTP_ERROR_SUCCESS; + bool was_error; + const char path[] = ":path"; + const char method[] = ":method"; + const char scheme[] = ":scheme"; + const char content_length[] = "Content-Length"; + const char content_type[] = "Content-Type"; + + was_error = socket->h2.headers_error_code != ISC_HTTP_ERROR_SUCCESS; + /* + * process Content-Length even when there was an error, + * to drop the connection earlier if required. + */ + if (HEADER_MATCH(content_length, name, namelen)) { + code = server_handle_content_length_header(socket, value, + valuelen); + } else if (!was_error && HEADER_MATCH(path, name, namelen)) { + code = server_handle_path_header(socket, value, valuelen); + } else if (!was_error && HEADER_MATCH(method, name, namelen)) { + code = server_handle_method_header(socket, value, valuelen); + } else if (!was_error && HEADER_MATCH(scheme, name, namelen)) { + code = server_handle_scheme_header(socket, value, valuelen); + } else if (!was_error && HEADER_MATCH(content_type, name, namelen)) { + code = server_handle_content_type_header(socket, value, + valuelen); + } + + return (code); +} + +static int +server_on_header_callback(nghttp2_session *session, const nghttp2_frame *frame, + const uint8_t *name, size_t namelen, + const uint8_t *value, size_t valuelen, uint8_t flags, + void *user_data) { + isc_nmsocket_t *socket = NULL; + isc_http_error_responses_t code = ISC_HTTP_ERROR_SUCCESS; + + UNUSED(flags); + UNUSED(user_data); + + socket = nghttp2_session_get_stream_user_data(session, + frame->hd.stream_id); + if (socket == NULL) { + return (NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE); + } + + socket->h2.headers_data_processed += (namelen + valuelen); + + switch (frame->hd.type) { + case NGHTTP2_HEADERS: + if (frame->headers.cat != NGHTTP2_HCAT_REQUEST) { + break; + } + code = server_handle_header(socket, name, namelen, value, + valuelen); + break; + } + + INSIST(socket != NULL); + + if (socket->h2.headers_data_processed > MAX_ALLOWED_DATA_IN_HEADERS) { + return (NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE); + } else if (socket->h2.content_length > MAX_ALLOWED_DATA_IN_POST) { + return (NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE); + } + + if (code == ISC_HTTP_ERROR_SUCCESS) { + return (0); + } else { + socket->h2.headers_error_code = code; + } + + return (0); +} + +static ssize_t +server_read_callback(nghttp2_session *ngsession, int32_t stream_id, + uint8_t *buf, size_t length, uint32_t *data_flags, + nghttp2_data_source *source, void *user_data) { + isc_nm_http_session_t *session = (isc_nm_http_session_t *)user_data; + isc_nmsocket_t *socket = (isc_nmsocket_t *)source->ptr; + size_t buflen; + + REQUIRE(socket->h2.stream_id == stream_id); + + UNUSED(ngsession); + UNUSED(session); + + buflen = isc_buffer_remaininglength(&socket->h2.wbuf); + if (buflen > length) { + buflen = length; + } + + if (buflen > 0) { + (void)memmove(buf, isc_buffer_current(&socket->h2.wbuf), + buflen); + isc_buffer_forward(&socket->h2.wbuf, buflen); + } + + if (isc_buffer_remaininglength(&socket->h2.wbuf) == 0) { + *data_flags |= NGHTTP2_DATA_FLAG_EOF; + } + + return (buflen); +} + +static isc_result_t +server_send_response(nghttp2_session *ngsession, int32_t stream_id, + const nghttp2_nv *nva, size_t nvlen, + isc_nmsocket_t *socket) { + nghttp2_data_provider data_prd; + int rv; + + if (socket->h2.response_submitted) { + /* NGHTTP2 will gladly accept new response (write request) + * from us even though we cannot send more than one over the + * same HTTP/2 stream. Thus, we need to handle this case + * manually. We will return failure code so that it will be + * passed to the write callback. */ + return (ISC_R_FAILURE); + } + + data_prd.source.ptr = socket; + data_prd.read_callback = server_read_callback; + + rv = nghttp2_submit_response(ngsession, stream_id, nva, nvlen, + &data_prd); + if (rv != 0) { + return (ISC_R_FAILURE); + } + + socket->h2.response_submitted = true; + return (ISC_R_SUCCESS); +} + +#define MAKE_ERROR_REPLY(tag, code, desc) \ + { \ + tag, MAKE_NV2(":status", #code), desc \ + } + +/* + * Here we use roughly the same error codes that Unbound uses. + * (https://blog.nlnetlabs.nl/dns-over-https-in-unbound/) + */ + +static struct http_error_responses { + const isc_http_error_responses_t type; + const nghttp2_nv header; + const char *desc; +} error_responses[] = { + MAKE_ERROR_REPLY(ISC_HTTP_ERROR_BAD_REQUEST, 400, "Bad Request"), + MAKE_ERROR_REPLY(ISC_HTTP_ERROR_NOT_FOUND, 404, "Not Found"), + MAKE_ERROR_REPLY(ISC_HTTP_ERROR_PAYLOAD_TOO_LARGE, 413, + "Payload Too Large"), + MAKE_ERROR_REPLY(ISC_HTTP_ERROR_URI_TOO_LONG, 414, "URI Too Long"), + MAKE_ERROR_REPLY(ISC_HTTP_ERROR_UNSUPPORTED_MEDIA_TYPE, 415, + "Unsupported Media Type"), + MAKE_ERROR_REPLY(ISC_HTTP_ERROR_GENERIC, 500, "Internal Server Error"), + MAKE_ERROR_REPLY(ISC_HTTP_ERROR_NOT_IMPLEMENTED, 501, "Not Implemented") +}; + +static void +log_server_error_response(const isc_nmsocket_t *socket, + const struct http_error_responses *response) { + const int log_level = ISC_LOG_DEBUG(1); + isc_sockaddr_t client_addr; + isc_sockaddr_t local_addr; + char client_sabuf[ISC_SOCKADDR_FORMATSIZE]; + char local_sabuf[ISC_SOCKADDR_FORMATSIZE]; + + if (!isc_log_wouldlog(isc_lctx, log_level)) { + return; + } + + client_addr = isc_nmhandle_peeraddr(socket->h2.session->handle); + local_addr = isc_nmhandle_localaddr(socket->h2.session->handle); + isc_sockaddr_format(&client_addr, client_sabuf, sizeof(client_sabuf)); + isc_sockaddr_format(&local_addr, local_sabuf, sizeof(local_sabuf)); + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR, + log_level, "HTTP/2 request from %s (on %s) failed: %s %s", + client_sabuf, local_sabuf, response->header.value, + response->desc); +} + +static isc_result_t +server_send_error_response(const isc_http_error_responses_t error, + nghttp2_session *ngsession, isc_nmsocket_t *socket) { + void *base; + + REQUIRE(error != ISC_HTTP_ERROR_SUCCESS); + + base = isc_buffer_base(&socket->h2.rbuf); + if (base != NULL) { + isc_mem_free(socket->h2.session->mctx, base); + isc_buffer_initnull(&socket->h2.rbuf); + } + + /* We do not want the error response to be cached anywhere. */ + socket->h2.min_ttl = 0; + + for (size_t i = 0; + i < sizeof(error_responses) / sizeof(error_responses[0]); i++) + { + if (error_responses[i].type == error) { + log_server_error_response(socket, &error_responses[i]); + return (server_send_response( + ngsession, socket->h2.stream_id, + &error_responses[i].header, 1, socket)); + } + } + + return (server_send_error_response(ISC_HTTP_ERROR_GENERIC, ngsession, + socket)); +} + +static void +server_call_cb(isc_nmsocket_t *socket, isc_nm_http_session_t *session, + const isc_result_t result, isc_region_t *data) { + isc_sockaddr_t addr; + isc_nmhandle_t *handle = NULL; + + REQUIRE(VALID_NMSOCK(socket)); + REQUIRE(VALID_HTTP2_SESSION(session)); + REQUIRE(socket->h2.cb != NULL); + + addr = isc_nmhandle_peeraddr(session->handle); + handle = isc__nmhandle_get(socket, &addr, NULL); + socket->h2.cb(handle, result, data, socket->h2.cbarg); + isc_nmhandle_detach(&handle); +} + +void +isc__nm_http_bad_request(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + sock = handle->sock; + REQUIRE(sock->type == isc_nm_httpsocket); + REQUIRE(!atomic_load(&sock->client)); + REQUIRE(VALID_HTTP2_SESSION(sock->h2.session)); + + (void)server_send_error_response(ISC_HTTP_ERROR_BAD_REQUEST, + sock->h2.session->ngsession, sock); +} + +static int +server_on_request_recv(nghttp2_session *ngsession, + isc_nm_http_session_t *session, isc_nmsocket_t *socket) { + isc_result_t result; + isc_http_error_responses_t code = ISC_HTTP_ERROR_SUCCESS; + isc_region_t data; + uint8_t tmp_buf[MAX_DNS_MESSAGE_SIZE]; + + code = socket->h2.headers_error_code; + if (code != ISC_HTTP_ERROR_SUCCESS) { + goto error; + } + + if (socket->h2.request_path == NULL || socket->h2.cb == NULL) { + code = ISC_HTTP_ERROR_NOT_FOUND; + } else if (socket->h2.request_type == ISC_HTTP_REQ_POST && + socket->h2.content_length == 0) + { + code = ISC_HTTP_ERROR_BAD_REQUEST; + } else if (socket->h2.request_type == ISC_HTTP_REQ_POST && + isc_buffer_usedlength(&socket->h2.rbuf) > + socket->h2.content_length) + { + code = ISC_HTTP_ERROR_PAYLOAD_TOO_LARGE; + } else if (socket->h2.request_type == ISC_HTTP_REQ_POST && + isc_buffer_usedlength(&socket->h2.rbuf) != + socket->h2.content_length) + { + code = ISC_HTTP_ERROR_BAD_REQUEST; + } else if (socket->h2.request_type == ISC_HTTP_REQ_POST && + socket->h2.query_data != NULL) + { + /* The spec does not mention which value the query string for + * POST should have. For GET we use its value to decode a DNS + * message from it, for POST the message is transferred in the + * body of the request. Taking it into account, it is much safer + * to treat POST + * requests with query strings as malformed ones. */ + code = ISC_HTTP_ERROR_BAD_REQUEST; + } else if (socket->h2.request_type == ISC_HTTP_REQ_GET && + socket->h2.content_length > 0) + { + code = ISC_HTTP_ERROR_BAD_REQUEST; + } else if (socket->h2.request_type == ISC_HTTP_REQ_GET && + socket->h2.query_data == NULL) + { + /* A GET request without any query data - there is nothing to + * decode. */ + INSIST(socket->h2.query_data_len == 0); + code = ISC_HTTP_ERROR_BAD_REQUEST; + } + + if (code != ISC_HTTP_ERROR_SUCCESS) { + goto error; + } + + if (socket->h2.request_type == ISC_HTTP_REQ_GET) { + isc_buffer_t decoded_buf; + isc_buffer_init(&decoded_buf, tmp_buf, sizeof(tmp_buf)); + if (isc_base64_decodestring(socket->h2.query_data, + &decoded_buf) != ISC_R_SUCCESS) + { + code = ISC_HTTP_ERROR_BAD_REQUEST; + goto error; + } + isc_buffer_usedregion(&decoded_buf, &data); + } else if (socket->h2.request_type == ISC_HTTP_REQ_POST) { + INSIST(socket->h2.content_length > 0); + isc_buffer_usedregion(&socket->h2.rbuf, &data); + } else { + UNREACHABLE(); + } + + server_call_cb(socket, session, ISC_R_SUCCESS, &data); + + return (0); + +error: + result = server_send_error_response(code, ngsession, socket); + if (result != ISC_R_SUCCESS) { + return (NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE); + } + return (0); +} + +void +isc__nm_http_send(isc_nmhandle_t *handle, const isc_region_t *region, + isc_nm_cb_t cb, void *cbarg) { + isc_nmsocket_t *sock = NULL; + isc__netievent_httpsend_t *ievent = NULL; + isc__nm_uvreq_t *uvreq = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(VALID_NMSOCK(sock)); + + uvreq = isc__nm_uvreq_get(sock->mgr, sock); + isc_nmhandle_attach(handle, &uvreq->handle); + uvreq->cb.send = cb; + uvreq->cbarg = cbarg; + + uvreq->uvbuf.base = (char *)region->base; + uvreq->uvbuf.len = region->length; + + ievent = isc__nm_get_netievent_httpsend(sock->mgr, sock, uvreq); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +static void +failed_send_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req, + isc_result_t eresult) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + + if (req->cb.send != NULL) { + isc__nm_sendcb(sock, req, eresult, true); + } else { + isc__nm_uvreq_put(&req, sock); + } +} + +static void +client_httpsend(isc_nmhandle_t *handle, isc_nmsocket_t *sock, + isc__nm_uvreq_t *req) { + isc_result_t result = ISC_R_SUCCESS; + isc_nm_cb_t cb = req->cb.send; + void *cbarg = req->cbarg; + + result = client_send( + handle, + &(isc_region_t){ (uint8_t *)req->uvbuf.base, req->uvbuf.len }); + if (result != ISC_R_SUCCESS) { + failed_send_cb(sock, req, result); + return; + } + + http_do_bio(sock->h2.session, handle, cb, cbarg); + isc__nm_uvreq_put(&req, sock); +} + +static void +server_httpsend(isc_nmhandle_t *handle, isc_nmsocket_t *sock, + isc__nm_uvreq_t *req) { + size_t content_len_buf_len, cache_control_buf_len; + isc_result_t result = ISC_R_SUCCESS; + isc_nm_cb_t cb = req->cb.send; + void *cbarg = req->cbarg; + if (isc__nmsocket_closing(sock) || + !http_session_active(handle->httpsession)) + { + failed_send_cb(sock, req, ISC_R_CANCELED); + return; + } + + INSIST(handle->httpsession->handle->sock->tid == isc_nm_tid()); + INSIST(VALID_NMHANDLE(handle->httpsession->handle)); + INSIST(VALID_NMSOCK(handle->httpsession->handle->sock)); + + isc_buffer_init(&sock->h2.wbuf, req->uvbuf.base, req->uvbuf.len); + isc_buffer_add(&sock->h2.wbuf, req->uvbuf.len); + + content_len_buf_len = snprintf(sock->h2.clenbuf, + sizeof(sock->h2.clenbuf), "%lu", + (unsigned long)req->uvbuf.len); + if (sock->h2.min_ttl == 0) { + cache_control_buf_len = + snprintf(sock->h2.cache_control_buf, + sizeof(sock->h2.cache_control_buf), "%s", + DEFAULT_CACHE_CONTROL); + } else { + cache_control_buf_len = + snprintf(sock->h2.cache_control_buf, + sizeof(sock->h2.cache_control_buf), + "max-age=%" PRIu32, sock->h2.min_ttl); + } + const nghttp2_nv hdrs[] = { MAKE_NV2(":status", "200"), + MAKE_NV2("Content-Type", DNS_MEDIA_TYPE), + MAKE_NV("Content-Length", sock->h2.clenbuf, + content_len_buf_len), + MAKE_NV("Cache-Control", + sock->h2.cache_control_buf, + cache_control_buf_len) }; + + result = server_send_response(handle->httpsession->ngsession, + sock->h2.stream_id, hdrs, + sizeof(hdrs) / sizeof(nghttp2_nv), sock); + + if (result == ISC_R_SUCCESS) { + http_do_bio(handle->httpsession, handle, cb, cbarg); + } else { + cb(handle, result, cbarg); + } + isc__nm_uvreq_put(&req, sock); +} + +void +isc__nm_async_httpsend(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_httpsend_t *ievent = (isc__netievent_httpsend_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *req = ievent->req; + isc_nmhandle_t *handle = NULL; + isc_nm_http_session_t *session = NULL; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + REQUIRE(VALID_HTTP2_SESSION(sock->h2.session)); + + ievent->req = NULL; + handle = req->handle; + + REQUIRE(VALID_NMHANDLE(handle)); + + session = sock->h2.session; + if (session != NULL && session->client) { + client_httpsend(handle, sock, req); + } else { + server_httpsend(handle, sock, req); + } +} + +void +isc__nm_http_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { + isc_result_t result; + http_cstream_t *cstream = NULL; + isc_nm_http_session_t *session = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + session = handle->sock->h2.session; + if (!http_session_active(session)) { + cb(handle, ISC_R_CANCELED, NULL, cbarg); + return; + } + + result = get_http_cstream(handle->sock, &cstream); + if (result != ISC_R_SUCCESS) { + return; + } + + handle->sock->h2.connect.cstream = cstream; + cstream->read_cb = cb; + cstream->read_cbarg = cbarg; + cstream->reading = true; + + if (cstream->sending) { + result = client_submit_request(session, cstream); + if (result != ISC_R_SUCCESS) { + put_http_cstream(session->mctx, cstream); + return; + } + + http_do_bio(session, NULL, NULL, NULL); + } +} + +static int +server_on_frame_recv_callback(nghttp2_session *ngsession, + const nghttp2_frame *frame, void *user_data) { + isc_nm_http_session_t *session = (isc_nm_http_session_t *)user_data; + isc_nmsocket_t *socket = NULL; + + switch (frame->hd.type) { + case NGHTTP2_DATA: + case NGHTTP2_HEADERS: + /* Check that the client request has finished */ + if (frame->hd.flags & NGHTTP2_FLAG_END_STREAM) { + socket = nghttp2_session_get_stream_user_data( + ngsession, frame->hd.stream_id); + + /* + * For DATA and HEADERS frame, this callback may be + * called after on_stream_close_callback. Check that + * the stream is still alive. + */ + if (socket == NULL) { + return (0); + } + + return (server_on_request_recv(ngsession, session, + socket)); + } + break; + default: + break; + } + return (0); +} + +static void +initialize_nghttp2_server_session(isc_nm_http_session_t *session) { + nghttp2_session_callbacks *callbacks = NULL; + nghttp2_mem mem; + + init_nghttp2_mem(session->mctx, &mem); + + RUNTIME_CHECK(nghttp2_session_callbacks_new(&callbacks) == 0); + + nghttp2_session_callbacks_set_on_data_chunk_recv_callback( + callbacks, on_data_chunk_recv_callback); + + nghttp2_session_callbacks_set_on_stream_close_callback( + callbacks, on_stream_close_callback); + + nghttp2_session_callbacks_set_on_header_callback( + callbacks, server_on_header_callback); + + nghttp2_session_callbacks_set_on_begin_headers_callback( + callbacks, server_on_begin_headers_callback); + + nghttp2_session_callbacks_set_on_frame_recv_callback( + callbacks, server_on_frame_recv_callback); + + RUNTIME_CHECK(nghttp2_session_server_new3(&session->ngsession, + callbacks, session, NULL, + &mem) == 0); + + nghttp2_session_callbacks_del(callbacks); +} + +static int +server_send_connection_header(isc_nm_http_session_t *session) { + nghttp2_settings_entry iv[1] = { + { NGHTTP2_SETTINGS_MAX_CONCURRENT_STREAMS, + session->max_concurrent_streams } + }; + int rv; + + rv = nghttp2_submit_settings(session->ngsession, NGHTTP2_FLAG_NONE, iv, + 1); + if (rv != 0) { + return (-1); + } + return (0); +} + +/* + * It is advisable to disable Nagle's algorithm for HTTP/2 + * connections because multiple HTTP/2 streams could be multiplexed + * over one transport connection. Thus, delays when delivering small + * packets could bring down performance for the whole session. + * HTTP/2 is meant to be used this way. + */ +static void +http_transpost_tcp_nodelay(isc_nmhandle_t *transphandle) { + isc_nmsocket_t *tcpsock = NULL; + uv_os_fd_t tcp_fd = (uv_os_fd_t)-1; + + if (transphandle->sock->type == isc_nm_tlssocket) { + tcpsock = transphandle->sock->outerhandle->sock; + } else { + tcpsock = transphandle->sock; + } + + (void)uv_fileno((uv_handle_t *)&tcpsock->uv_handle.tcp, &tcp_fd); + RUNTIME_CHECK(tcp_fd != (uv_os_fd_t)-1); + (void)isc__nm_socket_tcp_nodelay((uv_os_sock_t)tcp_fd); +} + +static isc_result_t +httplisten_acceptcb(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) { + isc_nmsocket_t *httplistensock = (isc_nmsocket_t *)cbarg; + isc_nm_http_session_t *session = NULL; + isc_nmsocket_t *listener = NULL, *httpserver = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + if (handle->sock->type == isc_nm_tlssocket) { + REQUIRE(VALID_NMSOCK(handle->sock->listener)); + listener = handle->sock->listener; + httpserver = listener->h2.httpserver; + } else { + REQUIRE(VALID_NMSOCK(handle->sock->server)); + listener = handle->sock->server; + REQUIRE(VALID_NMSOCK(listener->parent)); + httpserver = listener->parent->h2.httpserver; + } + + /* + * NOTE: HTTP listener socket might be destroyed by the time this + * function gets invoked, so we need to do extra sanity checks to + * detect this case. + */ + if (isc__nmsocket_closing(handle->sock) || httpserver == NULL) { + return (ISC_R_CANCELED); + } + + if (result != ISC_R_SUCCESS) { + /* XXXWPK do nothing? */ + return (result); + } + + REQUIRE(VALID_NMSOCK(httplistensock)); + INSIST(httplistensock == httpserver); + + if (isc__nmsocket_closing(httplistensock) || + !atomic_load(&httplistensock->listening)) + { + return (ISC_R_CANCELED); + } + + http_transpost_tcp_nodelay(handle); + + new_session(httplistensock->mgr->mctx, NULL, &session); + session->max_concurrent_streams = + atomic_load(&httplistensock->h2.max_concurrent_streams); + initialize_nghttp2_server_session(session); + handle->sock->h2.session = session; + + isc_nmhandle_attach(handle, &session->handle); + isc__nmsocket_attach(httplistensock, &session->serversocket); + server_send_connection_header(session); + + /* TODO H2 */ + http_do_bio(session, NULL, NULL, NULL); + return (ISC_R_SUCCESS); +} + +isc_result_t +isc_nm_listenhttp(isc_nm_t *mgr, isc_sockaddr_t *iface, int backlog, + isc_quota_t *quota, isc_tlsctx_t *ctx, + isc_nm_http_endpoints_t *eps, uint32_t max_concurrent_streams, + isc_nmsocket_t **sockp) { + isc_nmsocket_t *sock = NULL; + isc_result_t result; + + REQUIRE(!ISC_LIST_EMPTY(eps->handlers)); + REQUIRE(!ISC_LIST_EMPTY(eps->handler_cbargs)); + REQUIRE(atomic_load(&eps->in_use) == false); + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_httplistener, iface); + atomic_init(&sock->h2.max_concurrent_streams, + NGHTTP2_INITIAL_MAX_CONCURRENT_STREAMS); + + isc_nmsocket_set_max_streams(sock, max_concurrent_streams); + + atomic_store(&eps->in_use, true); + http_init_listener_endpoints(sock, eps); + + if (ctx != NULL) { + result = isc_nm_listentls(mgr, iface, httplisten_acceptcb, sock, + sizeof(isc_nm_http_session_t), + backlog, quota, ctx, &sock->outer); + } else { + result = isc_nm_listentcp(mgr, iface, httplisten_acceptcb, sock, + sizeof(isc_nm_http_session_t), + backlog, quota, &sock->outer); + } + + if (result != ISC_R_SUCCESS) { + atomic_store(&sock->closed, true); + isc__nmsocket_detach(&sock); + return (result); + } + + isc__nmsocket_attach(sock, &sock->outer->h2.httpserver); + + sock->nchildren = sock->outer->nchildren; + sock->result = ISC_R_UNSET; + sock->tid = 0; + sock->fd = (uv_os_sock_t)-1; + + isc__nmsocket_barrier_init(sock); + atomic_init(&sock->rchildren, sock->nchildren); + + atomic_store(&sock->listening, true); + *sockp = sock; + return (ISC_R_SUCCESS); +} + +isc_nm_http_endpoints_t * +isc_nm_http_endpoints_new(isc_mem_t *mctx) { + isc_nm_http_endpoints_t *restrict eps; + REQUIRE(mctx != NULL); + + eps = isc_mem_get(mctx, sizeof(*eps)); + *eps = (isc_nm_http_endpoints_t){ .mctx = NULL }; + + isc_mem_attach(mctx, &eps->mctx); + ISC_LIST_INIT(eps->handler_cbargs); + ISC_LIST_INIT(eps->handlers); + isc_refcount_init(&eps->references, 1); + atomic_init(&eps->in_use, false); + eps->magic = HTTP_ENDPOINTS_MAGIC; + + return eps; +} + +void +isc_nm_http_endpoints_detach(isc_nm_http_endpoints_t **restrict epsp) { + isc_nm_http_endpoints_t *restrict eps; + isc_mem_t *mctx; + isc_nm_httphandler_t *handler = NULL; + isc_nm_httpcbarg_t *httpcbarg = NULL; + + REQUIRE(epsp != NULL); + eps = *epsp; + REQUIRE(VALID_HTTP_ENDPOINTS(eps)); + + if (isc_refcount_decrement(&eps->references) > 1) { + *epsp = NULL; + return; + } + + mctx = eps->mctx; + + /* Delete all handlers */ + handler = ISC_LIST_HEAD(eps->handlers); + while (handler != NULL) { + isc_nm_httphandler_t *next = NULL; + + next = ISC_LIST_NEXT(handler, link); + ISC_LIST_DEQUEUE(eps->handlers, handler, link); + isc_mem_free(mctx, handler->path); + isc_mem_put(mctx, handler, sizeof(*handler)); + handler = next; + } + + httpcbarg = ISC_LIST_HEAD(eps->handler_cbargs); + while (httpcbarg != NULL) { + isc_nm_httpcbarg_t *next = NULL; + + next = ISC_LIST_NEXT(httpcbarg, link); + ISC_LIST_DEQUEUE(eps->handler_cbargs, httpcbarg, link); + isc_mem_put(mctx, httpcbarg, sizeof(isc_nm_httpcbarg_t)); + httpcbarg = next; + } + + eps->magic = 0; + + isc_mem_putanddetach(&mctx, eps, sizeof(*eps)); + *epsp = NULL; +} + +void +isc_nm_http_endpoints_attach(isc_nm_http_endpoints_t *source, + isc_nm_http_endpoints_t **targetp) { + REQUIRE(VALID_HTTP_ENDPOINTS(source)); + REQUIRE(targetp != NULL && *targetp == NULL); + + isc_refcount_increment(&source->references); + + *targetp = source; +} + +static isc_nm_httphandler_t * +http_endpoints_find(const char *request_path, + const isc_nm_http_endpoints_t *restrict eps) { + isc_nm_httphandler_t *handler = NULL; + + REQUIRE(VALID_HTTP_ENDPOINTS(eps)); + + if (request_path == NULL || *request_path == '\0') { + return (NULL); + } + + for (handler = ISC_LIST_HEAD(eps->handlers); handler != NULL; + handler = ISC_LIST_NEXT(handler, link)) + { + if (!strcmp(request_path, handler->path)) { + break; + } + } + + return (handler); +} + +/* + * In DoH we just need to intercept the request - the response can be sent + * to the client code via the nmhandle directly as it's always just the + * http content. + */ +static void +http_callback(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *data, + void *arg) { + isc_nm_httpcbarg_t *httpcbarg = arg; + + REQUIRE(VALID_NMHANDLE(handle)); + + if (result != ISC_R_SUCCESS) { + /* Shut down the client, then ourselves */ + httpcbarg->cb(handle, result, NULL, httpcbarg->cbarg); + /* XXXWPK FREE */ + return; + } + httpcbarg->cb(handle, result, data, httpcbarg->cbarg); +} + +isc_result_t +isc_nm_http_endpoints_add(isc_nm_http_endpoints_t *restrict eps, + const char *uri, const isc_nm_recv_cb_t cb, + void *cbarg, const size_t extrahandlesize) { + isc_mem_t *mctx; + isc_nm_httphandler_t *restrict handler = NULL; + isc_nm_httpcbarg_t *restrict httpcbarg = NULL; + bool newhandler = false; + + REQUIRE(VALID_HTTP_ENDPOINTS(eps)); + REQUIRE(isc_nm_http_path_isvalid(uri)); + REQUIRE(atomic_load(&eps->in_use) == false); + + mctx = eps->mctx; + + httpcbarg = isc_mem_get(mctx, sizeof(isc_nm_httpcbarg_t)); + *httpcbarg = (isc_nm_httpcbarg_t){ .cb = cb, .cbarg = cbarg }; + ISC_LINK_INIT(httpcbarg, link); + + if (http_endpoints_find(uri, eps) == NULL) { + handler = isc_mem_get(mctx, sizeof(*handler)); + *handler = (isc_nm_httphandler_t){ + .cb = http_callback, + .cbarg = httpcbarg, + .extrahandlesize = extrahandlesize, + .path = isc_mem_strdup(mctx, uri) + }; + ISC_LINK_INIT(handler, link); + + newhandler = true; + } + + if (newhandler) { + ISC_LIST_APPEND(eps->handlers, handler, link); + } + ISC_LIST_APPEND(eps->handler_cbargs, httpcbarg, link); + return (ISC_R_SUCCESS); +} + +void +isc__nm_http_stoplistening(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_httplistener); + + isc__nmsocket_stop(sock); +} + +static void +http_close_direct(isc_nmsocket_t *sock) { + isc_nm_http_session_t *session = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + + atomic_store(&sock->closed, true); + atomic_store(&sock->active, false); + session = sock->h2.session; + + if (session != NULL && session->sending == 0 && !session->reading) { + /* + * The socket is going to be closed too early without been + * used even once (might happen in a case of low level + * error). + */ + finish_http_session(session); + } else if (session != NULL && session->handle) { + http_do_bio(session, NULL, NULL, NULL); + } +} + +void +isc__nm_http_close(isc_nmsocket_t *sock) { + bool destroy = false; + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_httpsocket); + REQUIRE(!isc__nmsocket_active(sock)); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + if (sock->h2.session != NULL && sock->h2.session->closed && + sock->tid == isc_nm_tid()) + { + isc__nm_httpsession_detach(&sock->h2.session); + destroy = true; + } else if (sock->h2.session == NULL && sock->tid == isc_nm_tid()) { + destroy = true; + } + + if (destroy) { + http_close_direct(sock); + isc__nmsocket_prep_destroy(sock); + return; + } + + isc__netievent_httpclose_t *ievent = + isc__nm_get_netievent_httpclose(sock->mgr, sock); + + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_httpclose(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_httpclose_t *ievent = (isc__netievent_httpclose_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + UNUSED(worker); + + http_close_direct(sock); +} + +static void +failed_httpstream_read_cb(isc_nmsocket_t *sock, isc_result_t result, + isc_nm_http_session_t *session) { + isc_region_t data; + REQUIRE(VALID_NMSOCK(sock)); + INSIST(sock->type == isc_nm_httpsocket); + + if (sock->h2.request_path == NULL) { + return; + } + + INSIST(sock->h2.cbarg != NULL); + + (void)nghttp2_submit_rst_stream( + session->ngsession, NGHTTP2_FLAG_END_STREAM, sock->h2.stream_id, + NGHTTP2_REFUSED_STREAM); + isc_buffer_usedregion(&sock->h2.rbuf, &data); + server_call_cb(sock, session, result, &data); +} + +static void +client_call_failed_read_cb(isc_result_t result, + isc_nm_http_session_t *session) { + http_cstream_t *cstream = NULL; + + REQUIRE(VALID_HTTP2_SESSION(session)); + REQUIRE(result != ISC_R_SUCCESS); + + cstream = ISC_LIST_HEAD(session->cstreams); + while (cstream != NULL) { + http_cstream_t *next = ISC_LIST_NEXT(cstream, link); + + /* + * read_cb could be NULL if cstream was allocated and added + * to the tracking list, but was not properly initialized due + * to a low-level error. It is safe to get rid of the object + * in such a case. + */ + if (cstream->read_cb != NULL) { + isc_region_t read_data; + isc_buffer_usedregion(cstream->rbuf, &read_data); + cstream->read_cb(session->client_httphandle, result, + &read_data, cstream->read_cbarg); + } + + if (result != ISC_R_TIMEDOUT || cstream->read_cb == NULL || + !isc__nmsocket_timer_running(session->handle->sock)) + { + ISC_LIST_DEQUEUE(session->cstreams, cstream, link); + put_http_cstream(session->mctx, cstream); + } + + cstream = next; + } +} + +static void +server_call_failed_read_cb(isc_result_t result, + isc_nm_http_session_t *session) { + isc_nmsocket_h2_t *h2data = NULL; /* stream socket */ + + REQUIRE(VALID_HTTP2_SESSION(session)); + REQUIRE(result != ISC_R_SUCCESS); + + for (h2data = ISC_LIST_HEAD(session->sstreams); h2data != NULL; + h2data = ISC_LIST_NEXT(h2data, link)) + { + failed_httpstream_read_cb(h2data->psock, result, session); + } + + h2data = ISC_LIST_HEAD(session->sstreams); + while (h2data != NULL) { + isc_nmsocket_h2_t *next = ISC_LIST_NEXT(h2data, link); + ISC_LIST_DEQUEUE(session->sstreams, h2data, link); + /* Cleanup socket in place */ + atomic_store(&h2data->psock->active, false); + atomic_store(&h2data->psock->closed, true); + isc__nmsocket_detach(&h2data->psock); + + h2data = next; + } +} + +static void +failed_read_cb(isc_result_t result, isc_nm_http_session_t *session) { + if (session->client) { + client_call_failed_read_cb(result, session); + /* + * If result was ISC_R_TIMEDOUT and the timer was reset, + * then we still have active streams and should not close + * the session. + */ + if (ISC_LIST_EMPTY(session->cstreams)) { + finish_http_session(session); + } + } else { + server_call_failed_read_cb(result, session); + /* + * All streams are now destroyed; close the session. + */ + finish_http_session(session); + } +} + +void +isc__nm_http_set_maxage(isc_nmhandle_t *handle, const uint32_t ttl) { + isc_nm_http_session_t *session; + isc_nmsocket_t *sock; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + session = sock->h2.session; + + INSIST(VALID_HTTP2_SESSION(session)); + INSIST(!session->client); + + sock->h2.min_ttl = ttl; +} + +bool +isc__nm_http_has_encryption(const isc_nmhandle_t *handle) { + isc_nm_http_session_t *session; + isc_nmsocket_t *sock; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + session = sock->h2.session; + + INSIST(VALID_HTTP2_SESSION(session)); + + return (isc_nm_socket_type(session->handle) == isc_nm_tlssocket); +} + +const char * +isc__nm_http_verify_tls_peer_result_string(const isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + isc_nm_http_session_t *session; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_httpsocket); + + sock = handle->sock; + session = sock->h2.session; + + /* + * In the case of a low-level error the session->handle is not + * attached nor session object is created. + */ + if (session == NULL && sock->h2.connect.tls_peer_verify_string != NULL) + { + return (sock->h2.connect.tls_peer_verify_string); + } + + if (session == NULL) { + return (NULL); + } + + INSIST(VALID_HTTP2_SESSION(session)); + + return (isc_nm_verify_tls_peer_result_string(session->handle)); +} + +void +isc__nm_http_set_tlsctx(isc_nmsocket_t *listener, isc_tlsctx_t *tlsctx) { + REQUIRE(VALID_NMSOCK(listener)); + REQUIRE(listener->type == isc_nm_httplistener); + + isc_nmsocket_set_tlsctx(listener->outer, tlsctx); +} + +void +isc__nm_http_set_max_streams(isc_nmsocket_t *listener, + const uint32_t max_concurrent_streams) { + uint32_t max_streams = NGHTTP2_INITIAL_MAX_CONCURRENT_STREAMS; + + REQUIRE(VALID_NMSOCK(listener)); + REQUIRE(listener->type == isc_nm_httplistener); + + if (max_concurrent_streams > 0 && + max_concurrent_streams < NGHTTP2_INITIAL_MAX_CONCURRENT_STREAMS) + { + max_streams = max_concurrent_streams; + } + + atomic_store(&listener->h2.max_concurrent_streams, max_streams); +} + +void +isc_nm_http_set_endpoints(isc_nmsocket_t *listener, + isc_nm_http_endpoints_t *eps) { + size_t nworkers; + + REQUIRE(VALID_NMSOCK(listener)); + REQUIRE(listener->type == isc_nm_httplistener); + REQUIRE(VALID_HTTP_ENDPOINTS(eps)); + + atomic_store(&eps->in_use, true); + + nworkers = (size_t)listener->mgr->nworkers; + for (size_t i = 0; i < nworkers; i++) { + isc__netievent__http_eps_t *ievent = + isc__nm_get_netievent_httpendpoints(listener->mgr, + listener, eps); + isc__nm_enqueue_ievent(&listener->mgr->workers[i], + (isc__netievent_t *)ievent); + } +} + +void +isc__nm_async_httpendpoints(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent__http_eps_t *ievent = (isc__netievent__http_eps_t *)ev0; + const int tid = isc_nm_tid(); + isc_nmsocket_t *listener = ievent->sock; + isc_nm_http_endpoints_t *eps = ievent->endpoints; + UNUSED(worker); + + isc_nm_http_endpoints_detach(&listener->h2.listener_endpoints[tid]); + isc_nm_http_endpoints_attach(eps, + &listener->h2.listener_endpoints[tid]); +} + +static void +http_init_listener_endpoints(isc_nmsocket_t *listener, + isc_nm_http_endpoints_t *epset) { + size_t nworkers; + + REQUIRE(VALID_NMSOCK(listener)); + REQUIRE(VALID_NM(listener->mgr)); + REQUIRE(VALID_HTTP_ENDPOINTS(epset)); + + nworkers = (size_t)listener->mgr->nworkers; + INSIST(nworkers > 0); + + listener->h2.listener_endpoints = + isc_mem_get(listener->mgr->mctx, + sizeof(isc_nm_http_endpoints_t *) * nworkers); + listener->h2.n_listener_endpoints = nworkers; + for (size_t i = 0; i < nworkers; i++) { + listener->h2.listener_endpoints[i] = NULL; + isc_nm_http_endpoints_attach( + epset, &listener->h2.listener_endpoints[i]); + } +} + +static void +http_cleanup_listener_endpoints(isc_nmsocket_t *listener) { + REQUIRE(VALID_NM(listener->mgr)); + + if (listener->h2.listener_endpoints == NULL) { + return; + } + + for (size_t i = 0; i < listener->h2.n_listener_endpoints; i++) { + isc_nm_http_endpoints_detach( + &listener->h2.listener_endpoints[i]); + } + isc_mem_put(listener->mgr->mctx, listener->h2.listener_endpoints, + sizeof(isc_nm_http_endpoints_t *) * + listener->h2.n_listener_endpoints); + listener->h2.n_listener_endpoints = 0; +} + +static isc_nm_http_endpoints_t * +http_get_listener_endpoints(isc_nmsocket_t *listener, const int tid) { + isc_nm_http_endpoints_t *eps; + REQUIRE(VALID_NMSOCK(listener)); + REQUIRE(tid >= 0); + REQUIRE((size_t)tid < listener->h2.n_listener_endpoints); + + eps = listener->h2.listener_endpoints[tid]; + INSIST(eps != NULL); + return (eps); +} + +static const bool base64url_validation_table[256] = { + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, true, false, false, true, true, + true, true, true, true, true, true, true, true, false, false, + false, false, false, false, false, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, + true, false, false, false, false, true, false, true, true, true, + true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, + true, true, true, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false +}; + +char * +isc__nm_base64url_to_base64(isc_mem_t *mem, const char *base64url, + const size_t base64url_len, size_t *res_len) { + char *res = NULL; + size_t i, k, len; + + if (mem == NULL || base64url == NULL || base64url_len == 0) { + return (NULL); + } + + len = base64url_len % 4 ? base64url_len + (4 - base64url_len % 4) + : base64url_len; + res = isc_mem_allocate(mem, len + 1); /* '\0' */ + + for (i = 0; i < base64url_len; i++) { + switch (base64url[i]) { + case '-': + res[i] = '+'; + break; + case '_': + res[i] = '/'; + break; + default: + if (base64url_validation_table[(size_t)base64url[i]]) { + res[i] = base64url[i]; + } else { + isc_mem_free(mem, res); + return (NULL); + } + break; + } + } + + if (base64url_len % 4 != 0) { + for (k = 0; k < (4 - base64url_len % 4); k++, i++) { + res[i] = '='; + } + } + + INSIST(i == len); + + if (res_len != NULL) { + *res_len = len; + } + + res[len] = '\0'; + + return (res); +} + +char * +isc__nm_base64_to_base64url(isc_mem_t *mem, const char *base64, + const size_t base64_len, size_t *res_len) { + char *res = NULL; + size_t i; + + if (mem == NULL || base64 == NULL || base64_len == 0) { + return (NULL); + } + + res = isc_mem_allocate(mem, base64_len + 1); /* '\0' */ + + for (i = 0; i < base64_len; i++) { + switch (base64[i]) { + case '+': + res[i] = '-'; + break; + case '/': + res[i] = '_'; + break; + case '=': + goto end; + break; + default: + /* + * All other characters from the alphabet are the same + * for both base64 and base64url, so we can reuse the + * validation table for the rest of the characters. + */ + if (base64[i] != '-' && base64[i] != '_' && + base64url_validation_table[(size_t)base64[i]]) + { + res[i] = base64[i]; + } else { + isc_mem_free(mem, res); + return (NULL); + } + break; + } + } +end: + if (res_len) { + *res_len = i; + } + + res[i] = '\0'; + + return (res); +} + +void +isc__nm_http_initsocket(isc_nmsocket_t *sock) { + REQUIRE(sock != NULL); + + sock->h2 = (isc_nmsocket_h2_t){ + .request_type = ISC_HTTP_REQ_UNSUPPORTED, + .request_scheme = ISC_HTTP_SCHEME_UNSUPPORTED, + }; +} + +void +isc__nm_http_cleanup_data(isc_nmsocket_t *sock) { + if ((sock->type == isc_nm_tcplistener || + sock->type == isc_nm_tlslistener) && + sock->h2.httpserver != NULL) + { + isc__nmsocket_detach(&sock->h2.httpserver); + } + + if (sock->type == isc_nm_httplistener || + sock->type == isc_nm_httpsocket) + { + if (sock->type == isc_nm_httplistener && + sock->h2.listener_endpoints != NULL) + { + /* Delete all handlers */ + http_cleanup_listener_endpoints(sock); + } + + if (sock->h2.request_path != NULL) { + isc_mem_free(sock->mgr->mctx, sock->h2.request_path); + sock->h2.request_path = NULL; + } + + if (sock->h2.query_data != NULL) { + isc_mem_free(sock->mgr->mctx, sock->h2.query_data); + sock->h2.query_data = NULL; + } + + INSIST(sock->h2.connect.cstream == NULL); + + if (isc_buffer_base(&sock->h2.rbuf) != NULL) { + void *base = isc_buffer_base(&sock->h2.rbuf); + isc_mem_free(sock->mgr->mctx, base); + isc_buffer_initnull(&sock->h2.rbuf); + } + } + + if ((sock->type == isc_nm_httplistener || + sock->type == isc_nm_httpsocket || + sock->type == isc_nm_tcpsocket || + sock->type == isc_nm_tlssocket) && + sock->h2.session != NULL) + { + if (sock->h2.connect.uri != NULL) { + isc_mem_free(sock->mgr->mctx, sock->h2.connect.uri); + sock->h2.connect.uri = NULL; + } + isc__nm_httpsession_detach(&sock->h2.session); + } +} + +void +isc__nm_http_cleartimeout(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_httpsocket); + + sock = handle->sock; + if (sock->h2.session != NULL && sock->h2.session->handle != NULL) { + INSIST(VALID_HTTP2_SESSION(sock->h2.session)); + INSIST(VALID_NMHANDLE(sock->h2.session->handle)); + isc_nmhandle_cleartimeout(sock->h2.session->handle); + } +} + +void +isc__nm_http_settimeout(isc_nmhandle_t *handle, uint32_t timeout) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_httpsocket); + + sock = handle->sock; + if (sock->h2.session != NULL && sock->h2.session->handle != NULL) { + INSIST(VALID_HTTP2_SESSION(sock->h2.session)); + INSIST(VALID_NMHANDLE(sock->h2.session->handle)); + isc_nmhandle_settimeout(sock->h2.session->handle, timeout); + } +} + +void +isc__nmhandle_http_keepalive(isc_nmhandle_t *handle, bool value) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_httpsocket); + + sock = handle->sock; + if (sock->h2.session != NULL && sock->h2.session->handle) { + INSIST(VALID_HTTP2_SESSION(sock->h2.session)); + INSIST(VALID_NMHANDLE(sock->h2.session->handle)); + + isc_nmhandle_keepalive(sock->h2.session->handle, value); + } +} + +void +isc_nm_http_makeuri(const bool https, const isc_sockaddr_t *sa, + const char *hostname, const uint16_t http_port, + const char *abs_path, char *outbuf, + const size_t outbuf_len) { + char saddr[INET6_ADDRSTRLEN] = { 0 }; + int family; + bool ipv6_addr = false; + struct sockaddr_in6 sa6; + uint16_t host_port = http_port; + const char *host = NULL; + + REQUIRE(outbuf != NULL); + REQUIRE(outbuf_len != 0); + REQUIRE(isc_nm_http_path_isvalid(abs_path)); + + /* If hostname is specified, use that. */ + if (hostname != NULL && hostname[0] != '\0') { + /* + * The host name could be an IPv6 address. If so, + * wrap it between [ and ]. + */ + if (inet_pton(AF_INET6, hostname, &sa6) == 1 && + hostname[0] != '[') + { + ipv6_addr = true; + } + host = hostname; + } else { + /* + * A hostname was not specified; build one from + * the given IP address. + */ + INSIST(sa != NULL); + family = ((const struct sockaddr *)&sa->type.sa)->sa_family; + host_port = ntohs(family == AF_INET ? sa->type.sin.sin_port + : sa->type.sin6.sin6_port); + ipv6_addr = family == AF_INET6; + (void)inet_ntop( + family, + family == AF_INET + ? (const struct sockaddr *)&sa->type.sin.sin_addr + : (const struct sockaddr *)&sa->type.sin6 + .sin6_addr, + saddr, sizeof(saddr)); + host = saddr; + } + + /* + * If the port number was not specified, the default + * depends on whether we're using encryption or not. + */ + if (host_port == 0) { + host_port = https ? 443 : 80; + } + + (void)snprintf(outbuf, outbuf_len, "%s://%s%s%s:%u%s", + https ? "https" : "http", ipv6_addr ? "[" : "", host, + ipv6_addr ? "]" : "", host_port, abs_path); +} + +/* + * DoH GET Query String Scanner-less Recursive Descent Parser/Verifier + * + * It is based on the following grammar (using WSN/EBNF): + * + * S = query-string. + * query-string = ['?'] { key-value-pair } EOF. + * key-value-pair = key '=' value [ '&' ]. + * key = ('_' | alpha) { '_' | alnum}. + * value = value-char {value-char}. + * value-char = unreserved-char | percent-charcode. + * unreserved-char = alnum |'_' | '.' | '-' | '~'. (* RFC3986, Section 2.3 *) + * percent-charcode = '%' hexdigit hexdigit. + * ... + * + * Should be good enough. + */ +typedef struct isc_httpparser_state { + const char *str; + + const char *last_key; + size_t last_key_len; + + const char *last_value; + size_t last_value_len; + + bool query_found; + const char *query; + size_t query_len; +} isc_httpparser_state_t; + +#define MATCH(ch) (st->str[0] == (ch)) +#define MATCH_ALPHA() isalpha((unsigned char)(st->str[0])) +#define MATCH_DIGIT() isdigit((unsigned char)(st->str[0])) +#define MATCH_ALNUM() isalnum((unsigned char)(st->str[0])) +#define MATCH_XDIGIT() isxdigit((unsigned char)(st->str[0])) +#define ADVANCE() st->str++ +#define GETP() (st->str) + +static bool +rule_query_string(isc_httpparser_state_t *st); + +bool +isc__nm_parse_httpquery(const char *query_string, const char **start, + size_t *len) { + isc_httpparser_state_t state; + + REQUIRE(start != NULL); + REQUIRE(len != NULL); + + if (query_string == NULL || query_string[0] == '\0') { + return (false); + } + + state = (isc_httpparser_state_t){ .str = query_string }; + if (!rule_query_string(&state)) { + return (false); + } + + if (!state.query_found) { + return (false); + } + + *start = state.query; + *len = state.query_len; + + return (true); +} + +static bool +rule_key_value_pair(isc_httpparser_state_t *st); + +static bool +rule_key(isc_httpparser_state_t *st); + +static bool +rule_value(isc_httpparser_state_t *st); + +static bool +rule_value_char(isc_httpparser_state_t *st); + +static bool +rule_percent_charcode(isc_httpparser_state_t *st); + +static bool +rule_unreserved_char(isc_httpparser_state_t *st); + +static bool +rule_query_string(isc_httpparser_state_t *st) { + if (MATCH('?')) { + ADVANCE(); + } + + while (rule_key_value_pair(st)) { + /* skip */; + } + + if (!MATCH('\0')) { + return (false); + } + + ADVANCE(); + return (true); +} + +static bool +rule_key_value_pair(isc_httpparser_state_t *st) { + if (!rule_key(st)) { + return (false); + } + + if (MATCH('=')) { + ADVANCE(); + } else { + return (false); + } + + if (rule_value(st)) { + const char dns[] = "dns"; + if (st->last_key_len == sizeof(dns) - 1 && + memcmp(st->last_key, dns, sizeof(dns) - 1) == 0) + { + st->query_found = true; + st->query = st->last_value; + st->query_len = st->last_value_len; + } + } else { + return (false); + } + + if (MATCH('&')) { + ADVANCE(); + } + + return (true); +} + +static bool +rule_key(isc_httpparser_state_t *st) { + if (MATCH('_') || MATCH_ALPHA()) { + st->last_key = GETP(); + ADVANCE(); + } else { + return (false); + } + + while (MATCH('_') || MATCH_ALNUM()) { + ADVANCE(); + } + + st->last_key_len = GETP() - st->last_key; + return (true); +} + +static bool +rule_value(isc_httpparser_state_t *st) { + const char *s = GETP(); + if (!rule_value_char(st)) { + return (false); + } + + st->last_value = s; + while (rule_value_char(st)) { + /* skip */; + } + st->last_value_len = GETP() - st->last_value; + return (true); +} + +static bool +rule_value_char(isc_httpparser_state_t *st) { + if (rule_unreserved_char(st)) { + return (true); + } + + return (rule_percent_charcode(st)); +} + +static bool +rule_unreserved_char(isc_httpparser_state_t *st) { + if (MATCH_ALNUM() || MATCH('_') || MATCH('.') || MATCH('-') || + MATCH('~')) + { + ADVANCE(); + return (true); + } + return (false); +} + +static bool +rule_percent_charcode(isc_httpparser_state_t *st) { + if (MATCH('%')) { + ADVANCE(); + } else { + return (false); + } + + if (!MATCH_XDIGIT()) { + return (false); + } + ADVANCE(); + + if (!MATCH_XDIGIT()) { + return (false); + } + ADVANCE(); + + return (true); +} + +/* + * DoH URL Location Verifier. Based on the following grammar (EBNF/WSN + * notation): + * + * S = path_absolute. + * path_absolute = '/' [ segments ] '\0'. + * segments = segment_nz { slash_segment }. + * slash_segment = '/' segment. + * segment = { pchar }. + * segment_nz = pchar { pchar }. + * pchar = unreserved | pct_encoded | sub_delims | ':' | '@'. + * unreserved = ALPHA | DIGIT | '-' | '.' | '_' | '~'. + * pct_encoded = '%' XDIGIT XDIGIT. + * sub_delims = '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | + * ',' | ';' | '='. + * + * The grammar is extracted from RFC 3986. It is slightly modified to + * aid in parser creation, but the end result is the same + * (path_absolute is defined slightly differently - split into + * multiple productions). + * + * https://datatracker.ietf.org/doc/html/rfc3986#appendix-A + */ + +typedef struct isc_http_location_parser_state { + const char *str; +} isc_http_location_parser_state_t; + +static bool +rule_loc_path_absolute(isc_http_location_parser_state_t *); + +static bool +rule_loc_segments(isc_http_location_parser_state_t *); + +static bool +rule_loc_slash_segment(isc_http_location_parser_state_t *); + +static bool +rule_loc_segment(isc_http_location_parser_state_t *); + +static bool +rule_loc_segment_nz(isc_http_location_parser_state_t *); + +static bool +rule_loc_pchar(isc_http_location_parser_state_t *); + +static bool +rule_loc_unreserved(isc_http_location_parser_state_t *); + +static bool +rule_loc_pct_encoded(isc_http_location_parser_state_t *); + +static bool +rule_loc_sub_delims(isc_http_location_parser_state_t *); + +static bool +rule_loc_path_absolute(isc_http_location_parser_state_t *st) { + if (MATCH('/')) { + ADVANCE(); + } else { + return (false); + } + + (void)rule_loc_segments(st); + + if (MATCH('\0')) { + ADVANCE(); + } else { + return (false); + } + + return (true); +} + +static bool +rule_loc_segments(isc_http_location_parser_state_t *st) { + if (!rule_loc_segment_nz(st)) { + return (false); + } + + while (rule_loc_slash_segment(st)) { + /* zero or more */; + } + + return (true); +} + +static bool +rule_loc_slash_segment(isc_http_location_parser_state_t *st) { + if (MATCH('/')) { + ADVANCE(); + } else { + return (false); + } + + return (rule_loc_segment(st)); +} + +static bool +rule_loc_segment(isc_http_location_parser_state_t *st) { + while (rule_loc_pchar(st)) { + /* zero or more */; + } + + return (true); +} + +static bool +rule_loc_segment_nz(isc_http_location_parser_state_t *st) { + if (!rule_loc_pchar(st)) { + return (false); + } + + while (rule_loc_pchar(st)) { + /* zero or more */; + } + + return (true); +} + +static bool +rule_loc_pchar(isc_http_location_parser_state_t *st) { + if (rule_loc_unreserved(st)) { + return (true); + } else if (rule_loc_pct_encoded(st)) { + return (true); + } else if (rule_loc_sub_delims(st)) { + return (true); + } else if (MATCH(':') || MATCH('@')) { + ADVANCE(); + return (true); + } + + return (false); +} + +static bool +rule_loc_unreserved(isc_http_location_parser_state_t *st) { + if (MATCH_ALPHA() | MATCH_DIGIT() | MATCH('-') | MATCH('.') | + MATCH('_') | MATCH('~')) + { + ADVANCE(); + return (true); + } + return (false); +} + +static bool +rule_loc_pct_encoded(isc_http_location_parser_state_t *st) { + if (!MATCH('%')) { + return (false); + } + ADVANCE(); + + if (!MATCH_XDIGIT()) { + return (false); + } + ADVANCE(); + + if (!MATCH_XDIGIT()) { + return (false); + } + ADVANCE(); + + return (true); +} + +static bool +rule_loc_sub_delims(isc_http_location_parser_state_t *st) { + if (MATCH('!') | MATCH('$') | MATCH('&') | MATCH('\'') | MATCH('(') | + MATCH(')') | MATCH('*') | MATCH('+') | MATCH(',') | MATCH(';') | + MATCH('=')) + { + ADVANCE(); + return (true); + } + + return (false); +} + +bool +isc_nm_http_path_isvalid(const char *path) { + isc_http_location_parser_state_t state = { 0 }; + + REQUIRE(path != NULL); + + state.str = path; + + return (rule_loc_path_absolute(&state)); +} diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h new file mode 100644 index 0000000..364a933 --- /dev/null +++ b/lib/isc/netmgr/netmgr-int.h @@ -0,0 +1,2273 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#pragma once + +#include <unistd.h> +#include <uv.h> + +#include <openssl/err.h> +#include <openssl/ssl.h> + +#include <isc/astack.h> +#include <isc/atomic.h> +#include <isc/barrier.h> +#include <isc/buffer.h> +#include <isc/condition.h> +#include <isc/magic.h> +#include <isc/mem.h> +#include <isc/netmgr.h> +#include <isc/quota.h> +#include <isc/random.h> +#include <isc/refcount.h> +#include <isc/region.h> +#include <isc/result.h> +#include <isc/sockaddr.h> +#include <isc/stats.h> +#include <isc/thread.h> +#include <isc/tls.h> +#include <isc/util.h> + +#include "uv-compat.h" + +#define ISC_NETMGR_TID_UNKNOWN -1 + +/* Must be different from ISC_NETMGR_TID_UNKNOWN */ +#define ISC_NETMGR_NON_INTERLOCKED -2 + +/* + * Receive buffers + */ +#if HAVE_DECL_UV_UDP_MMSG_CHUNK +/* + * The value 20 here is UV__MMSG_MAXWIDTH taken from the current libuv source, + * libuv will not receive more that 20 datagrams in a single recvmmsg call. + */ +#define ISC_NETMGR_UDP_RECVBUF_SIZE (20 * UINT16_MAX) +#else +/* + * A single DNS message size + */ +#define ISC_NETMGR_UDP_RECVBUF_SIZE UINT16_MAX +#endif + +/* + * The TCP receive buffer can fit one maximum sized DNS message plus its size, + * the receive buffer here affects TCP, DoT and DoH. + */ +#define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX) + +/* Pick the larger buffer */ +#define ISC_NETMGR_RECVBUF_SIZE \ + (ISC_NETMGR_UDP_RECVBUF_SIZE >= ISC_NETMGR_TCP_RECVBUF_SIZE \ + ? ISC_NETMGR_UDP_RECVBUF_SIZE \ + : ISC_NETMGR_TCP_RECVBUF_SIZE) + +/* + * Send buffer + */ +#define ISC_NETMGR_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX) + +/* + * Make sure our RECVBUF size is large enough + */ + +STATIC_ASSERT(ISC_NETMGR_UDP_RECVBUF_SIZE <= ISC_NETMGR_RECVBUF_SIZE, + "UDP receive buffer size must be smaller or equal than worker " + "receive buffer size"); + +STATIC_ASSERT(ISC_NETMGR_TCP_RECVBUF_SIZE <= ISC_NETMGR_RECVBUF_SIZE, + "TCP receive buffer size must be smaller or equal than worker " + "receive buffer size"); + +/*% + * Regular TCP buffer size. + */ +#define NM_REG_BUF 4096 + +/*% + * Larger buffer for when the regular one isn't enough; this will + * hold two full DNS packets with lengths. netmgr receives 64k at + * most in TCPDNS or TLSDNS connections, so there's no risk of overrun + * when using a buffer this size. + */ +#define NM_BIG_BUF ISC_NETMGR_TCP_RECVBUF_SIZE * 2 + +/*% + * Maximum segment size (MSS) of TCP socket on which the server responds to + * queries. Value lower than common MSS on Ethernet (1220, that is 1280 (IPv6 + * minimum link MTU) - 40 (IPv6 fixed header) - 20 (TCP fixed header)) will + * address path MTU problem. + */ +#define NM_MAXSEG (1280 - 20 - 40) + +/* + * Define NETMGR_TRACE to activate tracing of handles and sockets. + * This will impair performance but enables us to quickly determine, + * if netmgr resources haven't been cleaned up on shutdown, which ones + * are still in use. + */ +#ifdef NETMGR_TRACE +#define TRACE_SIZE 8 + +void +isc__nm_dump_active(isc_nm_t *nm); + +#if defined(__linux__) +#include <syscall.h> +#define gettid() (uint32_t) syscall(SYS_gettid) +#else +#define gettid() (uint32_t) pthread_self() +#endif + +#ifdef NETMGR_TRACE_VERBOSE +#define NETMGR_TRACE_LOG(format, ...) \ + fprintf(stderr, "%" PRIu32 ":%d:%s:%u:%s:" format, gettid(), \ + isc_nm_tid(), file, line, func, __VA_ARGS__) +#else +#define NETMGR_TRACE_LOG(format, ...) \ + (void)file; \ + (void)line; \ + (void)func; +#endif + +#define FLARG_PASS , file, line, func +#define FLARG \ + , const char *file __attribute__((unused)), \ + unsigned int line __attribute__((unused)), \ + const char *func __attribute__((unused)) +#define FLARG_IEVENT(ievent) \ + const char *file = ievent->file; \ + unsigned int line = ievent->line; \ + const char *func = ievent->func; +#define FLARG_IEVENT_PASS(ievent) \ + ievent->file = file; \ + ievent->line = line; \ + ievent->func = func; +#define isc__nm_uvreq_get(req, sock) \ + isc___nm_uvreq_get(req, sock, __FILE__, __LINE__, __func__) +#define isc__nm_uvreq_put(req, sock) \ + isc___nm_uvreq_put(req, sock, __FILE__, __LINE__, __func__) +#define isc__nmsocket_init(sock, mgr, type, iface) \ + isc___nmsocket_init(sock, mgr, type, iface, __FILE__, __LINE__, \ + __func__) +#define isc__nmsocket_put(sockp) \ + isc___nmsocket_put(sockp, __FILE__, __LINE__, __func__) +#define isc__nmsocket_attach(sock, target) \ + isc___nmsocket_attach(sock, target, __FILE__, __LINE__, __func__) +#define isc__nmsocket_detach(socketp) \ + isc___nmsocket_detach(socketp, __FILE__, __LINE__, __func__) +#define isc__nmsocket_close(socketp) \ + isc___nmsocket_close(socketp, __FILE__, __LINE__, __func__) +#define isc__nmhandle_get(sock, peer, local) \ + isc___nmhandle_get(sock, peer, local, __FILE__, __LINE__, __func__) +#define isc__nmsocket_prep_destroy(sock) \ + isc___nmsocket_prep_destroy(sock, __FILE__, __LINE__, __func__) +#else +#define NETMGR_TRACE_LOG(format, ...) + +#define FLARG_PASS +#define FLARG +#define FLARG_IEVENT(ievent) +#define FLARG_IEVENT_PASS(ievent) +#define isc__nm_uvreq_get(req, sock) isc___nm_uvreq_get(req, sock) +#define isc__nm_uvreq_put(req, sock) isc___nm_uvreq_put(req, sock) +#define isc__nmsocket_init(sock, mgr, type, iface) \ + isc___nmsocket_init(sock, mgr, type, iface) +#define isc__nmsocket_put(sockp) isc___nmsocket_put(sockp) +#define isc__nmsocket_attach(sock, target) isc___nmsocket_attach(sock, target) +#define isc__nmsocket_detach(socketp) isc___nmsocket_detach(socketp) +#define isc__nmsocket_close(socketp) isc___nmsocket_close(socketp) +#define isc__nmhandle_get(sock, peer, local) \ + isc___nmhandle_get(sock, peer, local) +#define isc__nmsocket_prep_destroy(sock) isc___nmsocket_prep_destroy(sock) +#endif + +/* + * Queue types in the order of processing priority. + */ +typedef enum { + NETIEVENT_PRIORITY = 0, + NETIEVENT_PRIVILEGED = 1, + NETIEVENT_TASK = 2, + NETIEVENT_NORMAL = 3, + NETIEVENT_MAX = 4, +} netievent_type_t; + +typedef struct isc__nm_uvreq isc__nm_uvreq_t; +typedef struct isc__netievent isc__netievent_t; + +typedef ISC_LIST(isc__netievent_t) isc__netievent_list_t; + +typedef struct ievent { + isc_mutex_t lock; + isc_condition_t cond; + isc__netievent_list_t list; +} ievent_t; + +/* + * Single network event loop worker. + */ +typedef struct isc__networker { + isc_nm_t *mgr; + int id; /* thread id */ + uv_loop_t loop; /* libuv loop structure */ + uv_async_t async; /* async channel to send + * data to this networker */ + bool paused; + bool finished; + isc_thread_t thread; + ievent_t ievents[NETIEVENT_MAX]; + + isc_refcount_t references; + atomic_int_fast64_t pktcount; + char *recvbuf; + char *sendbuf; + bool recvbuf_inuse; +} isc__networker_t; + +/* + * A general handle for a connection bound to a networker. For UDP + * connections we have peer address here, so both TCP and UDP can be + * handled with a simple send-like function + */ +#define NMHANDLE_MAGIC ISC_MAGIC('N', 'M', 'H', 'D') +#define VALID_NMHANDLE(t) \ + (ISC_MAGIC_VALID(t, NMHANDLE_MAGIC) && \ + atomic_load(&(t)->references) > 0) + +typedef void (*isc__nm_closecb)(isc_nmhandle_t *); +typedef struct isc_nm_http_session isc_nm_http_session_t; + +struct isc_nmhandle { + int magic; + isc_refcount_t references; + + /* + * The socket is not 'attached' in the traditional + * reference-counting sense. Instead, we keep all handles in an + * array in the socket object. This way, we don't have circular + * dependencies and we can close all handles when we're destroying + * the socket. + */ + isc_nmsocket_t *sock; + + isc_nm_http_session_t *httpsession; + + isc_sockaddr_t peer; + isc_sockaddr_t local; + isc_nm_opaquecb_t doreset; /* reset extra callback, external */ + isc_nm_opaquecb_t dofree; /* free extra callback, external */ +#ifdef NETMGR_TRACE + void *backtrace[TRACE_SIZE]; + int backtrace_size; + LINK(isc_nmhandle_t) active_link; +#endif + void *opaque; + char extra[]; +}; + +typedef enum isc__netievent_type { + netievent_udpconnect, + netievent_udpclose, + netievent_udpsend, + netievent_udpread, + netievent_udpcancel, + + netievent_routeconnect, + + netievent_tcpconnect, + netievent_tcpclose, + netievent_tcpsend, + netievent_tcpstartread, + netievent_tcppauseread, + netievent_tcpaccept, + netievent_tcpcancel, + + netievent_tcpdnsaccept, + netievent_tcpdnsconnect, + netievent_tcpdnsclose, + netievent_tcpdnssend, + netievent_tcpdnsread, + netievent_tcpdnscancel, + + netievent_tlsclose, + netievent_tlssend, + netievent_tlsstartread, + netievent_tlsconnect, + netievent_tlsdobio, + netievent_tlscancel, + + netievent_tlsdnsaccept, + netievent_tlsdnsconnect, + netievent_tlsdnsclose, + netievent_tlsdnssend, + netievent_tlsdnsread, + netievent_tlsdnscancel, + netievent_tlsdnscycle, + netievent_tlsdnsshutdown, + + netievent_httpclose, + netievent_httpsend, + netievent_httpendpoints, + + netievent_shutdown, + netievent_stop, + netievent_pause, + + netievent_connectcb, + netievent_readcb, + netievent_sendcb, + + netievent_detach, + netievent_close, + + netievent_task, + netievent_privilegedtask, + + netievent_settlsctx, + + /* + * event type values higher than this will be treated + * as high-priority events, which can be processed + * while the netmgr is pausing or paused. + */ + netievent_prio = 0xff, + + netievent_udplisten, + netievent_udpstop, + netievent_tcplisten, + netievent_tcpstop, + netievent_tcpdnslisten, + netievent_tcpdnsstop, + netievent_tlsdnslisten, + netievent_tlsdnsstop, + netievent_sockstop, /* for multilayer sockets */ + + netievent_resume, +} isc__netievent_type; + +typedef union { + isc_nm_recv_cb_t recv; + isc_nm_cb_t send; + isc_nm_cb_t connect; + isc_nm_accept_cb_t accept; +} isc__nm_cb_t; + +/* + * Wrapper around uv_req_t with 'our' fields in it. req->data should + * always point to its parent. Note that we always allocate more than + * sizeof(struct) because we make room for different req types; + */ +#define UVREQ_MAGIC ISC_MAGIC('N', 'M', 'U', 'R') +#define VALID_UVREQ(t) ISC_MAGIC_VALID(t, UVREQ_MAGIC) + +typedef struct isc__nm_uvreq isc__nm_uvreq_t; +struct isc__nm_uvreq { + int magic; + isc_nmsocket_t *sock; + isc_nmhandle_t *handle; + char tcplen[2]; /* The TCP DNS message length */ + uv_buf_t uvbuf; /* translated isc_region_t, to be + * sent or received */ + isc_sockaddr_t local; /* local address */ + isc_sockaddr_t peer; /* peer address */ + isc__nm_cb_t cb; /* callback */ + void *cbarg; /* callback argument */ + isc_nm_timer_t *timer; /* TCP write timer */ + int connect_tries; /* connect retries */ + + union { + uv_handle_t handle; + uv_req_t req; + uv_getaddrinfo_t getaddrinfo; + uv_getnameinfo_t getnameinfo; + uv_shutdown_t shutdown; + uv_write_t write; + uv_connect_t connect; + uv_udp_send_t udp_send; + uv_fs_t fs; + uv_work_t work; + } uv_req; + ISC_LINK(isc__nm_uvreq_t) link; +}; + +void * +isc__nm_get_netievent(isc_nm_t *mgr, isc__netievent_type type); +/*%< + * Allocate an ievent and set the type. + */ +void +isc__nm_put_netievent(isc_nm_t *mgr, void *ievent); + +/* + * The macros here are used to simulate the "inheritance" in C, there's the base + * netievent structure that contains just its own type and socket, and there are + * extended netievent types that also have handles or requests or other data. + * + * The macros here ensure that: + * + * 1. every netievent type has matching definition, declaration and + * implementation + * + * 2. we handle all the netievent types of same subclass the same, e.g. if the + * extended netievent contains handle, we always attach to the handle in + * the ctor and detach from the handle in dtor. + * + * There are three macros here for each netievent subclass: + * + * 1. NETIEVENT_*_TYPE(type) creates the typedef for each type; used below in + * this header + * + * 2. NETIEVENT_*_DECL(type) generates the declaration of the get and put + * functions (isc__nm_get_netievent_* and isc__nm_put_netievent_*); used + * below in this header + * + * 3. NETIEVENT_*_DEF(type) generates the definition of the functions; used + * either in netmgr.c or matching protocol file (e.g. udp.c, tcp.c, etc.) + */ + +#define NETIEVENT__SOCKET \ + isc__netievent_type type; \ + ISC_LINK(isc__netievent_t) link; \ + isc_nmsocket_t *sock; \ + const char *file; \ + unsigned int line; \ + const char *func; + +typedef struct isc__netievent__socket { + NETIEVENT__SOCKET; +} isc__netievent__socket_t; + +#define NETIEVENT_SOCKET_TYPE(type) \ + typedef isc__netievent__socket_t isc__netievent_##type##_t; + +#define NETIEVENT_SOCKET_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_SOCKET_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + isc__nmsocket_attach(sock, &ievent->sock); \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + isc__nmsocket_detach(&ievent->sock); \ + isc__nm_put_netievent(nm, ievent); \ + } + +typedef struct isc__netievent__socket_req { + NETIEVENT__SOCKET; + isc__nm_uvreq_t *req; +} isc__netievent__socket_req_t; + +#define NETIEVENT_SOCKET_REQ_TYPE(type) \ + typedef isc__netievent__socket_req_t isc__netievent_##type##_t; + +#define NETIEVENT_SOCKET_REQ_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc__nm_uvreq_t *req); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_SOCKET_REQ_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + isc__nmsocket_attach(sock, &ievent->sock); \ + ievent->req = req; \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + isc__nmsocket_detach(&ievent->sock); \ + isc__nm_put_netievent(nm, ievent); \ + } + +typedef struct isc__netievent__socket_req_result { + NETIEVENT__SOCKET; + isc__nm_uvreq_t *req; + isc_result_t result; +} isc__netievent__socket_req_result_t; + +#define NETIEVENT_SOCKET_REQ_RESULT_TYPE(type) \ + typedef isc__netievent__socket_req_result_t isc__netievent_##type##_t; + +#define NETIEVENT_SOCKET_REQ_RESULT_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc__nm_uvreq_t *req, \ + isc_result_t result); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_SOCKET_REQ_RESULT_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc__nm_uvreq_t *req, \ + isc_result_t result) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + isc__nmsocket_attach(sock, &ievent->sock); \ + ievent->req = req; \ + ievent->result = result; \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + isc__nmsocket_detach(&ievent->sock); \ + isc__nm_put_netievent(nm, ievent); \ + } + +typedef struct isc__netievent__socket_handle { + NETIEVENT__SOCKET; + isc_nmhandle_t *handle; +} isc__netievent__socket_handle_t; + +#define NETIEVENT_SOCKET_HANDLE_TYPE(type) \ + typedef isc__netievent__socket_handle_t isc__netievent_##type##_t; + +#define NETIEVENT_SOCKET_HANDLE_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc_nmhandle_t *handle); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_SOCKET_HANDLE_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc_nmhandle_t *handle) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + isc__nmsocket_attach(sock, &ievent->sock); \ + isc_nmhandle_attach(handle, &ievent->handle); \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + isc__nmsocket_detach(&ievent->sock); \ + isc_nmhandle_detach(&ievent->handle); \ + isc__nm_put_netievent(nm, ievent); \ + } + +typedef struct isc__netievent__socket_quota { + NETIEVENT__SOCKET; + isc_quota_t *quota; +} isc__netievent__socket_quota_t; + +#define NETIEVENT_SOCKET_QUOTA_TYPE(type) \ + typedef isc__netievent__socket_quota_t isc__netievent_##type##_t; + +#define NETIEVENT_SOCKET_QUOTA_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc_quota_t *quota); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_SOCKET_QUOTA_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc_quota_t *quota) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + isc__nmsocket_attach(sock, &ievent->sock); \ + ievent->quota = quota; \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + isc__nmsocket_detach(&ievent->sock); \ + isc__nm_put_netievent(nm, ievent); \ + } + +typedef struct isc__netievent__task { + isc__netievent_type type; + ISC_LINK(isc__netievent_t) link; + isc_task_t *task; +} isc__netievent__task_t; + +#define NETIEVENT_TASK_TYPE(type) \ + typedef isc__netievent__task_t isc__netievent_##type##_t; + +#define NETIEVENT_TASK_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_task_t *task); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_TASK_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_task_t *task) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + ievent->task = task; \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + ievent->task = NULL; \ + isc__nm_put_netievent(nm, ievent); \ + } + +typedef struct isc__netievent_udpsend { + NETIEVENT__SOCKET; + isc_sockaddr_t peer; + isc__nm_uvreq_t *req; +} isc__netievent_udpsend_t; + +typedef struct isc__netievent_tlsconnect { + NETIEVENT__SOCKET; + SSL_CTX *ctx; + isc_sockaddr_t local; /* local address */ + isc_sockaddr_t peer; /* peer address */ +} isc__netievent_tlsconnect_t; + +typedef struct isc__netievent { + isc__netievent_type type; + ISC_LINK(isc__netievent_t) link; +} isc__netievent_t; + +#define NETIEVENT_TYPE(type) typedef isc__netievent_t isc__netievent_##type##_t; + +#define NETIEVENT_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type(isc_nm_t *nm); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + isc__nm_put_netievent(nm, ievent); \ + } + +typedef struct isc__netievent__tlsctx { + NETIEVENT__SOCKET; + isc_tlsctx_t *tlsctx; +} isc__netievent__tlsctx_t; + +#define NETIEVENT_SOCKET_TLSCTX_TYPE(type) \ + typedef isc__netievent__tlsctx_t isc__netievent_##type##_t; + +#define NETIEVENT_SOCKET_TLSCTX_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc_tlsctx_t *tlsctx); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_SOCKET_TLSCTX_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, isc_tlsctx_t *tlsctx) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + isc__nmsocket_attach(sock, &ievent->sock); \ + isc_tlsctx_attach(tlsctx, &ievent->tlsctx); \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + isc_tlsctx_free(&ievent->tlsctx); \ + isc__nmsocket_detach(&ievent->sock); \ + isc__nm_put_netievent(nm, ievent); \ + } + +#ifdef HAVE_LIBNGHTTP2 +typedef struct isc__netievent__http_eps { + NETIEVENT__SOCKET; + isc_nm_http_endpoints_t *endpoints; +} isc__netievent__http_eps_t; + +#define NETIEVENT_SOCKET_HTTP_EPS_TYPE(type) \ + typedef isc__netievent__http_eps_t isc__netievent_##type##_t; + +#define NETIEVENT_SOCKET_HTTP_EPS_DECL(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, \ + isc_nm_http_endpoints_t *endpoints); \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent); + +#define NETIEVENT_SOCKET_HTTP_EPS_DEF(type) \ + isc__netievent_##type##_t *isc__nm_get_netievent_##type( \ + isc_nm_t *nm, isc_nmsocket_t *sock, \ + isc_nm_http_endpoints_t *endpoints) { \ + isc__netievent_##type##_t *ievent = \ + isc__nm_get_netievent(nm, netievent_##type); \ + isc__nmsocket_attach(sock, &ievent->sock); \ + isc_nm_http_endpoints_attach(endpoints, &ievent->endpoints); \ + \ + return (ievent); \ + } \ + \ + void isc__nm_put_netievent_##type(isc_nm_t *nm, \ + isc__netievent_##type##_t *ievent) { \ + isc_nm_http_endpoints_detach(&ievent->endpoints); \ + isc__nmsocket_detach(&ievent->sock); \ + isc__nm_put_netievent(nm, ievent); \ + } +#endif /* HAVE_LIBNGHTTP2 */ + +typedef union { + isc__netievent_t ni; + isc__netievent__socket_t nis; + isc__netievent__socket_req_t nisr; + isc__netievent_udpsend_t nius; + isc__netievent__socket_quota_t nisq; + isc__netievent_tlsconnect_t nitc; + isc__netievent__tlsctx_t nitls; +#ifdef HAVE_LIBNGHTTP2 + isc__netievent__http_eps_t nihttpeps; +#endif /* HAVE_LIBNGHTTP2 */ +} isc__netievent_storage_t; + +/* + * Work item for a uv_work threadpool. + */ +typedef struct isc__nm_work { + isc_nm_t *netmgr; + uv_work_t req; + isc_nm_workcb_t cb; + isc_nm_after_workcb_t after_cb; + void *data; +} isc__nm_work_t; + +/* + * Network manager + */ +#define NM_MAGIC ISC_MAGIC('N', 'E', 'T', 'M') +#define VALID_NM(t) ISC_MAGIC_VALID(t, NM_MAGIC) + +struct isc_nm { + int magic; + isc_refcount_t references; + isc_mem_t *mctx; + int nworkers; + isc_mutex_t lock; + isc_condition_t wkstatecond; + isc_condition_t wkpausecond; + isc__networker_t *workers; + + isc_stats_t *stats; + + uint_fast32_t workers_running; + atomic_uint_fast32_t workers_paused; + atomic_uint_fast32_t maxudp; + + bool load_balance_sockets; + + atomic_bool paused; + + /* + * Active connections are being closed and new connections are + * no longer allowed. + */ + atomic_bool closing; + + /* + * A worker is actively waiting for other workers, for example to + * stop listening; that means no other thread can do the same thing + * or pause, or we'll deadlock. We have to either re-enqueue our + * event or wait for the other one to finish if we want to pause. + */ + atomic_int interlocked; + + /* + * Timeout values for TCP connections, corresponding to + * tcp-intiial-timeout, tcp-idle-timeout, tcp-keepalive-timeout, + * and tcp-advertised-timeout. Note that these are stored in + * milliseconds so they can be used directly with the libuv timer, + * but they are configured in tenths of seconds. + */ + atomic_uint_fast32_t init; + atomic_uint_fast32_t idle; + atomic_uint_fast32_t keepalive; + atomic_uint_fast32_t advertised; + + isc_barrier_t pausing; + isc_barrier_t resuming; + + /* + * Socket SO_RCVBUF and SO_SNDBUF values + */ + atomic_int_fast32_t recv_udp_buffer_size; + atomic_int_fast32_t send_udp_buffer_size; + atomic_int_fast32_t recv_tcp_buffer_size; + atomic_int_fast32_t send_tcp_buffer_size; + +#ifdef NETMGR_TRACE + ISC_LIST(isc_nmsocket_t) active_sockets; +#endif +}; + +/*% + * A universal structure for either a single socket or a group of + * dup'd/SO_REUSE_PORT-using sockets listening on the same interface. + */ +#define NMSOCK_MAGIC ISC_MAGIC('N', 'M', 'S', 'K') +#define VALID_NMSOCK(t) ISC_MAGIC_VALID(t, NMSOCK_MAGIC) + +/*% + * Index into socket stat counter arrays. + */ +typedef enum { + STATID_OPEN = 0, + STATID_OPENFAIL = 1, + STATID_CLOSE = 2, + STATID_BINDFAIL = 3, + STATID_CONNECTFAIL = 4, + STATID_CONNECT = 5, + STATID_ACCEPTFAIL = 6, + STATID_ACCEPT = 7, + STATID_SENDFAIL = 8, + STATID_RECVFAIL = 9, + STATID_ACTIVE = 10, + STATID_MAX = 11, +} isc__nm_statid_t; + +#if HAVE_LIBNGHTTP2 +typedef struct isc_nmsocket_tls_send_req { + isc_nmsocket_t *tlssock; + isc_region_t data; + isc_nm_cb_t cb; + void *cbarg; + isc_nmhandle_t *handle; + bool finish; + uint8_t smallbuf[512]; +} isc_nmsocket_tls_send_req_t; + +typedef enum isc_http_request_type { + ISC_HTTP_REQ_GET, + ISC_HTTP_REQ_POST, + ISC_HTTP_REQ_UNSUPPORTED +} isc_http_request_type_t; + +typedef enum isc_http_scheme_type { + ISC_HTTP_SCHEME_HTTP, + ISC_HTTP_SCHEME_HTTP_SECURE, + ISC_HTTP_SCHEME_UNSUPPORTED +} isc_http_scheme_type_t; + +typedef struct isc_nm_httpcbarg { + isc_nm_recv_cb_t cb; + void *cbarg; + LINK(struct isc_nm_httpcbarg) link; +} isc_nm_httpcbarg_t; + +typedef struct isc_nm_httphandler { + char *path; + isc_nm_recv_cb_t cb; + void *cbarg; + size_t extrahandlesize; + LINK(struct isc_nm_httphandler) link; +} isc_nm_httphandler_t; + +struct isc_nm_http_endpoints { + uint32_t magic; + isc_mem_t *mctx; + + ISC_LIST(isc_nm_httphandler_t) handlers; + ISC_LIST(isc_nm_httpcbarg_t) handler_cbargs; + + isc_refcount_t references; + atomic_bool in_use; +}; + +typedef struct isc_nmsocket_h2 { + isc_nmsocket_t *psock; /* owner of the structure */ + char *request_path; + char *query_data; + size_t query_data_len; + bool query_too_large; + isc_nm_httphandler_t *handler; + + isc_buffer_t rbuf; + isc_buffer_t wbuf; + + int32_t stream_id; + isc_nm_http_session_t *session; + + isc_nmsocket_t *httpserver; + + /* maximum concurrent streams (server-side) */ + atomic_uint_fast32_t max_concurrent_streams; + + uint32_t min_ttl; /* used to set "max-age" in responses */ + + isc_http_request_type_t request_type; + isc_http_scheme_type_t request_scheme; + + size_t content_length; + char clenbuf[128]; + + char cache_control_buf[128]; + + int headers_error_code; + size_t headers_data_processed; + + isc_nm_recv_cb_t cb; + void *cbarg; + LINK(struct isc_nmsocket_h2) link; + + isc_nm_http_endpoints_t **listener_endpoints; + size_t n_listener_endpoints; + + bool response_submitted; + struct { + char *uri; + bool post; + isc_tlsctx_t *tlsctx; + isc_sockaddr_t local_interface; + void *cstream; + const char *tls_peer_verify_string; + } connect; +} isc_nmsocket_h2_t; +#endif /* HAVE_LIBNGHTTP2 */ + +typedef void (*isc_nm_closehandlecb_t)(void *arg); +/*%< + * Opaque callback function, used for isc_nmhandle 'reset' and 'free' + * callbacks. + */ + +struct isc_nmsocket { + /*% Unlocked, RO */ + int magic; + int tid; + isc_nmsocket_type type; + isc_nm_t *mgr; + + /*% Parent socket for multithreaded listeners */ + isc_nmsocket_t *parent; + /*% Listener socket this connection was accepted on */ + isc_nmsocket_t *listener; + /*% Self socket */ + isc_nmsocket_t *self; + + isc_barrier_t startlistening; + isc_barrier_t stoplistening; + + /*% TLS stuff */ + struct tls { + isc_tls_t *tls; + isc_tlsctx_t *ctx; + isc_tlsctx_client_session_cache_t *client_sess_cache; + bool client_session_saved; + BIO *app_rbio; + BIO *app_wbio; + BIO *ssl_rbio; + BIO *ssl_wbio; + enum { + TLS_STATE_NONE, + TLS_STATE_HANDSHAKE, + TLS_STATE_IO, + TLS_STATE_ERROR, + TLS_STATE_CLOSING + } state; + isc_region_t senddata; + ISC_LIST(isc__nm_uvreq_t) sendreqs; + bool cycle; + isc_result_t pending_error; + /* List of active send requests. */ + isc__nm_uvreq_t *pending_req; + bool alpn_negotiated; + const char *tls_verify_errmsg; + } tls; + +#if HAVE_LIBNGHTTP2 + /*% TLS stuff */ + struct tlsstream { + bool server; + BIO *bio_in; + BIO *bio_out; + isc_tls_t *tls; + isc_tlsctx_t *ctx; + isc_tlsctx_t **listener_tls_ctx; /*%< A context reference per + worker */ + size_t n_listener_tls_ctx; + isc_tlsctx_client_session_cache_t *client_sess_cache; + bool client_session_saved; + isc_nmsocket_t *tlslistener; + isc_nmsocket_t *tlssocket; + atomic_bool result_updated; + enum { + TLS_INIT, + TLS_HANDSHAKE, + TLS_IO, + TLS_CLOSED + } state; /*%< The order of these is significant */ + size_t nsending; + bool reading; + } tlsstream; + + isc_nmsocket_h2_t h2; +#endif /* HAVE_LIBNGHTTP2 */ + /*% + * quota is the TCP client, attached when a TCP connection + * is established. pquota is a non-attached pointer to the + * TCP client quota, stored in listening sockets but only + * attached in connected sockets. + */ + isc_quota_t *quota; + isc_quota_t *pquota; + isc_quota_cb_t quotacb; + + /*% + * Socket statistics + */ + const isc_statscounter_t *statsindex; + + /*% + * TCP read/connect timeout timers. + */ + uv_timer_t read_timer; + uint64_t read_timeout; + uint64_t connect_timeout; + + /*% + * TCP write timeout timer. + */ + uint64_t write_timeout; + + /*% outer socket is for 'wrapped' sockets - e.g. tcpdns in tcp */ + isc_nmsocket_t *outer; + + /*% server socket for connections */ + isc_nmsocket_t *server; + + /*% Child sockets for multi-socket setups */ + isc_nmsocket_t *children; + uint_fast32_t nchildren; + isc_sockaddr_t iface; + isc_nmhandle_t *statichandle; + isc_nmhandle_t *outerhandle; + + /*% Extra data allocated at the end of each isc_nmhandle_t */ + size_t extrahandlesize; + + /*% TCP backlog */ + int backlog; + + /*% libuv data */ + uv_os_sock_t fd; + union uv_any_handle uv_handle; + + /*% Peer address */ + isc_sockaddr_t peer; + + /* Atomic */ + /*% Number of running (e.g. listening) child sockets */ + atomic_uint_fast32_t rchildren; + + /*% + * Socket is active if it's listening, working, etc. If it's + * closing, then it doesn't make a sense, for example, to + * push handles or reqs for reuse. + */ + atomic_bool active; + atomic_bool destroying; + + bool route_sock; + + /*% + * Socket is closed if it's not active and all the possible + * callbacks were fired, there are no active handles, etc. + * If active==false but closed==false, that means the socket + * is closing. + */ + atomic_bool closing; + atomic_bool closed; + atomic_bool listening; + atomic_bool connecting; + atomic_bool connected; + atomic_bool accepting; + atomic_bool reading; + atomic_bool timedout; + isc_refcount_t references; + + /*% + * Established an outgoing connection, as client not server. + */ + atomic_bool client; + + /*% + * TCPDNS socket has been set not to pipeline. + */ + atomic_bool sequential; + + /*% + * The socket is processing read callback, this is guard to not read + * data before the readcb is back. + */ + bool processing; + + /*% + * A TCP socket has had isc_nm_pauseread() called. + */ + atomic_bool readpaused; + + /*% + * A TCP or TCPDNS socket has been set to use the keepalive + * timeout instead of the default idle timeout. + */ + atomic_bool keepalive; + + /*% + * 'spare' handles for that can be reused to avoid allocations, + * for UDP. + */ + isc_astack_t *inactivehandles; + isc_astack_t *inactivereqs; + + /*% + * Used to wait for TCP listening events to complete, and + * for the number of running children to reach zero during + * shutdown. + * + * We use two condition variables to prevent the race where the netmgr + * threads would be able to finish and destroy the socket before it's + * unlocked by the isc_nm_listen<proto>() function. So, the flow is as + * follows: + * + * 1. parent thread creates all children sockets and passes then to + * netthreads, looks at the signaling variable and WAIT(cond) until + * the childrens are done initializing + * + * 2. the events get picked by netthreads, calls the libuv API (and + * either succeeds or fails) and WAIT(scond) until all other + * children sockets in netthreads are initialized and the listening + * socket lock is unlocked + * + * 3. the control is given back to the parent thread which now either + * returns success or shutdowns the listener if an error has + * occured in the children netthread + * + * NOTE: The other approach would be doing an extra attach to the parent + * listening socket, and then detach it in the parent thread, but that + * breaks the promise that once the libuv socket is initialized on the + * nmsocket, the nmsocket needs to be handled only by matching + * netthread, so in fact that would add a complexity in a way that + * isc__nmsocket_detach would have to be converted to use an + * asynchrounous netievent. + */ + isc_mutex_t lock; + isc_condition_t cond; + isc_condition_t scond; + + /*% + * Used to pass a result back from listen or connect events. + */ + isc_result_t result; + + /*% + * Current number of active handles. + */ + atomic_int_fast32_t ah; + + /*% Buffer for TCPDNS processing */ + size_t buf_size; + size_t buf_len; + unsigned char *buf; + + /*% + * This function will be called with handle->sock + * as the argument whenever a handle's references drop + * to zero, after its reset callback has been called. + */ + isc_nm_closehandlecb_t closehandle_cb; + + isc_nmhandle_t *recv_handle; + isc_nm_recv_cb_t recv_cb; + void *recv_cbarg; + bool recv_read; + + isc_nm_cb_t connect_cb; + void *connect_cbarg; + + isc_nm_accept_cb_t accept_cb; + void *accept_cbarg; + + atomic_int_fast32_t active_child_connections; + + isc_barrier_t barrier; + bool barrier_initialised; +#ifdef NETMGR_TRACE + void *backtrace[TRACE_SIZE]; + int backtrace_size; + LINK(isc_nmsocket_t) active_link; + ISC_LIST(isc_nmhandle_t) active_handles; +#endif +}; + +bool +isc__nm_in_netthread(void); +/*%< + * Returns 'true' if we're in the network thread. + */ + +void +isc__nm_maybe_enqueue_ievent(isc__networker_t *worker, isc__netievent_t *event); +/*%< + * If the caller is already in the matching nmthread, process the netievent + * directly, if not enqueue using isc__nm_enqueue_ievent(). + */ + +void +isc__nm_enqueue_ievent(isc__networker_t *worker, isc__netievent_t *event); +/*%< + * Enqueue an ievent onto a specific worker queue. (This the only safe + * way to use an isc__networker_t from another thread.) + */ + +void +isc__nm_free_uvbuf(isc_nmsocket_t *sock, const uv_buf_t *buf); +/*%< + * Free a buffer allocated for a receive operation. + * + * Note that as currently implemented, this doesn't actually + * free anything, marks the isc__networker's UDP receive buffer + * as "not in use". + */ + +isc_nmhandle_t * +isc___nmhandle_get(isc_nmsocket_t *sock, isc_sockaddr_t *peer, + isc_sockaddr_t *local FLARG); +/*%< + * Get a handle for the socket 'sock', allocating a new one + * if there isn't one available in 'sock->inactivehandles'. + * + * If 'peer' is not NULL, set the handle's peer address to 'peer', + * otherwise set it to 'sock->peer'. + * + * If 'local' is not NULL, set the handle's local address to 'local', + * otherwise set it to 'sock->iface->addr'. + * + * 'sock' will be attached to 'handle->sock'. The caller may need + * to detach the socket afterward. + */ + +isc__nm_uvreq_t * +isc___nm_uvreq_get(isc_nm_t *mgr, isc_nmsocket_t *sock FLARG); +/*%< + * Get a UV request structure for the socket 'sock', allocating a + * new one if there isn't one available in 'sock->inactivereqs'. + */ + +void +isc___nm_uvreq_put(isc__nm_uvreq_t **req, isc_nmsocket_t *sock FLARG); +/*%< + * Completes the use of a UV request structure, setting '*req' to NULL. + * + * The UV request is pushed onto the 'sock->inactivereqs' stack or, + * if that doesn't work, freed. + */ + +void +isc___nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type, + isc_sockaddr_t *iface FLARG); +/*%< + * Initialize socket 'sock', attach it to 'mgr', and set it to type 'type' + * and its interface to 'iface'. + */ + +void +isc___nmsocket_attach(isc_nmsocket_t *sock, isc_nmsocket_t **target FLARG); +/*%< + * Attach to a socket, increasing refcount + */ + +void +isc___nmsocket_detach(isc_nmsocket_t **socketp FLARG); +/*%< + * Detach from socket, decreasing refcount and possibly destroying the + * socket if it's no longer referenced. + */ + +void +isc___nmsocket_prep_destroy(isc_nmsocket_t *sock FLARG); +/*%< + * Market 'sock' as inactive, close it if necessary, and destroy it + * if there are no remaining references or active handles. + */ + +void +isc__nmsocket_shutdown(isc_nmsocket_t *sock); +/*%< + * Initiate the socket shutdown which actively calls the active + * callbacks. + */ + +void +isc__nmsocket_reset(isc_nmsocket_t *sock); +/*%< + * Reset and close the socket. + */ + +bool +isc__nmsocket_active(isc_nmsocket_t *sock); +/*%< + * Determine whether 'sock' is active by checking 'sock->active' + * or, for child sockets, 'sock->parent->active'. + */ + +bool +isc__nmsocket_deactivate(isc_nmsocket_t *sock); +/*%< + * @brief Deactivate active socket + * + * Atomically deactive the socket by setting @p sock->active or, for child + * sockets, @p sock->parent->active to @c false + * + * @param[in] sock - valid nmsocket + * @return @c false if the socket was already inactive, @c true otherwise + */ + +void +isc__nmsocket_clearcb(isc_nmsocket_t *sock); +/*%< + * Clear the recv and accept callbacks in 'sock'. + */ + +void +isc__nmsocket_timer_stop(isc_nmsocket_t *sock); +void +isc__nmsocket_timer_start(isc_nmsocket_t *sock); +void +isc__nmsocket_timer_restart(isc_nmsocket_t *sock); +bool +isc__nmsocket_timer_running(isc_nmsocket_t *sock); +/*%< + * Start/stop/restart/check the timeout on the socket + */ + +void +isc__nm_connectcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq, + isc_result_t eresult, bool async); + +void +isc__nm_async_connectcb(isc__networker_t *worker, isc__netievent_t *ev0); +/*%< + * Issue a connect callback on the socket, used to call the callback + */ + +void +isc__nm_readcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq, + isc_result_t eresult); +void +isc__nm_async_readcb(isc__networker_t *worker, isc__netievent_t *ev0); + +/*%< + * Issue a read callback on the socket, used to call the callback + * on failed conditions when the event can't be scheduled on the uv loop. + * + */ + +void +isc__nm_sendcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq, + isc_result_t eresult, bool async); +void +isc__nm_async_sendcb(isc__networker_t *worker, isc__netievent_t *ev0); +/*%< + * Issue a write callback on the socket, used to call the callback + * on failed conditions when the event can't be scheduled on the uv loop. + */ + +void +isc__nm_async_shutdown(isc__networker_t *worker, isc__netievent_t *ev0); +/*%< + * Walk through all uv handles, get the underlying sockets and issue + * close on them. + */ + +void +isc__nm_udp_send(isc_nmhandle_t *handle, const isc_region_t *region, + isc_nm_cb_t cb, void *cbarg); +/*%< + * Back-end implementation of isc_nm_send() for UDP handles. + */ + +void +isc__nm_udp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg); +/* + * Back-end implementation of isc_nm_read() for UDP handles. + */ + +void +isc__nm_udp_close(isc_nmsocket_t *sock); +/*%< + * Close a UDP socket. + */ + +void +isc__nm_udp_cancelread(isc_nmhandle_t *handle); +/*%< + * Stop reading on a connected UDP handle. + */ + +void +isc__nm_udp_shutdown(isc_nmsocket_t *sock); +/*%< + * Called during the shutdown process to close and clean up connected + * sockets. + */ + +void +isc__nm_udp_stoplistening(isc_nmsocket_t *sock); +/*%< + * Stop listening on 'sock'. + */ + +void +isc__nm_udp_settimeout(isc_nmhandle_t *handle, uint32_t timeout); +/*%< + * Set or clear the recv timeout for the UDP socket associated with 'handle'. + */ + +void +isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_udpconnect(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_udpstop(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_udpsend(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_udpread(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_udpcancel(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_udpclose(isc__networker_t *worker, isc__netievent_t *ev0); +/*%< + * Callback handlers for asynchronous UDP events (listen, stoplisten, send). + */ + +void +isc__nm_async_routeconnect(isc__networker_t *worker, isc__netievent_t *ev0); +/*%< + * Callback handler for route socket events. + */ + +void +isc__nm_tcp_send(isc_nmhandle_t *handle, const isc_region_t *region, + isc_nm_cb_t cb, void *cbarg); +/*%< + * Back-end implementation of isc_nm_send() for TCP handles. + */ + +void +isc__nm_tcp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg); +/* + * Back-end implementation of isc_nm_read() for TCP handles. + */ + +void +isc__nm_tcp_close(isc_nmsocket_t *sock); +/*%< + * Close a TCP socket. + */ +void +isc__nm_tcp_pauseread(isc_nmhandle_t *handle); +/*%< + * Pause reading on this handle, while still remembering the callback. + */ + +void +isc__nm_tcp_resumeread(isc_nmhandle_t *handle); +/*%< + * Resume reading from socket. + * + */ + +void +isc__nm_tcp_shutdown(isc_nmsocket_t *sock); +/*%< + * Called during the shutdown process to close and clean up connected + * sockets. + */ + +void +isc__nm_tcp_cancelread(isc_nmhandle_t *handle); +/*%< + * Stop reading on a connected TCP handle. + */ + +void +isc__nm_tcp_stoplistening(isc_nmsocket_t *sock); +/*%< + * Stop listening on 'sock'. + */ + +int_fast32_t +isc__nm_tcp_listener_nactive(isc_nmsocket_t *sock); +/*%< + * Returns the number of active connections for the TCP listener socket. + */ + +void +isc__nm_tcp_settimeout(isc_nmhandle_t *handle, uint32_t timeout); +/*%< + * Set the read timeout for the TCP socket associated with 'handle'. + */ + +void +isc__nm_async_tcpconnect(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpaccept(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpstop(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpsend(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_startread(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_pauseread(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpstartread(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcppauseread(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpcancel(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpclose(isc__networker_t *worker, isc__netievent_t *ev0); +/*%< + * Callback handlers for asynchronous TCP events (connect, listen, + * stoplisten, send, read, pause, close). + */ + +void +isc__nm_async_tlsclose(isc__networker_t *worker, isc__netievent_t *ev0); + +void +isc__nm_async_tlssend(isc__networker_t *worker, isc__netievent_t *ev0); + +void +isc__nm_async_tlsstartread(isc__networker_t *worker, isc__netievent_t *ev0); + +void +isc__nm_async_tlsdobio(isc__networker_t *worker, isc__netievent_t *ev0); + +void +isc__nm_async_tlscancel(isc__networker_t *worker, isc__netievent_t *ev0); +/*%< + * Callback handlers for asynchronous TLS events. + */ + +void +isc__nm_tcpdns_send(isc_nmhandle_t *handle, isc_region_t *region, + isc_nm_cb_t cb, void *cbarg); +/*%< + * Back-end implementation of isc_nm_send() for TCPDNS handles. + */ + +void +isc__nm_tcpdns_shutdown(isc_nmsocket_t *sock); + +void +isc__nm_tcpdns_close(isc_nmsocket_t *sock); +/*%< + * Close a TCPDNS socket. + */ + +void +isc__nm_tcpdns_stoplistening(isc_nmsocket_t *sock); +/*%< + * Stop listening on 'sock'. + */ + +void +isc__nm_tcpdns_settimeout(isc_nmhandle_t *handle, uint32_t timeout); +/*%< + * Set the read timeout and reset the timer for the TCPDNS socket + * associated with 'handle', and the TCP socket it wraps around. + */ + +void +isc__nm_async_tcpdnsaccept(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpdnsconnect(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpdnslisten(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpdnscancel(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpdnsclose(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpdnssend(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpdnsstop(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tcpdnsread(isc__networker_t *worker, isc__netievent_t *ev0); +/*%< + * Callback handlers for asynchronous TCPDNS events. + */ + +void +isc__nm_tcpdns_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg); +/* + * Back-end implementation of isc_nm_read() for TCPDNS handles. + */ + +void +isc__nm_tcpdns_cancelread(isc_nmhandle_t *handle); +/*%< + * Stop reading on a connected TCPDNS handle. + */ + +void +isc__nm_tlsdns_send(isc_nmhandle_t *handle, isc_region_t *region, + isc_nm_cb_t cb, void *cbarg); + +void +isc__nm_tlsdns_shutdown(isc_nmsocket_t *sock); + +void +isc__nm_tlsdns_close(isc_nmsocket_t *sock); +/*%< + * Close a TLSDNS socket. + */ + +void +isc__nm_tlsdns_stoplistening(isc_nmsocket_t *sock); +/*%< + * Stop listening on 'sock'. + */ + +void +isc__nm_tlsdns_settimeout(isc_nmhandle_t *handle, uint32_t timeout); +/*%< + * Set the read timeout and reset the timer for the TLSDNS socket + * associated with 'handle', and the TCP socket it wraps around. + */ + +void +isc__nm_tlsdns_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg); +/* + * Back-end implementation of isc_nm_read() for TLSDNS handles. + */ + +void +isc__nm_tlsdns_cancelread(isc_nmhandle_t *handle); +/*%< + * Stop reading on a connected TLSDNS handle. + */ + +const char * +isc__nm_tlsdns_verify_tls_peer_result_string(const isc_nmhandle_t *handle); + +void +isc__nm_async_tlsdnscycle(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnsaccept(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnsconnect(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnslisten(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnscancel(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnsclose(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnssend(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnsstop(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnsshutdown(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdnsread(isc__networker_t *worker, isc__netievent_t *ev0); +void +isc__nm_async_tlsdns_set_tlsctx(isc_nmsocket_t *listener, isc_tlsctx_t *tlsctx, + const int tid); +/*%< + * Callback handlers for asynchronous TLSDNS events. + */ + +isc_result_t +isc__nm_tlsdns_xfr_checkperm(isc_nmsocket_t *sock); +/*%< + * Check if it is permitted to do a zone transfer over the given TLSDNS + * socket. + * + * Returns: + * \li #ISC_R_SUCCESS Success, permission check passed successfully + * \li #ISC_R_DOTALPNERROR No permission because of ALPN tag mismatch + * \li any other result indicates failure (i.e. no permission) + * + * Requires: + * \li 'sock' is a valid TLSDNS socket. + */ + +void +isc__nm_tlsdns_cleanup_data(isc_nmsocket_t *sock); + +#if HAVE_LIBNGHTTP2 +void +isc__nm_tls_send(isc_nmhandle_t *handle, const isc_region_t *region, + isc_nm_cb_t cb, void *cbarg); + +void +isc__nm_tls_cancelread(isc_nmhandle_t *handle); + +/*%< + * Back-end implementation of isc_nm_send() for TLSDNS handles. + */ + +void +isc__nm_tls_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg); + +void +isc__nm_tls_close(isc_nmsocket_t *sock); +/*%< + * Close a TLS socket. + */ + +void +isc__nm_tls_pauseread(isc_nmhandle_t *handle); +/*%< + * Pause reading on this handle, while still remembering the callback. + */ + +void +isc__nm_tls_resumeread(isc_nmhandle_t *handle); +/*%< + * Resume reading from the handle. + * + */ + +void +isc__nm_tls_cleanup_data(isc_nmsocket_t *sock); + +void +isc__nm_tls_stoplistening(isc_nmsocket_t *sock); + +void +isc__nm_tls_settimeout(isc_nmhandle_t *handle, uint32_t timeout); +void +isc__nm_tls_cleartimeout(isc_nmhandle_t *handle); +/*%< + * Set the read timeout and reset the timer for the socket + * associated with 'handle', and the TCP socket it wraps + * around. + */ + +const char * +isc__nm_tls_verify_tls_peer_result_string(const isc_nmhandle_t *handle); + +void +isc__nmhandle_tls_keepalive(isc_nmhandle_t *handle, bool value); +/*%< + * Set the keepalive value on the underlying TCP handle. + */ + +void +isc__nm_async_tls_set_tlsctx(isc_nmsocket_t *listener, isc_tlsctx_t *tlsctx, + const int tid); + +void +isc__nmhandle_tls_setwritetimeout(isc_nmhandle_t *handle, + uint64_t write_timeout); + +void +isc__nm_http_stoplistening(isc_nmsocket_t *sock); + +void +isc__nm_http_settimeout(isc_nmhandle_t *handle, uint32_t timeout); +void +isc__nm_http_cleartimeout(isc_nmhandle_t *handle); +/*%< + * Set the read timeout and reset the timer for the socket + * associated with 'handle', and the TLS/TCP socket it wraps + * around. + */ + +void +isc__nmhandle_http_keepalive(isc_nmhandle_t *handle, bool value); +/*%< + * Set the keepalive value on the underlying session handle + */ + +void +isc__nm_http_initsocket(isc_nmsocket_t *sock); + +void +isc__nm_http_cleanup_data(isc_nmsocket_t *sock); + +isc_result_t +isc__nm_http_request(isc_nmhandle_t *handle, isc_region_t *region, + isc_nm_recv_cb_t reply_cb, void *cbarg); + +void +isc__nm_http_send(isc_nmhandle_t *handle, const isc_region_t *region, + isc_nm_cb_t cb, void *cbarg); + +void +isc__nm_http_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg); + +void +isc__nm_http_close(isc_nmsocket_t *sock); + +void +isc__nm_http_bad_request(isc_nmhandle_t *handle); +/*%< + * Respond to the request with 400 "Bad Request" status. + * + * Requires: + * \li 'handle' is a valid HTTP netmgr handle object, referencing a server-side + * socket + */ + +bool +isc__nm_http_has_encryption(const isc_nmhandle_t *handle); + +void +isc__nm_http_set_maxage(isc_nmhandle_t *handle, const uint32_t ttl); + +const char * +isc__nm_http_verify_tls_peer_result_string(const isc_nmhandle_t *handle); + +void +isc__nm_async_httpsend(isc__networker_t *worker, isc__netievent_t *ev0); + +void +isc__nm_async_httpclose(isc__networker_t *worker, isc__netievent_t *ev0); + +void +isc__nm_async_httpendpoints(isc__networker_t *worker, isc__netievent_t *ev0); + +bool +isc__nm_parse_httpquery(const char *query_string, const char **start, + size_t *len); + +char * +isc__nm_base64url_to_base64(isc_mem_t *mem, const char *base64url, + const size_t base64url_len, size_t *res_len); + +char * +isc__nm_base64_to_base64url(isc_mem_t *mem, const char *base64, + const size_t base64_len, size_t *res_len); + +void +isc__nm_httpsession_attach(isc_nm_http_session_t *source, + isc_nm_http_session_t **targetp); +void +isc__nm_httpsession_detach(isc_nm_http_session_t **sessionp); + +void +isc__nm_http_set_tlsctx(isc_nmsocket_t *sock, isc_tlsctx_t *tlsctx); + +void +isc__nm_http_set_max_streams(isc_nmsocket_t *listener, + const uint32_t max_concurrent_streams); + +#endif + +void +isc__nm_async_settlsctx(isc__networker_t *worker, isc__netievent_t *ev0); + +#define isc__nm_uverr2result(x) \ + isc___nm_uverr2result(x, true, __FILE__, __LINE__, __func__) +isc_result_t +isc___nm_uverr2result(int uverr, bool dolog, const char *file, + unsigned int line, const char *func); +/*%< + * Convert a libuv error value into an isc_result_t. The + * list of supported error values is not complete; new users + * of this function should add any expected errors that are + * not already there. + */ + +bool +isc__nm_acquire_interlocked(isc_nm_t *mgr); +/*%< + * Try to acquire interlocked state; return true if successful. + */ + +void +isc__nm_drop_interlocked(isc_nm_t *mgr); +/*%< + * Drop interlocked state; signal waiters. + */ + +void +isc__nm_acquire_interlocked_force(isc_nm_t *mgr); +/*%< + * Actively wait for interlocked state. + */ + +void +isc__nm_async_sockstop(isc__networker_t *worker, isc__netievent_t *ev0); + +void +isc__nm_incstats(isc_nmsocket_t *sock, isc__nm_statid_t id); +/*%< + * Increment socket-related statistics counters. + */ + +void +isc__nm_decstats(isc_nmsocket_t *sock, isc__nm_statid_t id); +/*%< + * Decrement socket-related statistics counters. + */ + +isc_result_t +isc__nm_socket(int domain, int type, int protocol, uv_os_sock_t *sockp); +/*%< + * Platform independent socket() version + */ + +void +isc__nm_closesocket(uv_os_sock_t sock); +/*%< + * Platform independent closesocket() version + */ + +isc_result_t +isc__nm_socket_freebind(uv_os_sock_t fd, sa_family_t sa_family); +/*%< + * Set the IP_FREEBIND (or equivalent) socket option on the uv_handle + */ + +isc_result_t +isc__nm_socket_reuse(uv_os_sock_t fd); +/*%< + * Set the SO_REUSEADDR or SO_REUSEPORT (or equivalent) socket option on the fd + */ + +isc_result_t +isc__nm_socket_reuse_lb(uv_os_sock_t fd); +/*%< + * Set the SO_REUSEPORT_LB (or equivalent) socket option on the fd + */ + +isc_result_t +isc__nm_socket_incoming_cpu(uv_os_sock_t fd); +/*%< + * Set the SO_INCOMING_CPU socket option on the fd if available + */ + +isc_result_t +isc__nm_socket_disable_pmtud(uv_os_sock_t fd, sa_family_t sa_family); +/*%< + * Disable the Path MTU Discovery, either by disabling IP(V6)_DONTFRAG socket + * option, or setting the IP(V6)_MTU_DISCOVER socket option to IP_PMTUDISC_OMIT + */ + +isc_result_t +isc__nm_socket_v6only(uv_os_sock_t fd, sa_family_t sa_family); +/*%< + * Restrict the socket to sending and receiving IPv6 packets only + */ + +isc_result_t +isc__nm_socket_connectiontimeout(uv_os_sock_t fd, int timeout_ms); +/*%< + * Set the connection timeout in milliseconds, on non-Linux platforms, + * the minimum value must be at least 1000 (1 second). + */ + +isc_result_t +isc__nm_socket_tcp_nodelay(uv_os_sock_t fd); +/*%< + * Disables Nagle's algorithm on a TCP socket (sets TCP_NODELAY). + */ + +isc_result_t +isc__nm_socket_tcp_maxseg(uv_os_sock_t fd, int size); +/*%< + * Set the TCP maximum segment size + */ + +isc_result_t +isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family); +/*%< + * Use minimum MTU on IPv6 sockets + */ + +void +isc__nm_set_network_buffers(isc_nm_t *nm, uv_handle_t *handle); +/*%> + * Sets the pre-configured network buffers size on the handle. + */ + +void +isc__nmsocket_barrier_init(isc_nmsocket_t *listener); +/*%> + * Initialise the socket synchronisation barrier according to the + * number of children. + */ + +void +isc__nmsocket_stop(isc_nmsocket_t *listener); +/*%> + * Broadcast "stop" event for a listener socket across all workers and + * wait its processing completion - then, stop and close the underlying + * transport listener socket. + * + * The primitive is used in multi-layer transport listener sockets to + * implement shutdown properly: after the broadcasted events has been + * processed it is safe to destroy the shared data within the listener + * socket (including shutting down the underlying transport listener + * socket). + */ + +/* + * typedef all the netievent types + */ + +NETIEVENT_SOCKET_TYPE(close); +NETIEVENT_SOCKET_TYPE(tcpclose); +NETIEVENT_SOCKET_TYPE(tcplisten); +NETIEVENT_SOCKET_TYPE(tcppauseread); +NETIEVENT_SOCKET_TYPE(tcpstop); +NETIEVENT_SOCKET_TYPE(tlsclose); +/* NETIEVENT_SOCKET_TYPE(tlsconnect); */ /* unique type, defined independently + */ +NETIEVENT_SOCKET_TYPE(tlsdobio); +NETIEVENT_SOCKET_TYPE(tlsstartread); +NETIEVENT_SOCKET_HANDLE_TYPE(tlscancel); +NETIEVENT_SOCKET_TYPE(udpclose); +NETIEVENT_SOCKET_TYPE(udplisten); +NETIEVENT_SOCKET_TYPE(udpread); +/* NETIEVENT_SOCKET_TYPE(udpsend); */ /* unique type, defined independently */ +NETIEVENT_SOCKET_TYPE(udpstop); + +NETIEVENT_SOCKET_TYPE(tcpdnsclose); +NETIEVENT_SOCKET_TYPE(tcpdnsread); +NETIEVENT_SOCKET_TYPE(tcpdnsstop); +NETIEVENT_SOCKET_TYPE(tcpdnslisten); +NETIEVENT_SOCKET_REQ_TYPE(tcpdnsconnect); +NETIEVENT_SOCKET_REQ_TYPE(tcpdnssend); +NETIEVENT_SOCKET_HANDLE_TYPE(tcpdnscancel); +NETIEVENT_SOCKET_QUOTA_TYPE(tcpdnsaccept); + +NETIEVENT_SOCKET_TYPE(tlsdnsclose); +NETIEVENT_SOCKET_TYPE(tlsdnsread); +NETIEVENT_SOCKET_TYPE(tlsdnsstop); +NETIEVENT_SOCKET_TYPE(tlsdnsshutdown); +NETIEVENT_SOCKET_TYPE(tlsdnslisten); +NETIEVENT_SOCKET_REQ_TYPE(tlsdnsconnect); +NETIEVENT_SOCKET_REQ_TYPE(tlsdnssend); +NETIEVENT_SOCKET_HANDLE_TYPE(tlsdnscancel); +NETIEVENT_SOCKET_QUOTA_TYPE(tlsdnsaccept); +NETIEVENT_SOCKET_TYPE(tlsdnscycle); + +#ifdef HAVE_LIBNGHTTP2 +NETIEVENT_SOCKET_REQ_TYPE(httpsend); +NETIEVENT_SOCKET_TYPE(httpclose); +NETIEVENT_SOCKET_HTTP_EPS_TYPE(httpendpoints); +#endif /* HAVE_LIBNGHTTP2 */ + +NETIEVENT_SOCKET_REQ_TYPE(tcpconnect); +NETIEVENT_SOCKET_REQ_TYPE(tcpsend); +NETIEVENT_SOCKET_TYPE(tcpstartread); +NETIEVENT_SOCKET_REQ_TYPE(tlssend); +NETIEVENT_SOCKET_REQ_TYPE(udpconnect); + +NETIEVENT_SOCKET_REQ_TYPE(routeconnect); + +NETIEVENT_SOCKET_REQ_RESULT_TYPE(connectcb); +NETIEVENT_SOCKET_REQ_RESULT_TYPE(readcb); +NETIEVENT_SOCKET_REQ_RESULT_TYPE(sendcb); + +NETIEVENT_SOCKET_HANDLE_TYPE(detach); +NETIEVENT_SOCKET_HANDLE_TYPE(tcpcancel); +NETIEVENT_SOCKET_HANDLE_TYPE(udpcancel); + +NETIEVENT_SOCKET_QUOTA_TYPE(tcpaccept); + +NETIEVENT_TYPE(pause); +NETIEVENT_TYPE(resume); +NETIEVENT_TYPE(shutdown); +NETIEVENT_TYPE(stop); + +NETIEVENT_TASK_TYPE(task); +NETIEVENT_TASK_TYPE(privilegedtask); + +NETIEVENT_SOCKET_TLSCTX_TYPE(settlsctx); +NETIEVENT_SOCKET_TYPE(sockstop); + +/* Now declared the helper functions */ + +NETIEVENT_SOCKET_DECL(close); +NETIEVENT_SOCKET_DECL(tcpclose); +NETIEVENT_SOCKET_DECL(tcplisten); +NETIEVENT_SOCKET_DECL(tcppauseread); +NETIEVENT_SOCKET_DECL(tcpstartread); +NETIEVENT_SOCKET_DECL(tcpstop); +NETIEVENT_SOCKET_DECL(tlsclose); +NETIEVENT_SOCKET_DECL(tlsconnect); +NETIEVENT_SOCKET_DECL(tlsdobio); +NETIEVENT_SOCKET_DECL(tlsstartread); +NETIEVENT_SOCKET_HANDLE_DECL(tlscancel); +NETIEVENT_SOCKET_DECL(udpclose); +NETIEVENT_SOCKET_DECL(udplisten); +NETIEVENT_SOCKET_DECL(udpread); +NETIEVENT_SOCKET_DECL(udpsend); +NETIEVENT_SOCKET_DECL(udpstop); + +NETIEVENT_SOCKET_DECL(tcpdnsclose); +NETIEVENT_SOCKET_DECL(tcpdnsread); +NETIEVENT_SOCKET_DECL(tcpdnsstop); +NETIEVENT_SOCKET_DECL(tcpdnslisten); +NETIEVENT_SOCKET_REQ_DECL(tcpdnsconnect); +NETIEVENT_SOCKET_REQ_DECL(tcpdnssend); +NETIEVENT_SOCKET_HANDLE_DECL(tcpdnscancel); +NETIEVENT_SOCKET_QUOTA_DECL(tcpdnsaccept); + +NETIEVENT_SOCKET_DECL(tlsdnsclose); +NETIEVENT_SOCKET_DECL(tlsdnsread); +NETIEVENT_SOCKET_DECL(tlsdnsstop); +NETIEVENT_SOCKET_DECL(tlsdnsshutdown); +NETIEVENT_SOCKET_DECL(tlsdnslisten); +NETIEVENT_SOCKET_REQ_DECL(tlsdnsconnect); +NETIEVENT_SOCKET_REQ_DECL(tlsdnssend); +NETIEVENT_SOCKET_HANDLE_DECL(tlsdnscancel); +NETIEVENT_SOCKET_QUOTA_DECL(tlsdnsaccept); +NETIEVENT_SOCKET_DECL(tlsdnscycle); + +#ifdef HAVE_LIBNGHTTP2 +NETIEVENT_SOCKET_REQ_DECL(httpsend); +NETIEVENT_SOCKET_DECL(httpclose); +NETIEVENT_SOCKET_HTTP_EPS_DECL(httpendpoints); +#endif /* HAVE_LIBNGHTTP2 */ + +NETIEVENT_SOCKET_REQ_DECL(tcpconnect); +NETIEVENT_SOCKET_REQ_DECL(tcpsend); +NETIEVENT_SOCKET_REQ_DECL(tlssend); +NETIEVENT_SOCKET_REQ_DECL(udpconnect); + +NETIEVENT_SOCKET_REQ_DECL(routeconnect); + +NETIEVENT_SOCKET_REQ_RESULT_DECL(connectcb); +NETIEVENT_SOCKET_REQ_RESULT_DECL(readcb); +NETIEVENT_SOCKET_REQ_RESULT_DECL(sendcb); + +NETIEVENT_SOCKET_HANDLE_DECL(udpcancel); +NETIEVENT_SOCKET_HANDLE_DECL(tcpcancel); +NETIEVENT_SOCKET_DECL(detach); + +NETIEVENT_SOCKET_QUOTA_DECL(tcpaccept); + +NETIEVENT_DECL(pause); +NETIEVENT_DECL(resume); +NETIEVENT_DECL(shutdown); +NETIEVENT_DECL(stop); + +NETIEVENT_TASK_DECL(task); +NETIEVENT_TASK_DECL(privilegedtask); + +NETIEVENT_SOCKET_TLSCTX_DECL(settlsctx); +NETIEVENT_SOCKET_DECL(sockstop); + +void +isc__nm_udp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result); +void +isc__nm_tcp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result); +void +isc__nm_tcpdns_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result); +void +isc__nm_tlsdns_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, + bool async); + +isc_result_t +isc__nm_tcpdns_processbuffer(isc_nmsocket_t *sock); +isc_result_t +isc__nm_tlsdns_processbuffer(isc_nmsocket_t *sock); + +isc__nm_uvreq_t * +isc__nm_get_read_req(isc_nmsocket_t *sock, isc_sockaddr_t *sockaddr); + +void +isc__nm_alloc_cb(uv_handle_t *handle, size_t size, uv_buf_t *buf); + +void +isc__nm_udp_read_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf, + const struct sockaddr *addr, unsigned flags); +void +isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf); +void +isc__nm_tcpdns_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf); +void +isc__nm_tlsdns_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf); + +isc_result_t +isc__nm_start_reading(isc_nmsocket_t *sock); +void +isc__nm_stop_reading(isc_nmsocket_t *sock); +isc_result_t +isc__nm_process_sock_buffer(isc_nmsocket_t *sock); +void +isc__nm_resume_processing(void *arg); +bool +isc__nmsocket_closing(isc_nmsocket_t *sock); +bool +isc__nm_closing(isc_nmsocket_t *sock); + +void +isc__nm_alloc_dnsbuf(isc_nmsocket_t *sock, size_t len); + +void +isc__nm_failed_send_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req, + isc_result_t eresult); +void +isc__nm_failed_accept_cb(isc_nmsocket_t *sock, isc_result_t eresult); +void +isc__nm_failed_connect_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req, + isc_result_t eresult, bool async); +void +isc__nm_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, bool async); + +void +isc__nm_accept_connection_log(isc_result_t result, bool can_log_quota); + +/* + * Timeout callbacks + */ +void +isc__nmsocket_connecttimeout_cb(uv_timer_t *timer); +void +isc__nmsocket_readtimeout_cb(uv_timer_t *timer); +void +isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult); + +#define UV_RUNTIME_CHECK(func, ret) \ + if (ret != 0) { \ + FATAL_ERROR("%s failed: %s\n", #func, uv_strerror(ret)); \ + } + +void +isc__nmsocket_log_tls_session_reuse(isc_nmsocket_t *sock, isc_tls_t *tls); diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c new file mode 100644 index 0000000..b19d468 --- /dev/null +++ b/lib/isc/netmgr/netmgr.c @@ -0,0 +1,3991 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <inttypes.h> +#include <unistd.h> +#include <uv.h> + +#include <isc/atomic.h> +#include <isc/backtrace.h> +#include <isc/barrier.h> +#include <isc/buffer.h> +#include <isc/condition.h> +#include <isc/errno.h> +#include <isc/list.h> +#include <isc/log.h> +#include <isc/magic.h> +#include <isc/mem.h> +#include <isc/netmgr.h> +#include <isc/print.h> +#include <isc/quota.h> +#include <isc/random.h> +#include <isc/refcount.h> +#include <isc/region.h> +#include <isc/result.h> +#include <isc/sockaddr.h> +#include <isc/stats.h> +#include <isc/task.h> +#include <isc/thread.h> +#include <isc/tls.h> +#include <isc/util.h> + +#include "netmgr-int.h" +#include "netmgr_p.h" +#include "openssl_shim.h" +#include "trampoline_p.h" +#include "uv-compat.h" + +/*% + * How many isc_nmhandles and isc_nm_uvreqs will we be + * caching for reuse in a socket. + */ +#define ISC_NM_HANDLES_STACK_SIZE 600 +#define ISC_NM_REQS_STACK_SIZE 600 + +/*% + * Shortcut index arrays to get access to statistics counters. + */ + +static const isc_statscounter_t udp4statsindex[] = { + isc_sockstatscounter_udp4open, + isc_sockstatscounter_udp4openfail, + isc_sockstatscounter_udp4close, + isc_sockstatscounter_udp4bindfail, + isc_sockstatscounter_udp4connectfail, + isc_sockstatscounter_udp4connect, + -1, + -1, + isc_sockstatscounter_udp4sendfail, + isc_sockstatscounter_udp4recvfail, + isc_sockstatscounter_udp4active +}; + +static const isc_statscounter_t udp6statsindex[] = { + isc_sockstatscounter_udp6open, + isc_sockstatscounter_udp6openfail, + isc_sockstatscounter_udp6close, + isc_sockstatscounter_udp6bindfail, + isc_sockstatscounter_udp6connectfail, + isc_sockstatscounter_udp6connect, + -1, + -1, + isc_sockstatscounter_udp6sendfail, + isc_sockstatscounter_udp6recvfail, + isc_sockstatscounter_udp6active +}; + +static const isc_statscounter_t tcp4statsindex[] = { + isc_sockstatscounter_tcp4open, isc_sockstatscounter_tcp4openfail, + isc_sockstatscounter_tcp4close, isc_sockstatscounter_tcp4bindfail, + isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect, + isc_sockstatscounter_tcp4acceptfail, isc_sockstatscounter_tcp4accept, + isc_sockstatscounter_tcp4sendfail, isc_sockstatscounter_tcp4recvfail, + isc_sockstatscounter_tcp4active +}; + +static const isc_statscounter_t tcp6statsindex[] = { + isc_sockstatscounter_tcp6open, isc_sockstatscounter_tcp6openfail, + isc_sockstatscounter_tcp6close, isc_sockstatscounter_tcp6bindfail, + isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect, + isc_sockstatscounter_tcp6acceptfail, isc_sockstatscounter_tcp6accept, + isc_sockstatscounter_tcp6sendfail, isc_sockstatscounter_tcp6recvfail, + isc_sockstatscounter_tcp6active +}; + +#if 0 +/* XXX: not currently used */ +static const isc_statscounter_t unixstatsindex[] = { + isc_sockstatscounter_unixopen, + isc_sockstatscounter_unixopenfail, + isc_sockstatscounter_unixclose, + isc_sockstatscounter_unixbindfail, + isc_sockstatscounter_unixconnectfail, + isc_sockstatscounter_unixconnect, + isc_sockstatscounter_unixacceptfail, + isc_sockstatscounter_unixaccept, + isc_sockstatscounter_unixsendfail, + isc_sockstatscounter_unixrecvfail, + isc_sockstatscounter_unixactive +}; +#endif /* if 0 */ + +/* + * libuv is not thread safe, but has mechanisms to pass messages + * between threads. Each socket is owned by a thread. For UDP + * sockets we have a set of sockets for each interface and we can + * choose a sibling and send the message directly. For TCP, or if + * we're calling from a non-networking thread, we need to pass the + * request using async_cb. + */ + +static thread_local int isc__nm_tid_v = ISC_NETMGR_TID_UNKNOWN; + +static void +nmsocket_maybe_destroy(isc_nmsocket_t *sock FLARG); +static void +nmhandle_free(isc_nmsocket_t *sock, isc_nmhandle_t *handle); +static isc_threadresult_t +nm_thread(isc_threadarg_t worker0); +static void +async_cb(uv_async_t *handle); + +static bool +process_netievent(isc__networker_t *worker, isc__netievent_t *ievent); +static isc_result_t +process_queue(isc__networker_t *worker, netievent_type_t type); +static void +wait_for_priority_queue(isc__networker_t *worker); +static void +drain_queue(isc__networker_t *worker, netievent_type_t type); + +static void +isc__nm_async_stop(isc__networker_t *worker, isc__netievent_t *ev0); +static void +isc__nm_async_pause(isc__networker_t *worker, isc__netievent_t *ev0); +static void +isc__nm_async_resume(isc__networker_t *worker, isc__netievent_t *ev0); +static void +isc__nm_async_detach(isc__networker_t *worker, isc__netievent_t *ev0); +static void +isc__nm_async_close(isc__networker_t *worker, isc__netievent_t *ev0); + +static void +isc__nm_threadpool_initialize(uint32_t workers); +static void +isc__nm_work_cb(uv_work_t *req); +static void +isc__nm_after_work_cb(uv_work_t *req, int status); + +/*%< + * Issue a 'handle closed' callback on the socket. + */ + +static void +nmhandle_detach_cb(isc_nmhandle_t **handlep FLARG); + +int +isc_nm_tid(void) { + return (isc__nm_tid_v); +} + +bool +isc__nm_in_netthread(void) { + return (isc__nm_tid_v >= 0); +} + +void +isc__nm_force_tid(int tid) { + isc__nm_tid_v = tid; +} + +static void +isc__nm_threadpool_initialize(uint32_t workers) { + char buf[11]; + int r = uv_os_getenv("UV_THREADPOOL_SIZE", buf, + &(size_t){ sizeof(buf) }); + if (r == UV_ENOENT) { + snprintf(buf, sizeof(buf), "%" PRIu32, workers); + uv_os_setenv("UV_THREADPOOL_SIZE", buf); + } +} + +#if HAVE_DECL_UV_UDP_LINUX_RECVERR +#define MINIMAL_UV_VERSION UV_VERSION(1, 42, 0) +#elif HAVE_DECL_UV_UDP_MMSG_FREE +#define MINIMAL_UV_VERSION UV_VERSION(1, 40, 0) +#elif HAVE_DECL_UV_UDP_RECVMMSG +#define MAXIMAL_UV_VERSION UV_VERSION(1, 39, 99) +#define MINIMAL_UV_VERSION UV_VERSION(1, 37, 0) +#else +#define MAXIMAL_UV_VERSION UV_VERSION(1, 34, 99) +#define MINIMAL_UV_VERSION UV_VERSION(1, 0, 0) +#endif + +void +isc__netmgr_create(isc_mem_t *mctx, uint32_t workers, isc_nm_t **netmgrp) { + isc_nm_t *mgr = NULL; + char name[32]; + + REQUIRE(workers > 0); + +#ifdef MAXIMAL_UV_VERSION + if (uv_version() > MAXIMAL_UV_VERSION) { + FATAL_ERROR("libuv version too new: running with libuv %s " + "when compiled with libuv %s will lead to " + "libuv failures", + uv_version_string(), UV_VERSION_STRING); + } +#endif /* MAXIMAL_UV_VERSION */ + + if (uv_version() < MINIMAL_UV_VERSION) { + FATAL_ERROR("libuv version too old: running with libuv %s " + "when compiled with libuv %s will lead to " + "libuv failures", + uv_version_string(), UV_VERSION_STRING); + } + + isc__nm_threadpool_initialize(workers); + + mgr = isc_mem_get(mctx, sizeof(*mgr)); + *mgr = (isc_nm_t){ .nworkers = workers }; + + isc_mem_attach(mctx, &mgr->mctx); + isc_mutex_init(&mgr->lock); + isc_condition_init(&mgr->wkstatecond); + isc_condition_init(&mgr->wkpausecond); + isc_refcount_init(&mgr->references, 1); + atomic_init(&mgr->maxudp, 0); + atomic_init(&mgr->interlocked, ISC_NETMGR_NON_INTERLOCKED); + atomic_init(&mgr->workers_paused, 0); + atomic_init(&mgr->paused, false); + atomic_init(&mgr->closing, false); + atomic_init(&mgr->recv_tcp_buffer_size, 0); + atomic_init(&mgr->send_tcp_buffer_size, 0); + atomic_init(&mgr->recv_udp_buffer_size, 0); + atomic_init(&mgr->send_udp_buffer_size, 0); +#if HAVE_SO_REUSEPORT_LB + mgr->load_balance_sockets = true; +#else + mgr->load_balance_sockets = false; +#endif + +#ifdef NETMGR_TRACE + ISC_LIST_INIT(mgr->active_sockets); +#endif + + /* + * Default TCP timeout values. + * May be updated by isc_nm_tcptimeouts(). + */ + atomic_init(&mgr->init, 30000); + atomic_init(&mgr->idle, 30000); + atomic_init(&mgr->keepalive, 30000); + atomic_init(&mgr->advertised, 30000); + + isc_barrier_init(&mgr->pausing, workers); + isc_barrier_init(&mgr->resuming, workers); + + mgr->workers = isc_mem_get(mctx, workers * sizeof(isc__networker_t)); + for (size_t i = 0; i < workers; i++) { + isc__networker_t *worker = &mgr->workers[i]; + int r; + + *worker = (isc__networker_t){ + .mgr = mgr, + .id = i, + }; + + r = uv_loop_init(&worker->loop); + UV_RUNTIME_CHECK(uv_loop_init, r); + + worker->loop.data = &mgr->workers[i]; + + r = uv_async_init(&worker->loop, &worker->async, async_cb); + UV_RUNTIME_CHECK(uv_async_init, r); + + for (size_t type = 0; type < NETIEVENT_MAX; type++) { + isc_mutex_init(&worker->ievents[type].lock); + isc_condition_init(&worker->ievents[type].cond); + ISC_LIST_INIT(worker->ievents[type].list); + } + + worker->recvbuf = isc_mem_get(mctx, ISC_NETMGR_RECVBUF_SIZE); + worker->sendbuf = isc_mem_get(mctx, ISC_NETMGR_SENDBUF_SIZE); + + /* + * We need to do this here and not in nm_thread to avoid a + * race - we could exit isc_nm_start, launch nm_destroy, + * and nm_thread would still not be up. + */ + mgr->workers_running++; + isc_thread_create(nm_thread, &mgr->workers[i], &worker->thread); + + snprintf(name, sizeof(name), "isc-net-%04zu", i); + isc_thread_setname(worker->thread, name); + } + + mgr->magic = NM_MAGIC; + *netmgrp = mgr; +} + +/* + * Free the resources of the network manager. + */ +static void +nm_destroy(isc_nm_t **mgr0) { + REQUIRE(VALID_NM(*mgr0)); + REQUIRE(!isc__nm_in_netthread()); + + isc_nm_t *mgr = *mgr0; + *mgr0 = NULL; + + isc_refcount_destroy(&mgr->references); + + mgr->magic = 0; + + for (int i = 0; i < mgr->nworkers; i++) { + isc__networker_t *worker = &mgr->workers[i]; + isc__netievent_t *event = isc__nm_get_netievent_stop(mgr); + isc__nm_enqueue_ievent(worker, event); + } + + LOCK(&mgr->lock); + while (mgr->workers_running > 0) { + WAIT(&mgr->wkstatecond, &mgr->lock); + } + UNLOCK(&mgr->lock); + + for (int i = 0; i < mgr->nworkers; i++) { + isc__networker_t *worker = &mgr->workers[i]; + int r; + + r = uv_loop_close(&worker->loop); + UV_RUNTIME_CHECK(uv_loop_close, r); + + for (size_t type = 0; type < NETIEVENT_MAX; type++) { + INSIST(ISC_LIST_EMPTY(worker->ievents[type].list)); + isc_condition_destroy(&worker->ievents[type].cond); + isc_mutex_destroy(&worker->ievents[type].lock); + } + + isc_mem_put(mgr->mctx, worker->sendbuf, + ISC_NETMGR_SENDBUF_SIZE); + isc_mem_put(mgr->mctx, worker->recvbuf, + ISC_NETMGR_RECVBUF_SIZE); + isc_thread_join(worker->thread, NULL); + } + + if (mgr->stats != NULL) { + isc_stats_detach(&mgr->stats); + } + + isc_barrier_destroy(&mgr->resuming); + isc_barrier_destroy(&mgr->pausing); + + isc_condition_destroy(&mgr->wkstatecond); + isc_condition_destroy(&mgr->wkpausecond); + isc_mutex_destroy(&mgr->lock); + + isc_mem_put(mgr->mctx, mgr->workers, + mgr->nworkers * sizeof(isc__networker_t)); + isc_mem_putanddetach(&mgr->mctx, mgr, sizeof(*mgr)); +} + +static void +enqueue_pause(isc__networker_t *worker) { + isc__netievent_pause_t *event = + isc__nm_get_netievent_pause(worker->mgr); + isc__nm_enqueue_ievent(worker, (isc__netievent_t *)event); +} + +static void +isc__nm_async_pause(isc__networker_t *worker, isc__netievent_t *ev0) { + UNUSED(ev0); + REQUIRE(worker->paused == false); + + worker->paused = true; + uv_stop(&worker->loop); +} + +void +isc_nm_pause(isc_nm_t *mgr) { + REQUIRE(VALID_NM(mgr)); + REQUIRE(!atomic_load(&mgr->paused)); + + isc__nm_acquire_interlocked_force(mgr); + + if (isc__nm_in_netthread()) { + REQUIRE(isc_nm_tid() == 0); + } + + for (int i = 0; i < mgr->nworkers; i++) { + isc__networker_t *worker = &mgr->workers[i]; + if (i == isc_nm_tid()) { + isc__nm_async_pause(worker, NULL); + } else { + enqueue_pause(worker); + } + } + + if (isc__nm_in_netthread()) { + atomic_fetch_add(&mgr->workers_paused, 1); + isc_barrier_wait(&mgr->pausing); + } + + LOCK(&mgr->lock); + while (atomic_load(&mgr->workers_paused) != mgr->workers_running) { + WAIT(&mgr->wkstatecond, &mgr->lock); + } + UNLOCK(&mgr->lock); + + atomic_compare_exchange_enforced(&mgr->paused, &(bool){ false }, true); +} + +static void +enqueue_resume(isc__networker_t *worker) { + isc__netievent_resume_t *event = + isc__nm_get_netievent_resume(worker->mgr); + isc__nm_enqueue_ievent(worker, (isc__netievent_t *)event); +} + +static void +isc__nm_async_resume(isc__networker_t *worker, isc__netievent_t *ev0) { + UNUSED(ev0); + REQUIRE(worker->paused == true); + + worker->paused = false; +} + +void +isc_nm_resume(isc_nm_t *mgr) { + REQUIRE(VALID_NM(mgr)); + REQUIRE(atomic_load(&mgr->paused)); + + if (isc__nm_in_netthread()) { + REQUIRE(isc_nm_tid() == 0); + drain_queue(&mgr->workers[isc_nm_tid()], NETIEVENT_PRIORITY); + } + + for (int i = 0; i < mgr->nworkers; i++) { + isc__networker_t *worker = &mgr->workers[i]; + if (i == isc_nm_tid()) { + isc__nm_async_resume(worker, NULL); + } else { + enqueue_resume(worker); + } + } + + if (isc__nm_in_netthread()) { + drain_queue(&mgr->workers[isc_nm_tid()], NETIEVENT_PRIVILEGED); + + atomic_fetch_sub(&mgr->workers_paused, 1); + isc_barrier_wait(&mgr->resuming); + } + + LOCK(&mgr->lock); + while (atomic_load(&mgr->workers_paused) != 0) { + WAIT(&mgr->wkstatecond, &mgr->lock); + } + UNLOCK(&mgr->lock); + + atomic_compare_exchange_enforced(&mgr->paused, &(bool){ true }, false); + + isc__nm_drop_interlocked(mgr); +} + +void +isc_nm_attach(isc_nm_t *mgr, isc_nm_t **dst) { + REQUIRE(VALID_NM(mgr)); + REQUIRE(dst != NULL && *dst == NULL); + + isc_refcount_increment(&mgr->references); + + *dst = mgr; +} + +void +isc_nm_detach(isc_nm_t **mgr0) { + isc_nm_t *mgr = NULL; + + REQUIRE(mgr0 != NULL); + REQUIRE(VALID_NM(*mgr0)); + + mgr = *mgr0; + *mgr0 = NULL; + + if (isc_refcount_decrement(&mgr->references) == 1) { + nm_destroy(&mgr); + } +} + +void +isc__netmgr_shutdown(isc_nm_t *mgr) { + REQUIRE(VALID_NM(mgr)); + + atomic_store(&mgr->closing, true); + for (int i = 0; i < mgr->nworkers; i++) { + isc__netievent_t *event = NULL; + event = isc__nm_get_netievent_shutdown(mgr); + isc__nm_enqueue_ievent(&mgr->workers[i], event); + } +} + +void +isc__netmgr_destroy(isc_nm_t **netmgrp) { + isc_nm_t *mgr = NULL; + int counter = 0; + + REQUIRE(VALID_NM(*netmgrp)); + + mgr = *netmgrp; + + /* + * Close active connections. + */ + isc__netmgr_shutdown(mgr); + + /* + * Wait for the manager to be dereferenced elsewhere. + */ + while (isc_refcount_current(&mgr->references) > 1 && counter++ < 1000) { + uv_sleep(10); + } + +#ifdef NETMGR_TRACE + if (isc_refcount_current(&mgr->references) > 1) { + isc__nm_dump_active(mgr); + UNREACHABLE(); + } +#endif + + /* + * Now just patiently wait + */ + while (isc_refcount_current(&mgr->references) > 1) { + uv_sleep(10); + } + + /* + * Detach final reference. + */ + isc_nm_detach(netmgrp); +} + +void +isc_nm_maxudp(isc_nm_t *mgr, uint32_t maxudp) { + REQUIRE(VALID_NM(mgr)); + + atomic_store(&mgr->maxudp, maxudp); +} + +void +isc_nmhandle_setwritetimeout(isc_nmhandle_t *handle, uint64_t write_timeout) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->tid == isc_nm_tid()); + + switch (handle->sock->type) { + case isc_nm_tcpsocket: + case isc_nm_udpsocket: + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + handle->sock->write_timeout = write_timeout; + break; +#ifdef HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + isc__nmhandle_tls_setwritetimeout(handle, write_timeout); + break; +#endif /* HAVE_LIBNGHTTP2 */ + default: + UNREACHABLE(); + break; + } +} + +void +isc_nm_settimeouts(isc_nm_t *mgr, uint32_t init, uint32_t idle, + uint32_t keepalive, uint32_t advertised) { + REQUIRE(VALID_NM(mgr)); + + atomic_store(&mgr->init, init); + atomic_store(&mgr->idle, idle); + atomic_store(&mgr->keepalive, keepalive); + atomic_store(&mgr->advertised, advertised); +} + +void +isc_nm_setnetbuffers(isc_nm_t *mgr, int32_t recv_tcp, int32_t send_tcp, + int32_t recv_udp, int32_t send_udp) { + REQUIRE(VALID_NM(mgr)); + + atomic_store(&mgr->recv_tcp_buffer_size, recv_tcp); + atomic_store(&mgr->send_tcp_buffer_size, send_tcp); + atomic_store(&mgr->recv_udp_buffer_size, recv_udp); + atomic_store(&mgr->send_udp_buffer_size, send_udp); +} + +bool +isc_nm_getloadbalancesockets(isc_nm_t *mgr) { + REQUIRE(VALID_NM(mgr)); + + return (mgr->load_balance_sockets); +} + +void +isc_nm_setloadbalancesockets(isc_nm_t *mgr, bool enabled) { + REQUIRE(VALID_NM(mgr)); + +#if HAVE_SO_REUSEPORT_LB + mgr->load_balance_sockets = enabled; +#else + UNUSED(enabled); +#endif +} + +void +isc_nm_gettimeouts(isc_nm_t *mgr, uint32_t *initial, uint32_t *idle, + uint32_t *keepalive, uint32_t *advertised) { + REQUIRE(VALID_NM(mgr)); + + if (initial != NULL) { + *initial = atomic_load(&mgr->init); + } + + if (idle != NULL) { + *idle = atomic_load(&mgr->idle); + } + + if (keepalive != NULL) { + *keepalive = atomic_load(&mgr->keepalive); + } + + if (advertised != NULL) { + *advertised = atomic_load(&mgr->advertised); + } +} + +/* + * nm_thread is a single worker thread, that runs uv_run event loop + * until asked to stop. + * + * There are four queues for asynchronous events: + * + * 1. priority queue - netievents on the priority queue are run even when + * the taskmgr enters exclusive mode and the netmgr is paused. This + * is needed to properly start listening on the interfaces, free + * resources on shutdown, or resume from a pause. + * + * 2. privileged task queue - only privileged tasks are queued here and + * this is the first queue that gets processed when network manager + * is unpaused using isc_nm_resume(). All netmgr workers need to + * clean the privileged task queue before they all proceed to normal + * operation. Both task queues are processed when the workers are + * shutting down. + * + * 3. task queue - only (traditional) tasks are scheduled here, and this + * queue and the privileged task queue are both processed when the + * netmgr workers are finishing. This is needed to process the task + * shutdown events. + * + * 4. normal queue - this is the queue with netmgr events, e.g. reading, + * sending, callbacks, etc. + */ + +static isc_threadresult_t +nm_thread(isc_threadarg_t worker0) { + isc__networker_t *worker = (isc__networker_t *)worker0; + isc_nm_t *mgr = worker->mgr; + + isc__nm_tid_v = worker->id; + + while (true) { + /* + * uv_run() runs async_cb() in a loop, which processes + * all four event queues until a "pause" or "stop" event + * is encountered. On pause, we process only priority and + * privileged events until resuming. + */ + int r = uv_run(&worker->loop, UV_RUN_DEFAULT); + INSIST(r > 0 || worker->finished); + + if (worker->paused) { + INSIST(atomic_load(&mgr->interlocked) != isc_nm_tid()); + + atomic_fetch_add(&mgr->workers_paused, 1); + if (isc_barrier_wait(&mgr->pausing) != 0) { + LOCK(&mgr->lock); + SIGNAL(&mgr->wkstatecond); + UNLOCK(&mgr->lock); + } + + while (worker->paused) { + wait_for_priority_queue(worker); + } + + /* + * All workers must drain the privileged event + * queue before we resume from pause. + */ + drain_queue(worker, NETIEVENT_PRIVILEGED); + + atomic_fetch_sub(&mgr->workers_paused, 1); + if (isc_barrier_wait(&mgr->resuming) != 0) { + LOCK(&mgr->lock); + SIGNAL(&mgr->wkstatecond); + UNLOCK(&mgr->lock); + } + } + + if (r == 0) { + INSIST(worker->finished); + break; + } + + INSIST(!worker->finished); + } + + /* + * We are shutting down. Drain the queues. + */ + drain_queue(worker, NETIEVENT_PRIVILEGED); + drain_queue(worker, NETIEVENT_TASK); + + for (size_t type = 0; type < NETIEVENT_MAX; type++) { + LOCK(&worker->ievents[type].lock); + INSIST(ISC_LIST_EMPTY(worker->ievents[type].list)); + UNLOCK(&worker->ievents[type].lock); + } + + LOCK(&mgr->lock); + mgr->workers_running--; + SIGNAL(&mgr->wkstatecond); + UNLOCK(&mgr->lock); + + return ((isc_threadresult_t)0); +} + +static bool +process_all_queues(isc__networker_t *worker) { + bool reschedule = false; + /* + * The queue processing functions will return false when the + * system is pausing or stopping and we don't want to process + * the other queues in such case, but we need the async event + * to be rescheduled in the next uv_run(). + */ + for (size_t type = 0; type < NETIEVENT_MAX; type++) { + isc_result_t result = process_queue(worker, type); + switch (result) { + case ISC_R_SUSPEND: + reschedule = true; + break; + case ISC_R_EMPTY: + /* empty queue */ + break; + case ISC_R_SUCCESS: + reschedule = true; + break; + default: + UNREACHABLE(); + } + } + + return (reschedule); +} + +/* + * async_cb() is a universal callback for 'async' events sent to event loop. + * It's the only way to safely pass data to the libuv event loop. We use a + * single async event and a set of lockless queues of 'isc__netievent_t' + * structures passed from other threads. + */ +static void +async_cb(uv_async_t *handle) { + isc__networker_t *worker = (isc__networker_t *)handle->loop->data; + + if (process_all_queues(worker)) { + /* + * If we didn't process all the events, we need to enqueue + * async_cb to be run in the next iteration of the uv_loop + */ + uv_async_send(handle); + } +} + +static void +isc__nm_async_stop(isc__networker_t *worker, isc__netievent_t *ev0) { + UNUSED(ev0); + worker->finished = true; + /* Close the async handler */ + uv_close((uv_handle_t *)&worker->async, NULL); +} + +void +isc_nm_task_enqueue(isc_nm_t *nm, isc_task_t *task, int threadid) { + isc__netievent_t *event = NULL; + int tid; + isc__networker_t *worker = NULL; + + if (threadid == -1) { + tid = (int)isc_random_uniform(nm->nworkers); + } else { + tid = threadid % nm->nworkers; + } + + worker = &nm->workers[tid]; + + if (isc_task_privileged(task)) { + event = (isc__netievent_t *) + isc__nm_get_netievent_privilegedtask(nm, task); + } else { + event = (isc__netievent_t *)isc__nm_get_netievent_task(nm, + task); + } + + isc__nm_enqueue_ievent(worker, event); +} + +#define isc__nm_async_privilegedtask(worker, ev0) \ + isc__nm_async_task(worker, ev0) + +static void +isc__nm_async_task(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_task_t *ievent = (isc__netievent_task_t *)ev0; + isc_result_t result; + + UNUSED(worker); + + result = isc_task_run(ievent->task); + + /* + * Tasks can block for a long time, especially when used by tools in + * interactive mode. Update the event loop's time to avoid unexpected + * errors when processing later events during the same callback. + * For example, newly started timers can fire too early, because the + * current time was stale. See the note about uv_update_time() in the + * https://docs.libuv.org/en/v1.x/timer.html#c.uv_timer_start page. + */ + uv_update_time(&worker->loop); + + switch (result) { + case ISC_R_QUOTA: + isc_task_ready(ievent->task); + return; + case ISC_R_SUCCESS: + return; + default: + UNREACHABLE(); + } +} + +static void +wait_for_priority_queue(isc__networker_t *worker) { + isc_condition_t *cond = &worker->ievents[NETIEVENT_PRIORITY].cond; + isc_mutex_t *lock = &worker->ievents[NETIEVENT_PRIORITY].lock; + isc__netievent_list_t *list = + &(worker->ievents[NETIEVENT_PRIORITY].list); + + LOCK(lock); + while (ISC_LIST_EMPTY(*list)) { + WAIT(cond, lock); + } + UNLOCK(lock); + + drain_queue(worker, NETIEVENT_PRIORITY); +} + +static void +drain_queue(isc__networker_t *worker, netievent_type_t type) { + bool empty = false; + while (!empty) { + if (process_queue(worker, type) == ISC_R_EMPTY) { + LOCK(&worker->ievents[type].lock); + empty = ISC_LIST_EMPTY(worker->ievents[type].list); + UNLOCK(&worker->ievents[type].lock); + } + } +} + +/* + * The two macros here generate the individual cases for the process_netievent() + * function. The NETIEVENT_CASE(type) macro is the common case, and + * NETIEVENT_CASE_NOMORE(type) is a macro that causes the loop in the + * process_queue() to stop, e.g. it's only used for the netievent that + * stops/pauses processing the enqueued netievents. + */ +#define NETIEVENT_CASE(type) \ + case netievent_##type: { \ + isc__nm_async_##type(worker, ievent); \ + isc__nm_put_netievent_##type( \ + worker->mgr, (isc__netievent_##type##_t *)ievent); \ + return (true); \ + } + +#define NETIEVENT_CASE_NOMORE(type) \ + case netievent_##type: { \ + isc__nm_async_##type(worker, ievent); \ + isc__nm_put_netievent_##type(worker->mgr, ievent); \ + return (false); \ + } + +static bool +process_netievent(isc__networker_t *worker, isc__netievent_t *ievent) { + REQUIRE(worker->id == isc_nm_tid()); + + switch (ievent->type) { + /* Don't process more ievents when we are stopping */ + NETIEVENT_CASE_NOMORE(stop); + + NETIEVENT_CASE(privilegedtask); + NETIEVENT_CASE(task); + + NETIEVENT_CASE(udpconnect); + NETIEVENT_CASE(udplisten); + NETIEVENT_CASE(udpstop); + NETIEVENT_CASE(udpsend); + NETIEVENT_CASE(udpread); + NETIEVENT_CASE(udpcancel); + NETIEVENT_CASE(udpclose); + + NETIEVENT_CASE(routeconnect); + + NETIEVENT_CASE(tcpaccept); + NETIEVENT_CASE(tcpconnect); + NETIEVENT_CASE(tcplisten); + NETIEVENT_CASE(tcpstartread); + NETIEVENT_CASE(tcppauseread); + NETIEVENT_CASE(tcpsend); + NETIEVENT_CASE(tcpstop); + NETIEVENT_CASE(tcpcancel); + NETIEVENT_CASE(tcpclose); + + NETIEVENT_CASE(tcpdnsaccept); + NETIEVENT_CASE(tcpdnslisten); + NETIEVENT_CASE(tcpdnsconnect); + NETIEVENT_CASE(tcpdnssend); + NETIEVENT_CASE(tcpdnscancel); + NETIEVENT_CASE(tcpdnsclose); + NETIEVENT_CASE(tcpdnsread); + NETIEVENT_CASE(tcpdnsstop); + + NETIEVENT_CASE(tlsdnscycle); + NETIEVENT_CASE(tlsdnsaccept); + NETIEVENT_CASE(tlsdnslisten); + NETIEVENT_CASE(tlsdnsconnect); + NETIEVENT_CASE(tlsdnssend); + NETIEVENT_CASE(tlsdnscancel); + NETIEVENT_CASE(tlsdnsclose); + NETIEVENT_CASE(tlsdnsread); + NETIEVENT_CASE(tlsdnsstop); + NETIEVENT_CASE(tlsdnsshutdown); + +#if HAVE_LIBNGHTTP2 + NETIEVENT_CASE(tlsstartread); + NETIEVENT_CASE(tlssend); + NETIEVENT_CASE(tlsclose); + NETIEVENT_CASE(tlsdobio); + NETIEVENT_CASE(tlscancel); + + NETIEVENT_CASE(httpsend); + NETIEVENT_CASE(httpclose); + NETIEVENT_CASE(httpendpoints); +#endif + NETIEVENT_CASE(settlsctx); + NETIEVENT_CASE(sockstop); + + NETIEVENT_CASE(connectcb); + NETIEVENT_CASE(readcb); + NETIEVENT_CASE(sendcb); + + NETIEVENT_CASE(close); + NETIEVENT_CASE(detach); + + NETIEVENT_CASE(shutdown); + NETIEVENT_CASE(resume); + NETIEVENT_CASE_NOMORE(pause); + default: + UNREACHABLE(); + } + return (true); +} + +static isc_result_t +process_queue(isc__networker_t *worker, netievent_type_t type) { + isc__netievent_t *ievent = NULL; + isc__netievent_list_t list; + + ISC_LIST_INIT(list); + + LOCK(&worker->ievents[type].lock); + ISC_LIST_MOVE(list, worker->ievents[type].list); + UNLOCK(&worker->ievents[type].lock); + + ievent = ISC_LIST_HEAD(list); + if (ievent == NULL) { + /* There's nothing scheduled */ + return (ISC_R_EMPTY); + } + + while (ievent != NULL) { + isc__netievent_t *next = ISC_LIST_NEXT(ievent, link); + ISC_LIST_DEQUEUE(list, ievent, link); + + if (!process_netievent(worker, ievent)) { + /* The netievent told us to stop */ + if (!ISC_LIST_EMPTY(list)) { + /* + * Reschedule the rest of the unprocessed + * events. + */ + LOCK(&worker->ievents[type].lock); + ISC_LIST_PREPENDLIST(worker->ievents[type].list, + list, link); + UNLOCK(&worker->ievents[type].lock); + } + return (ISC_R_SUSPEND); + } + + ievent = next; + } + + /* We processed at least one */ + return (ISC_R_SUCCESS); +} + +void * +isc__nm_get_netievent(isc_nm_t *mgr, isc__netievent_type type) { + isc__netievent_storage_t *event = isc_mem_get(mgr->mctx, + sizeof(*event)); + + *event = (isc__netievent_storage_t){ .ni.type = type }; + ISC_LINK_INIT(&(event->ni), link); + return (event); +} + +void +isc__nm_put_netievent(isc_nm_t *mgr, void *ievent) { + isc_mem_put(mgr->mctx, ievent, sizeof(isc__netievent_storage_t)); +} + +NETIEVENT_SOCKET_DEF(tcpclose); +NETIEVENT_SOCKET_DEF(tcplisten); +NETIEVENT_SOCKET_DEF(tcppauseread); +NETIEVENT_SOCKET_DEF(tcpstartread); +NETIEVENT_SOCKET_DEF(tcpstop); +NETIEVENT_SOCKET_DEF(tlsclose); +NETIEVENT_SOCKET_DEF(tlsconnect); +NETIEVENT_SOCKET_DEF(tlsdobio); +NETIEVENT_SOCKET_DEF(tlsstartread); +NETIEVENT_SOCKET_HANDLE_DEF(tlscancel); +NETIEVENT_SOCKET_DEF(udpclose); +NETIEVENT_SOCKET_DEF(udplisten); +NETIEVENT_SOCKET_DEF(udpread); +NETIEVENT_SOCKET_DEF(udpsend); +NETIEVENT_SOCKET_DEF(udpstop); + +NETIEVENT_SOCKET_DEF(tcpdnsclose); +NETIEVENT_SOCKET_DEF(tcpdnsread); +NETIEVENT_SOCKET_DEF(tcpdnsstop); +NETIEVENT_SOCKET_DEF(tcpdnslisten); +NETIEVENT_SOCKET_REQ_DEF(tcpdnsconnect); +NETIEVENT_SOCKET_REQ_DEF(tcpdnssend); +NETIEVENT_SOCKET_HANDLE_DEF(tcpdnscancel); +NETIEVENT_SOCKET_QUOTA_DEF(tcpdnsaccept); + +NETIEVENT_SOCKET_DEF(tlsdnsclose); +NETIEVENT_SOCKET_DEF(tlsdnsread); +NETIEVENT_SOCKET_DEF(tlsdnsstop); +NETIEVENT_SOCKET_DEF(tlsdnslisten); +NETIEVENT_SOCKET_REQ_DEF(tlsdnsconnect); +NETIEVENT_SOCKET_REQ_DEF(tlsdnssend); +NETIEVENT_SOCKET_HANDLE_DEF(tlsdnscancel); +NETIEVENT_SOCKET_QUOTA_DEF(tlsdnsaccept); +NETIEVENT_SOCKET_DEF(tlsdnscycle); +NETIEVENT_SOCKET_DEF(tlsdnsshutdown); + +#ifdef HAVE_LIBNGHTTP2 +NETIEVENT_SOCKET_REQ_DEF(httpsend); +NETIEVENT_SOCKET_DEF(httpclose); +NETIEVENT_SOCKET_HTTP_EPS_DEF(httpendpoints); +#endif /* HAVE_LIBNGHTTP2 */ + +NETIEVENT_SOCKET_REQ_DEF(tcpconnect); +NETIEVENT_SOCKET_REQ_DEF(tcpsend); +NETIEVENT_SOCKET_REQ_DEF(tlssend); +NETIEVENT_SOCKET_REQ_DEF(udpconnect); +NETIEVENT_SOCKET_REQ_DEF(routeconnect); +NETIEVENT_SOCKET_REQ_RESULT_DEF(connectcb); +NETIEVENT_SOCKET_REQ_RESULT_DEF(readcb); +NETIEVENT_SOCKET_REQ_RESULT_DEF(sendcb); + +NETIEVENT_SOCKET_DEF(detach); +NETIEVENT_SOCKET_HANDLE_DEF(tcpcancel); +NETIEVENT_SOCKET_HANDLE_DEF(udpcancel); + +NETIEVENT_SOCKET_QUOTA_DEF(tcpaccept); + +NETIEVENT_SOCKET_DEF(close); +NETIEVENT_DEF(pause); +NETIEVENT_DEF(resume); +NETIEVENT_DEF(shutdown); +NETIEVENT_DEF(stop); + +NETIEVENT_TASK_DEF(task); +NETIEVENT_TASK_DEF(privilegedtask); + +NETIEVENT_SOCKET_TLSCTX_DEF(settlsctx); +NETIEVENT_SOCKET_DEF(sockstop); + +void +isc__nm_maybe_enqueue_ievent(isc__networker_t *worker, + isc__netievent_t *event) { + /* + * If we are already in the matching nmthread, process the ievent + * directly. + */ + if (worker->id == isc_nm_tid()) { + process_netievent(worker, event); + return; + } + + isc__nm_enqueue_ievent(worker, event); +} + +void +isc__nm_enqueue_ievent(isc__networker_t *worker, isc__netievent_t *event) { + netievent_type_t type; + + if (event->type > netievent_prio) { + type = NETIEVENT_PRIORITY; + } else { + switch (event->type) { + case netievent_prio: + UNREACHABLE(); + break; + case netievent_privilegedtask: + type = NETIEVENT_PRIVILEGED; + break; + case netievent_task: + type = NETIEVENT_TASK; + break; + default: + type = NETIEVENT_NORMAL; + break; + } + } + + /* + * We need to make sure this signal will be delivered and + * the queue will be processed. + */ + LOCK(&worker->ievents[type].lock); + ISC_LIST_ENQUEUE(worker->ievents[type].list, event, link); + if (type == NETIEVENT_PRIORITY) { + SIGNAL(&worker->ievents[type].cond); + } + UNLOCK(&worker->ievents[type].lock); + + uv_async_send(&worker->async); +} + +bool +isc__nmsocket_active(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + if (sock->parent != NULL) { + return (atomic_load(&sock->parent->active)); + } + + return (atomic_load(&sock->active)); +} + +bool +isc__nmsocket_deactivate(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + + if (sock->parent != NULL) { + return (atomic_compare_exchange_strong(&sock->parent->active, + &(bool){ true }, false)); + } + + return (atomic_compare_exchange_strong(&sock->active, &(bool){ true }, + false)); +} + +void +isc___nmsocket_attach(isc_nmsocket_t *sock, isc_nmsocket_t **target FLARG) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(target != NULL && *target == NULL); + + isc_nmsocket_t *rsock = NULL; + + if (sock->parent != NULL) { + rsock = sock->parent; + INSIST(rsock->parent == NULL); /* sanity check */ + } else { + rsock = sock; + } + + NETMGR_TRACE_LOG("isc__nmsocket_attach():%p->references = %" PRIuFAST32 + "\n", + rsock, isc_refcount_current(&rsock->references) + 1); + + isc_refcount_increment0(&rsock->references); + + *target = sock; +} + +/* + * Free all resources inside a socket (including its children if any). + */ +static void +nmsocket_cleanup(isc_nmsocket_t *sock, bool dofree FLARG) { + isc_nmhandle_t *handle = NULL; + isc__nm_uvreq_t *uvreq = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(!isc__nmsocket_active(sock)); + + NETMGR_TRACE_LOG("nmsocket_cleanup():%p->references = %" PRIuFAST32 + "\n", + sock, isc_refcount_current(&sock->references)); + + isc__nm_decstats(sock, STATID_ACTIVE); + + atomic_store(&sock->destroying, true); + + if (sock->parent == NULL && sock->children != NULL) { + /* + * We shouldn't be here unless there are no active handles, + * so we can clean up and free the children. + */ + for (size_t i = 0; i < sock->nchildren; i++) { + if (!atomic_load(&sock->children[i].destroying)) { + nmsocket_cleanup(&sock->children[i], + false FLARG_PASS); + } + } + + /* + * This was a parent socket: destroy the listening + * barriers that synchronized the children. + */ + isc_barrier_destroy(&sock->startlistening); + isc_barrier_destroy(&sock->stoplistening); + + /* + * Now free them. + */ + isc_mem_put(sock->mgr->mctx, sock->children, + sock->nchildren * sizeof(*sock)); + sock->children = NULL; + sock->nchildren = 0; + } + + sock->statichandle = NULL; + + if (sock->outerhandle != NULL) { + isc__nmhandle_detach(&sock->outerhandle FLARG_PASS); + } + + if (sock->outer != NULL) { + isc___nmsocket_detach(&sock->outer FLARG_PASS); + } + + while ((handle = isc_astack_pop(sock->inactivehandles)) != NULL) { + nmhandle_free(sock, handle); + } + + if (sock->buf != NULL) { + isc_mem_put(sock->mgr->mctx, sock->buf, sock->buf_size); + } + + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } + + sock->pquota = NULL; + + isc_astack_destroy(sock->inactivehandles); + + while ((uvreq = isc_astack_pop(sock->inactivereqs)) != NULL) { + isc_mem_put(sock->mgr->mctx, uvreq, sizeof(*uvreq)); + } + + isc_astack_destroy(sock->inactivereqs); + sock->magic = 0; + + isc_condition_destroy(&sock->scond); + isc_condition_destroy(&sock->cond); + isc_mutex_destroy(&sock->lock); + isc__nm_tlsdns_cleanup_data(sock); +#if HAVE_LIBNGHTTP2 + isc__nm_tls_cleanup_data(sock); + isc__nm_http_cleanup_data(sock); +#endif + + INSIST(ISC_LIST_EMPTY(sock->tls.sendreqs)); + + if (sock->barrier_initialised) { + isc_barrier_destroy(&sock->barrier); + } + +#ifdef NETMGR_TRACE + LOCK(&sock->mgr->lock); + ISC_LIST_UNLINK(sock->mgr->active_sockets, sock, active_link); + UNLOCK(&sock->mgr->lock); +#endif + if (dofree) { + isc_nm_t *mgr = sock->mgr; + isc_mem_put(mgr->mctx, sock, sizeof(*sock)); + isc_nm_detach(&mgr); + } else { + isc_nm_detach(&sock->mgr); + } +} + +static void +nmsocket_maybe_destroy(isc_nmsocket_t *sock FLARG) { + int active_handles; + bool destroy = false; + + NETMGR_TRACE_LOG("%s():%p->references = %" PRIuFAST32 "\n", __func__, + sock, isc_refcount_current(&sock->references)); + + if (sock->parent != NULL) { + /* + * This is a child socket and cannot be destroyed except + * as a side effect of destroying the parent, so let's go + * see if the parent is ready to be destroyed. + */ + nmsocket_maybe_destroy(sock->parent FLARG_PASS); + return; + } + + /* + * This is a parent socket (or a standalone). See whether the + * children have active handles before deciding whether to + * accept destruction. + */ + LOCK(&sock->lock); + if (atomic_load(&sock->active) || atomic_load(&sock->destroying) || + !atomic_load(&sock->closed) || atomic_load(&sock->references) != 0) + { + UNLOCK(&sock->lock); + return; + } + + active_handles = atomic_load(&sock->ah); + if (sock->children != NULL) { + for (size_t i = 0; i < sock->nchildren; i++) { + LOCK(&sock->children[i].lock); + active_handles += atomic_load(&sock->children[i].ah); + UNLOCK(&sock->children[i].lock); + } + } + + if (active_handles == 0 || sock->statichandle != NULL) { + destroy = true; + } + + NETMGR_TRACE_LOG("%s:%p->active_handles = %d, .statichandle = %p\n", + __func__, sock, active_handles, sock->statichandle); + + if (destroy) { + atomic_store(&sock->destroying, true); + UNLOCK(&sock->lock); + nmsocket_cleanup(sock, true FLARG_PASS); + } else { + UNLOCK(&sock->lock); + } +} + +void +isc___nmsocket_prep_destroy(isc_nmsocket_t *sock FLARG) { + REQUIRE(sock->parent == NULL); + + NETMGR_TRACE_LOG("isc___nmsocket_prep_destroy():%p->references = " + "%" PRIuFAST32 "\n", + sock, isc_refcount_current(&sock->references)); + + /* + * The final external reference to the socket is gone. We can try + * destroying the socket, but we have to wait for all the inflight + * handles to finish first. + */ + atomic_store(&sock->active, false); + + /* + * If the socket has children, they'll need to be marked inactive + * so they can be cleaned up too. + */ + if (sock->children != NULL) { + for (size_t i = 0; i < sock->nchildren; i++) { + atomic_store(&sock->children[i].active, false); + } + } + + /* + * If we're here then we already stopped listening; otherwise + * we'd have a hanging reference from the listening process. + * + * If it's a regular socket we may need to close it. + */ + if (!atomic_load(&sock->closed)) { + switch (sock->type) { + case isc_nm_udpsocket: + isc__nm_udp_close(sock); + return; + case isc_nm_tcpsocket: + isc__nm_tcp_close(sock); + return; + case isc_nm_tcpdnssocket: + isc__nm_tcpdns_close(sock); + return; + case isc_nm_tlsdnssocket: + isc__nm_tlsdns_close(sock); + return; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + isc__nm_tls_close(sock); + break; + case isc_nm_httpsocket: + isc__nm_http_close(sock); + return; +#endif + default: + break; + } + } + + nmsocket_maybe_destroy(sock FLARG_PASS); +} + +void +isc___nmsocket_detach(isc_nmsocket_t **sockp FLARG) { + REQUIRE(sockp != NULL && *sockp != NULL); + REQUIRE(VALID_NMSOCK(*sockp)); + + isc_nmsocket_t *sock = *sockp, *rsock = NULL; + *sockp = NULL; + + /* + * If the socket is a part of a set (a child socket) we are + * counting references for the whole set at the parent. + */ + if (sock->parent != NULL) { + rsock = sock->parent; + INSIST(rsock->parent == NULL); /* Sanity check */ + } else { + rsock = sock; + } + + NETMGR_TRACE_LOG("isc__nmsocket_detach():%p->references = %" PRIuFAST32 + "\n", + rsock, isc_refcount_current(&rsock->references) - 1); + + if (isc_refcount_decrement(&rsock->references) == 1) { + isc___nmsocket_prep_destroy(rsock FLARG_PASS); + } +} + +void +isc_nmsocket_close(isc_nmsocket_t **sockp) { + REQUIRE(sockp != NULL); + REQUIRE(VALID_NMSOCK(*sockp)); + REQUIRE((*sockp)->type == isc_nm_udplistener || + (*sockp)->type == isc_nm_tcplistener || + (*sockp)->type == isc_nm_tcpdnslistener || + (*sockp)->type == isc_nm_tlsdnslistener || + (*sockp)->type == isc_nm_tlslistener || + (*sockp)->type == isc_nm_httplistener); + + isc__nmsocket_detach(sockp); +} + +void +isc___nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type, + isc_sockaddr_t *iface FLARG) { + uint16_t family; + + REQUIRE(sock != NULL); + REQUIRE(mgr != NULL); + + *sock = (isc_nmsocket_t){ .type = type, + .fd = -1, + .inactivehandles = isc_astack_new( + mgr->mctx, ISC_NM_HANDLES_STACK_SIZE), + .inactivereqs = isc_astack_new( + mgr->mctx, ISC_NM_REQS_STACK_SIZE) }; + + ISC_LIST_INIT(sock->tls.sendreqs); + + if (iface != NULL) { + family = iface->type.sa.sa_family; + sock->iface = *iface; + } else { + family = AF_UNSPEC; + } + +#if NETMGR_TRACE + sock->backtrace_size = isc_backtrace(sock->backtrace, TRACE_SIZE); + ISC_LINK_INIT(sock, active_link); + ISC_LIST_INIT(sock->active_handles); + LOCK(&mgr->lock); + ISC_LIST_APPEND(mgr->active_sockets, sock, active_link); + UNLOCK(&mgr->lock); +#endif + + isc_nm_attach(mgr, &sock->mgr); + sock->uv_handle.handle.data = sock; + + ISC_LINK_INIT(&sock->quotacb, link); + + switch (type) { + case isc_nm_udpsocket: + case isc_nm_udplistener: + switch (family) { + case AF_INET: + sock->statsindex = udp4statsindex; + break; + case AF_INET6: + sock->statsindex = udp6statsindex; + break; + case AF_UNSPEC: + /* + * Route sockets are AF_UNSPEC, and don't + * have stats counters. + */ + break; + default: + UNREACHABLE(); + } + break; + case isc_nm_tcpsocket: + case isc_nm_tcplistener: + case isc_nm_tcpdnssocket: + case isc_nm_tcpdnslistener: + case isc_nm_tlsdnssocket: + case isc_nm_tlsdnslistener: + case isc_nm_httpsocket: + case isc_nm_httplistener: + switch (family) { + case AF_INET: + sock->statsindex = tcp4statsindex; + break; + case AF_INET6: + sock->statsindex = tcp6statsindex; + break; + default: + UNREACHABLE(); + } + break; + default: + break; + } + + isc_mutex_init(&sock->lock); + isc_condition_init(&sock->cond); + isc_condition_init(&sock->scond); + isc_refcount_init(&sock->references, 1); + +#if HAVE_LIBNGHTTP2 + memset(&sock->tlsstream, 0, sizeof(sock->tlsstream)); +#endif /* HAVE_LIBNGHTTP2 */ + + NETMGR_TRACE_LOG("isc__nmsocket_init():%p->references = %" PRIuFAST32 + "\n", + sock, isc_refcount_current(&sock->references)); + + atomic_init(&sock->active, true); + atomic_init(&sock->sequential, false); + atomic_init(&sock->readpaused, false); + atomic_init(&sock->closing, false); + atomic_init(&sock->listening, 0); + atomic_init(&sock->closed, 0); + atomic_init(&sock->destroying, 0); + atomic_init(&sock->ah, 0); + atomic_init(&sock->client, 0); + atomic_init(&sock->connecting, false); + atomic_init(&sock->keepalive, false); + atomic_init(&sock->connected, false); + atomic_init(&sock->timedout, false); + + atomic_init(&sock->active_child_connections, 0); + +#if HAVE_LIBNGHTTP2 + isc__nm_http_initsocket(sock); +#endif + + sock->magic = NMSOCK_MAGIC; + + isc__nm_incstats(sock, STATID_ACTIVE); +} + +void +isc__nmsocket_clearcb(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(!isc__nm_in_netthread() || sock->tid == isc_nm_tid()); + + sock->recv_cb = NULL; + sock->recv_cbarg = NULL; + sock->accept_cb = NULL; + sock->accept_cbarg = NULL; + sock->connect_cb = NULL; + sock->connect_cbarg = NULL; +} + +void +isc__nm_free_uvbuf(isc_nmsocket_t *sock, const uv_buf_t *buf) { + isc__networker_t *worker = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + + worker = &sock->mgr->workers[sock->tid]; + REQUIRE(buf->base == worker->recvbuf); + + worker->recvbuf_inuse = false; +} + +static isc_nmhandle_t * +alloc_handle(isc_nmsocket_t *sock) { + isc_nmhandle_t *handle = + isc_mem_get(sock->mgr->mctx, + sizeof(isc_nmhandle_t) + sock->extrahandlesize); + + *handle = (isc_nmhandle_t){ .magic = NMHANDLE_MAGIC }; +#ifdef NETMGR_TRACE + ISC_LINK_INIT(handle, active_link); +#endif + isc_refcount_init(&handle->references, 1); + + return (handle); +} + +isc_nmhandle_t * +isc___nmhandle_get(isc_nmsocket_t *sock, isc_sockaddr_t *peer, + isc_sockaddr_t *local FLARG) { + isc_nmhandle_t *handle = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + + handle = isc_astack_pop(sock->inactivehandles); + + if (handle == NULL) { + handle = alloc_handle(sock); + } else { + isc_refcount_init(&handle->references, 1); + INSIST(VALID_NMHANDLE(handle)); + } + + NETMGR_TRACE_LOG( + "isc__nmhandle_get():handle %p->references = %" PRIuFAST32 "\n", + handle, isc_refcount_current(&handle->references)); + + isc___nmsocket_attach(sock, &handle->sock FLARG_PASS); + +#if NETMGR_TRACE + handle->backtrace_size = isc_backtrace(handle->backtrace, TRACE_SIZE); +#endif + + if (peer != NULL) { + handle->peer = *peer; + } else { + handle->peer = sock->peer; + } + + if (local != NULL) { + handle->local = *local; + } else { + handle->local = sock->iface; + } + + (void)atomic_fetch_add(&sock->ah, 1); + +#ifdef NETMGR_TRACE + LOCK(&sock->lock); + ISC_LIST_APPEND(sock->active_handles, handle, active_link); + UNLOCK(&sock->lock); +#endif + + switch (sock->type) { + case isc_nm_udpsocket: + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + if (!atomic_load(&sock->client)) { + break; + } + FALLTHROUGH; + case isc_nm_tcpsocket: + case isc_nm_tlssocket: + INSIST(sock->statichandle == NULL); + + /* + * statichandle must be assigned, not attached; + * otherwise, if a handle was detached elsewhere + * it could never reach 0 references, and the + * handle and socket would never be freed. + */ + sock->statichandle = handle; + break; + default: + break; + } + +#if HAVE_LIBNGHTTP2 + if (sock->type == isc_nm_httpsocket && sock->h2.session) { + isc__nm_httpsession_attach(sock->h2.session, + &handle->httpsession); + } +#endif + + return (handle); +} + +void +isc__nmhandle_attach(isc_nmhandle_t *handle, isc_nmhandle_t **handlep FLARG) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(handlep != NULL && *handlep == NULL); + + NETMGR_TRACE_LOG("isc__nmhandle_attach():handle %p->references = " + "%" PRIuFAST32 "\n", + handle, isc_refcount_current(&handle->references) + 1); + + isc_refcount_increment(&handle->references); + *handlep = handle; +} + +bool +isc_nmhandle_is_stream(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + + return (handle->sock->type == isc_nm_tcpsocket || + handle->sock->type == isc_nm_tcpdnssocket || + handle->sock->type == isc_nm_tlssocket || + handle->sock->type == isc_nm_tlsdnssocket || + handle->sock->type == isc_nm_httpsocket); +} + +static void +nmhandle_free(isc_nmsocket_t *sock, isc_nmhandle_t *handle) { + size_t extra = sock->extrahandlesize; + + isc_refcount_destroy(&handle->references); + + if (handle->dofree != NULL) { + handle->dofree(handle->opaque); + } + + *handle = (isc_nmhandle_t){ .magic = 0 }; + + isc_mem_put(sock->mgr->mctx, handle, sizeof(isc_nmhandle_t) + extra); +} + +static void +nmhandle_deactivate(isc_nmsocket_t *sock, isc_nmhandle_t *handle) { + bool reuse = false; + uint_fast32_t ah; + + /* + * We do all of this under lock to avoid races with socket + * destruction. We have to do this now, because at this point the + * socket is either unused or still attached to event->sock. + */ + LOCK(&sock->lock); + +#ifdef NETMGR_TRACE + ISC_LIST_UNLINK(sock->active_handles, handle, active_link); +#endif + + ah = atomic_fetch_sub(&sock->ah, 1); + INSIST(ah > 0); + +#if !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__ + if (atomic_load(&sock->active)) { + reuse = isc_astack_trypush(sock->inactivehandles, handle); + } +#endif /* !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__ */ + if (!reuse) { + nmhandle_free(sock, handle); + } + UNLOCK(&sock->lock); +} + +void +isc__nmhandle_detach(isc_nmhandle_t **handlep FLARG) { + isc_nmsocket_t *sock = NULL; + isc_nmhandle_t *handle = NULL; + + REQUIRE(handlep != NULL); + REQUIRE(VALID_NMHANDLE(*handlep)); + + handle = *handlep; + *handlep = NULL; + + /* + * If the closehandle_cb is set, it needs to run asynchronously to + * ensure correct ordering of the isc__nm_process_sock_buffer(). + */ + sock = handle->sock; + if (sock->tid == isc_nm_tid() && sock->closehandle_cb == NULL) { + nmhandle_detach_cb(&handle FLARG_PASS); + } else { + isc__netievent_detach_t *event = + isc__nm_get_netievent_detach(sock->mgr, sock); + /* + * we are using implicit "attach" as the last reference + * need to be destroyed explicitly in the async callback + */ + event->handle = handle; + FLARG_IEVENT_PASS(event); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)event); + } +} + +static void +nmhandle_detach_cb(isc_nmhandle_t **handlep FLARG) { + isc_nmsocket_t *sock = NULL; + isc_nmhandle_t *handle = NULL; + + REQUIRE(handlep != NULL); + REQUIRE(VALID_NMHANDLE(*handlep)); + + handle = *handlep; + *handlep = NULL; + + NETMGR_TRACE_LOG("isc__nmhandle_detach():%p->references = %" PRIuFAST32 + "\n", + handle, isc_refcount_current(&handle->references) - 1); + + if (isc_refcount_decrement(&handle->references) > 1) { + return; + } + + /* We need an acquire memory barrier here */ + (void)isc_refcount_current(&handle->references); + + sock = handle->sock; + handle->sock = NULL; + + if (handle->doreset != NULL) { + handle->doreset(handle->opaque); + } + +#if HAVE_LIBNGHTTP2 + if (sock->type == isc_nm_httpsocket && handle->httpsession != NULL) { + isc__nm_httpsession_detach(&handle->httpsession); + } +#endif + + nmhandle_deactivate(sock, handle); + + /* + * The handle is gone now. If the socket has a callback configured + * for that (e.g., to perform cleanup after request processing), + * call it now, or schedule it to run asynchronously. + */ + if (sock->closehandle_cb != NULL) { + if (sock->tid == isc_nm_tid()) { + sock->closehandle_cb(sock); + } else { + isc__netievent_close_t *event = + isc__nm_get_netievent_close(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)event); + } + } + + if (handle == sock->statichandle) { + /* statichandle is assigned, not attached. */ + sock->statichandle = NULL; + } + + isc___nmsocket_detach(&sock FLARG_PASS); +} + +void * +isc_nmhandle_getdata(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + + return (handle->opaque); +} + +void +isc_nmhandle_setdata(isc_nmhandle_t *handle, void *arg, + isc_nm_opaquecb_t doreset, isc_nm_opaquecb_t dofree) { + REQUIRE(VALID_NMHANDLE(handle)); + + handle->opaque = arg; + handle->doreset = doreset; + handle->dofree = dofree; +} + +void +isc__nm_alloc_dnsbuf(isc_nmsocket_t *sock, size_t len) { + REQUIRE(len <= NM_BIG_BUF); + + if (sock->buf == NULL) { + /* We don't have the buffer at all */ + size_t alloc_len = len < NM_REG_BUF ? NM_REG_BUF : NM_BIG_BUF; + sock->buf = isc_mem_get(sock->mgr->mctx, alloc_len); + sock->buf_size = alloc_len; + } else { + /* We have the buffer but it's too small */ + sock->buf = isc_mem_reget(sock->mgr->mctx, sock->buf, + sock->buf_size, NM_BIG_BUF); + sock->buf_size = NM_BIG_BUF; + } +} + +void +isc__nm_failed_send_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req, + isc_result_t eresult) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + + if (req->cb.send != NULL) { + isc__nm_sendcb(sock, req, eresult, true); + } else { + isc__nm_uvreq_put(&req, sock); + } +} + +void +isc__nm_failed_accept_cb(isc_nmsocket_t *sock, isc_result_t eresult) { + REQUIRE(atomic_load(&sock->accepting)); + REQUIRE(sock->server); + + /* + * Detach the quota early to make room for other connections; + * otherwise it'd be detached later asynchronously, and clog + * the quota unnecessarily. + */ + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } + + isc__nmsocket_detach(&sock->server); + + atomic_store(&sock->accepting, false); + + switch (eresult) { + case ISC_R_NOTCONNECTED: + /* IGNORE: The client disconnected before we could accept */ + break; + default: + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR, + "Accepting TCP connection failed: %s", + isc_result_totext(eresult)); + } +} + +void +isc__nm_failed_connect_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req, + isc_result_t eresult, bool async) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(req->cb.connect != NULL); + + isc__nm_incstats(sock, STATID_CONNECTFAIL); + + isc__nmsocket_timer_stop(sock); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + atomic_compare_exchange_enforced(&sock->connecting, &(bool){ true }, + false); + + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, eresult, async); + + isc__nmsocket_prep_destroy(sock); +} + +void +isc__nm_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, bool async) { + REQUIRE(VALID_NMSOCK(sock)); + switch (sock->type) { + case isc_nm_udpsocket: + isc__nm_udp_failed_read_cb(sock, result); + return; + case isc_nm_tcpsocket: + isc__nm_tcp_failed_read_cb(sock, result); + return; + case isc_nm_tcpdnssocket: + isc__nm_tcpdns_failed_read_cb(sock, result); + return; + case isc_nm_tlsdnssocket: + isc__nm_tlsdns_failed_read_cb(sock, result, async); + return; + default: + UNREACHABLE(); + } +} + +void +isc__nmsocket_connecttimeout_cb(uv_timer_t *timer) { + uv_connect_t *uvreq = uv_handle_get_data((uv_handle_t *)timer); + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); + isc__nm_uvreq_t *req = uv_handle_get_data((uv_handle_t *)uvreq); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->connecting)); + REQUIRE(VALID_UVREQ(req)); + REQUIRE(VALID_NMHANDLE(req->handle)); + + isc__nmsocket_timer_stop(sock); + + /* + * Mark the connection as timed out and shutdown the socket. + */ + atomic_compare_exchange_enforced(&sock->timedout, &(bool){ false }, + true); + isc__nmsocket_clearcb(sock); + isc__nmsocket_shutdown(sock); +} + +void +isc__nm_accept_connection_log(isc_result_t result, bool can_log_quota) { + int level; + + switch (result) { + case ISC_R_SUCCESS: + case ISC_R_NOCONN: + return; + case ISC_R_QUOTA: + case ISC_R_SOFTQUOTA: + if (!can_log_quota) { + return; + } + level = ISC_LOG_INFO; + break; + case ISC_R_NOTCONNECTED: + level = ISC_LOG_INFO; + break; + default: + level = ISC_LOG_ERROR; + } + + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR, + level, "Accepting TCP connection failed: %s", + isc_result_totext(result)); +} + +void +isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult) { + isc__nm_uvreq_t *req = data; + isc_nmsocket_t *sock = NULL; + + REQUIRE(eresult == ISC_R_TIMEDOUT); + REQUIRE(VALID_UVREQ(req)); + REQUIRE(VALID_NMSOCK(req->sock)); + + sock = req->sock; + + isc__nmsocket_reset(sock); +} + +void +isc__nmsocket_readtimeout_cb(uv_timer_t *timer) { + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)timer); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->reading)); + + if (atomic_load(&sock->client)) { + uv_timer_stop(timer); + + sock->recv_read = false; + + if (sock->recv_cb != NULL) { + isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); + isc__nm_readcb(sock, req, ISC_R_TIMEDOUT); + } + + if (!isc__nmsocket_timer_running(sock)) { + isc__nmsocket_clearcb(sock); + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, false); + } + } else { + isc__nm_failed_read_cb(sock, ISC_R_TIMEDOUT, false); + } +} + +void +isc__nmsocket_timer_restart(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + + if (uv_is_closing((uv_handle_t *)&sock->read_timer)) { + return; + } + + if (atomic_load(&sock->connecting)) { + int r; + + if (sock->connect_timeout == 0) { + return; + } + + r = uv_timer_start(&sock->read_timer, + isc__nmsocket_connecttimeout_cb, + sock->connect_timeout + 10, 0); + UV_RUNTIME_CHECK(uv_timer_start, r); + + } else { + int r; + + if (sock->read_timeout == 0) { + return; + } + + r = uv_timer_start(&sock->read_timer, + isc__nmsocket_readtimeout_cb, + sock->read_timeout, 0); + UV_RUNTIME_CHECK(uv_timer_start, r); + } +} + +bool +isc__nmsocket_timer_running(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + + return (uv_is_active((uv_handle_t *)&sock->read_timer)); +} + +void +isc__nmsocket_timer_start(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + + if (isc__nmsocket_timer_running(sock)) { + return; + } + + isc__nmsocket_timer_restart(sock); +} + +void +isc__nmsocket_timer_stop(isc_nmsocket_t *sock) { + int r; + + REQUIRE(VALID_NMSOCK(sock)); + + /* uv_timer_stop() is idempotent, no need to check if running */ + + r = uv_timer_stop(&sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_stop, r); +} + +isc__nm_uvreq_t * +isc__nm_get_read_req(isc_nmsocket_t *sock, isc_sockaddr_t *sockaddr) { + isc__nm_uvreq_t *req = NULL; + + req = isc__nm_uvreq_get(sock->mgr, sock); + req->cb.recv = sock->recv_cb; + req->cbarg = sock->recv_cbarg; + + switch (sock->type) { + case isc_nm_tcpsocket: + case isc_nm_tlssocket: + isc_nmhandle_attach(sock->statichandle, &req->handle); + break; + default: + if (atomic_load(&sock->client) && sock->statichandle != NULL) { + isc_nmhandle_attach(sock->statichandle, &req->handle); + } else { + req->handle = isc__nmhandle_get(sock, sockaddr, NULL); + } + break; + } + + return (req); +} + +/*%< + * Allocator callback for read operations. + * + * Note this doesn't actually allocate anything, it just assigns the + * worker's receive buffer to a socket, and marks it as "in use". + */ +void +isc__nm_alloc_cb(uv_handle_t *handle, size_t size, uv_buf_t *buf) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + isc__networker_t *worker = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(isc__nm_in_netthread()); + /* + * The size provided by libuv is only suggested size, and it always + * defaults to 64 * 1024 in the current versions of libuv (see + * src/unix/udp.c and src/unix/stream.c). + */ + UNUSED(size); + + worker = &sock->mgr->workers[sock->tid]; + INSIST(!worker->recvbuf_inuse); + INSIST(worker->recvbuf != NULL); + + switch (sock->type) { + case isc_nm_udpsocket: + buf->len = ISC_NETMGR_UDP_RECVBUF_SIZE; + break; + case isc_nm_tcpsocket: + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + buf->len = ISC_NETMGR_TCP_RECVBUF_SIZE; + break; + default: + UNREACHABLE(); + } + + REQUIRE(buf->len <= ISC_NETMGR_RECVBUF_SIZE); + buf->base = worker->recvbuf; + + worker->recvbuf_inuse = true; +} + +isc_result_t +isc__nm_start_reading(isc_nmsocket_t *sock) { + isc_result_t result = ISC_R_SUCCESS; + int r; + + if (atomic_load(&sock->reading)) { + return (ISC_R_SUCCESS); + } + + switch (sock->type) { + case isc_nm_udpsocket: + r = uv_udp_recv_start(&sock->uv_handle.udp, isc__nm_alloc_cb, + isc__nm_udp_read_cb); + break; + case isc_nm_tcpsocket: + r = uv_read_start(&sock->uv_handle.stream, isc__nm_alloc_cb, + isc__nm_tcp_read_cb); + break; + case isc_nm_tcpdnssocket: + r = uv_read_start(&sock->uv_handle.stream, isc__nm_alloc_cb, + isc__nm_tcpdns_read_cb); + break; + case isc_nm_tlsdnssocket: + r = uv_read_start(&sock->uv_handle.stream, isc__nm_alloc_cb, + isc__nm_tlsdns_read_cb); + break; + default: + UNREACHABLE(); + } + if (r != 0) { + result = isc__nm_uverr2result(r); + } else { + atomic_store(&sock->reading, true); + } + + return (result); +} + +void +isc__nm_stop_reading(isc_nmsocket_t *sock) { + int r; + + if (!atomic_load(&sock->reading)) { + return; + } + + switch (sock->type) { + case isc_nm_udpsocket: + r = uv_udp_recv_stop(&sock->uv_handle.udp); + UV_RUNTIME_CHECK(uv_udp_recv_stop, r); + break; + case isc_nm_tcpsocket: + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + r = uv_read_stop(&sock->uv_handle.stream); + UV_RUNTIME_CHECK(uv_read_stop, r); + break; + default: + UNREACHABLE(); + } + atomic_store(&sock->reading, false); +} + +bool +isc__nm_closing(isc_nmsocket_t *sock) { + return (atomic_load(&sock->mgr->closing)); +} + +bool +isc__nmsocket_closing(isc_nmsocket_t *sock) { + return (!isc__nmsocket_active(sock) || atomic_load(&sock->closing) || + isc__nm_closing(sock) || + (sock->server != NULL && !isc__nmsocket_active(sock->server))); +} + +static isc_result_t +processbuffer(isc_nmsocket_t *sock) { + switch (sock->type) { + case isc_nm_tcpdnssocket: + return (isc__nm_tcpdns_processbuffer(sock)); + case isc_nm_tlsdnssocket: + return (isc__nm_tlsdns_processbuffer(sock)); + default: + UNREACHABLE(); + } +} + +/* + * Process a DNS message. + * + * If we only have an incomplete DNS message, we don't touch any + * timers. If we do have a full message, reset the timer. + * + * Stop reading if this is a client socket, or if the server socket + * has been set to sequential mode. In this case we'll be called again + * later by isc__nm_resume_processing(). + */ +isc_result_t +isc__nm_process_sock_buffer(isc_nmsocket_t *sock) { + for (;;) { + int_fast32_t ah = atomic_load(&sock->ah); + isc_result_t result = processbuffer(sock); + switch (result) { + case ISC_R_NOMORE: + /* + * Don't reset the timer until we have a + * full DNS message. + */ + result = isc__nm_start_reading(sock); + if (result != ISC_R_SUCCESS) { + return (result); + } + /* + * Start the timer only if there are no externally used + * active handles, there's always one active handle + * attached internally to sock->recv_handle in + * accept_connection() + */ + if (ah == 1) { + isc__nmsocket_timer_start(sock); + } + goto done; + case ISC_R_CANCELED: + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + goto done; + case ISC_R_SUCCESS: + /* + * Stop the timer on the successful message read, this + * also allows to restart the timer when we have no more + * data. + */ + isc__nmsocket_timer_stop(sock); + + if (atomic_load(&sock->client) || + atomic_load(&sock->sequential)) + { + isc__nm_stop_reading(sock); + goto done; + } + break; + default: + UNREACHABLE(); + } + } +done: + return (ISC_R_SUCCESS); +} + +void +isc__nm_resume_processing(void *arg) { + isc_nmsocket_t *sock = (isc_nmsocket_t *)arg; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(!atomic_load(&sock->client)); + + if (isc__nmsocket_closing(sock)) { + return; + } + + isc__nm_process_sock_buffer(sock); +} + +void +isc_nmhandle_cleartimeout(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + switch (handle->sock->type) { +#if HAVE_LIBNGHTTP2 + case isc_nm_httpsocket: + isc__nm_http_cleartimeout(handle); + return; + case isc_nm_tlssocket: + isc__nm_tls_cleartimeout(handle); + return; +#endif + default: + handle->sock->read_timeout = 0; + + if (uv_is_active((uv_handle_t *)&handle->sock->read_timer)) { + isc__nmsocket_timer_stop(handle->sock); + } + } +} + +void +isc_nmhandle_settimeout(isc_nmhandle_t *handle, uint32_t timeout) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + switch (handle->sock->type) { +#if HAVE_LIBNGHTTP2 + case isc_nm_httpsocket: + isc__nm_http_settimeout(handle, timeout); + return; + case isc_nm_tlssocket: + isc__nm_tls_settimeout(handle, timeout); + return; +#endif + default: + handle->sock->read_timeout = timeout; + isc__nmsocket_timer_restart(handle->sock); + } +} + +void +isc_nmhandle_keepalive(isc_nmhandle_t *handle, bool value) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + + switch (sock->type) { + case isc_nm_tcpsocket: + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + atomic_store(&sock->keepalive, value); + sock->read_timeout = value ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle); + sock->write_timeout = value ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + isc__nmhandle_tls_keepalive(handle, value); + break; + case isc_nm_httpsocket: + isc__nmhandle_http_keepalive(handle, value); + break; +#endif /* HAVE_LIBNGHTTP2 */ + default: + /* + * For any other protocol, this is a no-op. + */ + return; + } +} + +bool +isc_nmhandle_timer_running(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + return (isc__nmsocket_timer_running(handle->sock)); +} + +void * +isc_nmhandle_getextra(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + + return (handle->extra); +} + +isc_sockaddr_t +isc_nmhandle_peeraddr(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + + return (handle->peer); +} + +isc_sockaddr_t +isc_nmhandle_localaddr(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + + return (handle->local); +} + +isc_nm_t * +isc_nmhandle_netmgr(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + return (handle->sock->mgr); +} + +isc__nm_uvreq_t * +isc___nm_uvreq_get(isc_nm_t *mgr, isc_nmsocket_t *sock FLARG) { + isc__nm_uvreq_t *req = NULL; + + REQUIRE(VALID_NM(mgr)); + REQUIRE(VALID_NMSOCK(sock)); + + if (sock != NULL && isc__nmsocket_active(sock)) { + /* Try to reuse one */ + req = isc_astack_pop(sock->inactivereqs); + } + + if (req == NULL) { + req = isc_mem_get(mgr->mctx, sizeof(*req)); + } + + *req = (isc__nm_uvreq_t){ + .magic = 0, + .connect_tries = 3, + }; + ISC_LINK_INIT(req, link); + req->uv_req.req.data = req; + isc___nmsocket_attach(sock, &req->sock FLARG_PASS); + req->magic = UVREQ_MAGIC; + + return (req); +} + +void +isc___nm_uvreq_put(isc__nm_uvreq_t **req0, isc_nmsocket_t *sock FLARG) { + isc__nm_uvreq_t *req = NULL; + isc_nmhandle_t *handle = NULL; + + REQUIRE(req0 != NULL); + REQUIRE(VALID_UVREQ(*req0)); + + req = *req0; + *req0 = NULL; + + INSIST(sock == req->sock); + + req->magic = 0; + + /* + * We need to save this first to make sure that handle, + * sock, and the netmgr won't all disappear. + */ + handle = req->handle; + req->handle = NULL; + +#if !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__ + if (!isc__nmsocket_active(sock) || + !isc_astack_trypush(sock->inactivereqs, req)) + { + isc_mem_put(sock->mgr->mctx, req, sizeof(*req)); + } +#else /* !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__ */ + isc_mem_put(sock->mgr->mctx, req, sizeof(*req)); +#endif /* !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__ */ + + if (handle != NULL) { + isc__nmhandle_detach(&handle FLARG_PASS); + } + + isc___nmsocket_detach(&sock FLARG_PASS); +} + +void +isc_nm_send(isc_nmhandle_t *handle, isc_region_t *region, isc_nm_cb_t cb, + void *cbarg) { + REQUIRE(VALID_NMHANDLE(handle)); + + switch (handle->sock->type) { + case isc_nm_udpsocket: + case isc_nm_udplistener: + isc__nm_udp_send(handle, region, cb, cbarg); + break; + case isc_nm_tcpsocket: + isc__nm_tcp_send(handle, region, cb, cbarg); + break; + case isc_nm_tcpdnssocket: + isc__nm_tcpdns_send(handle, region, cb, cbarg); + break; + case isc_nm_tlsdnssocket: + isc__nm_tlsdns_send(handle, region, cb, cbarg); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + isc__nm_tls_send(handle, region, cb, cbarg); + break; + case isc_nm_httpsocket: + isc__nm_http_send(handle, region, cb, cbarg); + break; +#endif + default: + UNREACHABLE(); + } +} + +void +isc_nm_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { + REQUIRE(VALID_NMHANDLE(handle)); + + switch (handle->sock->type) { + case isc_nm_udpsocket: + isc__nm_udp_read(handle, cb, cbarg); + break; + case isc_nm_tcpsocket: + isc__nm_tcp_read(handle, cb, cbarg); + break; + case isc_nm_tcpdnssocket: + isc__nm_tcpdns_read(handle, cb, cbarg); + break; + case isc_nm_tlsdnssocket: + isc__nm_tlsdns_read(handle, cb, cbarg); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + isc__nm_tls_read(handle, cb, cbarg); + break; + case isc_nm_httpsocket: + isc__nm_http_read(handle, cb, cbarg); + break; +#endif + default: + UNREACHABLE(); + } +} + +void +isc_nm_cancelread(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + + switch (handle->sock->type) { + case isc_nm_udpsocket: + isc__nm_udp_cancelread(handle); + break; + case isc_nm_tcpsocket: + isc__nm_tcp_cancelread(handle); + break; + case isc_nm_tcpdnssocket: + isc__nm_tcpdns_cancelread(handle); + break; + case isc_nm_tlsdnssocket: + isc__nm_tlsdns_cancelread(handle); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + isc__nm_tls_cancelread(handle); + break; +#endif + default: + UNREACHABLE(); + } +} + +void +isc_nm_pauseread(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + + isc_nmsocket_t *sock = handle->sock; + + switch (sock->type) { + case isc_nm_tcpsocket: + isc__nm_tcp_pauseread(handle); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + isc__nm_tls_pauseread(handle); + break; +#endif + default: + UNREACHABLE(); + } +} + +void +isc_nm_resumeread(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + + isc_nmsocket_t *sock = handle->sock; + + switch (sock->type) { + case isc_nm_tcpsocket: + isc__nm_tcp_resumeread(handle); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + isc__nm_tls_resumeread(handle); + break; +#endif + default: + UNREACHABLE(); + } +} + +void +isc_nm_stoplistening(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + + switch (sock->type) { + case isc_nm_udplistener: + isc__nm_udp_stoplistening(sock); + break; + case isc_nm_tcpdnslistener: + isc__nm_tcpdns_stoplistening(sock); + break; + case isc_nm_tcplistener: + isc__nm_tcp_stoplistening(sock); + break; + case isc_nm_tlsdnslistener: + isc__nm_tlsdns_stoplistening(sock); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlslistener: + isc__nm_tls_stoplistening(sock); + break; + case isc_nm_httplistener: + isc__nm_http_stoplistening(sock); + break; +#endif + default: + UNREACHABLE(); + } +} + +void +isc__nmsocket_stop(isc_nmsocket_t *listener) { + isc__netievent_sockstop_t ievent = { .sock = listener }; + + REQUIRE(VALID_NMSOCK(listener)); + + if (!atomic_compare_exchange_strong(&listener->closing, + &(bool){ false }, true)) + { + UNREACHABLE(); + } + + for (size_t i = 0; i < listener->nchildren; i++) { + isc__networker_t *worker = &listener->mgr->workers[i]; + isc__netievent_sockstop_t *ev; + + if (isc__nm_in_netthread() && i == (size_t)isc_nm_tid()) { + continue; + } + + ev = isc__nm_get_netievent_sockstop(listener->mgr, listener); + isc__nm_enqueue_ievent(worker, (isc__netievent_t *)ev); + } + + if (isc__nm_in_netthread()) { + isc__nm_async_sockstop(&listener->mgr->workers[isc_nm_tid()], + (isc__netievent_t *)&ievent); + } +} + +void +isc__nmsocket_barrier_init(isc_nmsocket_t *listener) { + REQUIRE(listener->nchildren > 0); + isc_barrier_init(&listener->barrier, listener->nchildren); + listener->barrier_initialised = true; +} + +void +isc__nm_async_sockstop(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_sockstop_t *ievent = (isc__netievent_sockstop_t *)ev0; + isc_nmsocket_t *listener = ievent->sock; + UNUSED(worker); + + (void)atomic_fetch_sub(&listener->rchildren, 1); + isc_barrier_wait(&listener->barrier); + + if (listener->tid != isc_nm_tid()) { + return; + } + + if (!atomic_compare_exchange_strong(&listener->listening, + &(bool){ true }, false)) + { + UNREACHABLE(); + } + + INSIST(atomic_load(&listener->rchildren) == 0); + + listener->accept_cb = NULL; + listener->accept_cbarg = NULL; + listener->recv_cb = NULL; + listener->recv_cbarg = NULL; + + if (listener->outer != NULL) { + isc_nm_stoplistening(listener->outer); + isc__nmsocket_detach(&listener->outer); + } + + atomic_store(&listener->closed, true); +} + +void +isc__nm_connectcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq, + isc_result_t eresult, bool async) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMHANDLE(uvreq->handle)); + + if (!async) { + isc__netievent_connectcb_t ievent = { .sock = sock, + .req = uvreq, + .result = eresult }; + isc__nm_async_connectcb(NULL, (isc__netievent_t *)&ievent); + } else { + isc__netievent_connectcb_t *ievent = + isc__nm_get_netievent_connectcb(sock->mgr, sock, uvreq, + eresult); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } +} + +void +isc__nm_async_connectcb(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_connectcb_t *ievent = (isc__netievent_connectcb_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *uvreq = ievent->req; + isc_result_t eresult = ievent->result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMHANDLE(uvreq->handle)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + REQUIRE(uvreq->cb.connect != NULL); + + uvreq->cb.connect(uvreq->handle, eresult, uvreq->cbarg); + + isc__nm_uvreq_put(&uvreq, sock); +} + +void +isc__nm_readcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq, + isc_result_t eresult) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMHANDLE(uvreq->handle)); + + if (eresult == ISC_R_SUCCESS || eresult == ISC_R_TIMEDOUT) { + isc__netievent_readcb_t ievent = { .sock = sock, + .req = uvreq, + .result = eresult }; + + isc__nm_async_readcb(NULL, (isc__netievent_t *)&ievent); + } else { + isc__netievent_readcb_t *ievent = isc__nm_get_netievent_readcb( + sock->mgr, sock, uvreq, eresult); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } +} + +void +isc__nm_async_readcb(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_readcb_t *ievent = (isc__netievent_readcb_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *uvreq = ievent->req; + isc_result_t eresult = ievent->result; + isc_region_t region; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMHANDLE(uvreq->handle)); + REQUIRE(sock->tid == isc_nm_tid()); + + region.base = (unsigned char *)uvreq->uvbuf.base; + region.length = uvreq->uvbuf.len; + + uvreq->cb.recv(uvreq->handle, eresult, ®ion, uvreq->cbarg); + + isc__nm_uvreq_put(&uvreq, sock); +} + +void +isc__nm_sendcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq, + isc_result_t eresult, bool async) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMHANDLE(uvreq->handle)); + + if (!async) { + isc__netievent_sendcb_t ievent = { .sock = sock, + .req = uvreq, + .result = eresult }; + isc__nm_async_sendcb(NULL, (isc__netievent_t *)&ievent); + return; + } + + isc__netievent_sendcb_t *ievent = + isc__nm_get_netievent_sendcb(sock->mgr, sock, uvreq, eresult); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_sendcb(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_sendcb_t *ievent = (isc__netievent_sendcb_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *uvreq = ievent->req; + isc_result_t eresult = ievent->result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMHANDLE(uvreq->handle)); + REQUIRE(sock->tid == isc_nm_tid()); + + uvreq->cb.send(uvreq->handle, eresult, uvreq->cbarg); + + isc__nm_uvreq_put(&uvreq, sock); +} + +static void +isc__nm_async_close(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_close_t *ievent = (isc__netievent_close_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->closehandle_cb != NULL); + + UNUSED(worker); + + ievent->sock->closehandle_cb(sock); +} + +void +isc__nm_async_detach(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_detach_t *ievent = (isc__netievent_detach_t *)ev0; + FLARG_IEVENT(ievent); + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(VALID_NMHANDLE(ievent->handle)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + + UNUSED(worker); + + nmhandle_detach_cb(&ievent->handle FLARG_PASS); +} + +static void +reset_shutdown(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + + isc__nmsocket_shutdown(sock); + isc__nmsocket_detach(&sock); +} + +void +isc__nmsocket_reset(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + + switch (sock->type) { + case isc_nm_tcpsocket: + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + /* + * This can be called from the TCP write timeout, or + * from the TCPDNS or TLSDNS branches of isc_nm_bad_request(). + */ + REQUIRE(sock->parent == NULL); + break; + default: + UNREACHABLE(); + break; + } + + if (!uv_is_closing(&sock->uv_handle.handle) && + uv_is_active(&sock->uv_handle.handle)) + { + /* + * The real shutdown will be handled in the respective + * close functions. + */ + isc__nmsocket_attach(sock, &(isc_nmsocket_t *){ NULL }); + int r = uv_tcp_close_reset(&sock->uv_handle.tcp, + reset_shutdown); + UV_RUNTIME_CHECK(uv_tcp_close_reset, r); + } else { + isc__nmsocket_shutdown(sock); + } +} + +void +isc__nmsocket_shutdown(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + switch (sock->type) { + case isc_nm_udpsocket: + isc__nm_udp_shutdown(sock); + break; + case isc_nm_tcpsocket: + isc__nm_tcp_shutdown(sock); + break; + case isc_nm_tcpdnssocket: + isc__nm_tcpdns_shutdown(sock); + break; + case isc_nm_tlsdnssocket: + isc__nm_tlsdns_shutdown(sock); + break; + case isc_nm_udplistener: + case isc_nm_tcplistener: + case isc_nm_tcpdnslistener: + case isc_nm_tlsdnslistener: + return; + default: + UNREACHABLE(); + } +} + +static void +shutdown_walk_cb(uv_handle_t *handle, void *arg) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + UNUSED(arg); + + if (uv_is_closing(handle)) { + return; + } + + switch (handle->type) { + case UV_UDP: + isc__nmsocket_shutdown(sock); + return; + case UV_TCP: + switch (sock->type) { + case isc_nm_tcpsocket: + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + if (sock->parent == NULL) { + /* Reset the TCP connections on shutdown */ + isc__nmsocket_reset(sock); + return; + } + FALLTHROUGH; + default: + isc__nmsocket_shutdown(sock); + } + + return; + default: + return; + } +} + +void +isc__nm_async_shutdown(isc__networker_t *worker, isc__netievent_t *ev0) { + UNUSED(ev0); + + uv_walk(&worker->loop, shutdown_walk_cb, NULL); +} + +bool +isc__nm_acquire_interlocked(isc_nm_t *mgr) { + if (!isc__nm_in_netthread()) { + return (false); + } + + LOCK(&mgr->lock); + bool success = atomic_compare_exchange_strong( + &mgr->interlocked, &(int){ ISC_NETMGR_NON_INTERLOCKED }, + isc_nm_tid()); + + UNLOCK(&mgr->lock); + return (success); +} + +void +isc__nm_drop_interlocked(isc_nm_t *mgr) { + if (!isc__nm_in_netthread()) { + return; + } + + LOCK(&mgr->lock); + int tid = atomic_exchange(&mgr->interlocked, + ISC_NETMGR_NON_INTERLOCKED); + INSIST(tid != ISC_NETMGR_NON_INTERLOCKED); + BROADCAST(&mgr->wkstatecond); + UNLOCK(&mgr->lock); +} + +void +isc__nm_acquire_interlocked_force(isc_nm_t *mgr) { + if (!isc__nm_in_netthread()) { + return; + } + + LOCK(&mgr->lock); + while (!atomic_compare_exchange_strong( + &mgr->interlocked, &(int){ ISC_NETMGR_NON_INTERLOCKED }, + isc_nm_tid())) + { + WAIT(&mgr->wkstatecond, &mgr->lock); + } + UNLOCK(&mgr->lock); +} + +void +isc_nm_setstats(isc_nm_t *mgr, isc_stats_t *stats) { + REQUIRE(VALID_NM(mgr)); + REQUIRE(mgr->stats == NULL); + REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max); + + isc_stats_attach(stats, &mgr->stats); +} + +void +isc__nm_incstats(isc_nmsocket_t *sock, isc__nm_statid_t id) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(id < STATID_MAX); + + if (sock->statsindex != NULL && sock->mgr->stats != NULL) { + isc_stats_increment(sock->mgr->stats, sock->statsindex[id]); + } +} + +void +isc__nm_decstats(isc_nmsocket_t *sock, isc__nm_statid_t id) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(id < STATID_MAX); + + if (sock->statsindex != NULL && sock->mgr->stats != NULL) { + isc_stats_decrement(sock->mgr->stats, sock->statsindex[id]); + } +} + +isc_result_t +isc__nm_socket(int domain, int type, int protocol, uv_os_sock_t *sockp) { + int sock = socket(domain, type, protocol); + if (sock < 0) { + return (isc_errno_toresult(errno)); + } + + *sockp = (uv_os_sock_t)sock; + return (ISC_R_SUCCESS); +} + +void +isc__nm_closesocket(uv_os_sock_t sock) { + close(sock); +} + +#define setsockopt_on(socket, level, name) \ + setsockopt(socket, level, name, &(int){ 1 }, sizeof(int)) + +#define setsockopt_off(socket, level, name) \ + setsockopt(socket, level, name, &(int){ 0 }, sizeof(int)) + +isc_result_t +isc__nm_socket_freebind(uv_os_sock_t fd, sa_family_t sa_family) { + /* + * Set the IP_FREEBIND (or equivalent option) on the uv_handle. + */ +#ifdef IP_FREEBIND + UNUSED(sa_family); + if (setsockopt_on(fd, IPPROTO_IP, IP_FREEBIND) == -1) { + return (ISC_R_FAILURE); + } + return (ISC_R_SUCCESS); +#elif defined(IP_BINDANY) || defined(IPV6_BINDANY) + if (sa_family == AF_INET) { +#if defined(IP_BINDANY) + if (setsockopt_on(fd, IPPROTO_IP, IP_BINDANY) == -1) { + return (ISC_R_FAILURE); + } + return (ISC_R_SUCCESS); +#endif + } else if (sa_family == AF_INET6) { +#if defined(IPV6_BINDANY) + if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_BINDANY) == -1) { + return (ISC_R_FAILURE); + } + return (ISC_R_SUCCESS); +#endif + } + return (ISC_R_NOTIMPLEMENTED); +#elif defined(SO_BINDANY) + UNUSED(sa_family); + if (setsockopt_on(fd, SOL_SOCKET, SO_BINDANY) == -1) { + return (ISC_R_FAILURE); + } + return (ISC_R_SUCCESS); +#else + UNUSED(fd); + UNUSED(sa_family); + return (ISC_R_NOTIMPLEMENTED); +#endif +} + +isc_result_t +isc__nm_socket_reuse(uv_os_sock_t fd) { + /* + * Generally, the SO_REUSEADDR socket option allows reuse of + * local addresses. + * + * On the BSDs, SO_REUSEPORT implies SO_REUSEADDR but with some + * additional refinements for programs that use multicast. + * + * On Linux, SO_REUSEPORT has different semantics: it _shares_ the port + * rather than steal it from the current listener, so we don't use it + * here, but rather in isc__nm_socket_reuse_lb(). + * + * On Windows, it also allows a socket to forcibly bind to a port in use + * by another socket. + */ + +#if defined(SO_REUSEPORT) && !defined(__linux__) + if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) { + return (ISC_R_FAILURE); + } + return (ISC_R_SUCCESS); +#elif defined(SO_REUSEADDR) + if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEADDR) == -1) { + return (ISC_R_FAILURE); + } + return (ISC_R_SUCCESS); +#else + UNUSED(fd); + return (ISC_R_NOTIMPLEMENTED); +#endif +} + +isc_result_t +isc__nm_socket_reuse_lb(uv_os_sock_t fd) { + /* + * On FreeBSD 12+, SO_REUSEPORT_LB socket option allows sockets to be + * bound to an identical socket address. For UDP sockets, the use of + * this option can provide better distribution of incoming datagrams to + * multiple processes (or threads) as compared to the traditional + * technique of having multiple processes compete to receive datagrams + * on the same socket. + * + * On Linux, the same thing is achieved simply with SO_REUSEPORT. + */ +#if defined(SO_REUSEPORT_LB) + if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT_LB) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#elif defined(SO_REUSEPORT) && defined(__linux__) + if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); + return (ISC_R_NOTIMPLEMENTED); +#endif +} + +isc_result_t +isc__nm_socket_incoming_cpu(uv_os_sock_t fd) { +#ifdef SO_INCOMING_CPU + if (setsockopt_on(fd, SOL_SOCKET, SO_INCOMING_CPU) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); +#endif + return (ISC_R_NOTIMPLEMENTED); +} + +isc_result_t +isc__nm_socket_disable_pmtud(uv_os_sock_t fd, sa_family_t sa_family) { + /* + * Disable the Path MTU Discovery on IP packets + */ + if (sa_family == AF_INET6) { +#if defined(IPV6_DONTFRAG) + if (setsockopt_off(fd, IPPROTO_IPV6, IPV6_DONTFRAG) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#elif defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT) + if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER, + &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1) + { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); +#endif + } else if (sa_family == AF_INET) { +#if defined(IP_DONTFRAG) + if (setsockopt_off(fd, IPPROTO_IP, IP_DONTFRAG) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#elif defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT) + if (setsockopt(fd, IPPROTO_IP, IP_MTU_DISCOVER, + &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1) + { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); +#endif + } else { + return (ISC_R_FAMILYNOSUPPORT); + } + + return (ISC_R_NOTIMPLEMENTED); +} + +isc_result_t +isc__nm_socket_v6only(uv_os_sock_t fd, sa_family_t sa_family) { + /* + * Enable the IPv6-only option on IPv6 sockets + */ + if (sa_family == AF_INET6) { +#if defined(IPV6_V6ONLY) + if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_V6ONLY) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); +#endif + } + return (ISC_R_NOTIMPLEMENTED); +} + +isc_result_t +isc_nm_checkaddr(const isc_sockaddr_t *addr, isc_socktype_t type) { + int proto, pf, addrlen, fd, r; + + REQUIRE(addr != NULL); + + switch (type) { + case isc_socktype_tcp: + proto = SOCK_STREAM; + break; + case isc_socktype_udp: + proto = SOCK_DGRAM; + break; + default: + return (ISC_R_NOTIMPLEMENTED); + } + + pf = isc_sockaddr_pf(addr); + if (pf == AF_INET) { + addrlen = sizeof(struct sockaddr_in); + } else { + addrlen = sizeof(struct sockaddr_in6); + } + + fd = socket(pf, proto, 0); + if (fd < 0) { + return (isc_errno_toresult(errno)); + } + + r = bind(fd, (const struct sockaddr *)&addr->type.sa, addrlen); + if (r < 0) { + close(fd); + return (isc_errno_toresult(errno)); + } + + close(fd); + return (ISC_R_SUCCESS); +} + +#if defined(TCP_CONNECTIONTIMEOUT) +#define TIMEOUT_TYPE int +#define TIMEOUT_DIV 1000 +#define TIMEOUT_OPTNAME TCP_CONNECTIONTIMEOUT +#elif defined(TCP_RXT_CONNDROPTIME) +#define TIMEOUT_TYPE int +#define TIMEOUT_DIV 1000 +#define TIMEOUT_OPTNAME TCP_RXT_CONNDROPTIME +#elif defined(TCP_USER_TIMEOUT) +#define TIMEOUT_TYPE unsigned int +#define TIMEOUT_DIV 1 +#define TIMEOUT_OPTNAME TCP_USER_TIMEOUT +#elif defined(TCP_KEEPINIT) +#define TIMEOUT_TYPE int +#define TIMEOUT_DIV 1000 +#define TIMEOUT_OPTNAME TCP_KEEPINIT +#endif + +isc_result_t +isc__nm_socket_connectiontimeout(uv_os_sock_t fd, int timeout_ms) { +#if defined(TIMEOUT_OPTNAME) + TIMEOUT_TYPE timeout = timeout_ms / TIMEOUT_DIV; + + if (timeout == 0) { + timeout = 1; + } + + if (setsockopt(fd, IPPROTO_TCP, TIMEOUT_OPTNAME, &timeout, + sizeof(timeout)) == -1) + { + return (ISC_R_FAILURE); + } + + return (ISC_R_SUCCESS); +#else + UNUSED(fd); + UNUSED(timeout_ms); + + return (ISC_R_SUCCESS); +#endif +} + +isc_result_t +isc__nm_socket_tcp_nodelay(uv_os_sock_t fd) { +#ifdef TCP_NODELAY + if (setsockopt_on(fd, IPPROTO_TCP, TCP_NODELAY) == -1) { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); + return (ISC_R_SUCCESS); +#endif +} + +isc_result_t +isc__nm_socket_tcp_maxseg(uv_os_sock_t fd, int size) { +#ifdef TCP_MAXSEG + if (setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, (void *)&size, + sizeof(size))) + { + return (ISC_R_FAILURE); + } else { + return (ISC_R_SUCCESS); + } +#else + UNUSED(fd); + UNUSED(size); + return (ISC_R_SUCCESS); +#endif +} + +isc_result_t +isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) { + if (sa_family != AF_INET6) { + return (ISC_R_SUCCESS); + } +#ifdef IPV6_USE_MIN_MTU + if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU) == -1) { + return (ISC_R_FAILURE); + } +#elif defined(IPV6_MTU) + if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU, &(int){ 1280 }, + sizeof(int)) == -1) + { + return (ISC_R_FAILURE); + } +#else + UNUSED(fd); +#endif + + return (ISC_R_SUCCESS); +} + +void +isc__nm_set_network_buffers(isc_nm_t *nm, uv_handle_t *handle) { + int32_t recv_buffer_size = 0; + int32_t send_buffer_size = 0; + + switch (handle->type) { + case UV_TCP: + recv_buffer_size = + atomic_load_relaxed(&nm->recv_tcp_buffer_size); + send_buffer_size = + atomic_load_relaxed(&nm->send_tcp_buffer_size); + break; + case UV_UDP: + recv_buffer_size = + atomic_load_relaxed(&nm->recv_udp_buffer_size); + send_buffer_size = + atomic_load_relaxed(&nm->send_udp_buffer_size); + break; + default: + UNREACHABLE(); + } + + if (recv_buffer_size > 0) { + int r = uv_recv_buffer_size(handle, &recv_buffer_size); + UV_RUNTIME_CHECK(uv_recv_buffer_size, r); + } + + if (send_buffer_size > 0) { + int r = uv_send_buffer_size(handle, &send_buffer_size); + UV_RUNTIME_CHECK(uv_send_buffer_size, r); + } +} + +static isc_threadresult_t +isc__nm_work_run(isc_threadarg_t arg) { + isc__nm_work_t *work = (isc__nm_work_t *)arg; + + work->cb(work->data); + + return ((isc_threadresult_t)0); +} + +static void +isc__nm_work_cb(uv_work_t *req) { + isc__nm_work_t *work = uv_req_get_data((uv_req_t *)req); + + if (isc_tid_v == SIZE_MAX) { + isc__trampoline_t *trampoline_arg = + isc__trampoline_get(isc__nm_work_run, work); + (void)isc__trampoline_run(trampoline_arg); + } else { + (void)isc__nm_work_run((isc_threadarg_t)work); + } +} + +static void +isc__nm_after_work_cb(uv_work_t *req, int status) { + isc_result_t result = ISC_R_SUCCESS; + isc__nm_work_t *work = uv_req_get_data((uv_req_t *)req); + isc_nm_t *netmgr = work->netmgr; + + if (status != 0) { + result = isc__nm_uverr2result(status); + } + + work->after_cb(work->data, result); + + isc_mem_put(netmgr->mctx, work, sizeof(*work)); + + isc_nm_detach(&netmgr); +} + +void +isc_nm_work_offload(isc_nm_t *netmgr, isc_nm_workcb_t work_cb, + isc_nm_after_workcb_t after_work_cb, void *data) { + isc__networker_t *worker = NULL; + isc__nm_work_t *work = NULL; + int r; + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(VALID_NM(netmgr)); + + worker = &netmgr->workers[isc_nm_tid()]; + + work = isc_mem_get(netmgr->mctx, sizeof(*work)); + *work = (isc__nm_work_t){ + .cb = work_cb, + .after_cb = after_work_cb, + .data = data, + }; + + isc_nm_attach(netmgr, &work->netmgr); + + uv_req_set_data((uv_req_t *)&work->req, work); + + r = uv_queue_work(&worker->loop, &work->req, isc__nm_work_cb, + isc__nm_after_work_cb); + UV_RUNTIME_CHECK(uv_queue_work, r); +} + +void +isc_nm_sequential(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + + switch (sock->type) { + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + break; + case isc_nm_httpsocket: + return; + default: + UNREACHABLE(); + } + + /* + * We don't want pipelining on this connection. That means + * that we need to pause after reading each request, and + * resume only after the request has been processed. This + * is done in isc__nm_resume_processing(), which is the + * socket's closehandle_cb callback, called whenever a handle + * is released. + */ + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + atomic_store(&sock->sequential, true); +} + +void +isc_nm_bad_request(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + + switch (sock->type) { + case isc_nm_udpsocket: + return; + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + REQUIRE(sock->parent == NULL); + isc__nmsocket_reset(sock); + return; +#if HAVE_LIBNGHTTP2 + case isc_nm_httpsocket: + isc__nm_http_bad_request(handle); + return; +#endif /* HAVE_LIBNGHTTP2 */ + case isc_nm_tcpsocket: +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: +#endif /* HAVE_LIBNGHTTP2 */ + default: + UNREACHABLE(); + break; + } +} + +isc_result_t +isc_nm_xfr_checkperm(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + isc_result_t result = ISC_R_NOPERM; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + + switch (sock->type) { + case isc_nm_tcpdnssocket: + result = ISC_R_SUCCESS; + break; + case isc_nm_tlsdnssocket: + result = isc__nm_tlsdns_xfr_checkperm(sock); + break; + default: + break; + } + + return (result); +} + +bool +isc_nm_is_http_handle(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + return (handle->sock->type == isc_nm_httpsocket); +} + +void +isc_nm_set_maxage(isc_nmhandle_t *handle, const uint32_t ttl) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(!atomic_load(&handle->sock->client)); + +#if !HAVE_LIBNGHTTP2 + UNUSED(ttl); +#endif + + sock = handle->sock; + switch (sock->type) { +#if HAVE_LIBNGHTTP2 + case isc_nm_httpsocket: + isc__nm_http_set_maxage(handle, ttl); + break; +#endif /* HAVE_LIBNGHTTP2 */ + case isc_nm_udpsocket: + case isc_nm_tcpdnssocket: + case isc_nm_tlsdnssocket: + return; + break; + + case isc_nm_tcpsocket: +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: +#endif /* HAVE_LIBNGHTTP2 */ + default: + UNREACHABLE(); + break; + } +} + +isc_nmsocket_type +isc_nm_socket_type(const isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + return (handle->sock->type); +} + +bool +isc_nm_has_encryption(const isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + switch (handle->sock->type) { + case isc_nm_tlsdnssocket: +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: +#endif /* HAVE_LIBNGHTTP2 */ + return (true); +#if HAVE_LIBNGHTTP2 + case isc_nm_httpsocket: + return (isc__nm_http_has_encryption(handle)); +#endif /* HAVE_LIBNGHTTP2 */ + default: + return (false); + }; + + return (false); +} + +void +isc__nm_async_settlsctx(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent__tlsctx_t *ev_tlsctx = (isc__netievent__tlsctx_t *)ev0; + const int tid = isc_nm_tid(); + isc_nmsocket_t *listener = ev_tlsctx->sock; + isc_tlsctx_t *tlsctx = ev_tlsctx->tlsctx; + + UNUSED(worker); + + switch (listener->type) { + case isc_nm_tlsdnslistener: + isc__nm_async_tlsdns_set_tlsctx(listener, tlsctx, tid); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlslistener: + isc__nm_async_tls_set_tlsctx(listener, tlsctx, tid); + break; +#endif /* HAVE_LIBNGHTTP2 */ + default: + UNREACHABLE(); + break; + }; +} + +static void +set_tlsctx_workers(isc_nmsocket_t *listener, isc_tlsctx_t *tlsctx) { + /* Update the TLS context reference for every worker thread. */ + for (size_t i = 0; i < (size_t)listener->mgr->nworkers; i++) { + isc__netievent__tlsctx_t *ievent = + isc__nm_get_netievent_settlsctx(listener->mgr, listener, + tlsctx); + isc__nm_enqueue_ievent(&listener->mgr->workers[i], + (isc__netievent_t *)ievent); + } +} + +void +isc_nmsocket_set_tlsctx(isc_nmsocket_t *listener, isc_tlsctx_t *tlsctx) { + REQUIRE(VALID_NMSOCK(listener)); + REQUIRE(tlsctx != NULL); + + switch (listener->type) { +#if HAVE_LIBNGHTTP2 + case isc_nm_httplistener: + /* + * We handle HTTP listener sockets differently, as they rely + * on underlying TLS sockets for networking. The TLS context + * will get passed to these underlying sockets via the call to + * isc__nm_http_set_tlsctx(). + */ + isc__nm_http_set_tlsctx(listener, tlsctx); + break; + case isc_nm_tlslistener: + set_tlsctx_workers(listener, tlsctx); + break; +#endif /* HAVE_LIBNGHTTP2 */ + case isc_nm_tlsdnslistener: + set_tlsctx_workers(listener, tlsctx); + break; + default: + UNREACHABLE(); + break; + }; +} + +const char * +isc_nm_verify_tls_peer_result_string(const isc_nmhandle_t *handle) { + isc_nmsocket_t *sock; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + switch (sock->type) { + case isc_nm_tlsdnssocket: + return (isc__nm_tlsdns_verify_tls_peer_result_string(handle)); + break; +#if HAVE_LIBNGHTTP2 + case isc_nm_tlssocket: + return (isc__nm_tls_verify_tls_peer_result_string(handle)); + break; + case isc_nm_httpsocket: + return (isc__nm_http_verify_tls_peer_result_string(handle)); + break; +#endif /* HAVE_LIBNGHTTP2 */ + default: + break; + } + + return (NULL); +} + +void +isc_nmsocket_set_max_streams(isc_nmsocket_t *listener, + const uint32_t max_streams) { + REQUIRE(VALID_NMSOCK(listener)); + switch (listener->type) { +#if HAVE_LIBNGHTTP2 + case isc_nm_httplistener: + isc__nm_http_set_max_streams(listener, max_streams); + break; +#endif /* HAVE_LIBNGHTTP2 */ + default: + UNUSED(max_streams); + break; + }; + return; +} + +void +isc__nmsocket_log_tls_session_reuse(isc_nmsocket_t *sock, isc_tls_t *tls) { + const int log_level = ISC_LOG_DEBUG(1); + char client_sabuf[ISC_SOCKADDR_FORMATSIZE]; + char local_sabuf[ISC_SOCKADDR_FORMATSIZE]; + + REQUIRE(tls != NULL); + + if (!isc_log_wouldlog(isc_lctx, log_level)) { + return; + }; + + isc_sockaddr_format(&sock->peer, client_sabuf, sizeof(client_sabuf)); + isc_sockaddr_format(&sock->iface, local_sabuf, sizeof(local_sabuf)); + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR, + log_level, "TLS %s session %s for %s on %s", + SSL_is_server(tls) ? "server" : "client", + SSL_session_reused(tls) ? "resumed" : "created", + client_sabuf, local_sabuf); +} + +#ifdef NETMGR_TRACE +/* + * Dump all active sockets in netmgr. We output to stderr + * as the logger might be already shut down. + */ + +static const char * +nmsocket_type_totext(isc_nmsocket_type type) { + switch (type) { + case isc_nm_udpsocket: + return ("isc_nm_udpsocket"); + case isc_nm_udplistener: + return ("isc_nm_udplistener"); + case isc_nm_tcpsocket: + return ("isc_nm_tcpsocket"); + case isc_nm_tcplistener: + return ("isc_nm_tcplistener"); + case isc_nm_tcpdnslistener: + return ("isc_nm_tcpdnslistener"); + case isc_nm_tcpdnssocket: + return ("isc_nm_tcpdnssocket"); + case isc_nm_tlssocket: + return ("isc_nm_tlssocket"); + case isc_nm_tlslistener: + return ("isc_nm_tlslistener"); + case isc_nm_tlsdnslistener: + return ("isc_nm_tlsdnslistener"); + case isc_nm_tlsdnssocket: + return ("isc_nm_tlsdnssocket"); + case isc_nm_httplistener: + return ("isc_nm_httplistener"); + case isc_nm_httpsocket: + return ("isc_nm_httpsocket"); + default: + UNREACHABLE(); + } +} + +static void +nmhandle_dump(isc_nmhandle_t *handle) { + fprintf(stderr, "Active handle %p, refs %" PRIuFAST32 "\n", handle, + isc_refcount_current(&handle->references)); + fprintf(stderr, "Created by:\n"); + isc_backtrace_symbols_fd(handle->backtrace, handle->backtrace_size, + STDERR_FILENO); + fprintf(stderr, "\n\n"); +} + +static void +nmsocket_dump(isc_nmsocket_t *sock) { + isc_nmhandle_t *handle = NULL; + + LOCK(&sock->lock); + fprintf(stderr, "\n=================\n"); + fprintf(stderr, "Active %s socket %p, type %s, refs %" PRIuFAST32 "\n", + atomic_load(&sock->client) ? "client" : "server", sock, + nmsocket_type_totext(sock->type), + isc_refcount_current(&sock->references)); + fprintf(stderr, + "Parent %p, listener %p, server %p, statichandle = " + "%p\n", + sock->parent, sock->listener, sock->server, sock->statichandle); + fprintf(stderr, "Flags:%s%s%s%s%s\n", + atomic_load(&sock->active) ? " active" : "", + atomic_load(&sock->closing) ? " closing" : "", + atomic_load(&sock->destroying) ? " destroying" : "", + atomic_load(&sock->connecting) ? " connecting" : "", + atomic_load(&sock->accepting) ? " accepting" : ""); + fprintf(stderr, "Created by:\n"); + isc_backtrace_symbols_fd(sock->backtrace, sock->backtrace_size, + STDERR_FILENO); + fprintf(stderr, "\n"); + + for (handle = ISC_LIST_HEAD(sock->active_handles); handle != NULL; + handle = ISC_LIST_NEXT(handle, active_link)) + { + static bool first = true; + if (first) { + fprintf(stderr, "Active handles:\n"); + first = false; + } + nmhandle_dump(handle); + } + + fprintf(stderr, "\n"); + UNLOCK(&sock->lock); +} + +void +isc__nm_dump_active(isc_nm_t *nm) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NM(nm)); + + LOCK(&nm->lock); + for (sock = ISC_LIST_HEAD(nm->active_sockets); sock != NULL; + sock = ISC_LIST_NEXT(sock, active_link)) + { + static bool first = true; + if (first) { + fprintf(stderr, "Outstanding sockets\n"); + first = false; + } + nmsocket_dump(sock); + } + UNLOCK(&nm->lock); +} +#endif diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c new file mode 100644 index 0000000..2a644fe --- /dev/null +++ b/lib/isc/netmgr/tcp.c @@ -0,0 +1,1456 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <libgen.h> +#include <unistd.h> +#include <uv.h> + +#include <isc/atomic.h> +#include <isc/barrier.h> +#include <isc/buffer.h> +#include <isc/condition.h> +#include <isc/errno.h> +#include <isc/log.h> +#include <isc/magic.h> +#include <isc/mem.h> +#include <isc/netmgr.h> +#include <isc/quota.h> +#include <isc/random.h> +#include <isc/refcount.h> +#include <isc/region.h> +#include <isc/result.h> +#include <isc/sockaddr.h> +#include <isc/stdtime.h> +#include <isc/thread.h> +#include <isc/util.h> + +#include "netmgr-int.h" +#include "uv-compat.h" + +static atomic_uint_fast32_t last_tcpquota_log = 0; + +static bool +can_log_tcp_quota(void) { + isc_stdtime_t now, last; + + isc_stdtime_get(&now); + last = atomic_exchange_relaxed(&last_tcpquota_log, now); + if (now != last) { + return (true); + } + + return (false); +} + +static isc_result_t +tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); + +static void +tcp_close_direct(isc_nmsocket_t *sock); + +static isc_result_t +tcp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); +static void +tcp_connect_cb(uv_connect_t *uvreq, int status); + +static void +tcp_connection_cb(uv_stream_t *server, int status); + +static void +tcp_close_cb(uv_handle_t *uvhandle); + +static isc_result_t +accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota); + +static void +quota_accept_cb(isc_quota_t *quota, void *sock0); + +static void +failed_accept_cb(isc_nmsocket_t *sock, isc_result_t eresult); + +static void +stop_tcp_parent(isc_nmsocket_t *sock); +static void +stop_tcp_child(isc_nmsocket_t *sock); + +static void +failed_accept_cb(isc_nmsocket_t *sock, isc_result_t eresult) { + REQUIRE(atomic_load(&sock->accepting)); + REQUIRE(sock->server); + + /* + * Detach the quota early to make room for other connections; + * otherwise it'd be detached later asynchronously, and clog + * the quota unnecessarily. + */ + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } + + isc__nmsocket_detach(&sock->server); + + atomic_store(&sock->accepting, false); + + switch (eresult) { + case ISC_R_NOTCONNECTED: + /* IGNORE: The client disconnected before we could accept */ + break; + default: + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR, + "Accepting TCP connection failed: %s", + isc_result_totext(eresult)); + } +} + +static isc_result_t +tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { + isc__networker_t *worker = NULL; + isc_result_t result = ISC_R_UNSET; + int r; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(sock->tid == isc_nm_tid()); + + worker = &sock->mgr->workers[sock->tid]; + + atomic_store(&sock->connecting, true); + + /* 2 minute timeout */ + result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + uv_handle_set_data(&sock->uv_handle.handle, sock); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); + if (r != 0) { + isc__nm_closesocket(sock->fd); + isc__nm_incstats(sock, STATID_OPENFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_OPEN); + + if (req->local.length != 0) { + r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0); + if (r != 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + uv_handle_set_data(&req->uv_req.handle, req); + r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp, + &req->peer.type.sa, tcp_connect_cb); + if (r != 0) { + isc__nm_incstats(sock, STATID_CONNECTFAIL); + goto done; + } + + uv_handle_set_data((uv_handle_t *)&sock->read_timer, + &req->uv_req.connect); + isc__nmsocket_timer_start(sock); + + atomic_store(&sock->connected, true); + +done: + result = isc__nm_uverr2result(r); + LOCK(&sock->lock); + sock->result = result; + SIGNAL(&sock->cond); + if (!atomic_load(&sock->active)) { + WAIT(&sock->scond, &sock->lock); + } + INSIST(atomic_load(&sock->active)); + UNLOCK(&sock->lock); + + return (result); +} + +void +isc__nm_async_tcpconnect(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpconnect_t *ievent = + (isc__netievent_tcpconnect_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *req = ievent->req; + isc_result_t result = ISC_R_SUCCESS; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcpsocket); + REQUIRE(sock->parent == NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + result = tcp_connect_direct(sock, req); + if (result != ISC_R_SUCCESS) { + atomic_store(&sock->active, false); + if (sock->fd != (uv_os_sock_t)(-1)) { + isc__nm_tcp_close(sock); + } + isc__nm_connectcb(sock, req, result, true); + } + + /* + * The sock is now attached to the handle. + */ + isc__nmsocket_detach(&sock); +} + +static void +tcp_connect_cb(uv_connect_t *uvreq, int status) { + isc_result_t result = ISC_R_UNSET; + isc__nm_uvreq_t *req = NULL; + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); + struct sockaddr_storage ss; + int r; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + req = uv_handle_get_data((uv_handle_t *)uvreq); + + REQUIRE(VALID_UVREQ(req)); + REQUIRE(VALID_NMHANDLE(req->handle)); + + if (atomic_load(&sock->timedout)) { + result = ISC_R_TIMEDOUT; + goto error; + } else if (!atomic_load(&sock->connecting)) { + /* + * The connect was cancelled from timeout; just clean up + * the req. + */ + isc__nm_uvreq_put(&req, sock); + return; + } else if (isc__nm_closing(sock)) { + /* Network manager shutting down */ + result = ISC_R_SHUTTINGDOWN; + goto error; + } else if (isc__nmsocket_closing(sock)) { + /* Connection canceled */ + result = ISC_R_CANCELED; + goto error; + } else if (status == UV_ETIMEDOUT) { + /* Timeout status code here indicates hard error */ + result = ISC_R_TIMEDOUT; + goto error; + } else if (status == UV_EADDRINUSE) { + /* + * On FreeBSD the TCP connect() call sometimes results in a + * spurious transient EADDRINUSE. Try a few more times before + * giving up. + */ + if (--req->connect_tries > 0) { + r = uv_tcp_connect(&req->uv_req.connect, + &sock->uv_handle.tcp, + &req->peer.type.sa, tcp_connect_cb); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto error; + } + return; + } + result = isc__nm_uverr2result(status); + goto error; + } else if (status != 0) { + result = isc__nm_uverr2result(status); + goto error; + } + + isc__nmsocket_timer_stop(sock); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + isc__nm_incstats(sock, STATID_CONNECT); + r = uv_tcp_getpeername(&sock->uv_handle.tcp, (struct sockaddr *)&ss, + &(int){ sizeof(ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto error; + } + + atomic_store(&sock->connecting, false); + + result = isc_sockaddr_fromsockaddr(&sock->peer, (struct sockaddr *)&ss); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + isc__nm_connectcb(sock, req, ISC_R_SUCCESS, false); + + return; +error: + isc__nm_failed_connect_cb(sock, req, result, false); +} + +void +isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, + isc_nm_cb_t cb, void *cbarg, unsigned int timeout, + size_t extrahandlesize) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + isc__netievent_tcpconnect_t *ievent = NULL; + isc__nm_uvreq_t *req = NULL; + sa_family_t sa_family; + + REQUIRE(VALID_NM(mgr)); + REQUIRE(local != NULL); + REQUIRE(peer != NULL); + + sa_family = peer->type.sa.sa_family; + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_tcpsocket, local); + + sock->extrahandlesize = extrahandlesize; + sock->connect_timeout = timeout; + sock->result = ISC_R_UNSET; + sock->fd = (uv_os_sock_t)-1; + atomic_init(&sock->client, true); + + req = isc__nm_uvreq_get(mgr, sock); + req->cb.connect = cb; + req->cbarg = cbarg; + req->peer = *peer; + req->local = *local; + req->handle = isc__nmhandle_get(sock, &req->peer, &sock->iface); + + result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &sock->fd); + if (result != ISC_R_SUCCESS) { + if (isc__nm_in_netthread()) { + sock->tid = isc_nm_tid(); + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, result, false); + } else { + isc__nmsocket_clearcb(sock); + sock->tid = isc_random_uniform(mgr->nworkers); + isc__nm_connectcb(sock, req, result, true); + } + atomic_store(&sock->closed, true); + isc__nmsocket_detach(&sock); + return; + } + + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); + + ievent = isc__nm_get_netievent_tcpconnect(mgr, sock, req); + + if (isc__nm_in_netthread()) { + atomic_store(&sock->active, true); + sock->tid = isc_nm_tid(); + isc__nm_async_tcpconnect(&mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + isc__nm_put_netievent_tcpconnect(mgr, ievent); + } else { + atomic_init(&sock->active, false); + sock->tid = isc_random_uniform(mgr->nworkers); + isc__nm_enqueue_ievent(&mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } + LOCK(&sock->lock); + while (sock->result == ISC_R_UNSET) { + WAIT(&sock->cond, &sock->lock); + } + atomic_store(&sock->active, true); + BROADCAST(&sock->scond); + UNLOCK(&sock->lock); +} + +static uv_os_sock_t +isc__nm_tcp_lb_socket(isc_nm_t *mgr, sa_family_t sa_family) { + isc_result_t result; + uv_os_sock_t sock; + + result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + (void)isc__nm_socket_incoming_cpu(sock); + (void)isc__nm_socket_v6only(sock, sa_family); + + /* FIXME: set mss */ + + result = isc__nm_socket_reuse(sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + if (mgr->load_balance_sockets) { + result = isc__nm_socket_reuse_lb(sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + } + + return (sock); +} + +static void +start_tcp_child(isc_nm_t *mgr, isc_sockaddr_t *iface, isc_nmsocket_t *sock, + uv_os_sock_t fd, int tid) { + isc__netievent_tcplisten_t *ievent = NULL; + isc_nmsocket_t *csock = &sock->children[tid]; + + isc__nmsocket_init(csock, mgr, isc_nm_tcpsocket, iface); + csock->parent = sock; + csock->accept_cb = sock->accept_cb; + csock->accept_cbarg = sock->accept_cbarg; + csock->extrahandlesize = sock->extrahandlesize; + csock->backlog = sock->backlog; + csock->tid = tid; + /* + * We don't attach to quota, just assign - to avoid + * increasing quota unnecessarily. + */ + csock->pquota = sock->pquota; + isc_quota_cb_init(&csock->quotacb, quota_accept_cb, csock); + + if (mgr->load_balance_sockets) { + UNUSED(fd); + csock->fd = isc__nm_tcp_lb_socket(mgr, + iface->type.sa.sa_family); + } else { + csock->fd = dup(fd); + } + REQUIRE(csock->fd >= 0); + + ievent = isc__nm_get_netievent_tcplisten(mgr, csock); + isc__nm_maybe_enqueue_ievent(&mgr->workers[tid], + (isc__netievent_t *)ievent); +} + +static void +enqueue_stoplistening(isc_nmsocket_t *sock) { + isc__netievent_tcpstop_t *ievent = + isc__nm_get_netievent_tcpstop(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +isc_result_t +isc_nm_listentcp(isc_nm_t *mgr, isc_sockaddr_t *iface, + isc_nm_accept_cb_t accept_cb, void *accept_cbarg, + size_t extrahandlesize, int backlog, isc_quota_t *quota, + isc_nmsocket_t **sockp) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + size_t children_size = 0; + uv_os_sock_t fd = -1; + + REQUIRE(VALID_NM(mgr)); + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_tcplistener, iface); + + atomic_init(&sock->rchildren, 0); + sock->nchildren = mgr->nworkers; + children_size = sock->nchildren * sizeof(sock->children[0]); + sock->children = isc_mem_get(mgr->mctx, children_size); + memset(sock->children, 0, children_size); + + sock->result = ISC_R_UNSET; + + sock->accept_cb = accept_cb; + sock->accept_cbarg = accept_cbarg; + sock->extrahandlesize = extrahandlesize; + sock->backlog = backlog; + sock->pquota = quota; + + sock->tid = 0; + sock->fd = -1; + + if (!mgr->load_balance_sockets) { + fd = isc__nm_tcp_lb_socket(mgr, iface->type.sa.sa_family); + } + + isc_barrier_init(&sock->startlistening, sock->nchildren); + + for (size_t i = 0; i < sock->nchildren; i++) { + if ((int)i == isc_nm_tid()) { + continue; + } + start_tcp_child(mgr, iface, sock, fd, i); + } + + if (isc__nm_in_netthread()) { + start_tcp_child(mgr, iface, sock, fd, isc_nm_tid()); + } + + if (!mgr->load_balance_sockets) { + isc__nm_closesocket(fd); + } + + LOCK(&sock->lock); + while (atomic_load(&sock->rchildren) != sock->nchildren) { + WAIT(&sock->cond, &sock->lock); + } + result = sock->result; + atomic_store(&sock->active, true); + UNLOCK(&sock->lock); + + INSIST(result != ISC_R_UNSET); + + if (result == ISC_R_SUCCESS) { + REQUIRE(atomic_load(&sock->rchildren) == sock->nchildren); + *sockp = sock; + } else { + atomic_store(&sock->active, false); + enqueue_stoplistening(sock); + isc_nmsocket_close(&sock); + } + + return (result); +} + +void +isc__nm_async_tcplisten(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcplisten_t *ievent = (isc__netievent_tcplisten_t *)ev0; + sa_family_t sa_family; + int r; + int flags = 0; + isc_nmsocket_t *sock = NULL; + isc_result_t result; + isc_nm_t *mgr; + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + REQUIRE(VALID_NMSOCK(ievent->sock->parent)); + + sock = ievent->sock; + sa_family = sock->iface.type.sa.sa_family; + mgr = sock->mgr; + + REQUIRE(sock->type == isc_nm_tcpsocket); + REQUIRE(sock->parent != NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); + + r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + + uv_handle_set_data(&sock->uv_handle.handle, sock); + /* This keeps the socket alive after everything else is gone */ + isc__nmsocket_attach(sock, &(isc_nmsocket_t *){ NULL }); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + LOCK(&sock->parent->lock); + + r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); + if (r < 0) { + isc__nm_closesocket(sock->fd); + isc__nm_incstats(sock, STATID_OPENFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_OPEN); + + if (sa_family == AF_INET6) { + flags = UV_TCP_IPV6ONLY; + } + + if (mgr->load_balance_sockets) { + r = isc_uv_tcp_freebind(&sock->uv_handle.tcp, + &sock->iface.type.sa, flags); + if (r < 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + } else { + if (sock->parent->fd == -1) { + r = isc_uv_tcp_freebind(&sock->uv_handle.tcp, + &sock->iface.type.sa, flags); + if (r < 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + sock->parent->uv_handle.tcp.flags = + sock->uv_handle.tcp.flags; + sock->parent->fd = sock->fd; + } else { + /* The socket is already bound, just copy the flags */ + sock->uv_handle.tcp.flags = + sock->parent->uv_handle.tcp.flags; + } + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + /* + * The callback will run in the same thread uv_listen() was called + * from, so a race with tcp_connection_cb() isn't possible. + */ + r = uv_listen((uv_stream_t *)&sock->uv_handle.tcp, sock->backlog, + tcp_connection_cb); + if (r != 0) { + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR, + "uv_listen failed: %s", + isc_result_totext(isc__nm_uverr2result(r))); + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + + atomic_store(&sock->listening, true); + +done: + result = isc__nm_uverr2result(r); + if (result != ISC_R_SUCCESS) { + sock->pquota = NULL; + } + + atomic_fetch_add(&sock->parent->rchildren, 1); + if (sock->parent->result == ISC_R_UNSET) { + sock->parent->result = result; + } + SIGNAL(&sock->parent->cond); + UNLOCK(&sock->parent->lock); + + isc_barrier_wait(&sock->parent->startlistening); +} + +static void +tcp_connection_cb(uv_stream_t *server, int status) { + isc_nmsocket_t *ssock = uv_handle_get_data((uv_handle_t *)server); + isc_result_t result; + isc_quota_t *quota = NULL; + + if (status != 0) { + result = isc__nm_uverr2result(status); + goto done; + } + + REQUIRE(VALID_NMSOCK(ssock)); + REQUIRE(ssock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(ssock)) { + result = ISC_R_CANCELED; + goto done; + } + + if (ssock->pquota != NULL) { + result = isc_quota_attach_cb(ssock->pquota, "a, + &ssock->quotacb); + if (result == ISC_R_QUOTA) { + isc__nm_incstats(ssock, STATID_ACCEPTFAIL); + goto done; + } + } + + result = accept_connection(ssock, quota); +done: + isc__nm_accept_connection_log(result, can_log_tcp_quota()); +} + +void +isc__nm_tcp_stoplistening(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcplistener); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + if (!isc__nm_in_netthread()) { + enqueue_stoplistening(sock); + } else { + stop_tcp_parent(sock); + } +} + +void +isc__nm_async_tcpstop(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpstop_t *ievent = (isc__netievent_tcpstop_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (sock->parent != NULL) { + stop_tcp_child(sock); + return; + } + + stop_tcp_parent(sock); +} + +void +isc__nm_tcp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(result != ISC_R_SUCCESS); + + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + + if (!sock->recv_read) { + goto destroy; + } + sock->recv_read = false; + + if (sock->recv_cb != NULL) { + isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); + isc__nmsocket_clearcb(sock); + isc__nm_readcb(sock, req, result); + } + +destroy: + isc__nmsocket_prep_destroy(sock); + + /* + * We need to detach from quota after the read callback function had a + * chance to be executed. + */ + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } +} + +void +isc__nm_tcp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + isc_nmsocket_t *sock = handle->sock; + isc__netievent_tcpstartread_t *ievent = NULL; + + REQUIRE(sock->type == isc_nm_tcpsocket); + REQUIRE(sock->statichandle == handle); + + sock->recv_cb = cb; + sock->recv_cbarg = cbarg; + sock->recv_read = true; + if (sock->read_timeout == 0) { + sock->read_timeout = + (atomic_load(&sock->keepalive) + ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle)); + } + + ievent = isc__nm_get_netievent_tcpstartread(sock->mgr, sock); + + /* + * This MUST be done asynchronously, no matter which thread we're + * in. The callback function for isc_nm_read() often calls + * isc_nm_read() again; if we tried to do that synchronously + * we'd clash in processbuffer() and grow the stack indefinitely. + */ + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + + return; +} + +void +isc__nm_async_tcpstartread(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpstartread_t *ievent = + (isc__netievent_tcpstartread_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc_result_t result; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + UNUSED(worker); + + if (isc__nmsocket_closing(sock)) { + result = ISC_R_CANCELED; + } else { + result = isc__nm_start_reading(sock); + } + + if (result != ISC_R_SUCCESS) { + atomic_store(&sock->reading, true); + isc__nm_tcp_failed_read_cb(sock, result); + return; + } + + isc__nmsocket_timer_start(sock); +} + +void +isc__nm_tcp_pauseread(isc_nmhandle_t *handle) { + isc__netievent_tcppauseread_t *ievent = NULL; + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(VALID_NMSOCK(sock)); + + if (!atomic_compare_exchange_strong(&sock->readpaused, &(bool){ false }, + true)) + { + return; + } + + ievent = isc__nm_get_netievent_tcppauseread(sock->mgr, sock); + + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + + return; +} + +void +isc__nm_async_tcppauseread(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcppauseread_t *ievent = + (isc__netievent_tcppauseread_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + UNUSED(worker); + + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); +} + +void +isc__nm_tcp_resumeread(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + isc__netievent_tcpstartread_t *ievent = NULL; + isc_nmsocket_t *sock = handle->sock; + + REQUIRE(sock->tid == isc_nm_tid()); + + if (sock->recv_cb == NULL) { + /* We are no longer reading */ + return; + } + + if (!isc__nmsocket_active(sock)) { + atomic_store(&sock->reading, true); + isc__nm_tcp_failed_read_cb(sock, ISC_R_CANCELED); + return; + } + + if (!atomic_compare_exchange_strong(&sock->readpaused, &(bool){ true }, + false)) + { + return; + } + + ievent = isc__nm_get_netievent_tcpstartread(sock->mgr, sock); + + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)stream); + isc__nm_uvreq_t *req = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->reading)); + REQUIRE(buf != NULL); + + if (isc__nmsocket_closing(sock)) { + isc__nm_tcp_failed_read_cb(sock, ISC_R_CANCELED); + goto free; + } + + if (nread < 0) { + if (nread != UV_EOF) { + isc__nm_incstats(sock, STATID_RECVFAIL); + } + + isc__nm_tcp_failed_read_cb(sock, isc__nm_uverr2result(nread)); + + goto free; + } + + req = isc__nm_get_read_req(sock, NULL); + + /* + * The callback will be called synchronously because the + * result is ISC_R_SUCCESS, so we don't need to retain + * the buffer + */ + req->uvbuf.base = buf->base; + req->uvbuf.len = nread; + + if (!atomic_load(&sock->client)) { + sock->read_timeout = + (atomic_load(&sock->keepalive) + ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle)); + } + + isc__nm_readcb(sock, req, ISC_R_SUCCESS); + + /* The readcb could have paused the reading */ + if (atomic_load(&sock->reading)) { + /* The timer will be updated */ + isc__nmsocket_timer_restart(sock); + } + +free: + if (nread < 0) { + /* + * The buffer may be a null buffer on error. + */ + if (buf->base == NULL && buf->len == 0) { + return; + } + } + + isc__nm_free_uvbuf(sock, buf); +} + +static void +quota_accept_cb(isc_quota_t *quota, void *sock0) { + isc_nmsocket_t *sock = (isc_nmsocket_t *)sock0; + isc__netievent_tcpaccept_t *ievent = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + + /* + * Create a tcpaccept event and pass it using the async channel. + */ + ievent = isc__nm_get_netievent_tcpaccept(sock->mgr, sock, quota); + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +/* + * This is called after we get a quota_accept_cb() callback. + */ +void +isc__nm_async_tcpaccept(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpaccept_t *ievent = (isc__netievent_tcpaccept_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc_result_t result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + result = accept_connection(sock, ievent->quota); + isc__nm_accept_connection_log(result, can_log_tcp_quota()); +} + +static isc_result_t +accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota) { + isc_nmsocket_t *csock = NULL; + isc__networker_t *worker = NULL; + int r; + isc_result_t result; + struct sockaddr_storage ss; + isc_sockaddr_t local; + isc_nmhandle_t *handle = NULL; + + REQUIRE(VALID_NMSOCK(ssock)); + REQUIRE(ssock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(ssock)) { + if (quota != NULL) { + isc_quota_detach("a); + } + return (ISC_R_CANCELED); + } + + csock = isc_mem_get(ssock->mgr->mctx, sizeof(isc_nmsocket_t)); + isc__nmsocket_init(csock, ssock->mgr, isc_nm_tcpsocket, &ssock->iface); + csock->tid = ssock->tid; + csock->extrahandlesize = ssock->extrahandlesize; + isc__nmsocket_attach(ssock, &csock->server); + csock->recv_cb = ssock->recv_cb; + csock->recv_cbarg = ssock->recv_cbarg; + csock->quota = quota; + atomic_init(&csock->accepting, true); + + worker = &csock->mgr->workers[isc_nm_tid()]; + + r = uv_tcp_init(&worker->loop, &csock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + uv_handle_set_data(&csock->uv_handle.handle, csock); + + r = uv_timer_init(&worker->loop, &csock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&csock->read_timer, csock); + + r = uv_accept(&ssock->uv_handle.stream, &csock->uv_handle.stream); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + r = uv_tcp_getpeername(&csock->uv_handle.tcp, (struct sockaddr *)&ss, + &(int){ sizeof(ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + result = isc_sockaddr_fromsockaddr(&csock->peer, + (struct sockaddr *)&ss); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + r = uv_tcp_getsockname(&csock->uv_handle.tcp, (struct sockaddr *)&ss, + &(int){ sizeof(ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + result = isc_sockaddr_fromsockaddr(&local, (struct sockaddr *)&ss); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + handle = isc__nmhandle_get(csock, NULL, &local); + + result = ssock->accept_cb(handle, ISC_R_SUCCESS, ssock->accept_cbarg); + if (result != ISC_R_SUCCESS) { + isc_nmhandle_detach(&handle); + goto failure; + } + + atomic_store(&csock->accepting, false); + + isc__nm_incstats(csock, STATID_ACCEPT); + + csock->read_timeout = atomic_load(&csock->mgr->init); + + atomic_fetch_add(&ssock->parent->active_child_connections, 1); + + /* + * The acceptcb needs to attach to the handle if it wants to keep the + * connection alive + */ + isc_nmhandle_detach(&handle); + + /* + * sock is now attached to the handle. + */ + isc__nmsocket_detach(&csock); + + return (ISC_R_SUCCESS); + +failure: + atomic_store(&csock->active, false); + + failed_accept_cb(csock, result); + + isc__nmsocket_prep_destroy(csock); + + isc__nmsocket_detach(&csock); + + return (result); +} + +void +isc__nm_tcp_send(isc_nmhandle_t *handle, const isc_region_t *region, + isc_nm_cb_t cb, void *cbarg) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + isc_nmsocket_t *sock = handle->sock; + isc__netievent_tcpsend_t *ievent = NULL; + isc__nm_uvreq_t *uvreq = NULL; + + REQUIRE(sock->type == isc_nm_tcpsocket); + + uvreq = isc__nm_uvreq_get(sock->mgr, sock); + uvreq->uvbuf.base = (char *)region->base; + uvreq->uvbuf.len = region->length; + + isc_nmhandle_attach(handle, &uvreq->handle); + + uvreq->cb.send = cb; + uvreq->cbarg = cbarg; + + ievent = isc__nm_get_netievent_tcpsend(sock->mgr, sock, uvreq); + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + + return; +} + +static void +tcp_send_cb(uv_write_t *req, int status) { + isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data; + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMSOCK(uvreq->sock)); + + sock = uvreq->sock; + + isc_nm_timer_stop(uvreq->timer); + isc_nm_timer_detach(&uvreq->timer); + + if (status < 0) { + isc__nm_incstats(sock, STATID_SENDFAIL); + isc__nm_failed_send_cb(sock, uvreq, + isc__nm_uverr2result(status)); + return; + } + + isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false); +} + +/* + * Handle 'tcpsend' async event - send a packet on the socket + */ +void +isc__nm_async_tcpsend(isc__networker_t *worker, isc__netievent_t *ev0) { + isc_result_t result; + isc__netievent_tcpsend_t *ievent = (isc__netievent_tcpsend_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *uvreq = ievent->req; + + REQUIRE(sock->type == isc_nm_tcpsocket); + REQUIRE(sock->tid == isc_nm_tid()); + UNUSED(worker); + + if (sock->write_timeout == 0) { + sock->write_timeout = + (atomic_load(&sock->keepalive) + ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle)); + } + + result = tcp_send_direct(sock, uvreq); + if (result != ISC_R_SUCCESS) { + isc__nm_incstats(sock, STATID_SENDFAIL); + isc__nm_failed_send_cb(sock, uvreq, result); + } +} + +static isc_result_t +tcp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_tcpsocket); + + int r; + + if (isc__nmsocket_closing(sock)) { + return (ISC_R_CANCELED); + } + + r = uv_write(&req->uv_req.write, &sock->uv_handle.stream, &req->uvbuf, + 1, tcp_send_cb); + if (r < 0) { + return (isc__nm_uverr2result(r)); + } + + isc_nm_timer_create(req->handle, isc__nmsocket_writetimeout_cb, req, + &req->timer); + if (sock->write_timeout > 0) { + isc_nm_timer_start(req->timer, sock->write_timeout); + } + + return (ISC_R_SUCCESS); +} + +static void +tcp_stop_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + uv_handle_set_data(handle, NULL); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + if (!atomic_compare_exchange_strong(&sock->closed, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + isc__nm_incstats(sock, STATID_CLOSE); + + atomic_store(&sock->listening, false); + + isc__nmsocket_detach(&sock); +} + +static void +tcp_close_sock(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + if (!atomic_compare_exchange_strong(&sock->closed, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + isc__nm_incstats(sock, STATID_CLOSE); + + if (sock->server != NULL) { + isc__nmsocket_detach(&sock->server); + } + + atomic_store(&sock->connected, false); + + isc__nmsocket_prep_destroy(sock); +} + +static void +tcp_close_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + uv_handle_set_data(handle, NULL); + + tcp_close_sock(sock); +} + +static void +read_timer_close_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + uv_handle_set_data(handle, NULL); + + if (sock->parent) { + uv_close(&sock->uv_handle.handle, tcp_stop_cb); + } else if (uv_is_closing(&sock->uv_handle.handle)) { + tcp_close_sock(sock); + } else { + uv_close(&sock->uv_handle.handle, tcp_close_cb); + } +} + +static void +stop_tcp_child(isc_nmsocket_t *sock) { + REQUIRE(sock->type == isc_nm_tcpsocket); + REQUIRE(sock->tid == isc_nm_tid()); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + tcp_close_direct(sock); + + atomic_fetch_sub(&sock->parent->rchildren, 1); + + isc_barrier_wait(&sock->parent->stoplistening); +} + +static void +stop_tcp_parent(isc_nmsocket_t *sock) { + isc_nmsocket_t *csock = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_tcplistener); + + isc_barrier_init(&sock->stoplistening, sock->nchildren); + + for (size_t i = 0; i < sock->nchildren; i++) { + csock = &sock->children[i]; + REQUIRE(VALID_NMSOCK(csock)); + + if ((int)i == isc_nm_tid()) { + /* + * We need to schedule closing the other sockets first + */ + continue; + } + + atomic_store(&csock->active, false); + enqueue_stoplistening(csock); + } + + csock = &sock->children[isc_nm_tid()]; + atomic_store(&csock->active, false); + stop_tcp_child(csock); + + atomic_store(&sock->closed, true); + isc__nmsocket_prep_destroy(sock); +} + +static void +tcp_close_direct(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + if (sock->server != NULL) { + REQUIRE(VALID_NMSOCK(sock->server)); + REQUIRE(VALID_NMSOCK(sock->server->parent)); + if (sock->server->parent != NULL) { + atomic_fetch_sub( + &sock->server->parent->active_child_connections, + 1); + } + } + + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } + + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + uv_close((uv_handle_t *)&sock->read_timer, read_timer_close_cb); +} + +void +isc__nm_tcp_close(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcpsocket); + REQUIRE(!isc__nmsocket_active(sock)); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + if (sock->tid == isc_nm_tid()) { + tcp_close_direct(sock); + } else { + /* + * We need to create an event and pass it using async channel + */ + isc__netievent_tcpclose_t *ievent = + isc__nm_get_netievent_tcpclose(sock->mgr, sock); + + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } +} + +void +isc__nm_async_tcpclose(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpclose_t *ievent = (isc__netievent_tcpclose_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + UNUSED(worker); + + tcp_close_direct(sock); +} + +static void +tcp_close_connect_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + + REQUIRE(VALID_NMSOCK(sock)); + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(sock->tid == isc_nm_tid()); + + isc__nmsocket_prep_destroy(sock); + isc__nmsocket_detach(&sock); +} + +void +isc__nm_tcp_shutdown(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_tcpsocket); + + /* + * If the socket is active, mark it inactive and + * continue. If it isn't active, stop now. + */ + if (!isc__nmsocket_deactivate(sock)) { + return; + } + + if (atomic_load(&sock->accepting)) { + return; + } + + if (atomic_load(&sock->connecting)) { + isc_nmsocket_t *tsock = NULL; + isc__nmsocket_attach(sock, &tsock); + uv_close(&sock->uv_handle.handle, tcp_close_connect_cb); + return; + } + + if (sock->statichandle != NULL) { + if (isc__nm_closing(sock)) { + isc__nm_failed_read_cb(sock, ISC_R_SHUTTINGDOWN, false); + } else { + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, false); + } + return; + } + + /* + * Otherwise, we just send the socket to abyss... + */ + if (sock->parent == NULL) { + isc__nmsocket_prep_destroy(sock); + } +} + +void +isc__nm_tcp_cancelread(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + isc__netievent_tcpcancel_t *ievent = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcpsocket); + + ievent = isc__nm_get_netievent_tcpcancel(sock->mgr, sock, handle); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_tcpcancel(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpcancel_t *ievent = (isc__netievent_tcpcancel_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + UNUSED(worker); + + uv_timer_stop(&sock->read_timer); + + isc__nm_tcp_failed_read_cb(sock, ISC_R_EOF); +} + +int_fast32_t +isc__nm_tcp_listener_nactive(isc_nmsocket_t *listener) { + int_fast32_t nactive; + + REQUIRE(VALID_NMSOCK(listener)); + + nactive = atomic_load(&listener->active_child_connections); + INSIST(nactive >= 0); + return (nactive); +} diff --git a/lib/isc/netmgr/tcpdns.c b/lib/isc/netmgr/tcpdns.c new file mode 100644 index 0000000..eda6aa6 --- /dev/null +++ b/lib/isc/netmgr/tcpdns.c @@ -0,0 +1,1500 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <libgen.h> +#include <unistd.h> +#include <uv.h> + +#include <isc/atomic.h> +#include <isc/barrier.h> +#include <isc/buffer.h> +#include <isc/condition.h> +#include <isc/errno.h> +#include <isc/log.h> +#include <isc/magic.h> +#include <isc/mem.h> +#include <isc/netmgr.h> +#include <isc/quota.h> +#include <isc/random.h> +#include <isc/refcount.h> +#include <isc/region.h> +#include <isc/result.h> +#include <isc/sockaddr.h> +#include <isc/stdtime.h> +#include <isc/thread.h> +#include <isc/util.h> + +#include "netmgr-int.h" +#include "uv-compat.h" + +static atomic_uint_fast32_t last_tcpdnsquota_log = 0; + +static bool +can_log_tcpdns_quota(void) { + isc_stdtime_t now, last; + + isc_stdtime_get(&now); + last = atomic_exchange_relaxed(&last_tcpdnsquota_log, now); + if (now != last) { + return (true); + } + + return (false); +} + +static isc_result_t +tcpdns_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); + +static void +tcpdns_close_direct(isc_nmsocket_t *sock); + +static void +tcpdns_connect_cb(uv_connect_t *uvreq, int status); + +static void +tcpdns_connection_cb(uv_stream_t *server, int status); + +static void +tcpdns_close_cb(uv_handle_t *uvhandle); + +static isc_result_t +accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota); + +static void +quota_accept_cb(isc_quota_t *quota, void *sock0); + +static void +stop_tcpdns_parent(isc_nmsocket_t *sock); +static void +stop_tcpdns_child(isc_nmsocket_t *sock); + +static isc_result_t +tcpdns_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { + isc__networker_t *worker = NULL; + isc_result_t result = ISC_R_UNSET; + int r; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(sock->tid == isc_nm_tid()); + + worker = &sock->mgr->workers[sock->tid]; + + atomic_store(&sock->connecting, true); + + r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + uv_handle_set_data(&sock->uv_handle.handle, sock); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + if (isc__nm_closing(sock)) { + result = ISC_R_SHUTTINGDOWN; + goto error; + } + + r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); + if (r != 0) { + isc__nm_closesocket(sock->fd); + isc__nm_incstats(sock, STATID_OPENFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_OPEN); + + if (req->local.length != 0) { + r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0); + /* + * In case of shared socket UV_EINVAL will be returned and needs + * to be ignored + */ + if (r != 0 && r != UV_EINVAL) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + uv_handle_set_data(&req->uv_req.handle, req); + r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp, + &req->peer.type.sa, tcpdns_connect_cb); + if (r != 0) { + isc__nm_incstats(sock, STATID_CONNECTFAIL); + goto done; + } + + uv_handle_set_data((uv_handle_t *)&sock->read_timer, + &req->uv_req.connect); + isc__nmsocket_timer_start(sock); + + atomic_store(&sock->connected, true); + +done: + result = isc__nm_uverr2result(r); +error: + LOCK(&sock->lock); + sock->result = result; + SIGNAL(&sock->cond); + if (!atomic_load(&sock->active)) { + WAIT(&sock->scond, &sock->lock); + } + INSIST(atomic_load(&sock->active)); + UNLOCK(&sock->lock); + + return (result); +} + +void +isc__nm_async_tcpdnsconnect(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpdnsconnect_t *ievent = + (isc__netievent_tcpdnsconnect_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *req = ievent->req; + isc_result_t result = ISC_R_SUCCESS; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcpdnssocket); + REQUIRE(sock->parent == NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + result = tcpdns_connect_direct(sock, req); + if (result != ISC_R_SUCCESS) { + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, result, true); + atomic_store(&sock->active, false); + isc__nm_tcpdns_close(sock); + } + + /* + * The sock is now attached to the handle. + */ + isc__nmsocket_detach(&sock); +} + +static void +tcpdns_connect_cb(uv_connect_t *uvreq, int status) { + isc_result_t result = ISC_R_UNSET; + isc__nm_uvreq_t *req = NULL; + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); + struct sockaddr_storage ss; + int r; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + req = uv_handle_get_data((uv_handle_t *)uvreq); + + REQUIRE(VALID_UVREQ(req)); + REQUIRE(VALID_NMHANDLE(req->handle)); + + if (atomic_load(&sock->timedout)) { + result = ISC_R_TIMEDOUT; + goto error; + } else if (isc__nm_closing(sock)) { + /* Network manager shutting down */ + result = ISC_R_SHUTTINGDOWN; + goto error; + } else if (isc__nmsocket_closing(sock)) { + /* Connection canceled */ + result = ISC_R_CANCELED; + goto error; + } else if (status == UV_ETIMEDOUT) { + /* Timeout status code here indicates hard error */ + result = ISC_R_TIMEDOUT; + goto error; + } else if (status == UV_EADDRINUSE) { + /* + * On FreeBSD the TCP connect() call sometimes results in a + * spurious transient EADDRINUSE. Try a few more times before + * giving up. + */ + if (--req->connect_tries > 0) { + r = uv_tcp_connect( + &req->uv_req.connect, &sock->uv_handle.tcp, + &req->peer.type.sa, tcpdns_connect_cb); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto error; + } + return; + } + result = isc__nm_uverr2result(status); + goto error; + } else if (status != 0) { + result = isc__nm_uverr2result(status); + goto error; + } + + isc__nmsocket_timer_stop(sock); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + isc__nm_incstats(sock, STATID_CONNECT); + r = uv_tcp_getpeername(&sock->uv_handle.tcp, (struct sockaddr *)&ss, + &(int){ sizeof(ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto error; + } + + atomic_store(&sock->connecting, false); + + result = isc_sockaddr_fromsockaddr(&sock->peer, (struct sockaddr *)&ss); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + isc__nm_connectcb(sock, req, ISC_R_SUCCESS, false); + + return; +error: + isc__nm_failed_connect_cb(sock, req, result, false); +} + +void +isc_nm_tcpdnsconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, + isc_nm_cb_t cb, void *cbarg, unsigned int timeout, + size_t extrahandlesize) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + isc__netievent_tcpdnsconnect_t *ievent = NULL; + isc__nm_uvreq_t *req = NULL; + sa_family_t sa_family; + + REQUIRE(VALID_NM(mgr)); + REQUIRE(local != NULL); + REQUIRE(peer != NULL); + + sa_family = peer->type.sa.sa_family; + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_tcpdnssocket, local); + + sock->extrahandlesize = extrahandlesize; + sock->connect_timeout = timeout; + sock->result = ISC_R_UNSET; + atomic_init(&sock->client, true); + + req = isc__nm_uvreq_get(mgr, sock); + req->cb.connect = cb; + req->cbarg = cbarg; + req->peer = *peer; + req->local = *local; + req->handle = isc__nmhandle_get(sock, &req->peer, &sock->iface); + + result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &sock->fd); + if (result != ISC_R_SUCCESS) { + if (isc__nm_in_netthread()) { + sock->tid = isc_nm_tid(); + } + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, result, true); + atomic_store(&sock->closed, true); + isc__nmsocket_detach(&sock); + return; + } + + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); + + /* 2 minute timeout */ + result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + ievent = isc__nm_get_netievent_tcpdnsconnect(mgr, sock, req); + + if (isc__nm_in_netthread()) { + atomic_store(&sock->active, true); + sock->tid = isc_nm_tid(); + isc__nm_async_tcpdnsconnect(&mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + isc__nm_put_netievent_tcpdnsconnect(mgr, ievent); + } else { + atomic_init(&sock->active, false); + sock->tid = isc_random_uniform(mgr->nworkers); + isc__nm_enqueue_ievent(&mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } + + LOCK(&sock->lock); + while (sock->result == ISC_R_UNSET) { + WAIT(&sock->cond, &sock->lock); + } + atomic_store(&sock->active, true); + BROADCAST(&sock->scond); + UNLOCK(&sock->lock); +} + +static uv_os_sock_t +isc__nm_tcpdns_lb_socket(isc_nm_t *mgr, sa_family_t sa_family) { + isc_result_t result; + uv_os_sock_t sock; + + result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + (void)isc__nm_socket_incoming_cpu(sock); + (void)isc__nm_socket_v6only(sock, sa_family); + + /* FIXME: set mss */ + + result = isc__nm_socket_reuse(sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + if (mgr->load_balance_sockets) { + result = isc__nm_socket_reuse_lb(sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + } + + return (sock); +} + +static void +enqueue_stoplistening(isc_nmsocket_t *sock) { + isc__netievent_tcpdnsstop_t *ievent = + isc__nm_get_netievent_tcpdnsstop(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +static void +start_tcpdns_child(isc_nm_t *mgr, isc_sockaddr_t *iface, isc_nmsocket_t *sock, + uv_os_sock_t fd, int tid) { + isc__netievent_tcpdnslisten_t *ievent = NULL; + isc_nmsocket_t *csock = &sock->children[tid]; + + isc__nmsocket_init(csock, mgr, isc_nm_tcpdnssocket, iface); + csock->parent = sock; + csock->accept_cb = sock->accept_cb; + csock->accept_cbarg = sock->accept_cbarg; + csock->recv_cb = sock->recv_cb; + csock->recv_cbarg = sock->recv_cbarg; + csock->extrahandlesize = sock->extrahandlesize; + csock->backlog = sock->backlog; + csock->tid = tid; + /* + * We don't attach to quota, just assign - to avoid + * increasing quota unnecessarily. + */ + csock->pquota = sock->pquota; + isc_quota_cb_init(&csock->quotacb, quota_accept_cb, csock); + + if (mgr->load_balance_sockets) { + UNUSED(fd); + csock->fd = isc__nm_tcpdns_lb_socket(mgr, + iface->type.sa.sa_family); + } else { + csock->fd = dup(fd); + } + REQUIRE(csock->fd >= 0); + + ievent = isc__nm_get_netievent_tcpdnslisten(mgr, csock); + isc__nm_maybe_enqueue_ievent(&mgr->workers[tid], + (isc__netievent_t *)ievent); +} +isc_result_t +isc_nm_listentcpdns(isc_nm_t *mgr, isc_sockaddr_t *iface, + isc_nm_recv_cb_t recv_cb, void *recv_cbarg, + isc_nm_accept_cb_t accept_cb, void *accept_cbarg, + size_t extrahandlesize, int backlog, isc_quota_t *quota, + isc_nmsocket_t **sockp) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + size_t children_size = 0; + uv_os_sock_t fd = -1; + + REQUIRE(VALID_NM(mgr)); + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_tcpdnslistener, iface); + + atomic_init(&sock->rchildren, 0); + sock->nchildren = mgr->nworkers; + children_size = sock->nchildren * sizeof(sock->children[0]); + sock->children = isc_mem_get(mgr->mctx, children_size); + memset(sock->children, 0, children_size); + + sock->result = ISC_R_UNSET; + sock->accept_cb = accept_cb; + sock->accept_cbarg = accept_cbarg; + sock->recv_cb = recv_cb; + sock->recv_cbarg = recv_cbarg; + sock->extrahandlesize = extrahandlesize; + sock->backlog = backlog; + sock->pquota = quota; + + sock->tid = 0; + sock->fd = -1; + + if (!mgr->load_balance_sockets) { + fd = isc__nm_tcpdns_lb_socket(mgr, iface->type.sa.sa_family); + } + + isc_barrier_init(&sock->startlistening, sock->nchildren); + + for (size_t i = 0; i < sock->nchildren; i++) { + if ((int)i == isc_nm_tid()) { + continue; + } + start_tcpdns_child(mgr, iface, sock, fd, i); + } + + if (isc__nm_in_netthread()) { + start_tcpdns_child(mgr, iface, sock, fd, isc_nm_tid()); + } + + if (!mgr->load_balance_sockets) { + isc__nm_closesocket(fd); + } + + LOCK(&sock->lock); + while (atomic_load(&sock->rchildren) != sock->nchildren) { + WAIT(&sock->cond, &sock->lock); + } + result = sock->result; + atomic_store(&sock->active, true); + UNLOCK(&sock->lock); + + INSIST(result != ISC_R_UNSET); + + if (result == ISC_R_SUCCESS) { + REQUIRE(atomic_load(&sock->rchildren) == sock->nchildren); + *sockp = sock; + } else { + atomic_store(&sock->active, false); + enqueue_stoplistening(sock); + isc_nmsocket_close(&sock); + } + + return (result); +} + +void +isc__nm_async_tcpdnslisten(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpdnslisten_t *ievent = + (isc__netievent_tcpdnslisten_t *)ev0; + sa_family_t sa_family; + int r; + int flags = 0; + isc_nmsocket_t *sock = NULL; + isc_result_t result = ISC_R_UNSET; + isc_nm_t *mgr = NULL; + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + REQUIRE(VALID_NMSOCK(ievent->sock->parent)); + + sock = ievent->sock; + sa_family = sock->iface.type.sa.sa_family; + mgr = sock->mgr; + + REQUIRE(sock->type == isc_nm_tcpdnssocket); + REQUIRE(sock->parent != NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); + + r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + uv_handle_set_data(&sock->uv_handle.handle, sock); + /* This keeps the socket alive after everything else is gone */ + isc__nmsocket_attach(sock, &(isc_nmsocket_t *){ NULL }); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + LOCK(&sock->parent->lock); + + r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); + if (r < 0) { + isc__nm_closesocket(sock->fd); + isc__nm_incstats(sock, STATID_OPENFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_OPEN); + + if (sa_family == AF_INET6) { + flags = UV_TCP_IPV6ONLY; + } + + if (mgr->load_balance_sockets) { + r = isc_uv_tcp_freebind(&sock->uv_handle.tcp, + &sock->iface.type.sa, flags); + if (r < 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + } else { + if (sock->parent->fd == -1) { + r = isc_uv_tcp_freebind(&sock->uv_handle.tcp, + &sock->iface.type.sa, flags); + if (r < 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + sock->parent->uv_handle.tcp.flags = + sock->uv_handle.tcp.flags; + sock->parent->fd = sock->fd; + } else { + /* The socket is already bound, just copy the flags */ + sock->uv_handle.tcp.flags = + sock->parent->uv_handle.tcp.flags; + } + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + /* + * The callback will run in the same thread uv_listen() was called + * from, so a race with tcpdns_connection_cb() isn't possible. + */ + r = uv_listen((uv_stream_t *)&sock->uv_handle.tcp, sock->backlog, + tcpdns_connection_cb); + if (r != 0) { + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR, + "uv_listen failed: %s", + isc_result_totext(isc__nm_uverr2result(r))); + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + + atomic_store(&sock->listening, true); + +done: + result = isc__nm_uverr2result(r); + if (result != ISC_R_SUCCESS) { + sock->pquota = NULL; + } + + atomic_fetch_add(&sock->parent->rchildren, 1); + if (sock->parent->result == ISC_R_UNSET) { + sock->parent->result = result; + } + SIGNAL(&sock->parent->cond); + UNLOCK(&sock->parent->lock); + + isc_barrier_wait(&sock->parent->startlistening); +} + +static void +tcpdns_connection_cb(uv_stream_t *server, int status) { + isc_nmsocket_t *ssock = uv_handle_get_data((uv_handle_t *)server); + isc_result_t result; + isc_quota_t *quota = NULL; + + if (status != 0) { + result = isc__nm_uverr2result(status); + goto done; + } + + REQUIRE(VALID_NMSOCK(ssock)); + REQUIRE(ssock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(ssock)) { + result = ISC_R_CANCELED; + goto done; + } + + if (ssock->pquota != NULL) { + result = isc_quota_attach_cb(ssock->pquota, "a, + &ssock->quotacb); + if (result == ISC_R_QUOTA) { + isc__nm_incstats(ssock, STATID_ACCEPTFAIL); + goto done; + } + } + + result = accept_connection(ssock, quota); +done: + isc__nm_accept_connection_log(result, can_log_tcpdns_quota()); +} + +void +isc__nm_tcpdns_stoplistening(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcpdnslistener); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + if (!isc__nm_in_netthread()) { + enqueue_stoplistening(sock); + } else { + stop_tcpdns_parent(sock); + } +} + +void +isc__nm_async_tcpdnsstop(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpdnsstop_t *ievent = + (isc__netievent_tcpdnsstop_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (sock->parent != NULL) { + stop_tcpdns_child(sock); + return; + } + + stop_tcpdns_parent(sock); +} + +void +isc__nm_tcpdns_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(result != ISC_R_SUCCESS); + + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + + if (!sock->recv_read) { + goto destroy; + } + sock->recv_read = false; + + if (sock->recv_cb != NULL) { + isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); + isc__nmsocket_clearcb(sock); + isc__nm_readcb(sock, req, result); + } + +destroy: + isc__nmsocket_prep_destroy(sock); + + /* + * We need to detach from quota after the read callback function had a + * chance to be executed. + */ + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } +} + +void +isc__nm_tcpdns_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + isc_nmsocket_t *sock = handle->sock; + isc__netievent_tcpdnsread_t *ievent = NULL; + + REQUIRE(sock->type == isc_nm_tcpdnssocket); + REQUIRE(sock->statichandle == handle); + + sock->recv_cb = cb; + sock->recv_cbarg = cbarg; + sock->recv_read = true; + if (sock->read_timeout == 0) { + sock->read_timeout = + (atomic_load(&sock->keepalive) + ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle)); + } + + ievent = isc__nm_get_netievent_tcpdnsread(sock->mgr, sock); + + /* + * This MUST be done asynchronously, no matter which thread we're + * in. The callback function for isc_nm_read() often calls + * isc_nm_read() again; if we tried to do that synchronously + * we'd clash in processbuffer() and grow the stack indefinitely. + */ + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + + return; +} + +void +isc__nm_async_tcpdnsread(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpdnsread_t *ievent = + (isc__netievent_tcpdnsread_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc_result_t result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(sock)) { + result = ISC_R_CANCELED; + } else { + result = isc__nm_process_sock_buffer(sock); + } + + if (result != ISC_R_SUCCESS) { + atomic_store(&sock->reading, true); + isc__nm_failed_read_cb(sock, result, false); + } +} + +/* + * Process a single packet from the incoming buffer. + * + * Return ISC_R_SUCCESS and attach 'handlep' to a handle if something + * was processed; return ISC_R_NOMORE if there isn't a full message + * to be processed. + * + * The caller will need to unreference the handle. + */ +isc_result_t +isc__nm_tcpdns_processbuffer(isc_nmsocket_t *sock) { + size_t len; + isc__nm_uvreq_t *req = NULL; + isc_nmhandle_t *handle = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(sock)) { + return (ISC_R_CANCELED); + } + + /* + * If we don't even have the length yet, we can't do + * anything. + */ + if (sock->buf_len < 2) { + return (ISC_R_NOMORE); + } + + /* + * Process the first packet from the buffer, leaving + * the rest (if any) for later. + */ + len = ntohs(*(uint16_t *)sock->buf); + if (len > sock->buf_len - 2) { + return (ISC_R_NOMORE); + } + + if (sock->recv_cb == NULL) { + /* + * recv_cb has been cleared - there is + * nothing to do + */ + return (ISC_R_CANCELED); + } else if (sock->statichandle == NULL && + atomic_load(&sock->connected) && + !atomic_load(&sock->connecting)) + { + /* + * It seems that some unexpected data (a DNS message) has + * arrived while we are wrapping up. + */ + return (ISC_R_CANCELED); + } + + req = isc__nm_get_read_req(sock, NULL); + REQUIRE(VALID_UVREQ(req)); + + /* + * We need to launch isc__nm_resume_processing() after the buffer + * has been consumed, thus we must delay detaching the handle. + */ + isc_nmhandle_attach(req->handle, &handle); + + /* + * The callback will be called synchronously because the + * result is ISC_R_SUCCESS, so we don't need to have + * the buffer on the heap + */ + req->uvbuf.base = (char *)sock->buf + 2; + req->uvbuf.len = len; + + /* + * If isc__nm_tcpdns_read() was called, it will be satisfied by single + * DNS message in the next call. + */ + sock->recv_read = false; + + /* + * An assertion failure here means that there's an erroneous + * extra nmhandle detach happening in the callback and + * isc__nm_resume_processing() is called while we're + * processing the buffer. + */ + REQUIRE(sock->processing == false); + sock->processing = true; + isc__nm_readcb(sock, req, ISC_R_SUCCESS); + sock->processing = false; + + len += 2; + sock->buf_len -= len; + if (sock->buf_len > 0) { + memmove(sock->buf, sock->buf + len, sock->buf_len); + } + + isc_nmhandle_detach(&handle); + + return (ISC_R_SUCCESS); +} + +void +isc__nm_tcpdns_read_cb(uv_stream_t *stream, ssize_t nread, + const uv_buf_t *buf) { + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)stream); + uint8_t *base = NULL; + size_t len; + isc_result_t result; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->reading)); + REQUIRE(buf != NULL); + + if (isc__nmsocket_closing(sock)) { + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, true); + goto free; + } + + if (nread < 0) { + if (nread != UV_EOF) { + isc__nm_incstats(sock, STATID_RECVFAIL); + } + + isc__nm_failed_read_cb(sock, isc__nm_uverr2result(nread), true); + goto free; + } + + base = (uint8_t *)buf->base; + len = nread; + + /* + * FIXME: We can avoid the memmove here if we know we have received full + * packet; e.g. we should be smarter, a.s. there are just few situations + * + * The tcp_alloc_buf should be smarter and point the uv_read_start to + * the position where previous read has ended in the sock->buf, that way + * the data could be read directly into sock->buf. + */ + + if (sock->buf_len + len > sock->buf_size) { + isc__nm_alloc_dnsbuf(sock, sock->buf_len + len); + } + memmove(sock->buf + sock->buf_len, base, len); + sock->buf_len += len; + + if (!atomic_load(&sock->client)) { + sock->read_timeout = atomic_load(&sock->mgr->idle); + } + + result = isc__nm_process_sock_buffer(sock); + if (result != ISC_R_SUCCESS) { + isc__nm_failed_read_cb(sock, result, true); + } +free: + if (nread < 0) { + /* + * The buffer may be a null buffer on error. + */ + if (buf->base == NULL && buf->len == 0) { + return; + } + } + + isc__nm_free_uvbuf(sock, buf); +} + +static void +quota_accept_cb(isc_quota_t *quota, void *sock0) { + isc_nmsocket_t *sock = (isc_nmsocket_t *)sock0; + + REQUIRE(VALID_NMSOCK(sock)); + + /* + * Create a tcpdnsaccept event and pass it using the async channel. + */ + + isc__netievent_tcpdnsaccept_t *ievent = + isc__nm_get_netievent_tcpdnsaccept(sock->mgr, sock, quota); + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +/* + * This is called after we get a quota_accept_cb() callback. + */ +void +isc__nm_async_tcpdnsaccept(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpdnsaccept_t *ievent = + (isc__netievent_tcpdnsaccept_t *)ev0; + isc_result_t result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + + result = accept_connection(ievent->sock, ievent->quota); + isc__nm_accept_connection_log(result, can_log_tcpdns_quota()); +} + +static isc_result_t +accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota) { + isc_nmsocket_t *csock = NULL; + isc__networker_t *worker = NULL; + int r; + isc_result_t result; + struct sockaddr_storage peer_ss; + struct sockaddr_storage local_ss; + isc_sockaddr_t local; + isc_nmhandle_t *handle = NULL; + + REQUIRE(VALID_NMSOCK(ssock)); + REQUIRE(ssock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(ssock)) { + if (quota != NULL) { + isc_quota_detach("a); + } + return (ISC_R_CANCELED); + } + + REQUIRE(ssock->accept_cb != NULL); + + csock = isc_mem_get(ssock->mgr->mctx, sizeof(isc_nmsocket_t)); + isc__nmsocket_init(csock, ssock->mgr, isc_nm_tcpdnssocket, + &ssock->iface); + csock->tid = ssock->tid; + csock->extrahandlesize = ssock->extrahandlesize; + isc__nmsocket_attach(ssock, &csock->server); + csock->recv_cb = ssock->recv_cb; + csock->recv_cbarg = ssock->recv_cbarg; + csock->quota = quota; + atomic_init(&csock->accepting, true); + + worker = &csock->mgr->workers[csock->tid]; + + r = uv_tcp_init(&worker->loop, &csock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + uv_handle_set_data(&csock->uv_handle.handle, csock); + + r = uv_timer_init(&worker->loop, &csock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&csock->read_timer, csock); + + r = uv_accept(&ssock->uv_handle.stream, &csock->uv_handle.stream); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + r = uv_tcp_getpeername(&csock->uv_handle.tcp, + (struct sockaddr *)&peer_ss, + &(int){ sizeof(peer_ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + result = isc_sockaddr_fromsockaddr(&csock->peer, + (struct sockaddr *)&peer_ss); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + r = uv_tcp_getsockname(&csock->uv_handle.tcp, + (struct sockaddr *)&local_ss, + &(int){ sizeof(local_ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + result = isc_sockaddr_fromsockaddr(&local, + (struct sockaddr *)&local_ss); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + /* + * The handle will be either detached on acceptcb failure or in the + * readcb. + */ + handle = isc__nmhandle_get(csock, NULL, &local); + + result = ssock->accept_cb(handle, ISC_R_SUCCESS, ssock->accept_cbarg); + if (result != ISC_R_SUCCESS) { + isc_nmhandle_detach(&handle); + goto failure; + } + + atomic_store(&csock->accepting, false); + + isc__nm_incstats(csock, STATID_ACCEPT); + + csock->read_timeout = atomic_load(&csock->mgr->init); + + csock->closehandle_cb = isc__nm_resume_processing; + + /* + * We need to keep the handle alive until we fail to read or connection + * is closed by the other side, it will be detached via + * prep_destroy()->tcpdns_close_direct(). + */ + isc_nmhandle_attach(handle, &csock->recv_handle); + result = isc__nm_process_sock_buffer(csock); + if (result != ISC_R_SUCCESS) { + isc_nmhandle_detach(&csock->recv_handle); + isc_nmhandle_detach(&handle); + goto failure; + } + + /* + * The initial timer has been set, update the read timeout for the next + * reads. + */ + csock->read_timeout = (atomic_load(&csock->keepalive) + ? atomic_load(&csock->mgr->keepalive) + : atomic_load(&csock->mgr->idle)); + + isc_nmhandle_detach(&handle); + + /* + * sock is now attached to the handle. + */ + isc__nmsocket_detach(&csock); + + return (ISC_R_SUCCESS); + +failure: + + atomic_store(&csock->active, false); + + isc__nm_failed_accept_cb(csock, result); + + isc__nmsocket_prep_destroy(csock); + + isc__nmsocket_detach(&csock); + + return (result); +} + +void +isc__nm_tcpdns_send(isc_nmhandle_t *handle, isc_region_t *region, + isc_nm_cb_t cb, void *cbarg) { + isc__netievent_tcpdnssend_t *ievent = NULL; + isc__nm_uvreq_t *uvreq = NULL; + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcpdnssocket); + + uvreq = isc__nm_uvreq_get(sock->mgr, sock); + *(uint16_t *)uvreq->tcplen = htons(region->length); + uvreq->uvbuf.base = (char *)region->base; + uvreq->uvbuf.len = region->length; + + isc_nmhandle_attach(handle, &uvreq->handle); + + uvreq->cb.send = cb; + uvreq->cbarg = cbarg; + + ievent = isc__nm_get_netievent_tcpdnssend(sock->mgr, sock, uvreq); + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + + return; +} + +static void +tcpdns_send_cb(uv_write_t *req, int status) { + isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data; + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMSOCK(uvreq->sock)); + + sock = uvreq->sock; + + isc_nm_timer_stop(uvreq->timer); + isc_nm_timer_detach(&uvreq->timer); + + if (status < 0) { + isc__nm_incstats(sock, STATID_SENDFAIL); + isc__nm_failed_send_cb(sock, uvreq, + isc__nm_uverr2result(status)); + return; + } + + isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false); +} + +/* + * Handle 'tcpsend' async event - send a packet on the socket + */ +void +isc__nm_async_tcpdnssend(isc__networker_t *worker, isc__netievent_t *ev0) { + isc_result_t result; + isc__netievent_tcpdnssend_t *ievent = + (isc__netievent_tcpdnssend_t *)ev0; + isc_nmsocket_t *sock = NULL; + isc__nm_uvreq_t *uvreq = NULL; + int r, nbufs = 2; + + UNUSED(worker); + + REQUIRE(VALID_UVREQ(ievent->req)); + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(ievent->sock->type == isc_nm_tcpdnssocket); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + + sock = ievent->sock; + uvreq = ievent->req; + + if (sock->write_timeout == 0) { + sock->write_timeout = + (atomic_load(&sock->keepalive) + ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle)); + } + + uv_buf_t bufs[2] = { { .base = uvreq->tcplen, .len = 2 }, + { .base = uvreq->uvbuf.base, + .len = uvreq->uvbuf.len } }; + + if (isc__nmsocket_closing(sock)) { + result = ISC_R_CANCELED; + goto fail; + } + + r = uv_try_write(&sock->uv_handle.stream, bufs, nbufs); + + if (r == (int)(bufs[0].len + bufs[1].len)) { + /* Wrote everything */ + isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, true); + return; + } + + if (r == 1) { + /* Partial write of DNSMSG length */ + bufs[0].base = uvreq->tcplen + 1; + bufs[0].len = 1; + } else if (r > 0) { + /* Partial write of DNSMSG */ + nbufs = 1; + bufs[0].base = uvreq->uvbuf.base + (r - 2); + bufs[0].len = uvreq->uvbuf.len - (r - 2); + } else if (r == UV_ENOSYS || r == UV_EAGAIN) { + /* uv_try_write not supported, send asynchronously */ + } else { + /* error sending data */ + result = isc__nm_uverr2result(r); + goto fail; + } + + r = uv_write(&uvreq->uv_req.write, &sock->uv_handle.stream, bufs, nbufs, + tcpdns_send_cb); + if (r < 0) { + result = isc__nm_uverr2result(r); + goto fail; + } + + isc_nm_timer_create(uvreq->handle, isc__nmsocket_writetimeout_cb, uvreq, + &uvreq->timer); + if (sock->write_timeout > 0) { + isc_nm_timer_start(uvreq->timer, sock->write_timeout); + } + + return; +fail: + isc__nm_incstats(sock, STATID_SENDFAIL); + isc__nm_failed_send_cb(sock, uvreq, result); +} + +static void +tcpdns_stop_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + uv_handle_set_data(handle, NULL); + + if (!atomic_compare_exchange_strong(&sock->closed, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + isc__nm_incstats(sock, STATID_CLOSE); + + atomic_store(&sock->listening, false); + + isc__nmsocket_detach(&sock); +} + +static void +tcpdns_close_sock(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + if (!atomic_compare_exchange_strong(&sock->closed, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + isc__nm_incstats(sock, STATID_CLOSE); + + if (sock->server != NULL) { + isc__nmsocket_detach(&sock->server); + } + + atomic_store(&sock->connected, false); + + isc__nmsocket_prep_destroy(sock); +} + +static void +tcpdns_close_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + + uv_handle_set_data(handle, NULL); + + tcpdns_close_sock(sock); +} + +static void +read_timer_close_cb(uv_handle_t *timer) { + isc_nmsocket_t *sock = uv_handle_get_data(timer); + uv_handle_set_data(timer, NULL); + + REQUIRE(VALID_NMSOCK(sock)); + + if (sock->parent) { + uv_close(&sock->uv_handle.handle, tcpdns_stop_cb); + } else if (uv_is_closing(&sock->uv_handle.handle)) { + tcpdns_close_sock(sock); + } else { + uv_close(&sock->uv_handle.handle, tcpdns_close_cb); + } +} + +static void +stop_tcpdns_child(isc_nmsocket_t *sock) { + REQUIRE(sock->type == isc_nm_tcpdnssocket); + REQUIRE(sock->tid == isc_nm_tid()); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + tcpdns_close_direct(sock); + + atomic_fetch_sub(&sock->parent->rchildren, 1); + + isc_barrier_wait(&sock->parent->stoplistening); +} + +static void +stop_tcpdns_parent(isc_nmsocket_t *sock) { + isc_nmsocket_t *csock = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_tcpdnslistener); + + isc_barrier_init(&sock->stoplistening, sock->nchildren); + + for (size_t i = 0; i < sock->nchildren; i++) { + csock = &sock->children[i]; + REQUIRE(VALID_NMSOCK(csock)); + + if ((int)i == isc_nm_tid()) { + /* + * We need to schedule closing the other sockets first + */ + continue; + } + + atomic_store(&csock->active, false); + enqueue_stoplistening(csock); + } + + csock = &sock->children[isc_nm_tid()]; + atomic_store(&csock->active, false); + stop_tcpdns_child(csock); + + atomic_store(&sock->closed, true); + isc__nmsocket_prep_destroy(sock); +} + +static void +tcpdns_close_direct(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } + + if (sock->recv_handle != NULL) { + isc_nmhandle_detach(&sock->recv_handle); + } + + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + uv_close((uv_handle_t *)&sock->read_timer, read_timer_close_cb); +} + +void +isc__nm_tcpdns_close(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcpdnssocket); + REQUIRE(!isc__nmsocket_active(sock)); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + if (sock->tid == isc_nm_tid()) { + tcpdns_close_direct(sock); + } else { + /* + * We need to create an event and pass it using async channel + */ + isc__netievent_tcpdnsclose_t *ievent = + isc__nm_get_netievent_tcpdnsclose(sock->mgr, sock); + + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } +} + +void +isc__nm_async_tcpdnsclose(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpdnsclose_t *ievent = + (isc__netievent_tcpdnsclose_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + tcpdns_close_direct(sock); +} + +static void +tcpdns_close_connect_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + + REQUIRE(VALID_NMSOCK(sock)); + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(sock->tid == isc_nm_tid()); + + isc__nmsocket_prep_destroy(sock); + isc__nmsocket_detach(&sock); +} + +void +isc__nm_tcpdns_shutdown(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_tcpdnssocket); + + /* + * If the socket is active, mark it inactive and + * continue. If it isn't active, stop now. + */ + if (!isc__nmsocket_deactivate(sock)) { + return; + } + + if (atomic_load(&sock->accepting)) { + return; + } + + if (atomic_load(&sock->connecting)) { + isc_nmsocket_t *tsock = NULL; + isc__nmsocket_attach(sock, &tsock); + uv_close(&sock->uv_handle.handle, tcpdns_close_connect_cb); + return; + } + + if (sock->statichandle != NULL) { + if (isc__nm_closing(sock)) { + isc__nm_failed_read_cb(sock, ISC_R_SHUTTINGDOWN, false); + } else { + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, false); + } + return; + } + + /* + * Otherwise, we just send the socket to abyss... + */ + if (sock->parent == NULL) { + isc__nmsocket_prep_destroy(sock); + } +} + +void +isc__nm_tcpdns_cancelread(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + isc__netievent_tcpdnscancel_t *ievent = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tcpdnssocket); + + ievent = isc__nm_get_netievent_tcpdnscancel(sock->mgr, sock, handle); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_tcpdnscancel(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tcpdnscancel_t *ievent = + (isc__netievent_tcpdnscancel_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + isc__nm_failed_read_cb(sock, ISC_R_EOF, false); +} diff --git a/lib/isc/netmgr/timer.c b/lib/isc/netmgr/timer.c new file mode 100644 index 0000000..8328775 --- /dev/null +++ b/lib/isc/netmgr/timer.c @@ -0,0 +1,120 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <uv.h> + +#include <isc/netmgr.h> +#include <isc/util.h> + +#include "netmgr-int.h" + +struct isc_nm_timer { + isc_refcount_t references; + uv_timer_t timer; + isc_nmhandle_t *handle; + isc_nm_timer_cb cb; + void *cbarg; +}; + +void +isc_nm_timer_create(isc_nmhandle_t *handle, isc_nm_timer_cb cb, void *cbarg, + isc_nm_timer_t **timerp) { + isc__networker_t *worker = NULL; + isc_nmsocket_t *sock = NULL; + isc_nm_timer_t *timer = NULL; + int r; + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + worker = &sock->mgr->workers[isc_nm_tid()]; + + /* TODO: per-loop object cache */ + timer = isc_mem_get(sock->mgr->mctx, sizeof(*timer)); + *timer = (isc_nm_timer_t){ .cb = cb, .cbarg = cbarg }; + isc_refcount_init(&timer->references, 1); + isc_nmhandle_attach(handle, &timer->handle); + + r = uv_timer_init(&worker->loop, &timer->timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + + uv_handle_set_data((uv_handle_t *)&timer->timer, timer); + + *timerp = timer; +} + +void +isc_nm_timer_attach(isc_nm_timer_t *timer, isc_nm_timer_t **timerp) { + REQUIRE(timer != NULL); + REQUIRE(timerp != NULL && *timerp == NULL); + + isc_refcount_increment(&timer->references); + *timerp = timer; +} + +static void +timer_destroy(uv_handle_t *uvhandle) { + isc_nm_timer_t *timer = uv_handle_get_data(uvhandle); + isc_nmhandle_t *handle = timer->handle; + isc_mem_t *mctx = timer->handle->sock->mgr->mctx; + + isc_mem_put(mctx, timer, sizeof(*timer)); + + isc_nmhandle_detach(&handle); +} + +void +isc_nm_timer_detach(isc_nm_timer_t **timerp) { + isc_nm_timer_t *timer = NULL; + isc_nmhandle_t *handle = NULL; + + REQUIRE(timerp != NULL && *timerp != NULL); + + timer = *timerp; + *timerp = NULL; + + handle = timer->handle; + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + if (isc_refcount_decrement(&timer->references) == 1) { + int r = uv_timer_stop(&timer->timer); + UV_RUNTIME_CHECK(uv_timer_stop, r); + uv_close((uv_handle_t *)&timer->timer, timer_destroy); + } +} + +static void +timer_cb(uv_timer_t *uvtimer) { + isc_nm_timer_t *timer = uv_handle_get_data((uv_handle_t *)uvtimer); + + REQUIRE(timer->cb != NULL); + + timer->cb(timer->cbarg, ISC_R_TIMEDOUT); +} + +void +isc_nm_timer_start(isc_nm_timer_t *timer, uint64_t timeout) { + int r = uv_timer_start(&timer->timer, timer_cb, timeout, 0); + UV_RUNTIME_CHECK(uv_timer_start, r); +} + +void +isc_nm_timer_stop(isc_nm_timer_t *timer) { + int r = uv_timer_stop(&timer->timer); + UV_RUNTIME_CHECK(uv_timer_stop, r); +} diff --git a/lib/isc/netmgr/tlsdns.c b/lib/isc/netmgr/tlsdns.c new file mode 100644 index 0000000..d30e33f --- /dev/null +++ b/lib/isc/netmgr/tlsdns.c @@ -0,0 +1,2363 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <libgen.h> +#include <unistd.h> +#include <uv.h> + +#include <isc/atomic.h> +#include <isc/barrier.h> +#include <isc/buffer.h> +#include <isc/condition.h> +#include <isc/errno.h> +#include <isc/log.h> +#include <isc/magic.h> +#include <isc/mem.h> +#include <isc/netmgr.h> +#include <isc/quota.h> +#include <isc/random.h> +#include <isc/refcount.h> +#include <isc/region.h> +#include <isc/result.h> +#include <isc/sockaddr.h> +#include <isc/stdtime.h> +#include <isc/thread.h> +#include <isc/util.h> + +#include "netmgr-int.h" +#include "openssl_shim.h" +#include "uv-compat.h" + +static atomic_uint_fast32_t last_tlsdnsquota_log = 0; + +static void +tls_error(isc_nmsocket_t *sock, isc_result_t result); + +static isc_result_t +tlsdns_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); + +static void +tlsdns_close_direct(isc_nmsocket_t *sock); + +static isc_result_t +tlsdns_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req); +static void +tlsdns_connect_cb(uv_connect_t *uvreq, int status); + +static void +tlsdns_connection_cb(uv_stream_t *server, int status); + +static void +tlsdns_close_cb(uv_handle_t *uvhandle); + +static isc_result_t +accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota); + +static void +quota_accept_cb(isc_quota_t *quota, void *sock0); + +static void +stop_tlsdns_parent(isc_nmsocket_t *sock); +static void +stop_tlsdns_child(isc_nmsocket_t *sock); + +static void +async_tlsdns_cycle(isc_nmsocket_t *sock) __attribute__((unused)); + +static isc_result_t +tls_cycle(isc_nmsocket_t *sock); + +static void +call_pending_send_callbacks(isc_nmsocket_t *sock, const isc_result_t result); + +static void +tlsdns_keep_client_tls_session(isc_nmsocket_t *sock); + +static void +tlsdns_set_tls_shutdown(isc_tls_t *tls) { + (void)SSL_set_shutdown(tls, SSL_SENT_SHUTDOWN); +} + +static bool +peer_verification_has_failed(isc_nmsocket_t *sock) { + if (sock->tls.tls != NULL && sock->tls.state == TLS_STATE_HANDSHAKE && + SSL_get_verify_result(sock->tls.tls) != X509_V_OK) + { + return (true); + } + + return (false); +} + +static bool +can_log_tlsdns_quota(void) { + isc_stdtime_t now, last; + + isc_stdtime_get(&now); + last = atomic_exchange_relaxed(&last_tlsdnsquota_log, now); + if (now != last) { + return (true); + } + + return (false); +} + +static isc_result_t +tlsdns_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { + isc__networker_t *worker = NULL; + isc_result_t result = ISC_R_UNSET; + int r; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(sock->tid == isc_nm_tid()); + + worker = &sock->mgr->workers[sock->tid]; + + atomic_store(&sock->connecting, true); + + /* 2 minute timeout */ + result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + uv_handle_set_data(&sock->uv_handle.handle, sock); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + if (isc__nm_closing(sock)) { + result = ISC_R_SHUTTINGDOWN; + goto error; + } + + r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); + if (r != 0) { + isc__nm_closesocket(sock->fd); + isc__nm_incstats(sock, STATID_OPENFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_OPEN); + + if (req->local.length != 0) { + r = uv_tcp_bind(&sock->uv_handle.tcp, &req->local.type.sa, 0); + /* + * In case of shared socket UV_EINVAL will be returned and needs + * to be ignored + */ + if (r != 0 && r != UV_EINVAL) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + uv_handle_set_data(&req->uv_req.handle, req); + r = uv_tcp_connect(&req->uv_req.connect, &sock->uv_handle.tcp, + &req->peer.type.sa, tlsdns_connect_cb); + if (r != 0) { + isc__nm_incstats(sock, STATID_CONNECTFAIL); + goto done; + } + + uv_handle_set_data((uv_handle_t *)&sock->read_timer, + &req->uv_req.connect); + isc__nmsocket_timer_start(sock); + + atomic_store(&sock->connected, true); + +done: + result = isc__nm_uverr2result(r); +error: + LOCK(&sock->lock); + sock->result = result; + SIGNAL(&sock->cond); + if (!atomic_load(&sock->active)) { + WAIT(&sock->scond, &sock->lock); + } + INSIST(atomic_load(&sock->active)); + UNLOCK(&sock->lock); + + return (result); +} + +void +isc__nm_async_tlsdnsconnect(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnsconnect_t *ievent = + (isc__netievent_tlsdnsconnect_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *req = ievent->req; + isc_result_t result = ISC_R_SUCCESS; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tlsdnssocket); + REQUIRE(sock->parent == NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + result = tlsdns_connect_direct(sock, req); + if (result != ISC_R_SUCCESS) { + atomic_compare_exchange_enforced(&sock->connecting, + &(bool){ true }, false); + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, result, true); + atomic_store(&sock->active, false); + isc__nm_tlsdns_close(sock); + } + + /* + * The sock is now attached to the handle. + */ + isc__nmsocket_detach(&sock); +} + +static void +tlsdns_connect_cb(uv_connect_t *uvreq, int status) { + isc_result_t result = ISC_R_UNSET; + isc__nm_uvreq_t *req = NULL; + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); + struct sockaddr_storage ss; + int r; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + req = uv_handle_get_data((uv_handle_t *)uvreq); + + REQUIRE(VALID_UVREQ(req)); + REQUIRE(VALID_NMHANDLE(req->handle)); + + if (atomic_load(&sock->timedout)) { + result = ISC_R_TIMEDOUT; + goto error; + } else if (isc__nm_closing(sock)) { + /* Network manager shutting down */ + result = ISC_R_SHUTTINGDOWN; + goto error; + } else if (isc__nmsocket_closing(sock)) { + /* Connection canceled */ + result = ISC_R_CANCELED; + goto error; + } else if (status == UV_ETIMEDOUT) { + /* Timeout status code here indicates hard error */ + result = ISC_R_TIMEDOUT; + goto error; + } else if (status == UV_EADDRINUSE) { + /* + * On FreeBSD the TCP connect() call sometimes results in a + * spurious transient EADDRINUSE. Try a few more times before + * giving up. + */ + if (--req->connect_tries > 0) { + r = uv_tcp_connect( + &req->uv_req.connect, &sock->uv_handle.tcp, + &req->peer.type.sa, tlsdns_connect_cb); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto error; + } + return; + } + result = isc__nm_uverr2result(status); + goto error; + } else if (status != 0) { + result = isc__nm_uverr2result(status); + goto error; + } + + isc__nm_incstats(sock, STATID_CONNECT); + r = uv_tcp_getpeername(&sock->uv_handle.tcp, (struct sockaddr *)&ss, + &(int){ sizeof(ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto error; + } + + sock->tls.state = TLS_STATE_NONE; + sock->tls.tls = isc_tls_create(sock->tls.ctx); + RUNTIME_CHECK(sock->tls.tls != NULL); + + /* + * + */ + r = BIO_new_bio_pair(&sock->tls.ssl_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE, + &sock->tls.app_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE); + RUNTIME_CHECK(r == 1); + + r = BIO_new_bio_pair(&sock->tls.ssl_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE, + &sock->tls.app_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE); + RUNTIME_CHECK(r == 1); + +#if HAVE_SSL_SET0_RBIO && HAVE_SSL_SET0_WBIO + /* + * Note that if the rbio and wbio are the same then + * SSL_set0_rbio() and SSL_set0_wbio() each take ownership of + * one reference. Therefore it may be necessary to increment the + * number of references available using BIO_up_ref(3) before + * calling the set0 functions. + */ + SSL_set0_rbio(sock->tls.tls, sock->tls.ssl_rbio); + SSL_set0_wbio(sock->tls.tls, sock->tls.ssl_wbio); +#else + SSL_set_bio(sock->tls.tls, sock->tls.ssl_rbio, sock->tls.ssl_wbio); +#endif + + result = isc_sockaddr_fromsockaddr(&sock->peer, (struct sockaddr *)&ss); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + if (sock->tls.client_sess_cache != NULL) { + isc_tlsctx_client_session_cache_reuse_sockaddr( + sock->tls.client_sess_cache, &sock->peer, + sock->tls.tls); + } + + SSL_set_connect_state(sock->tls.tls); + + /* Setting pending req */ + sock->tls.pending_req = req; + + result = isc__nm_process_sock_buffer(sock); + if (result != ISC_R_SUCCESS) { + sock->tls.pending_req = NULL; + goto error; + } + + result = tls_cycle(sock); + if (result != ISC_R_SUCCESS) { + sock->tls.pending_req = NULL; + goto error; + } + + return; + +error: + isc__nm_failed_connect_cb(sock, req, result, false); +} + +void +isc_nm_tlsdnsconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, + isc_nm_cb_t cb, void *cbarg, unsigned int timeout, + size_t extrahandlesize, isc_tlsctx_t *sslctx, + isc_tlsctx_client_session_cache_t *client_sess_cache) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + isc__netievent_tlsdnsconnect_t *ievent = NULL; + isc__nm_uvreq_t *req = NULL; + sa_family_t sa_family; + + REQUIRE(VALID_NM(mgr)); + REQUIRE(local != NULL); + REQUIRE(peer != NULL); + REQUIRE(sslctx != NULL); + + sa_family = peer->type.sa.sa_family; + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_tlsdnssocket, local); + + sock->extrahandlesize = extrahandlesize; + sock->connect_timeout = timeout; + sock->result = ISC_R_UNSET; + isc_tlsctx_attach(sslctx, &sock->tls.ctx); + atomic_init(&sock->client, true); + atomic_init(&sock->connecting, true); + + req = isc__nm_uvreq_get(mgr, sock); + req->cb.connect = cb; + req->cbarg = cbarg; + req->peer = *peer; + req->local = *local; + req->handle = isc__nmhandle_get(sock, &req->peer, &sock->iface); + + if (client_sess_cache != NULL) { + INSIST(isc_tlsctx_client_session_cache_getctx( + client_sess_cache) == sslctx); + isc_tlsctx_client_session_cache_attach( + client_sess_cache, &sock->tls.client_sess_cache); + } + + result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &sock->fd); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + if (isc__nm_closing(sock)) { + result = ISC_R_SHUTTINGDOWN; + goto failure; + } + + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); + + /* 2 minute timeout */ + result = isc__nm_socket_connectiontimeout(sock->fd, 120 * 1000); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + ievent = isc__nm_get_netievent_tlsdnsconnect(mgr, sock, req); + + if (isc__nm_in_netthread()) { + atomic_store(&sock->active, true); + sock->tid = isc_nm_tid(); + isc__nm_async_tlsdnsconnect(&mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + isc__nm_put_netievent_tlsdnsconnect(mgr, ievent); + } else { + atomic_init(&sock->active, false); + sock->tid = isc_random_uniform(mgr->nworkers); + isc__nm_enqueue_ievent(&mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } + LOCK(&sock->lock); + while (sock->result == ISC_R_UNSET) { + WAIT(&sock->cond, &sock->lock); + } + atomic_store(&sock->active, true); + BROADCAST(&sock->scond); + UNLOCK(&sock->lock); + return; + +failure: + if (isc__nm_in_netthread()) { + sock->tid = isc_nm_tid(); + } + + atomic_compare_exchange_enforced(&sock->connecting, &(bool){ true }, + false); + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, result, true); + atomic_store(&sock->closed, true); + isc__nmsocket_detach(&sock); +} + +static uv_os_sock_t +isc__nm_tlsdns_lb_socket(isc_nm_t *mgr, sa_family_t sa_family) { + isc_result_t result; + uv_os_sock_t sock; + + result = isc__nm_socket(sa_family, SOCK_STREAM, 0, &sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + (void)isc__nm_socket_incoming_cpu(sock); + (void)isc__nm_socket_v6only(sock, sa_family); + + /* FIXME: set mss */ + + result = isc__nm_socket_reuse(sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + if (mgr->load_balance_sockets) { + result = isc__nm_socket_reuse_lb(sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + } + + return (sock); +} + +static void +start_tlsdns_child(isc_nm_t *mgr, isc_sockaddr_t *iface, isc_nmsocket_t *sock, + uv_os_sock_t fd, int tid) { + isc__netievent_tlsdnslisten_t *ievent = NULL; + isc_nmsocket_t *csock = &sock->children[tid]; + + isc__nmsocket_init(csock, mgr, isc_nm_tlsdnssocket, iface); + csock->parent = sock; + csock->accept_cb = sock->accept_cb; + csock->accept_cbarg = sock->accept_cbarg; + csock->recv_cb = sock->recv_cb; + csock->recv_cbarg = sock->recv_cbarg; + csock->extrahandlesize = sock->extrahandlesize; + csock->backlog = sock->backlog; + csock->tid = tid; + isc_tlsctx_attach(sock->tls.ctx, &csock->tls.ctx); + + /* + * We don't attach to quota, just assign - to avoid + * increasing quota unnecessarily. + */ + csock->pquota = sock->pquota; + isc_quota_cb_init(&csock->quotacb, quota_accept_cb, csock); + + if (mgr->load_balance_sockets) { + UNUSED(fd); + csock->fd = isc__nm_tlsdns_lb_socket(mgr, + iface->type.sa.sa_family); + } else { + csock->fd = dup(fd); + } + REQUIRE(csock->fd >= 0); + + ievent = isc__nm_get_netievent_tlsdnslisten(mgr, csock); + isc__nm_maybe_enqueue_ievent(&mgr->workers[tid], + (isc__netievent_t *)ievent); +} + +static void +enqueue_stoplistening(isc_nmsocket_t *sock) { + isc__netievent_tlsdnsstop_t *ievent = + isc__nm_get_netievent_tlsdnsstop(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +isc_result_t +isc_nm_listentlsdns(isc_nm_t *mgr, isc_sockaddr_t *iface, + isc_nm_recv_cb_t recv_cb, void *recv_cbarg, + isc_nm_accept_cb_t accept_cb, void *accept_cbarg, + size_t extrahandlesize, int backlog, isc_quota_t *quota, + isc_tlsctx_t *sslctx, isc_nmsocket_t **sockp) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + size_t children_size = 0; + uv_os_sock_t fd = -1; + + REQUIRE(VALID_NM(mgr)); + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_tlsdnslistener, iface); + + atomic_init(&sock->rchildren, 0); + sock->nchildren = mgr->nworkers; + children_size = sock->nchildren * sizeof(sock->children[0]); + sock->children = isc_mem_get(mgr->mctx, children_size); + memset(sock->children, 0, children_size); + + sock->result = ISC_R_UNSET; + sock->accept_cb = accept_cb; + sock->accept_cbarg = accept_cbarg; + sock->recv_cb = recv_cb; + sock->recv_cbarg = recv_cbarg; + sock->extrahandlesize = extrahandlesize; + sock->backlog = backlog; + sock->pquota = quota; + + isc_tlsctx_attach(sslctx, &sock->tls.ctx); + + sock->tid = 0; + sock->fd = -1; + + if (!mgr->load_balance_sockets) { + fd = isc__nm_tlsdns_lb_socket(mgr, iface->type.sa.sa_family); + } + + isc_barrier_init(&sock->startlistening, sock->nchildren); + + for (size_t i = 0; i < sock->nchildren; i++) { + if ((int)i == isc_nm_tid()) { + continue; + } + start_tlsdns_child(mgr, iface, sock, fd, i); + } + + if (isc__nm_in_netthread()) { + start_tlsdns_child(mgr, iface, sock, fd, isc_nm_tid()); + } + + if (!mgr->load_balance_sockets) { + isc__nm_closesocket(fd); + } + + LOCK(&sock->lock); + while (atomic_load(&sock->rchildren) != sock->nchildren) { + WAIT(&sock->cond, &sock->lock); + } + result = sock->result; + atomic_store(&sock->active, true); + UNLOCK(&sock->lock); + + INSIST(result != ISC_R_UNSET); + + if (result == ISC_R_SUCCESS) { + REQUIRE(atomic_load(&sock->rchildren) == sock->nchildren); + *sockp = sock; + } else { + atomic_store(&sock->active, false); + enqueue_stoplistening(sock); + isc_nmsocket_close(&sock); + } + + return (result); +} + +void +isc__nm_async_tlsdnslisten(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnslisten_t *ievent = + (isc__netievent_tlsdnslisten_t *)ev0; + sa_family_t sa_family; + int r; + int flags = 0; + isc_nmsocket_t *sock = NULL; + isc_result_t result = ISC_R_UNSET; + isc_nm_t *mgr; + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + REQUIRE(VALID_NMSOCK(ievent->sock->parent)); + + sock = ievent->sock; + sa_family = sock->iface.type.sa.sa_family; + mgr = sock->mgr; + + REQUIRE(sock->type == isc_nm_tlsdnssocket); + REQUIRE(sock->parent != NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); + + r = uv_tcp_init(&worker->loop, &sock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + uv_handle_set_data(&sock->uv_handle.handle, sock); + /* This keeps the socket alive after everything else is gone */ + isc__nmsocket_attach(sock, &(isc_nmsocket_t *){ NULL }); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + LOCK(&sock->parent->lock); + + r = uv_tcp_open(&sock->uv_handle.tcp, sock->fd); + if (r < 0) { + isc__nm_closesocket(sock->fd); + isc__nm_incstats(sock, STATID_OPENFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_OPEN); + + if (sa_family == AF_INET6) { + flags = UV_TCP_IPV6ONLY; + } + + if (mgr->load_balance_sockets) { + r = isc_uv_tcp_freebind(&sock->uv_handle.tcp, + &sock->iface.type.sa, flags); + if (r < 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + } else { + if (sock->parent->fd == -1) { + r = isc_uv_tcp_freebind(&sock->uv_handle.tcp, + &sock->iface.type.sa, flags); + if (r < 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + sock->parent->uv_handle.tcp.flags = + sock->uv_handle.tcp.flags; + sock->parent->fd = sock->fd; + } else { + /* The socket is already bound, just copy the flags */ + sock->uv_handle.tcp.flags = + sock->parent->uv_handle.tcp.flags; + } + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + /* + * The callback will run in the same thread uv_listen() was + * called from, so a race with tlsdns_connection_cb() isn't + * possible. + */ + r = uv_listen((uv_stream_t *)&sock->uv_handle.tcp, sock->backlog, + tlsdns_connection_cb); + if (r != 0) { + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR, + "uv_listen failed: %s", + isc_result_totext(isc__nm_uverr2result(r))); + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + + atomic_store(&sock->listening, true); + +done: + result = isc__nm_uverr2result(r); + if (result != ISC_R_SUCCESS) { + sock->pquota = NULL; + } + + atomic_fetch_add(&sock->parent->rchildren, 1); + if (sock->parent->result == ISC_R_UNSET) { + sock->parent->result = result; + } + SIGNAL(&sock->parent->cond); + UNLOCK(&sock->parent->lock); + + isc_barrier_wait(&sock->parent->startlistening); +} + +static void +tlsdns_connection_cb(uv_stream_t *server, int status) { + isc_nmsocket_t *ssock = uv_handle_get_data((uv_handle_t *)server); + isc_result_t result; + isc_quota_t *quota = NULL; + + if (status != 0) { + result = isc__nm_uverr2result(status); + goto done; + } + + REQUIRE(VALID_NMSOCK(ssock)); + REQUIRE(ssock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(ssock)) { + result = ISC_R_CANCELED; + goto done; + } + + if (ssock->pquota != NULL) { + result = isc_quota_attach_cb(ssock->pquota, "a, + &ssock->quotacb); + if (result == ISC_R_QUOTA) { + isc__nm_incstats(ssock, STATID_ACCEPTFAIL); + goto done; + } + } + + result = accept_connection(ssock, quota); +done: + isc__nm_accept_connection_log(result, can_log_tlsdns_quota()); +} + +void +isc__nm_tlsdns_stoplistening(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tlsdnslistener); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + if (!isc__nm_in_netthread()) { + enqueue_stoplistening(sock); + } else { + stop_tlsdns_parent(sock); + } +} + +static void +tls_shutdown(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + + isc__netievent_tlsdnsshutdown_t *ievent = + isc__nm_get_netievent_tlsdnsshutdown(sock->mgr, sock); + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_tlsdnsshutdown(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnsshutdown_t *ievent = + (isc__netievent_tlsdnsshutdown_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + int rv; + int err; + isc_result_t result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(ievent->sock)); + + if (sock->tls.state != TLS_STATE_IO) { + /* Nothing to do */ + return; + } + + rv = SSL_shutdown(sock->tls.tls); + + if (rv == 1) { + sock->tls.state = TLS_STATE_NONE; + /* FIXME: continue closing the socket */ + return; + } + + if (rv == 0) { + result = tls_cycle(sock); + if (result != ISC_R_SUCCESS) { + tls_error(sock, result); + return; + } + + /* Reschedule closing the socket */ + tls_shutdown(sock); + return; + } + + err = SSL_get_error(sock->tls.tls, rv); + + switch (err) { + case SSL_ERROR_WANT_READ: + case SSL_ERROR_WANT_WRITE: + case SSL_ERROR_WANT_X509_LOOKUP: + result = tls_cycle(sock); + if (result != ISC_R_SUCCESS) { + tls_error(sock, result); + return; + } + + /* Reschedule closing the socket */ + tls_shutdown(sock); + return; + case 0: + UNREACHABLE(); + case SSL_ERROR_ZERO_RETURN: + tls_error(sock, ISC_R_EOF); + break; + default: + tls_error(sock, ISC_R_TLSERROR); + } + return; +} + +void +isc__nm_async_tlsdnsstop(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnsstop_t *ievent = + (isc__netievent_tlsdnsstop_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (sock->parent != NULL) { + stop_tlsdns_child(sock); + return; + } + + stop_tlsdns_parent(sock); +} + +void +isc__nm_tlsdns_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, + bool async) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(result != ISC_R_SUCCESS); + + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + + if (sock->tls.pending_req != NULL) { + isc_result_t failure_result = ISC_R_CANCELED; + isc__nm_uvreq_t *req = sock->tls.pending_req; + sock->tls.pending_req = NULL; + + if (peer_verification_has_failed(sock)) { + /* + * Save error message as 'sock->tls' will get detached. + */ + sock->tls.tls_verify_errmsg = + isc_tls_verify_peer_result_string( + sock->tls.tls); + failure_result = ISC_R_TLSBADPEERCERT; + } + isc__nm_failed_connect_cb(sock, req, failure_result, async); + } + + if (!sock->recv_read) { + goto destroy; + } + sock->recv_read = false; + + if (sock->recv_cb != NULL) { + isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); + isc__nmsocket_clearcb(sock); + isc__nm_readcb(sock, req, result); + } + +destroy: + call_pending_send_callbacks(sock, result); + isc__nmsocket_prep_destroy(sock); + + /* + * We need to detach from quota after the read callback function + * had a chance to be executed. + */ + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } +} + +void +isc__nm_tlsdns_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + isc_nmsocket_t *sock = handle->sock; + isc__netievent_tlsdnsread_t *ievent = NULL; + + REQUIRE(sock->type == isc_nm_tlsdnssocket); + REQUIRE(sock->statichandle == handle); + + sock->recv_cb = cb; + sock->recv_cbarg = cbarg; + sock->recv_read = true; + if (sock->read_timeout == 0) { + sock->read_timeout = + (atomic_load(&sock->keepalive) + ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle)); + } + + ievent = isc__nm_get_netievent_tlsdnsread(sock->mgr, sock); + + /* + * This MUST be done asynchronously, no matter which thread + * we're in. The callback function for isc_nm_read() often calls + * isc_nm_read() again; if we tried to do that synchronously + * we'd clash in processbuffer() and grow the stack + * indefinitely. + */ + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + + return; +} + +void +isc__nm_async_tlsdnsread(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnsread_t *ievent = + (isc__netievent_tlsdnsread_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc_result_t result = ISC_R_SUCCESS; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(sock)) { + atomic_store(&sock->reading, true); + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, false); + return; + } + + result = tls_cycle(sock); + if (result != ISC_R_SUCCESS) { + isc__nm_failed_read_cb(sock, result, false); + } +} + +/* + * Process a single packet from the incoming buffer. + * + * Return ISC_R_SUCCESS and attach 'handlep' to a handle if something + * was processed; return ISC_R_NOMORE if there isn't a full message + * to be processed. + * + * The caller will need to unreference the handle. + */ +isc_result_t +isc__nm_tlsdns_processbuffer(isc_nmsocket_t *sock) { + size_t len; + isc__nm_uvreq_t *req = NULL; + isc_nmhandle_t *handle = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(sock)) { + return (ISC_R_CANCELED); + } + + /* + * If we don't even have the length yet, we can't do + * anything. + */ + if (sock->buf_len < 2) { + return (ISC_R_NOMORE); + } + + /* + * Process the first packet from the buffer, leaving + * the rest (if any) for later. + */ + len = ntohs(*(uint16_t *)sock->buf); + if (len > sock->buf_len - 2) { + return (ISC_R_NOMORE); + } + + if (sock->recv_cb == NULL) { + /* + * recv_cb has been cleared - there is + * nothing to do + */ + return (ISC_R_CANCELED); + } else if (sock->statichandle == NULL && + sock->tls.state == TLS_STATE_IO && + atomic_load(&sock->connected) && + !atomic_load(&sock->connecting)) + { + /* + * It seems that some unexpected data (a DNS message) has + * arrived while we are wrapping up. + */ + return (ISC_R_CANCELED); + } + + req = isc__nm_get_read_req(sock, NULL); + REQUIRE(VALID_UVREQ(req)); + + /* + * We need to launch isc__nm_resume_processing() after the buffer + * has been consumed, thus we must delay detaching the handle. + */ + isc_nmhandle_attach(req->handle, &handle); + + /* + * The callback will be called synchronously because the + * result is ISC_R_SUCCESS, so we don't need to have + * the buffer on the heap + */ + req->uvbuf.base = (char *)sock->buf + 2; + req->uvbuf.len = len; + + /* + * If isc__nm_tlsdns_read() was called, it will be satisfied by + * single DNS message in the next call. + */ + sock->recv_read = false; + + /* + * An assertion failure here means that there's an erroneous + * extra nmhandle detach happening in the callback and + * isc__nm_resume_processing() is called while we're + * processing the buffer. + */ + REQUIRE(sock->processing == false); + sock->processing = true; + isc__nm_readcb(sock, req, ISC_R_SUCCESS); + sock->processing = false; + + len += 2; + sock->buf_len -= len; + if (sock->buf_len > 0) { + memmove(sock->buf, sock->buf + len, sock->buf_len); + } + + isc_nmhandle_detach(&handle); + + if (isc__nmsocket_closing(sock)) { + tlsdns_set_tls_shutdown(sock->tls.tls); + tlsdns_keep_client_tls_session(sock); + } + + return (ISC_R_SUCCESS); +} + +static isc_result_t +tls_cycle_input(isc_nmsocket_t *sock) { + isc_result_t result = ISC_R_SUCCESS; + int err = 0; + int rv = 1; + + if (sock->tls.state == TLS_STATE_IO) { + size_t len; + + for (;;) { + (void)SSL_peek(sock->tls.tls, &(char){ '\0' }, 0); + + int pending = SSL_pending(sock->tls.tls); + if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) { + pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE; + } + + if (pending != 0) { + if ((sock->buf_len + pending) > sock->buf_size) + { + isc__nm_alloc_dnsbuf( + sock, sock->buf_len + pending); + } + + len = 0; + rv = SSL_read_ex(sock->tls.tls, + sock->buf + sock->buf_len, + sock->buf_size - sock->buf_len, + &len); + if (rv != 1) { + /* + * Process what's in the buffer so far + */ + result = isc__nm_process_sock_buffer( + sock); + if (result != ISC_R_SUCCESS) { + goto failure; + } + /* + * FIXME: Should we call + * isc__nm_failed_read_cb()? + */ + break; + } + + INSIST((size_t)pending == len); + + sock->buf_len += len; + } + result = isc__nm_process_sock_buffer(sock); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + if (pending == 0) { + break; + } + } + } else if (!SSL_is_init_finished(sock->tls.tls)) { + if (SSL_is_server(sock->tls.tls)) { + rv = SSL_accept(sock->tls.tls); + } else { + rv = SSL_connect(sock->tls.tls); + } + + } else { + rv = 1; + } + + if (rv <= 0) { + err = SSL_get_error(sock->tls.tls, rv); + } + + switch (err) { + case SSL_ERROR_WANT_READ: + if (sock->tls.state == TLS_STATE_NONE && + !SSL_is_init_finished(sock->tls.tls)) + { + sock->tls.state = TLS_STATE_HANDSHAKE; + result = isc__nm_process_sock_buffer(sock); + if (result != ISC_R_SUCCESS) { + goto failure; + } + } + /* else continue reading */ + break; + case SSL_ERROR_WANT_WRITE: + async_tlsdns_cycle(sock); + break; + case SSL_ERROR_WANT_X509_LOOKUP: + /* Continue reading/writing */ + break; + case 0: + /* Everything is ok, continue */ + break; + case SSL_ERROR_ZERO_RETURN: + return (ISC_R_EOF); + default: + return (ISC_R_TLSERROR); + } + + /* Stop state after handshake */ + if (sock->tls.state == TLS_STATE_HANDSHAKE && + SSL_is_init_finished(sock->tls.tls)) + { + const unsigned char *alpn = NULL; + unsigned int alpnlen = 0; + + isc__nmsocket_log_tls_session_reuse(sock, sock->tls.tls); + + isc_tls_get_selected_alpn(sock->tls.tls, &alpn, &alpnlen); + if (alpn != NULL && alpnlen == ISC_TLS_DOT_PROTO_ALPN_ID_LEN && + memcmp(ISC_TLS_DOT_PROTO_ALPN_ID, alpn, + ISC_TLS_DOT_PROTO_ALPN_ID_LEN) == 0) + { + sock->tls.alpn_negotiated = true; + } + + sock->tls.state = TLS_STATE_IO; + + if (SSL_is_server(sock->tls.tls)) { + REQUIRE(sock->recv_handle != NULL); + result = sock->accept_cb(sock->recv_handle, + ISC_R_SUCCESS, + sock->accept_cbarg); + + if (result != ISC_R_SUCCESS) { + isc_nmhandle_detach(&sock->recv_handle); + goto failure; + } + } else { + isc__nm_uvreq_t *req = sock->tls.pending_req; + sock->tls.pending_req = NULL; + + isc__nmsocket_timer_stop(sock); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, + sock); + + atomic_compare_exchange_enforced( + &sock->connecting, &(bool){ true }, false); + isc__nm_connectcb(sock, req, ISC_R_SUCCESS, true); + } + async_tlsdns_cycle(sock); + } +failure: + return (result); +} + +static void +tls_error(isc_nmsocket_t *sock, isc_result_t result) { + switch (sock->tls.state) { + case TLS_STATE_HANDSHAKE: + case TLS_STATE_IO: + if (atomic_load(&sock->connecting)) { + isc__nm_uvreq_t *req = sock->tls.pending_req; + sock->tls.pending_req = NULL; + + isc__nm_failed_connect_cb(sock, req, result, false); + } else { + isc__nm_tlsdns_failed_read_cb(sock, result, false); + } + break; + case TLS_STATE_ERROR: + return; + default: + break; + } + + sock->tls.state = TLS_STATE_ERROR; + sock->tls.pending_error = result; + + isc__nmsocket_shutdown(sock); +} + +static void +call_pending_send_callbacks(isc_nmsocket_t *sock, const isc_result_t result) { + isc__nm_uvreq_t *cbreq = ISC_LIST_HEAD(sock->tls.sendreqs); + while (cbreq != NULL) { + isc__nm_uvreq_t *next = ISC_LIST_NEXT(cbreq, link); + ISC_LIST_UNLINK(sock->tls.sendreqs, cbreq, link); + INSIST(sock == cbreq->handle->sock); + isc__nm_sendcb(sock, cbreq, result, false); + cbreq = next; + } +} + +static void +free_senddata(isc_nmsocket_t *sock, const isc_result_t result) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tls.senddata.base != NULL); + REQUIRE(sock->tls.senddata.length > 0); + + isc_mem_put(sock->mgr->mctx, sock->tls.senddata.base, + sock->tls.senddata.length); + sock->tls.senddata.base = NULL; + sock->tls.senddata.length = 0; + + call_pending_send_callbacks(sock, result); +} + +static void +tls_write_cb(uv_write_t *req, int status) { + isc_result_t result = status != 0 ? isc__nm_uverr2result(status) + : ISC_R_SUCCESS; + isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data; + isc_nmsocket_t *sock = uvreq->sock; + + isc_nm_timer_stop(uvreq->timer); + isc_nm_timer_detach(&uvreq->timer); + + free_senddata(sock, result); + + isc__nm_uvreq_put(&uvreq, sock); + + if (status != 0) { + tls_error(sock, result); + return; + } + + result = tls_cycle(sock); + if (result != ISC_R_SUCCESS) { + tls_error(sock, result); + return; + } +} + +static isc_result_t +tls_cycle_output(isc_nmsocket_t *sock) { + isc_result_t result = ISC_R_SUCCESS; + int pending; + + while ((pending = BIO_pending(sock->tls.app_rbio)) > 0) { + isc__nm_uvreq_t *req = NULL; + size_t bytes; + int rv; + int r; + + if (sock->tls.senddata.base != NULL || + sock->tls.senddata.length > 0) + { + break; + } + + if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) { + pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE; + } + + sock->tls.senddata.base = isc_mem_get(sock->mgr->mctx, pending); + sock->tls.senddata.length = pending; + + /* It's a bit misnomer here, but it does the right thing */ + req = isc__nm_get_read_req(sock, NULL); + req->uvbuf.base = (char *)sock->tls.senddata.base; + req->uvbuf.len = sock->tls.senddata.length; + + rv = BIO_read_ex(sock->tls.app_rbio, req->uvbuf.base, + req->uvbuf.len, &bytes); + + RUNTIME_CHECK(rv == 1); + INSIST((size_t)pending == bytes); + + r = uv_try_write(&sock->uv_handle.stream, &req->uvbuf, 1); + + if (r == pending) { + /* Wrote everything, restart */ + isc__nm_uvreq_put(&req, sock); + free_senddata(sock, ISC_R_SUCCESS); + continue; + } + + if (r > 0) { + /* Partial write, send rest asynchronously */ + memmove(req->uvbuf.base, req->uvbuf.base + r, + req->uvbuf.len - r); + req->uvbuf.len = req->uvbuf.len - r; + } else if (r == UV_ENOSYS || r == UV_EAGAIN) { + /* uv_try_write is not supported, send + * asynchronously */ + } else { + result = isc__nm_uverr2result(r); + isc__nm_uvreq_put(&req, sock); + free_senddata(sock, result); + break; + } + + r = uv_write(&req->uv_req.write, &sock->uv_handle.stream, + &req->uvbuf, 1, tls_write_cb); + if (r < 0) { + result = isc__nm_uverr2result(r); + isc__nm_uvreq_put(&req, sock); + free_senddata(sock, result); + break; + } + + isc_nm_timer_create(req->handle, isc__nmsocket_writetimeout_cb, + req, &req->timer); + if (sock->write_timeout > 0) { + isc_nm_timer_start(req->timer, sock->write_timeout); + } + + break; + } + + return (result); +} + +static isc_result_t +tls_pop_error(isc_nmsocket_t *sock) { + isc_result_t result; + + if (sock->tls.state != TLS_STATE_ERROR) { + return (ISC_R_SUCCESS); + } + + if (sock->tls.pending_error == ISC_R_SUCCESS) { + return (ISC_R_TLSERROR); + } + + result = sock->tls.pending_error; + sock->tls.pending_error = ISC_R_SUCCESS; + + return (result); +} + +static isc_result_t +tls_cycle(isc_nmsocket_t *sock) { + isc_result_t result; + + /* + * Clear the TLS error queue so that SSL_get_error() and SSL I/O + * routine calls will not get affected by prior error statuses. + * + * See here: + * https://www.openssl.org/docs/man3.0/man3/SSL_get_error.html + * + * In particular, it mentions the following: + * + * The current thread's error queue must be empty before the + * TLS/SSL I/O operation is attempted, or SSL_get_error() will not + * work reliably. + * + * As we use the result of SSL_get_error() to decide on I/O + * operations, we need to ensure that it works reliably by + * cleaning the error queue. + * + * The sum of details: https://stackoverflow.com/a/37980911 + */ + ERR_clear_error(); + + if (isc__nmsocket_closing(sock)) { + return (ISC_R_CANCELED); + } + + result = tls_pop_error(sock); + if (result != ISC_R_SUCCESS) { + goto done; + } + + if (sock->tls.cycle) { + return (ISC_R_SUCCESS); + } + + sock->tls.cycle = true; + result = tls_cycle_input(sock); + if (result != ISC_R_SUCCESS) { + goto done; + } + + result = tls_cycle_output(sock); + if (result != ISC_R_SUCCESS) { + goto done; + } +done: + sock->tls.cycle = false; + + return (result); +} + +static void +async_tlsdns_cycle(isc_nmsocket_t *sock) { + isc__netievent_tlsdnscycle_t *ievent = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + + /* Socket was closed midflight by isc__nm_tlsdns_shutdown() */ + if (isc__nmsocket_closing(sock)) { + return; + } + + ievent = isc__nm_get_netievent_tlsdnscycle(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_tlsdnscycle(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnscycle_t *ievent = + (isc__netievent_tlsdnscycle_t *)ev0; + isc_result_t result; + isc_nmsocket_t *sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + + sock = ievent->sock; + + result = tls_cycle(sock); + if (result != ISC_R_SUCCESS) { + tls_error(sock, result); + } +} + +void +isc__nm_tlsdns_read_cb(uv_stream_t *stream, ssize_t nread, + const uv_buf_t *buf) { + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)stream); + size_t len; + isc_result_t result; + int rv; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->reading)); + REQUIRE(buf != NULL); + + if (isc__nmsocket_closing(sock)) { + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, true); + goto free; + } + + if (nread < 0) { + if (nread != UV_EOF) { + isc__nm_incstats(sock, STATID_RECVFAIL); + } + + isc__nm_failed_read_cb(sock, isc__nm_uverr2result(nread), true); + + goto free; + } + + if (!atomic_load(&sock->client)) { + sock->read_timeout = atomic_load(&sock->mgr->idle); + } + + /* + * The input has to be fed into BIO + */ + rv = BIO_write_ex(sock->tls.app_wbio, buf->base, (size_t)nread, &len); + + if (rv <= 0 || (size_t)nread != len) { + isc__nm_failed_read_cb(sock, ISC_R_TLSERROR, true); + goto free; + } + + result = tls_cycle(sock); + if (result != ISC_R_SUCCESS) { + isc__nm_failed_read_cb(sock, result, true); + } +free: + async_tlsdns_cycle(sock); + + if (nread < 0) { + /* + * The buffer may be a null buffer on error. + */ + if (buf->base == NULL && buf->len == 0) { + return; + } + } + + isc__nm_free_uvbuf(sock, buf); +} + +static void +quota_accept_cb(isc_quota_t *quota, void *sock0) { + isc_nmsocket_t *sock = (isc_nmsocket_t *)sock0; + + REQUIRE(VALID_NMSOCK(sock)); + + /* + * Create a tlsdnsaccept event and pass it using the async + * channel. + */ + + isc__netievent_tlsdnsaccept_t *ievent = + isc__nm_get_netievent_tlsdnsaccept(sock->mgr, sock, quota); + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +/* + * This is called after we get a quota_accept_cb() callback. + */ +void +isc__nm_async_tlsdnsaccept(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnsaccept_t *ievent = + (isc__netievent_tlsdnsaccept_t *)ev0; + isc_result_t result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + + result = accept_connection(ievent->sock, ievent->quota); + isc__nm_accept_connection_log(result, can_log_tlsdns_quota()); +} + +static isc_result_t +accept_connection(isc_nmsocket_t *ssock, isc_quota_t *quota) { + isc_nmsocket_t *csock = NULL; + isc__networker_t *worker = NULL; + int r; + isc_result_t result; + struct sockaddr_storage peer_ss; + struct sockaddr_storage local_ss; + isc_sockaddr_t local; + + REQUIRE(VALID_NMSOCK(ssock)); + REQUIRE(ssock->tid == isc_nm_tid()); + + if (isc__nmsocket_closing(ssock)) { + if (quota != NULL) { + isc_quota_detach("a); + } + return (ISC_R_CANCELED); + } + + REQUIRE(ssock->accept_cb != NULL); + + csock = isc_mem_get(ssock->mgr->mctx, sizeof(isc_nmsocket_t)); + isc__nmsocket_init(csock, ssock->mgr, isc_nm_tlsdnssocket, + &ssock->iface); + csock->tid = ssock->tid; + csock->extrahandlesize = ssock->extrahandlesize; + isc__nmsocket_attach(ssock, &csock->server); + csock->accept_cb = ssock->accept_cb; + csock->accept_cbarg = ssock->accept_cbarg; + csock->recv_cb = ssock->recv_cb; + csock->recv_cbarg = ssock->recv_cbarg; + csock->quota = quota; + atomic_init(&csock->accepting, true); + + worker = &csock->mgr->workers[csock->tid]; + + r = uv_tcp_init(&worker->loop, &csock->uv_handle.tcp); + UV_RUNTIME_CHECK(uv_tcp_init, r); + uv_handle_set_data(&csock->uv_handle.handle, csock); + + r = uv_timer_init(&worker->loop, &csock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&csock->read_timer, csock); + + r = uv_accept(&ssock->uv_handle.stream, &csock->uv_handle.stream); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + r = uv_tcp_getpeername(&csock->uv_handle.tcp, + (struct sockaddr *)&peer_ss, + &(int){ sizeof(peer_ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + result = isc_sockaddr_fromsockaddr(&csock->peer, + (struct sockaddr *)&peer_ss); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + r = uv_tcp_getsockname(&csock->uv_handle.tcp, + (struct sockaddr *)&local_ss, + &(int){ sizeof(local_ss) }); + if (r != 0) { + result = isc__nm_uverr2result(r); + goto failure; + } + + result = isc_sockaddr_fromsockaddr(&local, + (struct sockaddr *)&local_ss); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + csock->tls.state = TLS_STATE_NONE; + + csock->tls.tls = isc_tls_create(ssock->tls.ctx); + RUNTIME_CHECK(csock->tls.tls != NULL); + + r = BIO_new_bio_pair(&csock->tls.ssl_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE, + &csock->tls.app_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE); + RUNTIME_CHECK(r == 1); + + r = BIO_new_bio_pair(&csock->tls.ssl_rbio, ISC_NETMGR_TCP_RECVBUF_SIZE, + &csock->tls.app_wbio, ISC_NETMGR_TCP_RECVBUF_SIZE); + RUNTIME_CHECK(r == 1); + +#if HAVE_SSL_SET0_RBIO && HAVE_SSL_SET0_WBIO + /* + * Note that if the rbio and wbio are the same then + * SSL_set0_rbio() and SSL_set0_wbio() each take ownership of + * one reference. Therefore it may be necessary to increment the + * number of references available using BIO_up_ref(3) before + * calling the set0 functions. + */ + SSL_set0_rbio(csock->tls.tls, csock->tls.ssl_rbio); + SSL_set0_wbio(csock->tls.tls, csock->tls.ssl_wbio); +#else + SSL_set_bio(csock->tls.tls, csock->tls.ssl_rbio, csock->tls.ssl_wbio); +#endif + + SSL_set_accept_state(csock->tls.tls); + + /* FIXME: Set SSL_MODE_RELEASE_BUFFERS */ + + atomic_store(&csock->accepting, false); + + isc__nm_incstats(csock, STATID_ACCEPT); + + csock->read_timeout = atomic_load(&csock->mgr->init); + + csock->closehandle_cb = isc__nm_resume_processing; + + /* + * We need to keep the handle alive until we fail to read or + * connection is closed by the other side, it will be detached + * via prep_destroy()->tlsdns_close_direct(). + * + * The handle will be either detached on acceptcb failure or in + * the readcb. + */ + csock->recv_handle = isc__nmhandle_get(csock, NULL, &local); + + /* + * The initial timer has been set, update the read timeout for + * the next reads. + */ + csock->read_timeout = (atomic_load(&csock->keepalive) + ? atomic_load(&csock->mgr->keepalive) + : atomic_load(&csock->mgr->idle)); + + result = isc__nm_process_sock_buffer(csock); + if (result != ISC_R_SUCCESS) { + goto failure; + } + + /* + * sock is now attached to the handle. + */ + isc__nmsocket_detach(&csock); + + return (ISC_R_SUCCESS); + +failure: + atomic_store(&csock->active, false); + + isc__nm_failed_accept_cb(csock, result); + + isc__nmsocket_prep_destroy(csock); + + isc__nmsocket_detach(&csock); + + return (result); +} + +void +isc__nm_tlsdns_send(isc_nmhandle_t *handle, isc_region_t *region, + isc_nm_cb_t cb, void *cbarg) { + isc__netievent_tlsdnssend_t *ievent = NULL; + isc__nm_uvreq_t *uvreq = NULL; + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tlsdnssocket); + + uvreq = isc__nm_uvreq_get(sock->mgr, sock); + *(uint16_t *)uvreq->tcplen = htons(region->length); + uvreq->uvbuf.base = (char *)region->base; + uvreq->uvbuf.len = region->length; + + isc_nmhandle_attach(handle, &uvreq->handle); + + uvreq->cb.send = cb; + uvreq->cbarg = cbarg; + + ievent = isc__nm_get_netievent_tlsdnssend(sock->mgr, sock, uvreq); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + return; +} + +/* + * Handle 'tcpsend' async event - send a packet on the socket + */ +void +isc__nm_async_tlsdnssend(isc__networker_t *worker, isc__netievent_t *ev0) { + isc_result_t result; + isc__netievent_tlsdnssend_t *ievent = + (isc__netievent_tlsdnssend_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *uvreq = ievent->req; + + UNUSED(worker); + + REQUIRE(sock->type == isc_nm_tlsdnssocket); + REQUIRE(sock->tid == isc_nm_tid()); + + if (sock->write_timeout == 0) { + sock->write_timeout = + (atomic_load(&sock->keepalive) + ? atomic_load(&sock->mgr->keepalive) + : atomic_load(&sock->mgr->idle)); + } + + result = tlsdns_send_direct(sock, uvreq); + if (result != ISC_R_SUCCESS) { + isc__nm_incstats(sock, STATID_SENDFAIL); + isc__nm_failed_send_cb(sock, uvreq, result); + } +} + +static void +tlsdns_send_enqueue(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { + isc__netievent_tlsdnssend_t *ievent = + isc__nm_get_netievent_tlsdnssend(sock->mgr, sock, req); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +static isc_result_t +tlsdns_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { + isc_result_t result; + int err = 0; + int rv; + size_t bytes = 0; + size_t sendlen; + isc__networker_t *worker = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_tlsdnssocket); + + result = tls_pop_error(sock); + if (result != ISC_R_SUCCESS) { + return (result); + } + + if (isc__nmsocket_closing(sock)) { + return (ISC_R_CANCELED); + } + + /* Writes won't succeed until handshake end */ + if (!SSL_is_init_finished(sock->tls.tls)) { + goto requeue; + } + + /* + * Try to send any pending data before trying to call SSL_write_ex(). + * Otherwise, it could fail with SSL_ERROR_WANT_WRITE error. + * + * It is important to stress that we want to avoid this happening + * due to how SSL_write_ex() works - mainly to avoid partial + * writes. + * + * Although the documentation for these functions is vague, it is + * not stated that partial writes are not possible. On the + * contrary, one can deduce that it is possible and recovering + * from this situation is complicated and unreasonably hard to + * implement to the point when it is better to avoid this + * situation altogether. + * + * In particular, the following can be found in the documentation: + * + * "The write functions will only return with success when the + * complete contents of buf of length num has been written. This + * default behaviour can be changed with the + * SSL_MODE_ENABLE_PARTIAL_WRITE option of + * SSL_CTX_set_mode(3). When this flag is set, the write functions + * will also return with success when a partial write has been + * successfully completed. In this case, the write function + * operation is considered completed. The bytes are sent, and a + * new write call with a new buffer (with the already sent bytes + * removed) must be started. A partial write is performed with the + * size of a message block, which is 16kB." + + * That is, it is said that success is returned only when the + * complete chunk of data is written (encrypted), but it does not + * mention that partial writes are not possible (the behaviour can + * be changed using SSL_MODE_ENABLE_PARTIAL_WRITE). Another + * important aspect of this passage is that a partial write of up + * to 16 kilobytes can happen, and the call still can + * fail. Needless to say, this amount of data may include more + * than one DNS message. + * + * One could expect that SSL_write_ex() should return the number + * of bytes written, but no, that is not guaranteed (emphasis is + * mine): "*On success* SSL_write_ex() will store the number of + * bytes written in *written." + * + * Moreover, we can find the following guidance on how to handle + * the SSL_ERROR_WANT_WRITE error in the "Warnings" section of the + * documentation: + + * "When a write function call has to be repeated because + * SSL_get_error(3) returned SSL_ERROR_WANT_READ or + * SSL_ERROR_WANT_WRITE, it must be repeated with the same + * arguments. The data that was passed might have been partially + * processed. When SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER was set + * using SSL_CTX_set_mode(3) the pointer can be different, but the + * data and length should still be the same." + * + * That is, when a call to SSL_write_ex() fails with + * SSL_ERROR_WANT_WRITE, we must attempt to make the next call to + * the function exactly with the same arguments. Of course, the + * code is structured in such a way that we cannot guarantee that + * (and keeping track of that would be unreasonably complicated to + * implement). The best we can do to avoid this error is to get + * (and send) the outgoing data from the SSL buffer ASAP before + * processing the subsequent write request. We can achieve that by + * calling tls_cycle() and rescheduling the write request for + * being processed later. + */ + if (BIO_pending(sock->tls.app_rbio) > 0) { + /* Handle any pending data and requeue the write request. */ + goto cycle; + } + + /* + * There's no SSL_writev(), so we need to use a local buffer to + * assemble the whole message + */ + worker = &sock->mgr->workers[sock->tid]; + sendlen = req->uvbuf.len + sizeof(uint16_t); + memmove(worker->sendbuf, req->tcplen, sizeof(uint16_t)); + memmove(worker->sendbuf + sizeof(uint16_t), req->uvbuf.base, + req->uvbuf.len); + + rv = SSL_write_ex(sock->tls.tls, worker->sendbuf, sendlen, &bytes); + if (rv > 0) { + INSIST(sendlen == bytes); + + ISC_LIST_APPEND(sock->tls.sendreqs, req, link); + async_tlsdns_cycle(sock); + return (ISC_R_SUCCESS); + } + + /* Nothing was written, maybe enqueue? */ + err = SSL_get_error(sock->tls.tls, rv); + + switch (err) { + case SSL_ERROR_WANT_WRITE: + case SSL_ERROR_WANT_READ: + break; + case 0: + UNREACHABLE(); + default: + return (ISC_R_TLSERROR); + } + +cycle: + result = tls_cycle(sock); + if (result != ISC_R_SUCCESS) { + return (result); + } + +requeue: + tlsdns_send_enqueue(sock, req); + + return (result); +} + +static void +tlsdns_stop_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + uv_handle_set_data(handle, NULL); + + if (!atomic_compare_exchange_strong(&sock->closed, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + isc__nm_incstats(sock, STATID_CLOSE); + + atomic_store(&sock->listening, false); + + BIO_free_all(sock->tls.app_rbio); + BIO_free_all(sock->tls.app_wbio); + + if (sock->tls.ctx != NULL) { + isc_tlsctx_free(&sock->tls.ctx); + } + + isc__nmsocket_detach(&sock); +} + +static void +tlsdns_close_sock(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + if (!atomic_compare_exchange_strong(&sock->closed, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + isc__nm_incstats(sock, STATID_CLOSE); + + if (sock->server != NULL) { + isc__nmsocket_detach(&sock->server); + } + + atomic_store(&sock->connected, false); + + if (sock->tls.tls != NULL) { + /* + * Let's shutdown the TLS session properly so that the session + * will remain resumable, if required. + */ + tlsdns_set_tls_shutdown(sock->tls.tls); + tlsdns_keep_client_tls_session(sock); + isc_tls_free(&sock->tls.tls); + } + + BIO_free_all(sock->tls.app_rbio); + BIO_free_all(sock->tls.app_wbio); + + if (sock->tls.ctx != NULL) { + isc_tlsctx_free(&sock->tls.ctx); + } + + isc__nmsocket_prep_destroy(sock); +} + +static void +tlsdns_close_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + uv_handle_set_data(handle, NULL); + + tlsdns_close_sock(sock); +} + +static void +read_timer_close_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + uv_handle_set_data(handle, NULL); + + REQUIRE(VALID_NMSOCK(sock)); + + if (sock->parent) { + uv_close(&sock->uv_handle.handle, tlsdns_stop_cb); + } else if (uv_is_closing(&sock->uv_handle.handle)) { + tlsdns_close_sock(sock); + } else { + uv_close(&sock->uv_handle.handle, tlsdns_close_cb); + } +} + +static void +stop_tlsdns_child(isc_nmsocket_t *sock) { + REQUIRE(sock->type == isc_nm_tlsdnssocket); + REQUIRE(sock->tid == isc_nm_tid()); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + tlsdns_close_direct(sock); + + atomic_fetch_sub(&sock->parent->rchildren, 1); + + isc_barrier_wait(&sock->parent->stoplistening); +} + +static void +stop_tlsdns_parent(isc_nmsocket_t *sock) { + isc_nmsocket_t *csock = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_tlsdnslistener); + + isc_barrier_init(&sock->stoplistening, sock->nchildren); + + for (size_t i = 0; i < sock->nchildren; i++) { + csock = &sock->children[i]; + + REQUIRE(VALID_NMSOCK(csock)); + + if ((int)i == isc_nm_tid()) { + /* + * We need to schedule closing the other sockets first + */ + continue; + } + + atomic_store(&csock->active, false); + enqueue_stoplistening(csock); + } + + csock = &sock->children[isc_nm_tid()]; + atomic_store(&csock->active, false); + stop_tlsdns_child(csock); + + atomic_store(&sock->closed, true); + isc__nmsocket_prep_destroy(sock); +} + +static void +tlsdns_close_direct(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + REQUIRE(sock->tls.pending_req == NULL); + + if (sock->quota != NULL) { + isc_quota_detach(&sock->quota); + } + + if (sock->recv_handle != NULL) { + isc_nmhandle_detach(&sock->recv_handle); + } + + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + uv_close((uv_handle_t *)&sock->read_timer, read_timer_close_cb); +} + +void +isc__nm_tlsdns_close(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tlsdnssocket); + REQUIRE(!isc__nmsocket_active(sock)); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + if (sock->tid == isc_nm_tid()) { + tlsdns_close_direct(sock); + } else { + /* + * We need to create an event and pass it using async + * channel + */ + isc__netievent_tlsdnsclose_t *ievent = + isc__nm_get_netievent_tlsdnsclose(sock->mgr, sock); + + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } +} + +void +isc__nm_async_tlsdnsclose(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnsclose_t *ievent = + (isc__netievent_tlsdnsclose_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + tlsdns_close_direct(sock); +} + +static void +tlsdns_close_connect_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + + REQUIRE(VALID_NMSOCK(sock)); + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(sock->tid == isc_nm_tid()); + + isc__nmsocket_prep_destroy(sock); + isc__nmsocket_detach(&sock); +} + +void +isc__nm_tlsdns_shutdown(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_tlsdnssocket); + + /* + * If the socket is active, mark it inactive and + * continue. If it isn't active, stop now. + */ + if (!isc__nmsocket_deactivate(sock)) { + return; + } + + if (sock->tls.tls) { + /* Shutdown any active TLS connections */ + tlsdns_set_tls_shutdown(sock->tls.tls); + } + + if (atomic_load(&sock->accepting)) { + return; + } + + /* TLS handshake hasn't been completed yet */ + if (atomic_load(&sock->connecting)) { + isc_nmsocket_t *tsock = NULL; + + /* + * TCP connection has been established, now waiting on + * TLS handshake to complete + */ + if (sock->tls.pending_req != NULL) { + isc_result_t result = ISC_R_CANCELED; + isc__nm_uvreq_t *req = sock->tls.pending_req; + sock->tls.pending_req = NULL; + + if (peer_verification_has_failed(sock)) { + /* + * Save error message as 'sock->tls' will get + * detached. + */ + sock->tls.tls_verify_errmsg = + isc_tls_verify_peer_result_string( + sock->tls.tls); + result = ISC_R_TLSBADPEERCERT; + } + isc__nm_failed_connect_cb(sock, req, result, false); + return; + } + + /* The TCP connection hasn't been established yet */ + isc__nmsocket_attach(sock, &tsock); + uv_close(&sock->uv_handle.handle, tlsdns_close_connect_cb); + return; + } + + if (sock->statichandle != NULL) { + if (isc__nm_closing(sock)) { + isc__nm_failed_read_cb(sock, ISC_R_SHUTTINGDOWN, false); + } else { + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, false); + } + return; + } + + /* + * Otherwise, we just send the socket to abyss... + */ + if (sock->parent == NULL) { + isc__nmsocket_prep_destroy(sock); + } +} + +void +isc__nm_tlsdns_cancelread(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + isc__netievent_tlsdnscancel_t *ievent = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tlsdnssocket); + + ievent = isc__nm_get_netievent_tlsdnscancel(sock->mgr, sock, handle); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_tlsdnscancel(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdnscancel_t *ievent = + (isc__netievent_tlsdnscancel_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + isc__nm_failed_read_cb(sock, ISC_R_EOF, false); +} + +/* Zone transfers/updates over TLS are allowed only when "dot" ALPN + * was negotiated. + * + * Per the XoT spec, we must also check that the TLS version is >= + * 1.3. The check could be added here. However, we still need to + * support platforms where no cryptographic library with TLSv1.3 + * support is available. As a result of this we cannot be too strict + * regarding the minimal TLS protocol version in order to make it + * possible to do encrypted zone transfers over TLSv1.2, as it would + * not be right to leave users on these platforms without means for + * encrypted zone transfers using BIND only. + * + * The ones requiring strict compatibility with the specification + * could disable TLSv1.2 in the configuration file. + */ +isc_result_t +isc__nm_tlsdns_xfr_checkperm(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tlsdnssocket); + + if (!sock->tls.alpn_negotiated) { + return (ISC_R_DOTALPNERROR); + } + + return (ISC_R_SUCCESS); +} + +const char * +isc__nm_tlsdns_verify_tls_peer_result_string(const isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_tlsdnssocket); + + sock = handle->sock; + if (sock->tls.tls == NULL) { + return (sock->tls.tls_verify_errmsg); + } + + return (isc_tls_verify_peer_result_string(sock->tls.tls)); +} + +void +isc__nm_async_tlsdns_set_tlsctx(isc_nmsocket_t *listener, isc_tlsctx_t *tlsctx, + const int tid) { + REQUIRE(tid >= 0); + + isc_tlsctx_free(&listener->children[tid].tls.ctx); + isc_tlsctx_attach(tlsctx, &listener->children[tid].tls.ctx); +} + +void +isc__nm_tlsdns_cleanup_data(isc_nmsocket_t *sock) { + if (sock->type == isc_nm_tlsdnslistener || + sock->type == isc_nm_tlsdnssocket) + { + if (sock->tls.client_sess_cache != NULL) { + INSIST(atomic_load(&sock->client)); + INSIST(sock->type == isc_nm_tlsdnssocket); + isc_tlsctx_client_session_cache_detach( + &sock->tls.client_sess_cache); + } + if (sock->tls.ctx != NULL) { + INSIST(ISC_LIST_EMPTY(sock->tls.sendreqs)); + isc_tlsctx_free(&sock->tls.ctx); + } + } +} + +static void +tlsdns_keep_client_tls_session(isc_nmsocket_t *sock) { + /* + * Ensure that the isc_tls_t is being accessed from + * within the worker thread the socket is bound to. + */ + REQUIRE(sock->tid == isc_nm_tid()); + if (sock->tls.client_sess_cache != NULL && + sock->tls.client_session_saved == false) + { + INSIST(atomic_load(&sock->client)); + isc_tlsctx_client_session_cache_keep_sockaddr( + sock->tls.client_sess_cache, &sock->peer, + sock->tls.tls); + sock->tls.client_session_saved = true; + } +} diff --git a/lib/isc/netmgr/tlsstream.c b/lib/isc/netmgr/tlsstream.c new file mode 100644 index 0000000..7b49071 --- /dev/null +++ b/lib/isc/netmgr/tlsstream.c @@ -0,0 +1,1348 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <errno.h> +#include <libgen.h> +#include <unistd.h> +#include <uv.h> + +#include <openssl/err.h> +#include <openssl/ssl.h> + +#include <isc/atomic.h> +#include <isc/buffer.h> +#include <isc/condition.h> +#include <isc/log.h> +#include <isc/magic.h> +#include <isc/mem.h> +#include <isc/netmgr.h> +#include <isc/once.h> +#include <isc/quota.h> +#include <isc/random.h> +#include <isc/refcount.h> +#include <isc/region.h> +#include <isc/result.h> +#include <isc/sockaddr.h> +#include <isc/stdtime.h> +#include <isc/thread.h> +#include <isc/util.h> + +#include "../openssl_shim.h" +#include "netmgr-int.h" +#include "uv-compat.h" + +#define TLS_BUF_SIZE (UINT16_MAX) + +static isc_result_t +tls_error_to_result(const int tls_err, const int tls_state, isc_tls_t *tls) { + switch (tls_err) { + case SSL_ERROR_ZERO_RETURN: + return (ISC_R_EOF); + case SSL_ERROR_SSL: + if (tls != NULL && tls_state < TLS_IO && + SSL_get_verify_result(tls) != X509_V_OK) + { + return (ISC_R_TLSBADPEERCERT); + } + return (ISC_R_TLSERROR); + default: + return (ISC_R_UNEXPECTED); + } +} + +static void +tls_failed_read_cb(isc_nmsocket_t *sock, const isc_result_t result); + +static void +tls_do_bio(isc_nmsocket_t *sock, isc_region_t *received_data, + isc__nm_uvreq_t *send_data, bool finish); + +static void +tls_readcb(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, + void *cbarg); + +static void +tls_close_direct(isc_nmsocket_t *sock); + +static void +async_tls_do_bio(isc_nmsocket_t *sock); + +static void +tls_init_listener_tlsctx(isc_nmsocket_t *listener, isc_tlsctx_t *ctx); + +static void +tls_cleanup_listener_tlsctx(isc_nmsocket_t *listener); + +static isc_tlsctx_t * +tls_get_listener_tlsctx(isc_nmsocket_t *listener, const int tid); + +static void +tls_keep_client_tls_session(isc_nmsocket_t *sock); + +static void +tls_try_shutdown(isc_tls_t *tls, const bool quite); + +/* + * The socket is closing, outerhandle has been detached, listener is + * inactive, or the netmgr is closing: any operation on it should abort + * with ISC_R_CANCELED. + */ +static bool +inactive(isc_nmsocket_t *sock) { + return (!isc__nmsocket_active(sock) || atomic_load(&sock->closing) || + sock->outerhandle == NULL || + !isc__nmsocket_active(sock->outerhandle->sock) || + atomic_load(&sock->outerhandle->sock->closing) || + (sock->listener != NULL && + !isc__nmsocket_active(sock->listener)) || + isc__nm_closing(sock)); +} + +static void +tls_call_connect_cb(isc_nmsocket_t *sock, isc_nmhandle_t *handle, + const isc_result_t result) { + if (sock->connect_cb == NULL) { + return; + } + sock->connect_cb(handle, result, sock->connect_cbarg); + if (result != ISC_R_SUCCESS) { + isc__nmsocket_clearcb(handle->sock); + } +} + +static void +tls_senddone(isc_nmhandle_t *handle, isc_result_t eresult, void *cbarg) { + isc_nmsocket_tls_send_req_t *send_req = + (isc_nmsocket_tls_send_req_t *)cbarg; + isc_nmsocket_t *tlssock = NULL; + bool finish = send_req->finish; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(VALID_NMSOCK(send_req->tlssock)); + + tlssock = send_req->tlssock; + send_req->tlssock = NULL; + + if (finish) { + tls_try_shutdown(tlssock->tlsstream.tls, true); + } + + if (send_req->cb != NULL) { + INSIST(VALID_NMHANDLE(tlssock->statichandle)); + send_req->cb(send_req->handle, eresult, send_req->cbarg); + isc_nmhandle_detach(&send_req->handle); + /* The last handle has been just detached: close the underlying + * socket. */ + if (tlssock->statichandle == NULL) { + finish = true; + } + } + + /* We are tying to avoid a memory allocation for small write + * requests. See the mirroring code in the tls_send_outgoing() + * function. */ + if (send_req->data.length > sizeof(send_req->smallbuf)) { + isc_mem_put(handle->sock->mgr->mctx, send_req->data.base, + send_req->data.length); + } else { + INSIST(&send_req->smallbuf[0] == send_req->data.base); + } + isc_mem_put(handle->sock->mgr->mctx, send_req, sizeof(*send_req)); + tlssock->tlsstream.nsending--; + + if (finish && eresult == ISC_R_SUCCESS) { + tlssock->tlsstream.reading = false; + isc_nm_cancelread(handle); + } else if (eresult == ISC_R_SUCCESS) { + tls_do_bio(tlssock, NULL, NULL, false); + } else if (eresult != ISC_R_SUCCESS && + tlssock->tlsstream.state <= TLS_HANDSHAKE && + !tlssock->tlsstream.server) + { + /* + * We are still waiting for the handshake to complete, but + * it isn't going to happen. Call the connect callback, + * passing the error code there. + * + * (Note: tls_failed_read_cb() calls the connect + * rather than the read callback in this case. + * XXX: clarify?) + */ + tls_failed_read_cb(tlssock, eresult); + } + + isc__nmsocket_detach(&tlssock); +} + +static void +tls_failed_read_cb(isc_nmsocket_t *sock, const isc_result_t result) { + bool destroy = true; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(result != ISC_R_SUCCESS); + + if (!sock->tlsstream.server && + (sock->tlsstream.state == TLS_INIT || + sock->tlsstream.state == TLS_HANDSHAKE) && + sock->connect_cb != NULL) + { + isc_nmhandle_t *handle = NULL; + INSIST(sock->statichandle == NULL); + handle = isc__nmhandle_get(sock, &sock->peer, &sock->iface); + tls_call_connect_cb(sock, handle, result); + isc__nmsocket_clearcb(sock); + isc_nmhandle_detach(&handle); + } else if (sock->recv_cb != NULL && sock->statichandle != NULL && + (sock->recv_read || result == ISC_R_TIMEDOUT)) + { + isc__nm_uvreq_t *req = NULL; + INSIST(VALID_NMHANDLE(sock->statichandle)); + sock->recv_read = false; + req = isc__nm_uvreq_get(sock->mgr, sock); + req->cb.recv = sock->recv_cb; + req->cbarg = sock->recv_cbarg; + isc_nmhandle_attach(sock->statichandle, &req->handle); + if (result != ISC_R_TIMEDOUT) { + isc__nmsocket_clearcb(sock); + } + isc__nm_readcb(sock, req, result); + if (result == ISC_R_TIMEDOUT && + (sock->outerhandle == NULL || + isc__nmsocket_timer_running(sock->outerhandle->sock))) + { + destroy = false; + } + } + + if (destroy) { + isc__nmsocket_prep_destroy(sock); + } +} + +static void +async_tls_do_bio(isc_nmsocket_t *sock) { + isc__netievent_tlsdobio_t *ievent = + isc__nm_get_netievent_tlsdobio(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +static int +tls_send_outgoing(isc_nmsocket_t *sock, bool finish, isc_nmhandle_t *tlshandle, + isc_nm_cb_t cb, void *cbarg) { + isc_nmsocket_tls_send_req_t *send_req = NULL; + int pending; + int rv; + size_t len = 0; + + if (inactive(sock)) { + if (cb != NULL) { + INSIST(VALID_NMHANDLE(tlshandle)); + cb(tlshandle, ISC_R_CANCELED, cbarg); + } + return (0); + } + + if (finish) { + tls_try_shutdown(sock->tlsstream.tls, false); + tls_keep_client_tls_session(sock); + } + + pending = BIO_pending(sock->tlsstream.bio_out); + if (pending <= 0) { + return (pending); + } + + /* TODO Should we keep track of these requests in a list? */ + if ((unsigned int)pending > TLS_BUF_SIZE) { + pending = TLS_BUF_SIZE; + } + + send_req = isc_mem_get(sock->mgr->mctx, sizeof(*send_req)); + *send_req = (isc_nmsocket_tls_send_req_t){ .finish = finish, + .data.length = pending }; + + /* Let's try to avoid a memory allocation for small write requests */ + if ((size_t)pending > sizeof(send_req->smallbuf)) { + send_req->data.base = isc_mem_get(sock->mgr->mctx, pending); + } else { + send_req->data.base = &send_req->smallbuf[0]; + } + + isc__nmsocket_attach(sock, &send_req->tlssock); + if (cb != NULL) { + send_req->cb = cb; + send_req->cbarg = cbarg; + isc_nmhandle_attach(tlshandle, &send_req->handle); + } + + rv = BIO_read_ex(sock->tlsstream.bio_out, send_req->data.base, pending, + &len); + /* There's something pending, read must succeed */ + RUNTIME_CHECK(rv == 1); + + INSIST(VALID_NMHANDLE(sock->outerhandle)); + + sock->tlsstream.nsending++; + isc_nm_send(sock->outerhandle, &send_req->data, tls_senddone, send_req); + + return (pending); +} + +static int +tls_process_outgoing(isc_nmsocket_t *sock, bool finish, + isc__nm_uvreq_t *send_data) { + int pending; + + bool received_shutdown = ((SSL_get_shutdown(sock->tlsstream.tls) & + SSL_RECEIVED_SHUTDOWN) != 0); + bool sent_shutdown = ((SSL_get_shutdown(sock->tlsstream.tls) & + SSL_SENT_SHUTDOWN) != 0); + + if (received_shutdown && !sent_shutdown) { + finish = true; + } + + /* Data from TLS to network */ + if (send_data != NULL) { + pending = tls_send_outgoing(sock, finish, send_data->handle, + send_data->cb.send, + send_data->cbarg); + } else { + pending = tls_send_outgoing(sock, finish, NULL, NULL, NULL); + } + + return (pending); +} + +static int +tls_try_handshake(isc_nmsocket_t *sock, isc_result_t *presult) { + int rv = 0; + isc_nmhandle_t *tlshandle = NULL; + + REQUIRE(sock->tlsstream.state == TLS_HANDSHAKE); + + if (SSL_is_init_finished(sock->tlsstream.tls) == 1) { + return (0); + } + + rv = SSL_do_handshake(sock->tlsstream.tls); + if (rv == 1) { + isc_result_t result = ISC_R_SUCCESS; + INSIST(SSL_is_init_finished(sock->tlsstream.tls) == 1); + INSIST(sock->statichandle == NULL); + isc__nmsocket_log_tls_session_reuse(sock, sock->tlsstream.tls); + tlshandle = isc__nmhandle_get(sock, &sock->peer, &sock->iface); + + if (isc__nm_closing(sock)) { + result = ISC_R_SHUTTINGDOWN; + } + + if (sock->tlsstream.server) { + if (isc__nmsocket_closing(sock->listener)) { + result = ISC_R_CANCELED; + } else if (result == ISC_R_SUCCESS) { + result = sock->listener->accept_cb( + tlshandle, result, + sock->listener->accept_cbarg); + } + } else { + tls_call_connect_cb(sock, tlshandle, result); + } + isc_nmhandle_detach(&tlshandle); + sock->tlsstream.state = TLS_IO; + + if (presult != NULL) { + *presult = result; + } + } + + return (rv); +} + +static bool +tls_try_to_close_unused_socket(isc_nmsocket_t *sock) { + if (sock->tlsstream.state > TLS_HANDSHAKE && + sock->statichandle == NULL && sock->tlsstream.nsending == 0) + { + /* + * It seems that no action on the socket has been + * scheduled on some point after the handshake, let's + * close the connection. + */ + isc__nmsocket_prep_destroy(sock); + return (true); + } + + return (false); +} + +static void +tls_do_bio(isc_nmsocket_t *sock, isc_region_t *received_data, + isc__nm_uvreq_t *send_data, bool finish) { + isc_result_t result = ISC_R_SUCCESS; + int pending, tls_status = SSL_ERROR_NONE; + int rv = 0; + size_t len = 0; + int saved_errno = 0; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + /* We will resume read if TLS layer wants us to */ + if (sock->tlsstream.reading && sock->outerhandle) { + REQUIRE(VALID_NMHANDLE(sock->outerhandle)); + isc_nm_pauseread(sock->outerhandle); + } + + /* + * Clear the TLS error queue so that SSL_get_error() and SSL I/O + * routine calls will not get affected by prior error statuses. + * + * See here: + * https://www.openssl.org/docs/man3.0/man3/SSL_get_error.html + * + * In particular, it mentions the following: + * + * The current thread's error queue must be empty before the + * TLS/SSL I/O operation is attempted, or SSL_get_error() will not + * work reliably. + * + * As we use the result of SSL_get_error() to decide on I/O + * operations, we need to ensure that it works reliably by + * cleaning the error queue. + * + * The sum of details: https://stackoverflow.com/a/37980911 + */ + ERR_clear_error(); + + if (sock->tlsstream.state == TLS_INIT) { + INSIST(received_data == NULL && send_data == NULL); + if (sock->tlsstream.server) { + SSL_set_accept_state(sock->tlsstream.tls); + } else { + SSL_set_connect_state(sock->tlsstream.tls); + } + sock->tlsstream.state = TLS_HANDSHAKE; + rv = tls_try_handshake(sock, NULL); + INSIST(SSL_is_init_finished(sock->tlsstream.tls) == 0); + } else if (sock->tlsstream.state == TLS_CLOSED) { + return; + } else { /* initialised and doing I/O */ + if (received_data != NULL) { + INSIST(send_data == NULL); + rv = BIO_write_ex(sock->tlsstream.bio_in, + received_data->base, + received_data->length, &len); + if (rv <= 0 || len != received_data->length) { + result = ISC_R_TLSERROR; +#if defined(NETMGR_TRACE) && defined(NETMGR_TRACE_VERBOSE) + saved_errno = errno; +#endif + goto error; + } + + /* + * Only after doing the IO we can check whether SSL + * handshake is done. + */ + if (sock->tlsstream.state == TLS_HANDSHAKE) { + isc_result_t hs_result = ISC_R_UNSET; + rv = tls_try_handshake(sock, &hs_result); + if (sock->tlsstream.state == TLS_IO && + hs_result != ISC_R_SUCCESS) + { + /* + * The accept callback has been called + * unsuccessfully. Let's try to shut + * down the TLS connection gracefully. + */ + INSIST(SSL_is_init_finished( + sock->tlsstream.tls) == + 1); + finish = true; + } + } + } else if (send_data != NULL) { + INSIST(received_data == NULL); + INSIST(sock->tlsstream.state > TLS_HANDSHAKE); + bool received_shutdown = + ((SSL_get_shutdown(sock->tlsstream.tls) & + SSL_RECEIVED_SHUTDOWN) != 0); + bool sent_shutdown = + ((SSL_get_shutdown(sock->tlsstream.tls) & + SSL_SENT_SHUTDOWN) != 0); + rv = SSL_write_ex(sock->tlsstream.tls, + send_data->uvbuf.base, + send_data->uvbuf.len, &len); + if (rv != 1 || len != send_data->uvbuf.len) { + result = received_shutdown || sent_shutdown + ? ISC_R_CANCELED + : ISC_R_TLSERROR; + send_data->cb.send(send_data->handle, result, + send_data->cbarg); + send_data = NULL; + return; + } + } + + /* Decrypt and pass data from network to client */ + if (sock->tlsstream.state >= TLS_IO && sock->recv_cb != NULL && + !atomic_load(&sock->readpaused) && + sock->statichandle != NULL && !finish) + { + uint8_t recv_buf[TLS_BUF_SIZE]; + INSIST(sock->tlsstream.state > TLS_HANDSHAKE); + while ((rv = SSL_read_ex(sock->tlsstream.tls, recv_buf, + TLS_BUF_SIZE, &len)) == 1) + { + isc_region_t region; + region = (isc_region_t){ .base = &recv_buf[0], + .length = len }; + + INSIST(VALID_NMHANDLE(sock->statichandle)); + sock->recv_cb(sock->statichandle, ISC_R_SUCCESS, + ®ion, sock->recv_cbarg); + /* The handle could have been detached in + * sock->recv_cb, making the sock->statichandle + * nullified (it happens in netmgr.c). If it is + * the case, then it means that we are not + * interested in keeping the connection alive + * anymore. Let's shut down the SSL session, + * send what we have in the SSL buffers, + * and close the connection. + */ + if (sock->statichandle == NULL) { + finish = true; + break; + } else if (sock->recv_cb == NULL) { + /* + * The 'sock->recv_cb' might have been + * nullified during the call to + * 'sock->recv_cb'. That could happen, + * indirectly when wrapping up. + * + * In this case, let's close the TLS + * connection. + */ + finish = true; + break; + } else if (atomic_load(&sock->readpaused)) { + /* + * Reading has been paused from withing + * the context of read callback - stop + * processing incoming data. + */ + break; + } + } + } + } + errno = 0; + tls_status = SSL_get_error(sock->tlsstream.tls, rv); + saved_errno = errno; + + /* See "BUGS" section at: + * https://www.openssl.org/docs/man1.1.1/man3/SSL_get_error.html + * + * It is mentioned there that when TLS status equals + * SSL_ERROR_SYSCALL AND errno == 0 it means that underlying + * transport layer returned EOF prematurely. However, we are + * managing the transport ourselves, so we should just resume + * reading from the TCP socket. + * + * It seems that this case has been handled properly on modern + * versions of OpenSSL. That being said, the situation goes in + * line with the manual: it is briefly mentioned there that + * SSL_ERROR_SYSCALL might be returned not only in a case of + * low-level errors (like system call failures). + */ + if (tls_status == SSL_ERROR_SYSCALL && saved_errno == 0 && + received_data == NULL && send_data == NULL && finish == false) + { + tls_status = SSL_ERROR_WANT_READ; + } + + pending = tls_process_outgoing(sock, finish, send_data); + if (pending > 0 && tls_status != SSL_ERROR_SSL) { + /* We'll continue in tls_senddone */ + return; + } + + switch (tls_status) { + case SSL_ERROR_NONE: + case SSL_ERROR_ZERO_RETURN: + (void)tls_try_to_close_unused_socket(sock); + return; + case SSL_ERROR_WANT_WRITE: + if (sock->tlsstream.nsending == 0) { + /* + * Launch tls_do_bio asynchronously. If we're sending + * already the send callback will call it. + */ + async_tls_do_bio(sock); + } + return; + case SSL_ERROR_WANT_READ: + if (tls_try_to_close_unused_socket(sock) || + sock->outerhandle == NULL || atomic_load(&sock->readpaused)) + { + return; + } + + INSIST(VALID_NMHANDLE(sock->outerhandle)); + + if (sock->tlsstream.reading) { + isc_nm_resumeread(sock->outerhandle); + } else if (sock->tlsstream.state == TLS_HANDSHAKE) { + sock->tlsstream.reading = true; + isc_nm_read(sock->outerhandle, tls_readcb, sock); + } + return; + default: + result = tls_error_to_result(tls_status, sock->tlsstream.state, + sock->tlsstream.tls); + break; + } + +error: +#if defined(NETMGR_TRACE) && defined(NETMGR_TRACE_VERBOSE) + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR, + ISC_LOG_NOTICE, + "SSL error in BIO: %d %s (errno: %d). Arguments: " + "received_data: %p, " + "send_data: %p, finish: %s", + tls_status, isc_result_totext(result), saved_errno, + received_data, send_data, finish ? "true" : "false"); +#endif + tls_failed_read_cb(sock, result); +} + +static void +tls_readcb(isc_nmhandle_t *handle, isc_result_t result, isc_region_t *region, + void *cbarg) { + isc_nmsocket_t *tlssock = (isc_nmsocket_t *)cbarg; + + REQUIRE(VALID_NMSOCK(tlssock)); + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(tlssock->tid == isc_nm_tid()); + + if (result != ISC_R_SUCCESS) { + tls_failed_read_cb(tlssock, result); + return; + } + + tls_do_bio(tlssock, region, NULL, false); +} + +static isc_result_t +initialize_tls(isc_nmsocket_t *sock, bool server) { + REQUIRE(sock->tid == isc_nm_tid()); + + sock->tlsstream.bio_in = BIO_new(BIO_s_mem()); + if (sock->tlsstream.bio_in == NULL) { + isc_tls_free(&sock->tlsstream.tls); + return (ISC_R_TLSERROR); + } + sock->tlsstream.bio_out = BIO_new(BIO_s_mem()); + if (sock->tlsstream.bio_out == NULL) { + BIO_free_all(sock->tlsstream.bio_in); + sock->tlsstream.bio_in = NULL; + isc_tls_free(&sock->tlsstream.tls); + return (ISC_R_TLSERROR); + } + + if (BIO_set_mem_eof_return(sock->tlsstream.bio_in, EOF) != 1 || + BIO_set_mem_eof_return(sock->tlsstream.bio_out, EOF) != 1) + { + goto error; + } + + SSL_set_bio(sock->tlsstream.tls, sock->tlsstream.bio_in, + sock->tlsstream.bio_out); + sock->tlsstream.server = server; + sock->tlsstream.nsending = 0; + sock->tlsstream.state = TLS_INIT; + return (ISC_R_SUCCESS); +error: + isc_tls_free(&sock->tlsstream.tls); + sock->tlsstream.bio_out = sock->tlsstream.bio_in = NULL; + return (ISC_R_TLSERROR); +} + +static isc_result_t +tlslisten_acceptcb(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) { + isc_nmsocket_t *tlslistensock = (isc_nmsocket_t *)cbarg; + isc_nmsocket_t *tlssock = NULL; + isc_tlsctx_t *tlsctx = NULL; + int tid; + + /* If accept() was unsuccessful we can't do anything */ + if (result != ISC_R_SUCCESS) { + return (result); + } + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(VALID_NMSOCK(tlslistensock)); + REQUIRE(tlslistensock->type == isc_nm_tlslistener); + + if (isc__nmsocket_closing(handle->sock) || + isc__nmsocket_closing(tlslistensock) || + !atomic_load(&tlslistensock->listening)) + { + return (ISC_R_CANCELED); + } + + /* + * We need to create a 'wrapper' tlssocket for this connection. + */ + tlssock = isc_mem_get(handle->sock->mgr->mctx, sizeof(*tlssock)); + isc__nmsocket_init(tlssock, handle->sock->mgr, isc_nm_tlssocket, + &handle->sock->iface); + + tid = isc_nm_tid(); + /* We need to initialize SSL now to reference SSL_CTX properly */ + tlsctx = tls_get_listener_tlsctx(tlslistensock, tid); + RUNTIME_CHECK(tlsctx != NULL); + isc_tlsctx_attach(tlsctx, &tlssock->tlsstream.ctx); + tlssock->tlsstream.tls = isc_tls_create(tlssock->tlsstream.ctx); + if (tlssock->tlsstream.tls == NULL) { + atomic_store(&tlssock->closed, true); + isc_tlsctx_free(&tlssock->tlsstream.ctx); + isc__nmsocket_detach(&tlssock); + return (ISC_R_TLSERROR); + } + + tlssock->extrahandlesize = tlslistensock->extrahandlesize; + isc__nmsocket_attach(tlslistensock, &tlssock->listener); + isc_nmhandle_attach(handle, &tlssock->outerhandle); + tlssock->peer = handle->sock->peer; + tlssock->read_timeout = atomic_load(&handle->sock->mgr->init); + tlssock->tid = tid; + + /* + * Hold a reference to tlssock in the TCP socket: it will + * detached in isc__nm_tls_cleanup_data(). + */ + handle->sock->tlsstream.tlssocket = tlssock; + + result = initialize_tls(tlssock, true); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + /* TODO: catch failure code, detach tlssock, and log the error */ + + tls_do_bio(tlssock, NULL, NULL, false); + return (result); +} + +isc_result_t +isc_nm_listentls(isc_nm_t *mgr, isc_sockaddr_t *iface, + isc_nm_accept_cb_t accept_cb, void *accept_cbarg, + size_t extrahandlesize, int backlog, isc_quota_t *quota, + SSL_CTX *sslctx, isc_nmsocket_t **sockp) { + isc_result_t result; + isc_nmsocket_t *tlssock = NULL; + isc_nmsocket_t *tsock = NULL; + + REQUIRE(VALID_NM(mgr)); + if (atomic_load(&mgr->closing)) { + return (ISC_R_SHUTTINGDOWN); + } + + tlssock = isc_mem_get(mgr->mctx, sizeof(*tlssock)); + + isc__nmsocket_init(tlssock, mgr, isc_nm_tlslistener, iface); + tlssock->result = ISC_R_UNSET; + tlssock->accept_cb = accept_cb; + tlssock->accept_cbarg = accept_cbarg; + tlssock->extrahandlesize = extrahandlesize; + tls_init_listener_tlsctx(tlssock, sslctx); + tlssock->tlsstream.tls = NULL; + + /* + * tlssock will be a TLS 'wrapper' around an unencrypted stream. + * We set tlssock->outer to a socket listening for a TCP connection. + */ + result = isc_nm_listentcp(mgr, iface, tlslisten_acceptcb, tlssock, + extrahandlesize, backlog, quota, + &tlssock->outer); + if (result != ISC_R_SUCCESS) { + atomic_store(&tlssock->closed, true); + isc__nmsocket_detach(&tlssock); + return (result); + } + + /* wait for listen result */ + isc__nmsocket_attach(tlssock->outer, &tsock); + tlssock->result = result; + atomic_store(&tlssock->active, true); + INSIST(tlssock->outer->tlsstream.tlslistener == NULL); + isc__nmsocket_attach(tlssock, &tlssock->outer->tlsstream.tlslistener); + isc__nmsocket_detach(&tsock); + INSIST(result != ISC_R_UNSET); + tlssock->nchildren = tlssock->outer->nchildren; + + isc__nmsocket_barrier_init(tlssock); + atomic_init(&tlssock->rchildren, tlssock->nchildren); + + if (result == ISC_R_SUCCESS) { + atomic_store(&tlssock->listening, true); + *sockp = tlssock; + } + + return (result); +} + +void +isc__nm_async_tlssend(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlssend_t *ievent = (isc__netievent_tlssend_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *req = ievent->req; + + REQUIRE(VALID_UVREQ(req)); + REQUIRE(sock->tid == isc_nm_tid()); + + UNUSED(worker); + + ievent->req = NULL; + + if (inactive(sock)) { + req->cb.send(req->handle, ISC_R_CANCELED, req->cbarg); + goto done; + } + + tls_do_bio(sock, NULL, req, false); +done: + isc__nm_uvreq_put(&req, sock); + return; +} + +void +isc__nm_tls_send(isc_nmhandle_t *handle, const isc_region_t *region, + isc_nm_cb_t cb, void *cbarg) { + isc__netievent_tlssend_t *ievent = NULL; + isc__nm_uvreq_t *uvreq = NULL; + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + sock = handle->sock; + + REQUIRE(sock->type == isc_nm_tlssocket); + + uvreq = isc__nm_uvreq_get(sock->mgr, sock); + isc_nmhandle_attach(handle, &uvreq->handle); + uvreq->cb.send = cb; + uvreq->cbarg = cbarg; + uvreq->uvbuf.base = (char *)region->base; + uvreq->uvbuf.len = region->length; + + /* + * We need to create an event and pass it using async channel + */ + ievent = isc__nm_get_netievent_tlssend(sock->mgr, sock, uvreq); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_tlsstartread(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsstartread_t *ievent = + (isc__netievent_tlsstartread_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(sock->tid == isc_nm_tid()); + + UNUSED(worker); + + tls_do_bio(sock, NULL, NULL, false); +} + +void +isc__nm_tls_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { + isc__netievent_tlsstartread_t *ievent = NULL; + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->statichandle == handle); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->recv_cb == NULL); + + if (inactive(sock)) { + cb(handle, ISC_R_CANCELED, NULL, cbarg); + return; + } + + sock->recv_cb = cb; + sock->recv_cbarg = cbarg; + sock->recv_read = true; + + ievent = isc__nm_get_netievent_tlsstartread(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_tls_pauseread(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + if (atomic_compare_exchange_strong(&handle->sock->readpaused, + &(bool){ false }, true)) + { + if (handle->sock->outerhandle != NULL) { + isc_nm_pauseread(handle->sock->outerhandle); + } + } +} + +void +isc__nm_tls_resumeread(isc_nmhandle_t *handle) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + if (!atomic_compare_exchange_strong(&handle->sock->readpaused, + &(bool){ true }, false)) + { + if (inactive(handle->sock)) { + return; + } + + async_tls_do_bio(handle->sock); + } +} + +static void +tls_close_direct(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + /* + * At this point we're certain that there are no + * external references, we can close everything. + */ + if (sock->outerhandle != NULL) { + isc_nm_pauseread(sock->outerhandle); + isc__nmsocket_clearcb(sock->outerhandle->sock); + isc_nmhandle_detach(&sock->outerhandle); + } + + if (sock->listener != NULL) { + isc__nmsocket_detach(&sock->listener); + } + + /* Further cleanup performed in isc__nm_tls_cleanup_data() */ + atomic_store(&sock->closed, true); + atomic_store(&sock->active, false); + sock->tlsstream.state = TLS_CLOSED; +} + +void +isc__nm_tls_close(isc_nmsocket_t *sock) { + isc__netievent_tlsclose_t *ievent = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tlssocket); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + ievent = isc__nm_get_netievent_tlsclose(sock->mgr, sock); + isc__nm_maybe_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_tlsclose(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsclose_t *ievent = (isc__netievent_tlsclose_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(ievent->sock->tid == isc_nm_tid()); + + UNUSED(worker); + + tls_close_direct(sock); +} + +void +isc__nm_tls_stoplistening(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_tlslistener); + + isc__nmsocket_stop(sock); +} + +static void +tcp_connected(isc_nmhandle_t *handle, isc_result_t result, void *cbarg); + +void +isc_nm_tlsconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, + isc_nm_cb_t cb, void *cbarg, isc_tlsctx_t *ctx, + isc_tlsctx_client_session_cache_t *client_sess_cache, + unsigned int timeout, size_t extrahandlesize) { + isc_nmsocket_t *nsock = NULL; +#if defined(NETMGR_TRACE) && defined(NETMGR_TRACE_VERBOSE) + fprintf(stderr, "TLS: isc_nm_tlsconnect(): in net thread: %s\n", + isc__nm_in_netthread() ? "yes" : "no"); +#endif /* NETMGR_TRACE */ + + REQUIRE(VALID_NM(mgr)); + + if (atomic_load(&mgr->closing)) { + cb(NULL, ISC_R_SHUTTINGDOWN, cbarg); + return; + } + + nsock = isc_mem_get(mgr->mctx, sizeof(*nsock)); + isc__nmsocket_init(nsock, mgr, isc_nm_tlssocket, local); + nsock->extrahandlesize = extrahandlesize; + nsock->result = ISC_R_UNSET; + nsock->connect_cb = cb; + nsock->connect_cbarg = cbarg; + nsock->connect_timeout = timeout; + isc_tlsctx_attach(ctx, &nsock->tlsstream.ctx); + atomic_init(&nsock->client, true); + if (client_sess_cache != NULL) { + INSIST(isc_tlsctx_client_session_cache_getctx( + client_sess_cache) == ctx); + isc_tlsctx_client_session_cache_attach( + client_sess_cache, &nsock->tlsstream.client_sess_cache); + } + + isc_nm_tcpconnect(mgr, local, peer, tcp_connected, nsock, + nsock->connect_timeout, 0); +} + +static void +tcp_connected(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) { + isc_nmsocket_t *tlssock = (isc_nmsocket_t *)cbarg; + isc_nmhandle_t *tlshandle = NULL; + + REQUIRE(VALID_NMSOCK(tlssock)); + + tlssock->tid = isc_nm_tid(); + if (result != ISC_R_SUCCESS) { + goto error; + } + + INSIST(VALID_NMHANDLE(handle)); + + tlssock->iface = handle->sock->iface; + tlssock->peer = handle->sock->peer; + if (isc__nm_closing(tlssock)) { + result = ISC_R_SHUTTINGDOWN; + goto error; + } + + /* + * We need to initialize SSL now to reference SSL_CTX properly. + */ + tlssock->tlsstream.tls = isc_tls_create(tlssock->tlsstream.ctx); + if (tlssock->tlsstream.tls == NULL) { + result = ISC_R_TLSERROR; + goto error; + } + + result = initialize_tls(tlssock, false); + if (result != ISC_R_SUCCESS) { + goto error; + } + tlssock->peer = isc_nmhandle_peeraddr(handle); + isc_nmhandle_attach(handle, &tlssock->outerhandle); + atomic_store(&tlssock->active, true); + + if (tlssock->tlsstream.client_sess_cache != NULL) { + isc_tlsctx_client_session_cache_reuse_sockaddr( + tlssock->tlsstream.client_sess_cache, &tlssock->peer, + tlssock->tlsstream.tls); + } + + /* + * Hold a reference to tlssock in the TCP socket: it will + * detached in isc__nm_tls_cleanup_data(). + */ + handle->sock->tlsstream.tlssocket = tlssock; + + tls_do_bio(tlssock, NULL, NULL, false); + return; +error: + tlshandle = isc__nmhandle_get(tlssock, NULL, NULL); + atomic_store(&tlssock->closed, true); + tls_call_connect_cb(tlssock, tlshandle, result); + isc_nmhandle_detach(&tlshandle); + isc__nmsocket_detach(&tlssock); +} + +static void +tls_cancelread(isc_nmsocket_t *sock) { + if (!inactive(sock) && sock->tlsstream.state == TLS_IO) { + tls_do_bio(sock, NULL, NULL, true); + } else if (sock->outerhandle != NULL) { + sock->tlsstream.reading = false; + isc_nm_cancelread(sock->outerhandle); + } +} + +void +isc__nm_tls_cancelread(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + isc__netievent_tlscancel_t *ievent = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(sock->type == isc_nm_tlssocket); + + if (sock->tid == isc_nm_tid()) { + tls_cancelread(sock); + } else { + ievent = isc__nm_get_netievent_tlscancel(sock->mgr, sock, + handle); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } +} + +void +isc__nm_async_tlscancel(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlscancel_t *ievent = (isc__netievent_tlscancel_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(worker->id == sock->tid); + REQUIRE(sock->tid == isc_nm_tid()); + + UNUSED(worker); + tls_cancelread(sock); +} + +void +isc__nm_async_tlsdobio(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_tlsdobio_t *ievent = (isc__netievent_tlsdobio_t *)ev0; + + UNUSED(worker); + + tls_do_bio(ievent->sock, NULL, NULL, false); +} + +void +isc__nm_tls_cleanup_data(isc_nmsocket_t *sock) { + if (sock->type == isc_nm_tcplistener && + sock->tlsstream.tlslistener != NULL) + { + isc__nmsocket_detach(&sock->tlsstream.tlslistener); + } else if (sock->type == isc_nm_tlslistener) { + tls_cleanup_listener_tlsctx(sock); + } else if (sock->type == isc_nm_tlssocket) { + if (sock->tlsstream.tls != NULL) { + /* + * Let's shut down the TLS session properly so that + * the session will remain resumable, if required. + */ + tls_try_shutdown(sock->tlsstream.tls, true); + tls_keep_client_tls_session(sock); + isc_tls_free(&sock->tlsstream.tls); + /* These are destroyed when we free SSL */ + sock->tlsstream.bio_out = NULL; + sock->tlsstream.bio_in = NULL; + } + if (sock->tlsstream.ctx != NULL) { + isc_tlsctx_free(&sock->tlsstream.ctx); + } + if (sock->tlsstream.client_sess_cache != NULL) { + INSIST(atomic_load(&sock->client)); + isc_tlsctx_client_session_cache_detach( + &sock->tlsstream.client_sess_cache); + } + } else if (sock->type == isc_nm_tcpsocket && + sock->tlsstream.tlssocket != NULL) + { + /* + * The TLS socket can't be destroyed until its underlying TCP + * socket is, to avoid possible use-after-free errors. + */ + isc__nmsocket_detach(&sock->tlsstream.tlssocket); + } +} + +void +isc__nm_tls_cleartimeout(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_tlssocket); + + sock = handle->sock; + if (sock->outerhandle != NULL) { + INSIST(VALID_NMHANDLE(sock->outerhandle)); + isc_nmhandle_cleartimeout(sock->outerhandle); + } +} + +void +isc__nm_tls_settimeout(isc_nmhandle_t *handle, uint32_t timeout) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_tlssocket); + + sock = handle->sock; + if (sock->outerhandle != NULL) { + INSIST(VALID_NMHANDLE(sock->outerhandle)); + isc_nmhandle_settimeout(sock->outerhandle, timeout); + } +} + +void +isc__nmhandle_tls_keepalive(isc_nmhandle_t *handle, bool value) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_tlssocket); + + sock = handle->sock; + if (sock->outerhandle != NULL) { + INSIST(VALID_NMHANDLE(sock->outerhandle)); + + isc_nmhandle_keepalive(sock->outerhandle, value); + } +} + +void +isc__nmhandle_tls_setwritetimeout(isc_nmhandle_t *handle, + uint64_t write_timeout) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_tlssocket); + + sock = handle->sock; + if (sock->outerhandle != NULL) { + INSIST(VALID_NMHANDLE(sock->outerhandle)); + + isc_nmhandle_setwritetimeout(sock->outerhandle, write_timeout); + } +} + +const char * +isc__nm_tls_verify_tls_peer_result_string(const isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + REQUIRE(handle->sock->type == isc_nm_tlssocket); + + sock = handle->sock; + if (sock->tlsstream.tls == NULL) { + return (NULL); + } + + return (isc_tls_verify_peer_result_string(sock->tlsstream.tls)); +} + +static void +tls_init_listener_tlsctx(isc_nmsocket_t *listener, isc_tlsctx_t *ctx) { + size_t nworkers; + + REQUIRE(VALID_NM(listener->mgr)); + REQUIRE(ctx != NULL); + + nworkers = (size_t)listener->mgr->nworkers; + INSIST(nworkers > 0); + + listener->tlsstream.listener_tls_ctx = isc_mem_get( + listener->mgr->mctx, sizeof(isc_tlsctx_t *) * nworkers); + listener->tlsstream.n_listener_tls_ctx = nworkers; + for (size_t i = 0; i < nworkers; i++) { + listener->tlsstream.listener_tls_ctx[i] = NULL; + isc_tlsctx_attach(ctx, + &listener->tlsstream.listener_tls_ctx[i]); + } +} + +static void +tls_cleanup_listener_tlsctx(isc_nmsocket_t *listener) { + REQUIRE(VALID_NM(listener->mgr)); + + if (listener->tlsstream.listener_tls_ctx == NULL) { + return; + } + + for (size_t i = 0; i < listener->tlsstream.n_listener_tls_ctx; i++) { + isc_tlsctx_free(&listener->tlsstream.listener_tls_ctx[i]); + } + isc_mem_put(listener->mgr->mctx, listener->tlsstream.listener_tls_ctx, + sizeof(isc_tlsctx_t *) * + listener->tlsstream.n_listener_tls_ctx); + listener->tlsstream.n_listener_tls_ctx = 0; +} + +static isc_tlsctx_t * +tls_get_listener_tlsctx(isc_nmsocket_t *listener, const int tid) { + REQUIRE(VALID_NM(listener->mgr)); + REQUIRE(tid >= 0); + + if (listener->tlsstream.listener_tls_ctx == NULL) { + return (NULL); + } + + return (listener->tlsstream.listener_tls_ctx[tid]); +} + +void +isc__nm_async_tls_set_tlsctx(isc_nmsocket_t *listener, isc_tlsctx_t *tlsctx, + const int tid) { + REQUIRE(tid >= 0); + + isc_tlsctx_free(&listener->tlsstream.listener_tls_ctx[tid]); + isc_tlsctx_attach(tlsctx, &listener->tlsstream.listener_tls_ctx[tid]); +} + +static void +tls_keep_client_tls_session(isc_nmsocket_t *sock) { + /* + * Ensure that the isc_tls_t is being accessed from + * within the worker thread the socket is bound to. + */ + REQUIRE(sock->tid == isc_nm_tid()); + if (sock->tlsstream.client_sess_cache != NULL && + sock->tlsstream.client_session_saved == false) + { + INSIST(atomic_load(&sock->client)); + isc_tlsctx_client_session_cache_keep_sockaddr( + sock->tlsstream.client_sess_cache, &sock->peer, + sock->tlsstream.tls); + sock->tlsstream.client_session_saved = true; + } +} + +static void +tls_try_shutdown(isc_tls_t *tls, const bool force) { + if (force) { + (void)SSL_set_shutdown(tls, SSL_SENT_SHUTDOWN); + } else if ((SSL_get_shutdown(tls) & SSL_SENT_SHUTDOWN) == 0) { + (void)SSL_shutdown(tls); + } +} diff --git a/lib/isc/netmgr/udp.c b/lib/isc/netmgr/udp.c new file mode 100644 index 0000000..1a0ee16 --- /dev/null +++ b/lib/isc/netmgr/udp.c @@ -0,0 +1,1405 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <unistd.h> +#include <uv.h> + +#include <isc/atomic.h> +#include <isc/barrier.h> +#include <isc/buffer.h> +#include <isc/condition.h> +#include <isc/errno.h> +#include <isc/magic.h> +#include <isc/mem.h> +#include <isc/netmgr.h> +#include <isc/random.h> +#include <isc/refcount.h> +#include <isc/region.h> +#include <isc/result.h> +#include <isc/sockaddr.h> +#include <isc/thread.h> +#include <isc/util.h> + +#include "netmgr-int.h" +#include "uv-compat.h" + +#ifdef HAVE_NET_ROUTE_H +#include <net/route.h> +#if defined(RTM_VERSION) && defined(RTM_NEWADDR) && defined(RTM_DELADDR) +#define USE_ROUTE_SOCKET 1 +#define ROUTE_SOCKET_PF PF_ROUTE +#define ROUTE_SOCKET_PROTOCOL 0 +#define MSGHDR rt_msghdr +#define MSGTYPE rtm_type +#endif /* if defined(RTM_VERSION) && defined(RTM_NEWADDR) && \ + * defined(RTM_DELADDR) */ +#endif /* ifdef HAVE_NET_ROUTE_H */ + +#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#if defined(RTM_NEWADDR) && defined(RTM_DELADDR) +#define USE_ROUTE_SOCKET 1 +#define USE_NETLINK 1 +#define ROUTE_SOCKET_PF PF_NETLINK +#define ROUTE_SOCKET_PROTOCOL NETLINK_ROUTE +#define MSGHDR nlmsghdr +#define MSGTYPE nlmsg_type +#endif /* if defined(RTM_NEWADDR) && defined(RTM_DELADDR) */ +#endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \ + */ + +static isc_result_t +udp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req, + isc_sockaddr_t *peer); + +static void +udp_recv_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf, + const struct sockaddr *addr, unsigned flags); + +static void +udp_send_cb(uv_udp_send_t *req, int status); + +static void +udp_close_cb(uv_handle_t *handle); + +static void +read_timer_close_cb(uv_handle_t *handle); + +static void +udp_close_direct(isc_nmsocket_t *sock); + +static void +stop_udp_parent(isc_nmsocket_t *sock); +static void +stop_udp_child(isc_nmsocket_t *sock); + +static uv_os_sock_t +isc__nm_udp_lb_socket(isc_nm_t *mgr, sa_family_t sa_family) { + isc_result_t result; + uv_os_sock_t sock; + + result = isc__nm_socket(sa_family, SOCK_DGRAM, 0, &sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + (void)isc__nm_socket_incoming_cpu(sock); + (void)isc__nm_socket_disable_pmtud(sock, sa_family); + (void)isc__nm_socket_v6only(sock, sa_family); + + result = isc__nm_socket_reuse(sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + + if (mgr->load_balance_sockets) { + result = isc__nm_socket_reuse_lb(sock); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + } + + return (sock); +} + +static void +start_udp_child(isc_nm_t *mgr, isc_sockaddr_t *iface, isc_nmsocket_t *sock, + uv_os_sock_t fd, int tid) { + isc_nmsocket_t *csock; + isc__netievent_udplisten_t *ievent = NULL; + + csock = &sock->children[tid]; + + isc__nmsocket_init(csock, mgr, isc_nm_udpsocket, iface); + csock->parent = sock; + csock->iface = sock->iface; + atomic_init(&csock->reading, true); + csock->recv_cb = sock->recv_cb; + csock->recv_cbarg = sock->recv_cbarg; + csock->extrahandlesize = sock->extrahandlesize; + csock->tid = tid; + + if (mgr->load_balance_sockets) { + UNUSED(fd); + csock->fd = isc__nm_udp_lb_socket(mgr, + iface->type.sa.sa_family); + } else { + csock->fd = dup(fd); + } + REQUIRE(csock->fd >= 0); + + ievent = isc__nm_get_netievent_udplisten(mgr, csock); + isc__nm_maybe_enqueue_ievent(&mgr->workers[tid], + (isc__netievent_t *)ievent); +} + +static void +enqueue_stoplistening(isc_nmsocket_t *sock) { + isc__netievent_udpstop_t *ievent = + isc__nm_get_netievent_udpstop(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +isc_result_t +isc_nm_listenudp(isc_nm_t *mgr, isc_sockaddr_t *iface, isc_nm_recv_cb_t cb, + void *cbarg, size_t extrahandlesize, isc_nmsocket_t **sockp) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + size_t children_size = 0; + REQUIRE(VALID_NM(mgr)); + uv_os_sock_t fd = -1; + + /* + * We are creating mgr->nworkers duplicated sockets, one + * socket for each worker thread. + */ + sock = isc_mem_get(mgr->mctx, sizeof(isc_nmsocket_t)); + isc__nmsocket_init(sock, mgr, isc_nm_udplistener, iface); + + atomic_init(&sock->rchildren, 0); + sock->nchildren = mgr->nworkers; + children_size = sock->nchildren * sizeof(sock->children[0]); + sock->children = isc_mem_get(mgr->mctx, children_size); + memset(sock->children, 0, children_size); + + sock->recv_cb = cb; + sock->recv_cbarg = cbarg; + sock->extrahandlesize = extrahandlesize; + sock->result = ISC_R_UNSET; + + sock->tid = 0; + sock->fd = -1; + + if (!mgr->load_balance_sockets) { + fd = isc__nm_udp_lb_socket(mgr, iface->type.sa.sa_family); + } + + isc_barrier_init(&sock->startlistening, sock->nchildren); + + for (size_t i = 0; i < sock->nchildren; i++) { + if ((int)i == isc_nm_tid()) { + continue; + } + start_udp_child(mgr, iface, sock, fd, i); + } + + if (isc__nm_in_netthread()) { + start_udp_child(mgr, iface, sock, fd, isc_nm_tid()); + } + + if (!mgr->load_balance_sockets) { + isc__nm_closesocket(fd); + } + + LOCK(&sock->lock); + while (atomic_load(&sock->rchildren) != sock->nchildren) { + WAIT(&sock->cond, &sock->lock); + } + result = sock->result; + atomic_store(&sock->active, true); + UNLOCK(&sock->lock); + + INSIST(result != ISC_R_UNSET); + + if (result == ISC_R_SUCCESS) { + REQUIRE(atomic_load(&sock->rchildren) == sock->nchildren); + *sockp = sock; + } else { + atomic_store(&sock->active, false); + enqueue_stoplistening(sock); + isc_nmsocket_close(&sock); + } + + return (result); +} + +#ifdef USE_ROUTE_SOCKET +static isc_result_t +route_socket(uv_os_sock_t *fdp) { + isc_result_t result; + uv_os_sock_t fd; +#ifdef USE_NETLINK + struct sockaddr_nl sa; + int r; +#endif + + result = isc__nm_socket(ROUTE_SOCKET_PF, SOCK_RAW, + ROUTE_SOCKET_PROTOCOL, &fd); + if (result != ISC_R_SUCCESS) { + return (result); + } + +#ifdef USE_NETLINK + sa.nl_family = PF_NETLINK; + sa.nl_groups = RTMGRP_LINK | RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR; + r = bind(fd, (struct sockaddr *)&sa, sizeof(sa)); + if (r < 0) { + isc__nm_closesocket(fd); + return (isc_errno_toresult(r)); + } +#endif + + *fdp = fd; + return (ISC_R_SUCCESS); +} + +static isc_result_t +route_connect_direct(isc_nmsocket_t *sock) { + isc__networker_t *worker = NULL; + isc_result_t result = ISC_R_UNSET; + int r; + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(sock->tid == isc_nm_tid()); + + worker = &sock->mgr->workers[isc_nm_tid()]; + + atomic_store(&sock->connecting, true); + + r = uv_udp_init(&worker->loop, &sock->uv_handle.udp); + UV_RUNTIME_CHECK(uv_udp_init, r); + uv_handle_set_data(&sock->uv_handle.handle, sock); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + if (isc__nm_closing(sock)) { + result = ISC_R_SHUTTINGDOWN; + goto error; + } + + r = uv_udp_open(&sock->uv_handle.udp, sock->fd); + if (r != 0) { + goto done; + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + atomic_store(&sock->connecting, false); + atomic_store(&sock->connected, true); + +done: + result = isc__nm_uverr2result(r); +error: + + LOCK(&sock->lock); + sock->result = result; + SIGNAL(&sock->cond); + if (!atomic_load(&sock->active)) { + WAIT(&sock->scond, &sock->lock); + } + INSIST(atomic_load(&sock->active)); + UNLOCK(&sock->lock); + + return (result); +} + +/* + * Asynchronous 'udpconnect' call handler: open a new UDP socket and + * call the 'open' callback with a handle. + */ +void +isc__nm_async_routeconnect(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_routeconnect_t *ievent = + (isc__netievent_routeconnect_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *req = ievent->req; + isc_result_t result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_udpsocket); + REQUIRE(sock->parent == NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + result = route_connect_direct(sock); + if (result != ISC_R_SUCCESS) { + atomic_store(&sock->active, false); + isc__nm_udp_close(sock); + isc__nm_connectcb(sock, req, result, true); + } else { + /* + * The callback has to be called after the socket has been + * initialized + */ + isc__nm_connectcb(sock, req, ISC_R_SUCCESS, true); + } + + /* + * The sock is now attached to the handle. + */ + isc__nmsocket_detach(&sock); +} +#endif /* USE_ROUTE_SOCKET */ + +isc_result_t +isc_nm_routeconnect(isc_nm_t *mgr, isc_nm_cb_t cb, void *cbarg, + size_t extrahandlesize) { +#ifdef USE_ROUTE_SOCKET + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + isc__netievent_udpconnect_t *event = NULL; + isc__nm_uvreq_t *req = NULL; + + REQUIRE(VALID_NM(mgr)); + + sock = isc_mem_get(mgr->mctx, sizeof(*sock)); + isc__nmsocket_init(sock, mgr, isc_nm_udpsocket, NULL); + + sock->connect_cb = cb; + sock->connect_cbarg = cbarg; + sock->extrahandlesize = extrahandlesize; + sock->result = ISC_R_UNSET; + atomic_init(&sock->client, true); + sock->route_sock = true; + + req = isc__nm_uvreq_get(mgr, sock); + req->cb.connect = cb; + req->cbarg = cbarg; + req->handle = isc__nmhandle_get(sock, NULL, NULL); + + result = route_socket(&sock->fd); + if (result != ISC_R_SUCCESS) { + if (isc__nm_in_netthread()) { + sock->tid = isc_nm_tid(); + } + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, result, true); + atomic_store(&sock->closed, true); + isc__nmsocket_detach(&sock); + return (result); + } + + event = isc__nm_get_netievent_routeconnect(mgr, sock, req); + + if (isc__nm_in_netthread()) { + atomic_store(&sock->active, true); + sock->tid = isc_nm_tid(); + isc__nm_async_routeconnect(&mgr->workers[sock->tid], + (isc__netievent_t *)event); + isc__nm_put_netievent_routeconnect(mgr, event); + } else { + atomic_init(&sock->active, false); + sock->tid = 0; + isc__nm_enqueue_ievent(&mgr->workers[sock->tid], + (isc__netievent_t *)event); + } + LOCK(&sock->lock); + while (sock->result == ISC_R_UNSET) { + WAIT(&sock->cond, &sock->lock); + } + atomic_store(&sock->active, true); + BROADCAST(&sock->scond); + UNLOCK(&sock->lock); + + return (sock->result); +#else /* USE_ROUTE_SOCKET */ + UNUSED(mgr); + UNUSED(cb); + UNUSED(cbarg); + UNUSED(extrahandlesize); + return (ISC_R_NOTIMPLEMENTED); +#endif /* USE_ROUTE_SOCKET */ +} + +/* + * Asynchronous 'udplisten' call handler: start listening on a UDP socket. + */ +void +isc__nm_async_udplisten(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_udplisten_t *ievent = (isc__netievent_udplisten_t *)ev0; + isc_nmsocket_t *sock = NULL; + int r, uv_bind_flags = 0; + int uv_init_flags = 0; + sa_family_t sa_family; + isc_result_t result = ISC_R_UNSET; + isc_nm_t *mgr = NULL; + + REQUIRE(VALID_NMSOCK(ievent->sock)); + REQUIRE(ievent->sock->tid == isc_nm_tid()); + REQUIRE(VALID_NMSOCK(ievent->sock->parent)); + + sock = ievent->sock; + sa_family = sock->iface.type.sa.sa_family; + mgr = sock->mgr; + + REQUIRE(sock->type == isc_nm_udpsocket); + REQUIRE(sock->parent != NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + +#if HAVE_DECL_UV_UDP_RECVMMSG + uv_init_flags |= UV_UDP_RECVMMSG; +#endif + r = uv_udp_init_ex(&worker->loop, &sock->uv_handle.udp, uv_init_flags); + UV_RUNTIME_CHECK(uv_udp_init_ex, r); + uv_handle_set_data(&sock->uv_handle.handle, sock); + /* This keeps the socket alive after everything else is gone */ + isc__nmsocket_attach(sock, &(isc_nmsocket_t *){ NULL }); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + LOCK(&sock->parent->lock); + + r = uv_udp_open(&sock->uv_handle.udp, sock->fd); + if (r < 0) { + isc__nm_closesocket(sock->fd); + isc__nm_incstats(sock, STATID_OPENFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_OPEN); + + if (sa_family == AF_INET6) { + uv_bind_flags |= UV_UDP_IPV6ONLY; + } + + if (mgr->load_balance_sockets) { + r = isc_uv_udp_freebind(&sock->uv_handle.udp, + &sock->parent->iface.type.sa, + uv_bind_flags); + if (r < 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + } else { + if (sock->parent->fd == -1) { + /* This thread is first, bind the socket */ + r = isc_uv_udp_freebind(&sock->uv_handle.udp, + &sock->parent->iface.type.sa, + uv_bind_flags); + if (r < 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + sock->parent->uv_handle.udp.flags = + sock->uv_handle.udp.flags; + sock->parent->fd = sock->fd; + } else { + /* The socket is already bound, just copy the flags */ + sock->uv_handle.udp.flags = + sock->parent->uv_handle.udp.flags; + } + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + r = uv_udp_recv_start(&sock->uv_handle.udp, isc__nm_alloc_cb, + udp_recv_cb); + if (r != 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + + atomic_store(&sock->listening, true); + +done: + result = isc__nm_uverr2result(r); + atomic_fetch_add(&sock->parent->rchildren, 1); + if (sock->parent->result == ISC_R_UNSET) { + sock->parent->result = result; + } + SIGNAL(&sock->parent->cond); + UNLOCK(&sock->parent->lock); + + isc_barrier_wait(&sock->parent->startlistening); +} + +void +isc__nm_udp_stoplistening(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_udplistener); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + if (!isc__nm_in_netthread()) { + enqueue_stoplistening(sock); + } else { + stop_udp_parent(sock); + } +} + +/* + * Asynchronous 'udpstop' call handler: stop listening on a UDP socket. + */ +void +isc__nm_async_udpstop(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_udpstop_t *ievent = (isc__netievent_udpstop_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (sock->parent != NULL) { + stop_udp_child(sock); + return; + } + + stop_udp_parent(sock); +} + +/* + * udp_recv_cb handles incoming UDP packet from uv. The buffer here is + * reused for a series of packets, so we need to allocate a new one. + * This new one can be reused to send the response then. + */ +static void +udp_recv_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf, + const struct sockaddr *addr, unsigned flags) { + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)handle); + isc__nm_uvreq_t *req = NULL; + uint32_t maxudp; + isc_result_t result; + isc_sockaddr_t sockaddr, *sa = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->reading)); + + /* + * When using recvmmsg(2), if no errors occur, there will be a final + * callback with nrecv set to 0, addr set to NULL and the buffer + * pointing at the initially allocated data with the UV_UDP_MMSG_CHUNK + * flag cleared and the UV_UDP_MMSG_FREE flag set. + */ +#if HAVE_DECL_UV_UDP_MMSG_FREE + if ((flags & UV_UDP_MMSG_FREE) == UV_UDP_MMSG_FREE) { + INSIST(nrecv == 0); + INSIST(addr == NULL); + goto free; + } +#else + UNUSED(flags); +#endif + + /* + * - If we're simulating a firewall blocking UDP packets + * bigger than 'maxudp' bytes for testing purposes. + */ + maxudp = atomic_load(&sock->mgr->maxudp); + if ((maxudp != 0 && (uint32_t)nrecv > maxudp)) { + /* + * We need to keep the read_cb intact in case, so the + * readtimeout_cb can trigger and not crash because of + * missing read_req. + */ + goto free; + } + + /* + * - If there was a networking error. + */ + if (nrecv < 0) { + isc__nm_failed_read_cb(sock, isc__nm_uverr2result(nrecv), + false); + goto free; + } + + /* + * - If addr == NULL, in which case it's the end of stream; + * we can free the buffer and bail. + */ + if (addr == NULL) { + isc__nm_failed_read_cb(sock, ISC_R_EOF, false); + goto free; + } + + /* + * - If the socket is no longer active. + */ + if (!isc__nmsocket_active(sock)) { + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, false); + goto free; + } + + if (!sock->route_sock) { + result = isc_sockaddr_fromsockaddr(&sockaddr, addr); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + sa = &sockaddr; + } + + req = isc__nm_get_read_req(sock, sa); + + /* + * The callback will be called synchronously, because result is + * ISC_R_SUCCESS, so we are ok of passing the buf directly. + */ + req->uvbuf.base = buf->base; + req->uvbuf.len = nrecv; + + sock->recv_read = false; + + REQUIRE(!sock->processing); + sock->processing = true; + isc__nm_readcb(sock, req, ISC_R_SUCCESS); + sock->processing = false; + +free: +#if HAVE_DECL_UV_UDP_MMSG_CHUNK + /* + * When using recvmmsg(2), chunks will have the UV_UDP_MMSG_CHUNK flag + * set, those must not be freed. + */ + if ((flags & UV_UDP_MMSG_CHUNK) == UV_UDP_MMSG_CHUNK) { + return; + } +#endif + + /* + * When using recvmmsg(2), if a UDP socket error occurs, nrecv will be < + * 0. In either scenario, the callee can now safely free the provided + * buffer. + */ + if (nrecv < 0) { + /* + * The buffer may be a null buffer on error. + */ + if (buf->base == NULL && buf->len == 0) { + return; + } + } + + isc__nm_free_uvbuf(sock, buf); +} + +/* + * Send the data in 'region' to a peer via a UDP socket. We try to find + * a proper sibling/child socket so that we won't have to jump to + * another thread. + */ +void +isc__nm_udp_send(isc_nmhandle_t *handle, const isc_region_t *region, + isc_nm_cb_t cb, void *cbarg) { + isc_nmsocket_t *sock = handle->sock; + isc_nmsocket_t *rsock = NULL; + isc_sockaddr_t *peer = &handle->peer; + isc__nm_uvreq_t *uvreq = NULL; + uint32_t maxudp = atomic_load(&sock->mgr->maxudp); + int ntid; + + INSIST(sock->type == isc_nm_udpsocket); + + /* + * We're simulating a firewall blocking UDP packets bigger than + * 'maxudp' bytes, for testing purposes. + * + * The client would ordinarily have unreferenced the handle + * in the callback, but that won't happen in this case, so + * we need to do so here. + */ + if (maxudp != 0 && region->length > maxudp) { + isc_nmhandle_detach(&handle); + return; + } + + if (atomic_load(&sock->client)) { + /* + * When we are sending from the client socket, we directly use + * the socket provided. + */ + rsock = sock; + goto send; + } else { + /* + * When we are sending from the server socket, we either use the + * socket associated with the network thread we are in, or we + * use the thread from the socket associated with the handle. + */ + INSIST(sock->parent != NULL); + + if (isc__nm_in_netthread()) { + ntid = isc_nm_tid(); + } else { + ntid = sock->tid; + } + rsock = &sock->parent->children[ntid]; + } + +send: + uvreq = isc__nm_uvreq_get(rsock->mgr, rsock); + uvreq->uvbuf.base = (char *)region->base; + uvreq->uvbuf.len = region->length; + + isc_nmhandle_attach(handle, &uvreq->handle); + + uvreq->cb.send = cb; + uvreq->cbarg = cbarg; + + if (isc_nm_tid() == rsock->tid) { + REQUIRE(rsock->tid == isc_nm_tid()); + isc__netievent_udpsend_t ievent = { .sock = rsock, + .req = uvreq, + .peer = *peer }; + + isc__nm_async_udpsend(NULL, (isc__netievent_t *)&ievent); + } else { + isc__netievent_udpsend_t *ievent = + isc__nm_get_netievent_udpsend(sock->mgr, rsock); + ievent->peer = *peer; + ievent->req = uvreq; + + isc__nm_enqueue_ievent(&sock->mgr->workers[rsock->tid], + (isc__netievent_t *)ievent); + } +} + +/* + * Asynchronous 'udpsend' event handler: send a packet on a UDP socket. + */ +void +isc__nm_async_udpsend(isc__networker_t *worker, isc__netievent_t *ev0) { + isc_result_t result; + isc__netievent_udpsend_t *ievent = (isc__netievent_udpsend_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *uvreq = ievent->req; + + REQUIRE(sock->type == isc_nm_udpsocket); + REQUIRE(sock->tid == isc_nm_tid()); + UNUSED(worker); + + if (isc__nmsocket_closing(sock)) { + isc__nm_failed_send_cb(sock, uvreq, ISC_R_CANCELED); + return; + } + + result = udp_send_direct(sock, uvreq, &ievent->peer); + if (result != ISC_R_SUCCESS) { + isc__nm_incstats(sock, STATID_SENDFAIL); + isc__nm_failed_send_cb(sock, uvreq, result); + } +} + +static void +udp_send_cb(uv_udp_send_t *req, int status) { + isc_result_t result = ISC_R_SUCCESS; + isc__nm_uvreq_t *uvreq = uv_handle_get_data((uv_handle_t *)req); + isc_nmsocket_t *sock = NULL; + + REQUIRE(VALID_UVREQ(uvreq)); + REQUIRE(VALID_NMHANDLE(uvreq->handle)); + + sock = uvreq->sock; + + REQUIRE(sock->tid == isc_nm_tid()); + + if (status < 0) { + result = isc__nm_uverr2result(status); + isc__nm_incstats(sock, STATID_SENDFAIL); + } + + isc__nm_sendcb(sock, uvreq, result, false); +} + +/* + * udp_send_direct sends buf to a peer on a socket. Sock has to be in + * the same thread as the callee. + */ +static isc_result_t +udp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req, + isc_sockaddr_t *peer) { + const struct sockaddr *sa = &peer->type.sa; + int r; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(VALID_UVREQ(req)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_udpsocket); + + if (isc__nmsocket_closing(sock)) { + return (ISC_R_CANCELED); + } + +#if UV_VERSION_HEX >= UV_VERSION(1, 27, 0) + /* + * If we used uv_udp_connect() (and not the shim version for + * older versions of libuv), then the peer address has to be + * set to NULL or else uv_udp_send() could fail or assert, + * depending on the libuv version. + */ + if (atomic_load(&sock->connected)) { + sa = NULL; + } +#endif + + r = uv_udp_send(&req->uv_req.udp_send, &sock->uv_handle.udp, + &req->uvbuf, 1, sa, udp_send_cb); + if (r < 0) { + return (isc__nm_uverr2result(r)); + } + + return (ISC_R_SUCCESS); +} + +static isc_result_t +udp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { + isc__networker_t *worker = NULL; + int uv_bind_flags = UV_UDP_REUSEADDR; + isc_result_t result = ISC_R_UNSET; + int r; + + REQUIRE(isc__nm_in_netthread()); + REQUIRE(sock->tid == isc_nm_tid()); + + worker = &sock->mgr->workers[isc_nm_tid()]; + + atomic_store(&sock->connecting, true); + + r = uv_udp_init(&worker->loop, &sock->uv_handle.udp); + UV_RUNTIME_CHECK(uv_udp_init, r); + uv_handle_set_data(&sock->uv_handle.handle, sock); + + r = uv_timer_init(&worker->loop, &sock->read_timer); + UV_RUNTIME_CHECK(uv_timer_init, r); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + + if (isc__nm_closing(sock)) { + result = ISC_R_SHUTTINGDOWN; + goto error; + } + + r = uv_udp_open(&sock->uv_handle.udp, sock->fd); + if (r != 0) { + isc__nm_incstats(sock, STATID_OPENFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_OPEN); + + if (sock->iface.type.sa.sa_family == AF_INET6) { + uv_bind_flags |= UV_UDP_IPV6ONLY; + } + + r = uv_udp_bind(&sock->uv_handle.udp, &sock->iface.type.sa, + uv_bind_flags); + if (r != 0) { + isc__nm_incstats(sock, STATID_BINDFAIL); + goto done; + } + + isc__nm_set_network_buffers(sock->mgr, &sock->uv_handle.handle); + + /* + * On FreeBSD the UDP connect() call sometimes results in a + * spurious transient EADDRINUSE. Try a few more times before + * giving up. + */ + do { + r = isc_uv_udp_connect(&sock->uv_handle.udp, + &req->peer.type.sa); + } while (r == UV_EADDRINUSE && --req->connect_tries > 0); + if (r != 0) { + isc__nm_incstats(sock, STATID_CONNECTFAIL); + goto done; + } + isc__nm_incstats(sock, STATID_CONNECT); + + atomic_store(&sock->connecting, false); + atomic_store(&sock->connected, true); + +done: + result = isc__nm_uverr2result(r); +error: + + LOCK(&sock->lock); + sock->result = result; + SIGNAL(&sock->cond); + if (!atomic_load(&sock->active)) { + WAIT(&sock->scond, &sock->lock); + } + INSIST(atomic_load(&sock->active)); + UNLOCK(&sock->lock); + + return (result); +} + +/* + * Asynchronous 'udpconnect' call handler: open a new UDP socket and + * call the 'open' callback with a handle. + */ +void +isc__nm_async_udpconnect(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_udpconnect_t *ievent = + (isc__netievent_udpconnect_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc__nm_uvreq_t *req = ievent->req; + isc_result_t result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_udpsocket); + REQUIRE(sock->parent == NULL); + REQUIRE(sock->tid == isc_nm_tid()); + + result = udp_connect_direct(sock, req); + if (result != ISC_R_SUCCESS) { + atomic_store(&sock->active, false); + isc__nm_udp_close(sock); + isc__nm_connectcb(sock, req, result, true); + } else { + /* + * The callback has to be called after the socket has been + * initialized + */ + isc__nm_connectcb(sock, req, ISC_R_SUCCESS, true); + } + + /* + * The sock is now attached to the handle. + */ + isc__nmsocket_detach(&sock); +} + +void +isc_nm_udpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, + isc_nm_cb_t cb, void *cbarg, unsigned int timeout, + size_t extrahandlesize) { + isc_result_t result = ISC_R_SUCCESS; + isc_nmsocket_t *sock = NULL; + isc__netievent_udpconnect_t *event = NULL; + isc__nm_uvreq_t *req = NULL; + sa_family_t sa_family; + + REQUIRE(VALID_NM(mgr)); + REQUIRE(local != NULL); + REQUIRE(peer != NULL); + + sa_family = peer->type.sa.sa_family; + + sock = isc_mem_get(mgr->mctx, sizeof(isc_nmsocket_t)); + isc__nmsocket_init(sock, mgr, isc_nm_udpsocket, local); + + sock->connect_cb = cb; + sock->connect_cbarg = cbarg; + sock->read_timeout = timeout; + sock->extrahandlesize = extrahandlesize; + sock->peer = *peer; + sock->result = ISC_R_UNSET; + atomic_init(&sock->client, true); + + req = isc__nm_uvreq_get(mgr, sock); + req->cb.connect = cb; + req->cbarg = cbarg; + req->peer = *peer; + req->local = *local; + req->handle = isc__nmhandle_get(sock, &req->peer, &sock->iface); + + result = isc__nm_socket(sa_family, SOCK_DGRAM, 0, &sock->fd); + if (result != ISC_R_SUCCESS) { + if (isc__nm_in_netthread()) { + sock->tid = isc_nm_tid(); + } + isc__nmsocket_clearcb(sock); + isc__nm_connectcb(sock, req, result, true); + atomic_store(&sock->closed, true); + isc__nmsocket_detach(&sock); + return; + } + + result = isc__nm_socket_reuse(sock->fd); + RUNTIME_CHECK(result == ISC_R_SUCCESS || + result == ISC_R_NOTIMPLEMENTED); + + result = isc__nm_socket_reuse_lb(sock->fd); + RUNTIME_CHECK(result == ISC_R_SUCCESS || + result == ISC_R_NOTIMPLEMENTED); + + (void)isc__nm_socket_incoming_cpu(sock->fd); + + (void)isc__nm_socket_disable_pmtud(sock->fd, sa_family); + + (void)isc__nm_socket_min_mtu(sock->fd, sa_family); + + event = isc__nm_get_netievent_udpconnect(mgr, sock, req); + + if (isc__nm_in_netthread()) { + atomic_store(&sock->active, true); + sock->tid = isc_nm_tid(); + isc__nm_async_udpconnect(&mgr->workers[sock->tid], + (isc__netievent_t *)event); + isc__nm_put_netievent_udpconnect(mgr, event); + } else { + atomic_init(&sock->active, false); + sock->tid = isc_random_uniform(mgr->nworkers); + isc__nm_enqueue_ievent(&mgr->workers[sock->tid], + (isc__netievent_t *)event); + } + LOCK(&sock->lock); + while (sock->result == ISC_R_UNSET) { + WAIT(&sock->cond, &sock->lock); + } + atomic_store(&sock->active, true); + BROADCAST(&sock->scond); + UNLOCK(&sock->lock); +} + +void +isc__nm_udp_read_cb(uv_udp_t *handle, ssize_t nrecv, const uv_buf_t *buf, + const struct sockaddr *addr, unsigned flags) { + isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)handle); + REQUIRE(VALID_NMSOCK(sock)); + + udp_recv_cb(handle, nrecv, buf, addr, flags); + /* + * If a caller calls isc_nm_read() on a listening socket, we can + * get here, but we MUST NOT stop reading from the listener + * socket. The only difference between listener and connected + * sockets is that the former has sock->parent set and later + * does not. + */ + if (!sock->parent) { + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + } +} + +void +isc__nm_udp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(result != ISC_R_SUCCESS); + + if (atomic_load(&sock->client)) { + isc__nmsocket_timer_stop(sock); + isc__nm_stop_reading(sock); + + if (!sock->recv_read) { + goto destroy; + } + sock->recv_read = false; + + if (sock->recv_cb != NULL) { + isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); + isc__nmsocket_clearcb(sock); + isc__nm_readcb(sock, req, result); + } + + destroy: + isc__nmsocket_prep_destroy(sock); + return; + } + + /* + * For UDP server socket, we don't have child socket via + * "accept", so we: + * - we continue to read + * - we don't clear the callbacks + * - we don't destroy it (only stoplistening could do that) + */ + if (!sock->recv_read) { + return; + } + sock->recv_read = false; + + if (sock->recv_cb != NULL) { + isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); + isc__nm_readcb(sock, req, result); + } +} + +/* + * Asynchronous 'udpread' call handler: start or resume reading on a + * socket; pause reading and call the 'recv' callback after each + * datagram. + */ +void +isc__nm_async_udpread(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_udpread_t *ievent = (isc__netievent_udpread_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + isc_result_t result; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + if (isc__nm_closing(sock)) { + result = ISC_R_SHUTTINGDOWN; + } else if (isc__nmsocket_closing(sock)) { + result = ISC_R_CANCELED; + } else { + result = isc__nm_start_reading(sock); + } + + if (result != ISC_R_SUCCESS) { + atomic_store(&sock->reading, true); + isc__nm_failed_read_cb(sock, result, false); + return; + } + + isc__nmsocket_timer_start(sock); +} + +void +isc__nm_udp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { + REQUIRE(VALID_NMHANDLE(handle)); + REQUIRE(VALID_NMSOCK(handle->sock)); + + isc_nmsocket_t *sock = handle->sock; + + REQUIRE(sock->type == isc_nm_udpsocket); + REQUIRE(sock->statichandle == handle); + REQUIRE(!sock->recv_read); + + sock->recv_cb = cb; + sock->recv_cbarg = cbarg; + sock->recv_read = true; + + if (!atomic_load(&sock->reading) && sock->tid == isc_nm_tid()) { + isc__netievent_udpread_t ievent = { .sock = sock }; + isc__nm_async_udpread(NULL, (isc__netievent_t *)&ievent); + } else { + isc__netievent_udpread_t *ievent = + isc__nm_get_netievent_udpread(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } +} + +static void +udp_stop_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + uv_handle_set_data(handle, NULL); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + if (!atomic_compare_exchange_strong(&sock->closed, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + isc__nm_incstats(sock, STATID_CLOSE); + + atomic_store(&sock->listening, false); + + isc__nmsocket_detach(&sock); +} + +static void +udp_close_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + uv_handle_set_data(handle, NULL); + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->closing)); + + if (!atomic_compare_exchange_strong(&sock->closed, &(bool){ false }, + true)) + { + UNREACHABLE(); + } + + isc__nm_incstats(sock, STATID_CLOSE); + + if (sock->server != NULL) { + isc__nmsocket_detach(&sock->server); + } + + atomic_store(&sock->connected, false); + atomic_store(&sock->listening, false); + + isc__nmsocket_prep_destroy(sock); +} + +static void +read_timer_close_cb(uv_handle_t *handle) { + isc_nmsocket_t *sock = uv_handle_get_data(handle); + uv_handle_set_data(handle, NULL); + + if (sock->parent) { + uv_close(&sock->uv_handle.handle, udp_stop_cb); + } else { + uv_close(&sock->uv_handle.handle, udp_close_cb); + } +} + +static void +stop_udp_child(isc_nmsocket_t *sock) { + REQUIRE(sock->type == isc_nm_udpsocket); + REQUIRE(sock->tid == isc_nm_tid()); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + udp_close_direct(sock); + + atomic_fetch_sub(&sock->parent->rchildren, 1); + + isc_barrier_wait(&sock->parent->stoplistening); +} + +static void +stop_udp_parent(isc_nmsocket_t *sock) { + isc_nmsocket_t *csock = NULL; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_udplistener); + + isc_barrier_init(&sock->stoplistening, sock->nchildren); + + for (size_t i = 0; i < sock->nchildren; i++) { + csock = &sock->children[i]; + REQUIRE(VALID_NMSOCK(csock)); + + if ((int)i == isc_nm_tid()) { + /* + * We need to schedule closing the other sockets first + */ + continue; + } + + atomic_store(&csock->active, false); + enqueue_stoplistening(csock); + } + + csock = &sock->children[isc_nm_tid()]; + atomic_store(&csock->active, false); + stop_udp_child(csock); + + atomic_store(&sock->closed, true); + isc__nmsocket_prep_destroy(sock); +} + +static void +udp_close_direct(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + uv_close((uv_handle_t *)&sock->read_timer, read_timer_close_cb); +} + +void +isc__nm_async_udpclose(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_udpclose_t *ievent = (isc__netievent_udpclose_t *)ev0; + isc_nmsocket_t *sock = ievent->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + UNUSED(worker); + + udp_close_direct(sock); +} + +void +isc__nm_udp_close(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_udpsocket); + REQUIRE(!isc__nmsocket_active(sock)); + + if (!atomic_compare_exchange_strong(&sock->closing, &(bool){ false }, + true)) + { + return; + } + + if (sock->tid == isc_nm_tid()) { + udp_close_direct(sock); + } else { + isc__netievent_udpclose_t *ievent = + isc__nm_get_netievent_udpclose(sock->mgr, sock); + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); + } +} + +void +isc__nm_udp_shutdown(isc_nmsocket_t *sock) { + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(sock->type == isc_nm_udpsocket); + + /* + * If the socket is active, mark it inactive and + * continue. If it isn't active, stop now. + */ + if (!isc__nmsocket_deactivate(sock)) { + return; + } + + /* + * If the socket is connecting, the cancel will happen in the + * async_udpconnect() due socket being inactive now. + */ + if (atomic_load(&sock->connecting)) { + return; + } + + /* + * When the client detaches the last handle, the + * sock->statichandle would be NULL, in that case, nobody is + * interested in the callback. + */ + if (sock->statichandle != NULL) { + if (isc__nm_closing(sock)) { + isc__nm_failed_read_cb(sock, ISC_R_SHUTTINGDOWN, false); + } else { + isc__nm_failed_read_cb(sock, ISC_R_CANCELED, false); + } + return; + } + + /* + * Otherwise, we just send the socket to abyss... + */ + if (sock->parent == NULL) { + isc__nmsocket_prep_destroy(sock); + } +} + +void +isc__nm_udp_cancelread(isc_nmhandle_t *handle) { + isc_nmsocket_t *sock = NULL; + isc__netievent_udpcancel_t *ievent = NULL; + + REQUIRE(VALID_NMHANDLE(handle)); + + sock = handle->sock; + + REQUIRE(VALID_NMSOCK(sock)); + REQUIRE(sock->type == isc_nm_udpsocket); + + ievent = isc__nm_get_netievent_udpcancel(sock->mgr, sock, handle); + + isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid], + (isc__netievent_t *)ievent); +} + +void +isc__nm_async_udpcancel(isc__networker_t *worker, isc__netievent_t *ev0) { + isc__netievent_udpcancel_t *ievent = (isc__netievent_udpcancel_t *)ev0; + isc_nmsocket_t *sock = NULL; + + UNUSED(worker); + + REQUIRE(VALID_NMSOCK(ievent->sock)); + + sock = ievent->sock; + + REQUIRE(sock->tid == isc_nm_tid()); + REQUIRE(atomic_load(&sock->client)); + + isc__nm_failed_read_cb(sock, ISC_R_EOF, false); +} diff --git a/lib/isc/netmgr/uv-compat.c b/lib/isc/netmgr/uv-compat.c new file mode 100644 index 0000000..b7c0f7b --- /dev/null +++ b/lib/isc/netmgr/uv-compat.c @@ -0,0 +1,140 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include "uv-compat.h" +#include <unistd.h> + +#include <isc/util.h> + +#include "netmgr-int.h" + +#if UV_VERSION_HEX < UV_VERSION(1, 27, 0) +int +isc_uv_udp_connect(uv_udp_t *handle, const struct sockaddr *addr) { + int err = 0; + + do { + int addrlen = (addr->sa_family == AF_INET) + ? sizeof(struct sockaddr_in) + : sizeof(struct sockaddr_in6); + err = connect(handle->io_watcher.fd, addr, addrlen); + } while (err == -1 && errno == EINTR); + + if (err) { +#if UV_VERSION_HEX >= UV_VERSION(1, 10, 0) + return (uv_translate_sys_error(errno)); +#else + return (-errno); +#endif /* UV_VERSION_HEX >= UV_VERSION(1, 10, 0) */ + } + + return (0); +} +#endif /* UV_VERSION_HEX < UV_VERSION(1, 27, 0) */ + +#if UV_VERSION_HEX < UV_VERSION(1, 32, 0) +int +uv_tcp_close_reset(uv_tcp_t *handle, uv_close_cb close_cb) { + if (setsockopt(handle->io_watcher.fd, SOL_SOCKET, SO_LINGER, + &(struct linger){ 1, 0 }, sizeof(struct linger)) == -1) + { +#if UV_VERSION_HEX >= UV_VERSION(1, 10, 0) + return (uv_translate_sys_error(errno)); +#else + return (-errno); +#endif /* UV_VERSION_HEX >= UV_VERSION(1, 10, 0) */ + } + + uv_close((uv_handle_t *)handle, close_cb); + return (0); +} +#endif /* UV_VERSION_HEX < UV_VERSION(1, 32, 0) */ + +int +isc_uv_udp_freebind(uv_udp_t *handle, const struct sockaddr *addr, + unsigned int flags) { + int r; + uv_os_sock_t fd; + + r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd); + if (r < 0) { + return (r); + } + + r = uv_udp_bind(handle, addr, flags); + if (r == UV_EADDRNOTAVAIL && + isc__nm_socket_freebind(fd, addr->sa_family) == ISC_R_SUCCESS) + { + /* + * Retry binding with IP_FREEBIND (or equivalent option) if the + * address is not available. This helps with IPv6 tentative + * addresses which are reported by the route socket, although + * named is not yet able to properly bind to them. + */ + r = uv_udp_bind(handle, addr, flags); + } + + return (r); +} + +static int +isc__uv_tcp_bind_now(uv_tcp_t *handle, const struct sockaddr *addr, + unsigned int flags) { + int r; + struct sockaddr_storage sname; + int snamelen = sizeof(sname); + + r = uv_tcp_bind(handle, addr, flags); + if (r < 0) { + return (r); + } + + /* + * uv_tcp_bind() uses a delayed error, initially returning + * success even if bind() fails. By calling uv_tcp_getsockname() + * here we can find out whether the bind() call was successful. + */ + r = uv_tcp_getsockname(handle, (struct sockaddr *)&sname, &snamelen); + if (r < 0) { + return (r); + } + + return (0); +} + +int +isc_uv_tcp_freebind(uv_tcp_t *handle, const struct sockaddr *addr, + unsigned int flags) { + int r; + uv_os_sock_t fd; + + r = uv_fileno((const uv_handle_t *)handle, (uv_os_fd_t *)&fd); + if (r < 0) { + return (r); + } + + r = isc__uv_tcp_bind_now(handle, addr, flags); + if (r == UV_EADDRNOTAVAIL && + isc__nm_socket_freebind(fd, addr->sa_family) == ISC_R_SUCCESS) + { + /* + * Retry binding with IP_FREEBIND (or equivalent option) if the + * address is not available. This helps with IPv6 tentative + * addresses which are reported by the route socket, although + * named is not yet able to properly bind to them. + */ + r = isc__uv_tcp_bind_now(handle, addr, flags); + } + + return (r); +} diff --git a/lib/isc/netmgr/uv-compat.h b/lib/isc/netmgr/uv-compat.h new file mode 100644 index 0000000..3a10387 --- /dev/null +++ b/lib/isc/netmgr/uv-compat.h @@ -0,0 +1,126 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#pragma once +#include <uv.h> + +/* + * These functions were introduced in newer libuv, but we still + * want BIND9 compile on older ones so we emulate them. + * They're inline to avoid conflicts when running with a newer + * library version. + */ + +#define UV_VERSION(major, minor, patch) ((major << 16) | (minor << 8) | (patch)) + +/* + * Copied verbatim from libuv/src/version.c + */ + +#define UV_STRINGIFY(v) UV_STRINGIFY_HELPER(v) +#define UV_STRINGIFY_HELPER(v) #v + +#define UV_VERSION_STRING_BASE \ + UV_STRINGIFY(UV_VERSION_MAJOR) \ + "." UV_STRINGIFY(UV_VERSION_MINOR) "." UV_STRINGIFY(UV_VERSION_PATCH) + +#if UV_VERSION_IS_RELEASE +#define UV_VERSION_STRING UV_VERSION_STRING_BASE +#else +#define UV_VERSION_STRING UV_VERSION_STRING_BASE "-" UV_VERSION_SUFFIX +#endif + +#if !defined(UV__ERR) +#define UV__ERR(x) (-(x)) +#endif + +#if UV_VERSION_HEX < UV_VERSION(1, 19, 0) +static inline void * +uv_handle_get_data(const uv_handle_t *handle) { + return (handle->data); +} + +static inline void +uv_handle_set_data(uv_handle_t *handle, void *data) { + handle->data = data; +} + +static inline void * +uv_req_get_data(const uv_req_t *req) { + return (req->data); +} + +static inline void +uv_req_set_data(uv_req_t *req, void *data) { + req->data = data; +} +#endif /* UV_VERSION_HEX < UV_VERSION(1, 19, 0) */ + +#if UV_VERSION_HEX < UV_VERSION(1, 32, 0) +int +uv_tcp_close_reset(uv_tcp_t *handle, uv_close_cb close_cb); +#endif + +#if UV_VERSION_HEX < UV_VERSION(1, 34, 0) +#define uv_sleep(msec) usleep(msec * 1000) +#endif /* UV_VERSION_HEX < UV_VERSION(1, 34, 0) */ + +#if UV_VERSION_HEX < UV_VERSION(1, 27, 0) +int +isc_uv_udp_connect(uv_udp_t *handle, const struct sockaddr *addr); +/*%< + * Associate the UDP handle to a remote address and port, so every message sent + * by this handle is automatically sent to that destination. + * + * NOTE: This is just a limited shim for uv_udp_connect() as it requires the + * handle to be bound. + */ +#else /* UV_VERSION_HEX < UV_VERSION(1, 27, 0) */ +#define isc_uv_udp_connect uv_udp_connect +#endif /* UV_VERSION_HEX < UV_VERSION(1, 27, 0) */ + +#if UV_VERSION_HEX < UV_VERSION(1, 12, 0) +#include <stdlib.h> +#include <string.h> + +static inline int +uv_os_getenv(const char *name, char *buffer, size_t *size) { + size_t len; + char *buf = getenv(name); + + if (buf == NULL) { + return (UV_ENOENT); + } + + len = strlen(buf) + 1; + if (len > *size) { + *size = len; + return (UV_ENOBUFS); + } + + *size = len; + memmove(buffer, buf, len); + + return (0); +} + +#define uv_os_setenv(name, value) setenv(name, value, 0) +#endif /* UV_VERSION_HEX < UV_VERSION(1, 12, 0) */ + +int +isc_uv_udp_freebind(uv_udp_t *handle, const struct sockaddr *addr, + unsigned int flags); + +int +isc_uv_tcp_freebind(uv_tcp_t *handle, const struct sockaddr *addr, + unsigned int flags); diff --git a/lib/isc/netmgr/uverr2result.c b/lib/isc/netmgr/uverr2result.c new file mode 100644 index 0000000..9f16ea8 --- /dev/null +++ b/lib/isc/netmgr/uverr2result.c @@ -0,0 +1,105 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <stdbool.h> +#include <uv.h> + +#include <isc/result.h> +#include <isc/string.h> +#include <isc/util.h> + +#include "netmgr-int.h" + +/*% + * Convert a libuv error value into an isc_result_t. The + * list of supported error values is not complete; new users + * of this function should add any expected errors that are + * not already there. + */ +isc_result_t +isc___nm_uverr2result(int uverr, bool dolog, const char *file, + unsigned int line, const char *func) { + switch (uverr) { + case 0: + return (ISC_R_SUCCESS); + case UV_ENOTDIR: + case UV_ELOOP: + case UV_EINVAL: /* XXX sometimes this is not for files */ + case UV_ENAMETOOLONG: + case UV_EBADF: + return (ISC_R_INVALIDFILE); + case UV_ENOENT: + return (ISC_R_FILENOTFOUND); + case UV_EAGAIN: + return (ISC_R_NOCONN); + case UV_EACCES: + case UV_EPERM: + return (ISC_R_NOPERM); + case UV_EEXIST: + return (ISC_R_FILEEXISTS); + case UV_EIO: + return (ISC_R_IOERROR); + case UV_ENOMEM: + return (ISC_R_NOMEMORY); + case UV_ENFILE: + case UV_EMFILE: + return (ISC_R_TOOMANYOPENFILES); + case UV_ENOSPC: + return (ISC_R_DISCFULL); + case UV_EPIPE: + case UV_ECONNRESET: + case UV_ECONNABORTED: + return (ISC_R_CONNECTIONRESET); + case UV_ENOTCONN: + return (ISC_R_NOTCONNECTED); + case UV_ETIMEDOUT: + return (ISC_R_TIMEDOUT); + case UV_ENOBUFS: + return (ISC_R_NORESOURCES); + case UV_EAFNOSUPPORT: + return (ISC_R_FAMILYNOSUPPORT); + case UV_ENETDOWN: + return (ISC_R_NETDOWN); + case UV_EHOSTDOWN: + return (ISC_R_HOSTDOWN); + case UV_ENETUNREACH: + return (ISC_R_NETUNREACH); + case UV_EHOSTUNREACH: + return (ISC_R_HOSTUNREACH); + case UV_EADDRINUSE: + return (ISC_R_ADDRINUSE); + case UV_EADDRNOTAVAIL: + return (ISC_R_ADDRNOTAVAIL); + case UV_ECONNREFUSED: + return (ISC_R_CONNREFUSED); + case UV_ECANCELED: + return (ISC_R_CANCELED); + case UV_EOF: + return (ISC_R_EOF); + case UV_EMSGSIZE: + return (ISC_R_MAXSIZE); + case UV_ENOTSUP: + return (ISC_R_FAMILYNOSUPPORT); + case UV_ENOPROTOOPT: + case UV_EPROTONOSUPPORT: + return (ISC_R_INVALIDPROTO); + default: + if (dolog) { + UNEXPECTED_ERROR("unable to convert libuv error code " + "in %s (%s:%d) to isc_result: %d: %s", + func, file, line, uverr, + uv_strerror(uverr)); + } + return (ISC_R_UNEXPECTED); + } +} |