1 files changed, 387 insertions, 0 deletions
diff --git a/src/sock_unix.c b/src/sock_unix.c
new file mode 100644
index 0000000..ef749a5
--- /dev/null
+++ b/src/sock_unix.c
@@ -0,0 +1,387 @@
+/*
+ * SOCK_UNIX socket management
+ *
+ * Copyright 2000-2020 Willy Tarreau <w@1wt.eu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <ctype.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <haproxy/api.h>
+#include <haproxy/errors.h>
+#include <haproxy/fd.h>
+#include <haproxy/global.h>
+#include <haproxy/listener.h>
+#include <haproxy/receiver-t.h>
+#include <haproxy/namespace.h>
+#include <haproxy/sock.h>
+#include <haproxy/sock_unix.h>
+#include <haproxy/tools.h>
+
+
+struct proto_fam proto_fam_unix = {
+	.name = "unix",
+	.sock_domain = PF_UNIX,
+	.sock_family = AF_UNIX,
+	.sock_addrlen = sizeof(struct sockaddr_un),
+	.l3_addrlen = sizeof(((struct sockaddr_un*)0)->sun_path),
+	.addrcmp = sock_unix_addrcmp,
+	.bind = sock_unix_bind_receiver,
+	.get_src = sock_get_src,
+	.get_dst = sock_get_dst,
+};
+
+/* PLEASE NOTE for functions below:
+ *
+ * The address family SHOULD always be checked. In some cases a function will
+ * be used in a situation where the address family is guaranteed (e.g. protocol
+ * definitions), so the test may be avoided. This special case must then be
+ * mentioned in the comment before the function definition.
+ */
+
+
+/* Compares two AF_UNIX sockaddr addresses. Returns 0 if they match or non-zero
+ * if they do not match. It also supports ABNS socket addresses (those starting
+ * with \0). For regular UNIX sockets however, this does explicitly support
+ * matching names ending exactly with .XXXXX.tmp which are newly bound sockets
+ * about to be replaced; this suffix is then ignored. Note that our UNIX socket
+ * paths are always zero-terminated.
+ */
+int sock_unix_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b)
+{
+	const struct sockaddr_un *au = (const struct sockaddr_un *)a;
+	const struct sockaddr_un *bu = (const struct sockaddr_un *)b;
+	int idx, dot, idx2;
+
+	if (a->ss_family != b->ss_family)
+		return -1;
+
+	if (a->ss_family != AF_UNIX)
+		return -1;
+
+	if (au->sun_path[0] != bu->sun_path[0])
+		return -1;
+
+	if (au->sun_path[0] == 0)
+		return memcmp(au->sun_path, bu->sun_path, sizeof(au->sun_path));
+
+	idx = 1; dot = 0;
+	while (au->sun_path[idx] == bu->sun_path[idx]) {
+		if (au->sun_path[idx] == 0)
+			return 0;
+		if (au->sun_path[idx] == '.')
+			dot = idx;
+		idx++;
+	}
+
+	/* Now we have a difference. It's OK if they are within or after a
+	 * sequence of digits following a dot, and are followed by ".tmp".
+	 *
+	 * make sure to perform the check against tempname if the compared
+	 * string is in "final" format (does not end with ".XXXX.tmp").
+	 *
+	 * Examples:
+	 *     /tmp/test matches with /tmp/test.1822.tmp
+	 *     /tmp/test.1822.tmp matches with /tmp/test.XXXX.tmp
+	 */
+	if (au->sun_path[idx] == 0 || bu->sun_path[idx] == 0) {
+		if (au->sun_path[idx] == '.' || bu->sun_path[idx] == '.')
+			dot = idx; /* try to match against temp path */
+		else
+			return -1; /* invalid temp path */
+	}
+
+	if (!dot)
+		return -1;
+
+	/* First, check in path "a" */
+	if (au->sun_path[idx] != 0) {
+		for (idx2 = dot + 1; idx2 && isdigit((unsigned char)au->sun_path[idx2]);)
+			idx2++;
+		if (strcmp(au->sun_path + idx2, ".tmp") != 0)
+			return -1;
+	}
+
+	/* Then check in path "b" */
+	if (bu->sun_path[idx] != 0) {
+		for (idx2 = dot + 1; idx2 && isdigit((unsigned char)bu->sun_path[idx2]); idx2++)
+			;
+		if (strcmp(bu->sun_path + idx2, ".tmp") != 0)
+			return -1;
+	}
+
+	/* OK that's a match */
+	return 0;
+}
+
+/* Binds receiver <rx>, and assigns rx->iocb and rx->owner as the callback and
+ * context, respectively, with ->bind_thread as the thread mask. Returns an
+ * error code made of ERR_* bits on failure or ERR_NONE on success. On failure,
+ * an error message may be passed into <errmsg>.
+ */
+int sock_unix_bind_receiver(struct receiver *rx, char **errmsg)
+{
+	char tempname[MAXPATHLEN];
+	char backname[MAXPATHLEN];
+	struct sockaddr_un addr;
+	const char *path;
+	int maxpathlen;
+	int fd, err, ext, ret;
+
+	/* ensure we never return garbage */
+	if (errmsg)
+		*errmsg = 0;
+
+	err = ERR_NONE;
+
+	if (rx->flags & RX_F_BOUND)
+		return ERR_NONE;
+
+	if (rx->flags & RX_F_MUST_DUP) {
+		/* this is a secondary receiver that is an exact copy of a
+		 * reference which must already be bound (or has failed).
+		 * We'll try to dup() the other one's FD and take it. We
+		 * try hard not to reconfigure the socket since it's shared.
+		 */
+		BUG_ON(!rx->shard_info);
+		if (!(rx->shard_info->ref->flags & RX_F_BOUND)) {
+			/* it's assumed that the first one has already reported
+			 * the error, let's not spam with another one, and do
+			 * not set ERR_ALERT.
+			 */
+			err |= ERR_RETRYABLE;
+			goto bind_ret_err;
+		}
+		/* taking the other one's FD will result in it being marked
+		 * extern and being dup()ed. Let's mark the receiver as
+		 * inherited so that it properly bypasses all second-stage
+		 * setup and avoids being passed to new processes.
+		 */
+		rx->flags |= RX_F_INHERITED;
+		rx->fd = rx->shard_info->ref->fd;
+	}
+
+	/* if no FD was assigned yet, we'll have to either find a compatible
+	 * one or create a new one.
+	 */
+	if (rx->fd == -1)
+		rx->fd = sock_find_compatible_fd(rx);
+
+	path = ((struct sockaddr_un *)&rx->addr)->sun_path;
+	maxpathlen = MIN(MAXPATHLEN, sizeof(addr.sun_path));
+
+	/* if the listener already has an fd assigned, then we were offered the
+	 * fd by an external process (most likely the parent), and we don't want
+	 * to create a new socket. However we still want to set a few flags on
+	 * the socket.
+	 */
+	fd = rx->fd;
+	ext = (fd >= 0);
+	if (ext)
+		goto fd_ready;
+
+	if (path[0]) {
+		ret = snprintf(tempname, maxpathlen, "%s.%d.tmp", path, pid);
+		if (ret < 0 || ret >= sizeof(addr.sun_path)) {
+			err |= ERR_FATAL | ERR_ALERT;
+			memprintf(errmsg, "name too long for UNIX socket (limit usually 97)");
+			goto bind_return;
+		}
+
+		ret = snprintf(backname, maxpathlen, "%s.%d.bak", path, pid);
+		if (ret < 0 || ret >= maxpathlen) {
+			err |= ERR_FATAL | ERR_ALERT;
+			memprintf(errmsg, "name too long for UNIX socket (limit usually 97)");
+			goto bind_return;
+		}
+
+		/* 2. clean existing orphaned entries */
+		if (unlink(tempname) < 0 && errno != ENOENT) {
+			err |= ERR_FATAL | ERR_ALERT;
+			memprintf(errmsg, "error when trying to unlink previous UNIX socket (%s)", strerror(errno));
+			goto bind_return;
+		}
+
+		if (unlink(backname) < 0 && errno != ENOENT) {
+			err |= ERR_FATAL | ERR_ALERT;
+			memprintf(errmsg, "error when trying to unlink previous UNIX socket (%s)", strerror(errno));
+			goto bind_return;
+		}
+
+		/* 3. backup existing socket */
+		if (link(path, backname) < 0 && errno != ENOENT) {
+			err |= ERR_FATAL | ERR_ALERT;
+			memprintf(errmsg, "error when trying to preserve previous UNIX socket (%s)", strerror(errno));
+			goto bind_return;
+		}
+
+		/* Note: this test is redundant with the snprintf one above and
+		 * will never trigger, it's just added as the only way to shut
+		 * gcc's painfully dumb warning about possibly truncated output
+		 * during strncpy(). Don't move it above or smart gcc will not
+		 * see it!
+		 */
+		if (strlen(tempname) >= sizeof(addr.sun_path)) {
+			err |= ERR_FATAL | ERR_ALERT;
+			memprintf(errmsg, "name too long for UNIX socket (limit usually 97)");
+			goto bind_return;
+		}
+
+		strncpy(addr.sun_path, tempname, sizeof(addr.sun_path) - 1);
+		addr.sun_path[sizeof(addr.sun_path) - 1] = 0;
+	}
+	else {
+		/* first char is zero, it's an abstract socket whose address
+		 * is defined by all the bytes past this zero.
+		 */
+		memcpy(addr.sun_path, path, sizeof(addr.sun_path));
+	}
+	addr.sun_family = AF_UNIX;
+
+	/* WT: shouldn't we use my_socketat(rx->netns) here instead ? */
+	fd = socket(rx->proto->fam->sock_domain, rx->proto->sock_type, rx->proto->sock_prot);
+	if (fd < 0) {
+		err |= ERR_FATAL | ERR_ALERT;
+		memprintf(errmsg, "cannot create receiving socket (%s)", strerror(errno));
+		goto bind_return;
+	}
+
+ fd_ready:
+	if (ext && fd < global.maxsock && fdtab[fd].owner) {
+		/* This FD was already bound so this means that it was already
+		 * known and registered before parsing, hence it's an inherited
+		 * FD. The only reason why it's already known here is that it
+		 * has been registered multiple times (multiple listeners on the
+		 * same, or a "shards" directive on the line). There cannot be
+		 * multiple listeners on one FD but at least we can create a
+		 * new one from the original one. We won't reconfigure it,
+		 * however, as this was already done for the first one.
+		 */
+		fd = dup(fd);
+		if (fd == -1) {
+			err |= ERR_RETRYABLE | ERR_ALERT;
+			memprintf(errmsg, "cannot dup() receiving socket (%s)", strerror(errno));
+			goto bind_return;
+		}
+	}
+
+	if (fd >= global.maxsock) {
+		err |= ERR_FATAL | ERR_ABORT | ERR_ALERT;
+		memprintf(errmsg, "not enough free sockets (raise '-n' parameter)");
+		goto bind_close_return;
+	}
+
+	if (fd_set_nonblock(fd) == -1) {
+		err |= ERR_FATAL | ERR_ALERT;
+		memprintf(errmsg, "cannot make socket non-blocking");
+		goto bind_close_return;
+	}
+
+	if (!ext && bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+		/* note that bind() creates the socket <tempname> on the file system */
+		if (errno == EADDRINUSE) {
+			/* the old process might still own it, let's retry */
+			err |= ERR_RETRYABLE | ERR_ALERT;
+			memprintf(errmsg, "cannot bind UNIX socket (already in use)");
+			goto bind_close_return;
+		}
+		else {
+			err |= ERR_FATAL | ERR_ALERT;
+			memprintf(errmsg, "cannot bind UNIX socket (%s)", strerror(errno));
+			goto bind_close_return;
+		}
+	}
+
+	/* <uid> and <gid> different of -1 will be used to change the socket owner.
+	 * If <mode> is not 0, it will be used to restrict access to the socket.
+	 * While it is known not to be portable on every OS, it's still useful
+	 * where it works. We also don't change permissions on abstract sockets.
+	 */
+	if (!ext && path[0] &&
+	    (((rx->settings->ux.uid != -1 || rx->settings->ux.gid != -1) &&
+	      (chown(tempname, rx->settings->ux.uid, rx->settings->ux.gid) == -1)) ||
+	     (rx->settings->ux.mode != 0 && chmod(tempname, rx->settings->ux.mode) == -1))) {
+		err |= ERR_FATAL | ERR_ALERT;
+		memprintf(errmsg, "cannot change UNIX socket ownership (%s)", strerror(errno));
+		goto err_unlink_temp;
+	}
+
+	/* Point of no return: we are ready, we'll switch the sockets. We don't
+	 * fear losing the socket <path> because we have a copy of it in
+	 * backname. Abstract sockets are not renamed.
+	 */
+	if (!ext && path[0] && rename(tempname, path) < 0) {
+		err |= ERR_FATAL | ERR_ALERT;
+		memprintf(errmsg, "cannot switch final and temporary UNIX sockets (%s)", strerror(errno));
+		goto err_rename;
+	}
+
+	/* Cleanup: only unlink if we didn't inherit the fd from the parent */
+	if (!ext && path[0])
+		unlink(backname);
+
+	rx->fd = fd;
+	rx->flags |= RX_F_BOUND;
+
+	if (!path[0]) {
+		/* ABNS sockets do not support suspend, and they conflict with
+		 * other ones (no reuseport), so they must always be unbound.
+		 */
+		rx->flags |= RX_F_NON_SUSPENDABLE;
+	}
+
+	fd_insert(fd, rx->owner, rx->iocb, rx->bind_tgroup, rx->bind_thread);
+
+	/* for now, all regularly bound TCP listeners are exportable */
+	if (!(rx->flags & RX_F_INHERITED))
+		HA_ATOMIC_OR(&fdtab[fd].state, FD_EXPORTED);
+
+	return err;
+
+ err_rename:
+	ret = rename(backname, path);
+	if (ret < 0 && errno == ENOENT)
+		unlink(path);
+ err_unlink_temp:
+	if (!ext && path[0])
+		unlink(tempname);
+	close(fd);
+ err_unlink_back:
+	if (!ext && path[0])
+		unlink(backname);
+ bind_return:
+	if (errmsg && *errmsg) {
+		if (!ext) {
+			char *path_str;
+
+			path_str = sa2str((struct sockaddr_storage *)&rx->addr, 0, 0);
+			memprintf(errmsg, "%s [%s]", *errmsg, ((path_str) ? path_str : ""));
+			ha_free(&path_str);
+		}
+		else
+			memprintf(errmsg, "%s [fd %d]", *errmsg, fd);
+	}
+ bind_ret_err:
+	return err;
+
+ bind_close_return:
+	close(fd);
+	goto bind_return;
+}