1 files changed, 766 insertions, 0 deletions
diff --git a/src/master/service-monitor.c b/src/master/service-monitor.c
new file mode 100644
index 0000000..c10e99c
--- /dev/null
+++ b/src/master/service-monitor.c
@@ -0,0 +1,766 @@
+/* Copyright (c) 2005-2018 Dovecot authors, see the included COPYING file */
+
+#include "common.h"
+#include "array.h"
+#include "ioloop.h"
+#include "hash.h"
+#include "str.h"
+#include "safe-mkstemp.h"
+#include "time-util.h"
+#include "sleep.h"
+#include "master-client.h"
+#include "service.h"
+#include "service-process.h"
+#include "service-process-notify.h"
+#include "service-anvil.h"
+#include "service-log.h"
+#include "service-monitor.h"
+
+#include <unistd.h>
+#include <sys/wait.h>
+#include <syslog.h>
+#include <signal.h>
+
+#define SERVICE_DROP_WARN_INTERVAL_SECS 1
+#define SERVICE_DROP_TIMEOUT_MSECS (10*1000)
+#define SERVICE_LOG_DROP_WARNING_DELAY_MSECS 500
+#define MAX_DIE_WAIT_MSECS 5000
+#define SERVICE_MAX_EXIT_FAILURES_IN_SEC 10
+#define SERVICE_PREFORK_MAX_AT_ONCE 10
+
+static void service_monitor_start_extra_avail(struct service *service);
+static void service_status_more(struct service_process *process,
+				const struct master_status *status);
+static void service_monitor_listen_start_force(struct service *service);
+
+static void service_process_kill_idle(struct service_process *process)
+{
+	struct service *service = process->service;
+	struct master_status status;
+
+	i_assert(process->available_count == service->client_limit);
+
+	if (service->process_avail <= service->set->process_min_avail) {
+		/* we don't have any extra idling processes anymore. */
+		timeout_remove(&process->to_idle);
+	} else if (process->last_kill_sent > process->last_status_update+1) {
+		service_error(service, "Process %s is ignoring idle SIGINT",
+			      dec2str(process->pid));
+
+		/* assume this process is busy */
+		i_zero(&status);
+		service_status_more(process, &status);
+		process->available_count = 0;
+	} else {
+		if (kill(process->pid, SIGINT) < 0 && errno != ESRCH) {
+			service_error(service, "kill(%s, SIGINT) failed: %m",
+				      dec2str(process->pid));
+		}
+		process->last_kill_sent = ioloop_time;
+	}
+}
+
+static void service_status_more(struct service_process *process,
+				const struct master_status *status)
+{
+	struct service *service = process->service;
+
+	process->total_count +=
+		process->available_count - status->available_count;
+	process->idle_start = 0;
+
+	timeout_remove(&process->to_idle);
+
+	if (status->available_count != 0)
+		return;
+
+	/* process used up all of its clients */
+	i_assert(service->process_avail > 0);
+	service->process_avail--;
+
+	if (service->type == SERVICE_TYPE_LOGIN &&
+	    service->process_avail == 0 &&
+	    service->process_count == service->process_limit)
+		service_login_notify(service, TRUE);
+
+	/* we may need to start more */
+	service_monitor_start_extra_avail(service);
+	service_monitor_listen_start(service);
+}
+
+static void service_check_idle(struct service_process *process)
+{
+	struct service *service = process->service;
+
+	if (process->available_count != service->client_limit)
+		return;
+	process->idle_start = ioloop_time;
+	if (service->process_avail > service->set->process_min_avail &&
+	    process->to_idle == NULL &&
+	    service->idle_kill != UINT_MAX) {
+		/* we have more processes than we really need.
+		   add a bit of randomness so that we don't send the
+		   signal to all of them at once */
+		process->to_idle =
+			timeout_add((service->idle_kill * 1000) +
+				    i_rand_limit(100) * 10,
+				    service_process_kill_idle,
+				    process);
+	}
+}
+
+static void service_status_less(struct service_process *process)
+{
+	struct service *service = process->service;
+
+	/* some process got more connections - remove the delayed warning */
+	timeout_remove(&service->to_drop_warning);
+
+	if (process->available_count == 0) {
+		/* process can accept more clients again */
+		if (service->process_avail++ == 0)
+			service_monitor_listen_stop(service);
+		i_assert(service->process_avail <= service->process_count);
+	}
+	if (service->type == SERVICE_TYPE_LOGIN)
+		service_login_notify(service, FALSE);
+}
+
+static void
+service_status_input_one(struct service *service,
+			 const struct master_status *status)
+{
+        struct service_process *process;
+
+	process = hash_table_lookup(service_pids, POINTER_CAST(status->pid));
+	if (process == NULL) {
+		/* we've probably wait()ed it away already. ignore */
+		return;
+	}
+
+	if (process->uid != status->uid || process->service != service) {
+		/* a) Process was closed and another process was created with
+		   the same PID, but we're still receiving status update from
+		   the old process.
+
+		   b) Some process is trying to corrupt our internal state by
+		   trying to pretend to be someone else. We could use stronger
+		   randomness here, but the worst they can do is DoS and there
+		   are already more serious problems if someone is able to do
+		   this.. */
+		service_error(service, "Ignoring invalid update from child %s "
+			      "(UID=%u)", dec2str(status->pid), status->uid);
+		return;
+	}
+	process->last_status_update = ioloop_time;
+
+	/* first status notification */
+	timeout_remove(&process->to_status);
+
+	if (process->available_count != status->available_count) {
+		if (process->available_count > status->available_count) {
+			/* process started servicing some more clients */
+			service_status_more(process, status);
+		} else {
+			/* process finished servicing some clients */
+			service_status_less(process);
+		}
+		process->available_count = status->available_count;
+	}
+	service_check_idle(process);
+}
+
+static void service_status_input(struct service *service)
+{
+	struct master_status status[1024/sizeof(struct master_status)];
+	unsigned int i, count;
+	ssize_t ret;
+
+	ret = read(service->status_fd[0], &status, sizeof(status));
+	if (ret <= 0) {
+		if (ret == 0)
+			service_error(service, "read(status) failed: EOF");
+		else if (errno != EAGAIN)
+			service_error(service, "read(status) failed: %m");
+		else
+			return;
+		service_monitor_stop(service);
+		return;
+	}
+
+	if ((ret % sizeof(struct master_status)) != 0) {
+		service_error(service, "service sent partial status update "
+			      "(%d bytes)", (int)ret);
+		return;
+	}
+
+	count = ret / sizeof(struct master_status);
+	for (i = 0; i < count; i++)
+		service_status_input_one(service, &status[i]);
+}
+
+static void service_log_drop_warning(struct service *service)
+{
+	const char *limit_name;
+	unsigned int limit;
+
+	if (service->last_drop_warning +
+	    SERVICE_DROP_WARN_INTERVAL_SECS <= ioloop_time) {
+		service->last_drop_warning = ioloop_time;
+		if (service->process_limit > 1) {
+			limit_name = "process_limit";
+			limit = service->process_limit;
+		} else if (service->set->service_count == 1) {
+			i_assert(service->client_limit == 1);
+			limit_name = "client_limit/service_count";
+			limit = 1;
+		} else {
+			limit_name = "client_limit";
+			limit = service->client_limit;
+		}
+		i_warning("service(%s): %s (%u) reached, "
+			  "client connections are being dropped",
+			  service->set->name, limit_name, limit);
+	}
+}
+
+static void service_monitor_throttle(struct service *service)
+{
+	if (service->to_throttle != NULL || service->list->destroying)
+		return;
+
+	i_assert(service->throttle_msecs > 0);
+
+	service_error(service,
+		      "command startup failed, throttling for %u.%03u secs",
+		      service->throttle_msecs / 1000,
+		      service->throttle_msecs % 1000);
+	service_throttle(service, service->throttle_msecs);
+	service->throttle_msecs *= 2;
+	if (service->throttle_msecs >
+	    SERVICE_STARTUP_FAILURE_THROTTLE_MAX_MSECS) {
+		service->throttle_msecs =
+			SERVICE_STARTUP_FAILURE_THROTTLE_MAX_MSECS;
+	}
+}
+
+static void service_drop_timeout(struct service *service)
+{
+	struct service_listener *lp;
+	int fd;
+
+	i_assert(service->process_avail == 0);
+
+	/* drop all pending connections */
+	array_foreach_elem(&service->listeners, lp) {
+		while ((fd = net_accept(lp->fd, NULL, NULL)) > 0)
+			net_disconnect(fd);
+	}
+
+	service_monitor_listen_start_force(service);
+	service->listen_pending = TRUE;
+}
+
+static void service_monitor_listen_pending(struct service *service)
+{
+	i_assert(service->process_avail == 0);
+
+	service_monitor_listen_stop(service);
+	service->listen_pending = TRUE;
+
+	service->to_drop = timeout_add(SERVICE_DROP_TIMEOUT_MSECS,
+				       service_drop_timeout, service);
+}
+
+static void service_drop_connections(struct service_listener *l)
+{
+	struct service *service = l->service;
+	int fd;
+
+	if (service->type != SERVICE_TYPE_WORKER)
+		service_log_drop_warning(service);
+
+	if (service->type == SERVICE_TYPE_LOGIN) {
+		/* reached process limit, notify processes that they
+		   need to start killing existing connections if they
+		   reach connection limit */
+		service_login_notify(service, TRUE);
+
+		service_monitor_listen_pending(service);
+	} else if (!service->listen_pending) {
+		/* maybe this is a temporary peak, stop for a while and
+		   see if it goes away */
+		service_monitor_listen_pending(service);
+		if (service->to_drop_warning == NULL &&
+		    service->type == SERVICE_TYPE_WORKER) {
+			service->to_drop_warning =
+				timeout_add_short(SERVICE_LOG_DROP_WARNING_DELAY_MSECS,
+						  service_log_drop_warning, service);
+		}
+	} else {
+		/* this has been happening for a while now. just accept and
+		   close the connection, so it's clear that this is happening
+		   because of the limit, rather than because the service
+		   processes aren't answering fast enough */
+		fd = net_accept(l->fd, NULL, NULL);
+		if (fd > 0)
+			net_disconnect(fd);
+	}
+}
+
+static void service_accept(struct service_listener *l)
+{
+	struct service *service = l->service;
+
+	i_assert(service->process_avail == 0);
+
+	if (service->process_count == service->process_limit) {
+		/* we've reached our limits, new clients will have to
+		   wait until there are more processes available */
+		service_drop_connections(l);
+		return;
+	}
+
+	/* create a child process and let it accept() this connection */
+	if (service_process_create(service) == NULL)
+		service_monitor_throttle(service);
+	else
+		service_monitor_listen_stop(service);
+}
+
+static bool
+service_monitor_start_count(struct service *service, unsigned int limit)
+{
+	unsigned int i, count;
+
+	i_assert(service->set->process_min_avail >= service->process_avail);
+
+	count = service->set->process_min_avail - service->process_avail;
+	if (service->process_count + count > service->process_limit)
+		count = service->process_limit - service->process_count;
+	if (count > limit)
+		count = limit;
+
+	for (i = 0; i < count; i++) {
+		if (service_process_create(service) == NULL) {
+			service_monitor_throttle(service);
+			break;
+		}
+	}
+	if (i > 0) {
+		/* we created some processes, they'll do the listening now */
+		service_monitor_listen_stop(service);
+	}
+	return i >= limit;
+}
+
+static void service_monitor_prefork_timeout(struct service *service)
+{
+	/* don't prefork more processes if other more important processes had
+	   been forked while we were waiting for this timeout (= master seems
+	   busy) */
+	if (service->list->fork_counter != service->prefork_counter) {
+		service->prefork_counter = service->list->fork_counter;
+		return;
+	}
+	if (service->process_avail < service->set->process_min_avail) {
+		if (service_monitor_start_count(service, SERVICE_PREFORK_MAX_AT_ONCE) &&
+		    service->process_avail < service->set->process_min_avail) {
+			/* All SERVICE_PREFORK_MAX_AT_ONCE were created, but
+			   it still wasn't enough. Launch more in the next
+			   timeout. */
+			return;
+		}
+	}
+	timeout_remove(&service->to_prefork);
+}
+
+static void service_monitor_start_extra_avail(struct service *service)
+{
+	if (service->process_avail >= service->set->process_min_avail ||
+	    service->process_count >= service->process_limit ||
+	    service->list->destroying)
+		return;
+
+	if (service->process_avail == 0) {
+		/* quickly start one process now */
+		if (!service_monitor_start_count(service, 1))
+			return;
+		if (service->process_avail >= service->set->process_min_avail)
+			return;
+	}
+	if (service->to_prefork == NULL) {
+		/* ioloop handles timeouts before fds (= SIGCHLD callback),
+		   so let the first timeout handler call simply update the fork
+		   counter and the second one check if we're busy or not. */
+		service->to_prefork =
+			timeout_add_short(0, service_monitor_prefork_timeout, service);
+	}
+}
+
+static void service_monitor_listen_start_force(struct service *service)
+{
+	struct service_listener *l;
+
+	service->listening = TRUE;
+	service->listen_pending = FALSE;
+	timeout_remove(&service->to_drop);
+	timeout_remove(&service->to_drop_warning);
+
+	array_foreach_elem(&service->listeners, l) {
+		if (l->io == NULL && l->fd != -1)
+			l->io = io_add(l->fd, IO_READ, service_accept, l);
+	}
+}
+
+void service_monitor_listen_start(struct service *service)
+{
+	if (service->process_avail > 0 || service->to_throttle != NULL ||
+	    (service->process_count == service->process_limit &&
+	     service->listen_pending))
+		return;
+
+	service_monitor_listen_start_force(service);
+}
+
+void service_monitor_listen_stop(struct service *service)
+{
+	struct service_listener *l;
+
+	array_foreach_elem(&service->listeners, l)
+		io_remove(&l->io);
+	service->listening = FALSE;
+	service->listen_pending = FALSE;
+	timeout_remove(&service->to_drop);
+	timeout_remove(&service->to_drop_warning);
+}
+
+static int service_login_create_notify_fd(struct service *service)
+{
+	int fd, ret;
+
+	if (service->login_notify_fd != -1)
+		return 0;
+
+	T_BEGIN {
+		string_t *prefix = t_str_new(128);
+		const char *path;
+
+		str_append(prefix, service->set->master_set->base_dir);
+		str_append(prefix, "/login-master-notify");
+
+		fd = safe_mkstemp(prefix, 0600, (uid_t)-1, (gid_t)-1);
+		path = str_c(prefix);
+
+		if (fd == -1) {
+			service_error(service, "safe_mkstemp(%s) failed: %m",
+				      path);
+		} else if (unlink(path) < 0) {
+			service_error(service, "unlink(%s) failed: %m", path);
+		} else {
+			fd_close_on_exec(fd, TRUE);
+			service->login_notify_fd = fd;
+		}
+	} T_END;
+
+	ret = fd == -1 ? -1 : 0;
+	if (fd != service->login_notify_fd)
+		i_close_fd(&fd);
+	return ret;
+}
+
+void services_monitor_start(struct service_list *service_list)
+{
+	ARRAY(struct service *) listener_services;
+	struct service *service;
+
+	if (services_log_init(service_list) < 0)
+		return;
+	service_anvil_monitor_start(service_list);
+
+	if (service_list->io_master == NULL &&
+	    service_list->master_fd != -1) {
+		service_list->io_master =
+			io_add(service_list->master_fd, IO_READ,
+			       master_client_connected, service_list);
+	}
+
+	t_array_init(&listener_services, array_count(&service_list->services));
+	array_foreach_elem(&service_list->services, service) {
+		if (service->type == SERVICE_TYPE_LOGIN) {
+			if (service_login_create_notify_fd(service) < 0)
+				continue;
+		}
+		if (service->master_dead_pipe_fd[0] == -1) {
+			if (pipe(service->master_dead_pipe_fd) < 0) {
+				service_error(service, "pipe() failed: %m");
+				continue;
+			}
+			fd_close_on_exec(service->master_dead_pipe_fd[0], TRUE);
+			fd_close_on_exec(service->master_dead_pipe_fd[1], TRUE);
+		}
+		if (service->status_fd[0] == -1) {
+			/* we haven't yet created status pipe */
+			if (pipe(service->status_fd) < 0) {
+				service_error(service, "pipe() failed: %m");
+				continue;
+			}
+
+			net_set_nonblock(service->status_fd[0], TRUE);
+			fd_close_on_exec(service->status_fd[0], TRUE);
+			net_set_nonblock(service->status_fd[1], TRUE);
+			fd_close_on_exec(service->status_fd[1], TRUE);
+		}
+		if (service->io_status == NULL) {
+			service->io_status =
+				io_add(service->status_fd[0], IO_READ,
+				       service_status_input, service);
+		}
+		service_monitor_listen_start(service);
+		array_push_back(&listener_services, &service);
+	}
+
+	/* create processes only after adding all listeners */
+	array_foreach_elem(&listener_services, service)
+		service_monitor_start_extra_avail(service);
+
+	if (service_list->log->status_fd[0] != -1) {
+		if (service_process_create(service_list->log) != NULL)
+			service_monitor_listen_stop(service_list->log);
+	}
+
+	/* start up a process for startup-services */
+	array_foreach_elem(&service_list->services, service) {
+		if (service->type == SERVICE_TYPE_STARTUP &&
+		    service->status_fd[0] != -1) {
+			if (service_process_create(service) != NULL)
+				service_monitor_listen_stop(service);
+		}
+	}
+}
+
+static void service_monitor_close_dead_pipe(struct service *service)
+{
+	if (service->master_dead_pipe_fd[0] != -1) {
+		i_close_fd(&service->master_dead_pipe_fd[0]);
+		i_close_fd(&service->master_dead_pipe_fd[1]);
+	}
+}
+
+void service_monitor_stop(struct service *service)
+{
+	int i;
+
+	io_remove(&service->io_status);
+
+	if (service->status_fd[0] != -1 &&
+	    service->type != SERVICE_TYPE_ANVIL) {
+		for (i = 0; i < 2; i++) {
+			if (close(service->status_fd[i]) < 0) {
+				service_error(service,
+					      "close(status fd) failed: %m");
+			}
+			service->status_fd[i] = -1;
+		}
+	}
+	service_monitor_close_dead_pipe(service);
+	if (service->login_notify_fd != -1) {
+		if (close(service->login_notify_fd) < 0) {
+			service_error(service,
+				      "close(login notify fd) failed: %m");
+		}
+		service->login_notify_fd = -1;
+	}
+	timeout_remove(&service->to_login_notify);
+	service_monitor_listen_stop(service);
+
+	timeout_remove(&service->to_throttle);
+	timeout_remove(&service->to_prefork);
+}
+
+void service_monitor_stop_close(struct service *service)
+{
+	struct service_listener *l;
+
+	service_monitor_stop(service);
+
+	array_foreach_elem(&service->listeners, l)
+		i_close_fd(&l->fd);
+}
+
+static void services_monitor_wait(struct service_list *service_list)
+{
+	struct service *service;
+	struct timeval tv_start;
+	bool finished;
+
+	io_loop_time_refresh();
+	tv_start = ioloop_timeval;
+
+	for (;;) {
+		finished = TRUE;
+		services_monitor_reap_children();
+		array_foreach_elem(&service_list->services, service) {
+			if (service->status_fd[0] != -1)
+				service_status_input(service);
+			if (service->process_avail > 0)
+				finished = FALSE;
+		}
+		io_loop_time_refresh();
+		if (finished ||
+		    timeval_diff_msecs(&ioloop_timeval, &tv_start) > MAX_DIE_WAIT_MSECS)
+			break;
+		i_sleep_msecs(100);
+	}
+}
+
+static bool service_processes_close_listeners(struct service *service)
+{
+	struct service_process *process = service->processes;
+	bool ret = FALSE;
+
+	for (; process != NULL; process = process->next) {
+		if (kill(process->pid, SIGQUIT) == 0)
+			ret = TRUE;
+		else if (errno != ESRCH) {
+			service_error(service, "kill(%s, SIGQUIT) failed: %m",
+				      dec2str(process->pid));
+		}
+	}
+	return ret;
+}
+
+static bool
+service_list_processes_close_listeners(struct service_list *service_list)
+{
+	struct service *service;
+	bool ret = FALSE;
+
+	array_foreach_elem(&service_list->services, service) {
+		if (service_processes_close_listeners(service))
+			ret = TRUE;
+	}
+	return ret;
+}
+
+static void services_monitor_wait_and_kill(struct service_list *service_list)
+{
+	/* we've notified all children that the master is dead.
+	   now wait for the children to either die or to tell that
+	   they're no longer listening for new connections. */
+	services_monitor_wait(service_list);
+
+	/* Even if the waiting stopped early because all the process_avail==0,
+	   it can mean that there are processes that have the listener socket
+	   open (just not actively being listened to). We'll need to make sure
+	   that those sockets are closed before we exit, so that a restart
+	   won't fail. Do this by sending SIGQUIT to all the child processes
+	   that are left, which are handled by lib-master to immediately close
+	   the listener in the signal handler itself. */
+	if (service_list_processes_close_listeners(service_list)) {
+		/* SIGQUITs were sent. wait a little bit to make sure they're
+		   also processed before quitting. */
+		i_sleep_msecs(1000);
+	}
+}
+
+void services_monitor_stop(struct service_list *service_list, bool wait)
+{
+	struct service *service;
+
+	array_foreach_elem(&service_list->services, service)
+		service_monitor_close_dead_pipe(service);
+
+	if (wait)
+		services_monitor_wait_and_kill(service_list);
+
+	io_remove(&service_list->io_master);
+
+	array_foreach_elem(&service_list->services, service)
+		service_monitor_stop(service);
+
+	services_log_deinit(service_list);
+}
+
+static bool
+service_process_failure(struct service_process *process, int status)
+{
+	struct service *service = process->service;
+	bool throttle;
+
+	service_process_log_status_error(process, status);
+	throttle = process->to_status != NULL;
+	if (!throttle && !service->have_successful_exits) {
+		/* this service has seen no successful exits yet.
+		   try to avoid failure storms by throttling the service if it
+		   only keeps failing rapidly. this is no longer done after
+		   one success to avoid intentional DoSing, in case attacker
+		   finds a way to quickly crash his own session. */
+		if (service->exit_failure_last != ioloop_time) {
+			service->exit_failure_last = ioloop_time;
+			service->exit_failures_in_sec = 0;
+		}
+		if (++service->exit_failures_in_sec > SERVICE_MAX_EXIT_FAILURES_IN_SEC)
+			throttle = TRUE;
+	}
+	service_process_notify_add(service_anvil_global->kills, process);
+	return throttle;
+}
+
+void services_monitor_reap_children(void)
+{
+	struct service_process *process;
+	struct service *service;
+	pid_t pid;
+	int status;
+	bool service_stopped, throttle;
+
+	while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
+		process = hash_table_lookup(service_pids, POINTER_CAST(pid));
+		if (process == NULL) {
+			i_error("waitpid() returned unknown PID %s",
+				dec2str(pid));
+			continue;
+		}
+
+		service = process->service;
+		if (status == 0) {
+			/* success - one success resets all failures */
+			service->have_successful_exits = TRUE;
+			service->exit_failures_in_sec = 0;
+			service->throttle_msecs =
+				SERVICE_STARTUP_FAILURE_THROTTLE_MIN_MSECS;
+			throttle = FALSE;
+		} else {
+			throttle = service_process_failure(process, status);
+		}
+		if (service->type == SERVICE_TYPE_ANVIL)
+			service_anvil_process_destroyed(process);
+
+		/* if we're reloading, we may get here with a service list
+		   that's going to be destroyed after this process is
+		   destroyed. keep the list referenced until we're done. */
+		service_list_ref(service->list);
+		service_process_destroy(process);
+
+		if (throttle)
+			service_monitor_throttle(service);
+		service_stopped = service->status_fd[0] == -1;
+		if (!service_stopped && !service->list->destroying) {
+			service_monitor_start_extra_avail(service);
+			/* if there are no longer listening processes,
+			   start listening for more */
+			if (service->to_throttle != NULL) {
+				/* throttling */
+			} else if (service == service->list->log &&
+				   service->process_count == 0) {
+				/* log service must always be running */
+				if (service_process_create(service) == NULL)
+					service_monitor_throttle(service);
+			} else {
+				service_monitor_listen_start(service);
+			}
+		}
+		service_list_unref(service->list);
+	}
+}