/*++
/* NAME
/*	events 3
/* SUMMARY
/*	event manager
/* SYNOPSIS
/*	#include <events.h>
/*
/*	time_t	event_time()
/*
/*	void	event_loop(delay)
/*	int	delay;
/*
/*	time_t	event_request_timer(callback, context, delay)
/*	void	(*callback)(int event, void *context);
/*	void	*context;
/*	int	delay;
/*
/*	int	event_cancel_timer(callback, context)
/*	void	(*callback)(int event, void *context);
/*	void	*context;
/*
/*	void	event_enable_read(fd, callback, context)
/*	int	fd;
/*	void	(*callback)(int event, void *context);
/*	void	*context;
/*
/*	void	event_enable_write(fd, callback, context)
/*	int	fd;
/*	void	(*callback)(int event, void *context);
/*	void	*context;
/*
/*	void	event_disable_readwrite(fd)
/*	int	fd;
/*
/*	void	event_drain(time_limit)
/*	int	time_limit;
/*
/*	void	event_fork(void)
/* DESCRIPTION
/*	This module delivers I/O and timer events.
/*	Multiple I/O streams and timers can be monitored simultaneously.
/*	Events are delivered via callback routines provided by the
/*	application. When requesting an event, the application can provide
/*	private context that is passed back when the callback routine is
/*	executed.
/*
/*	event_time() returns a cached value of the current time.
/*
/*	event_loop() monitors all I/O channels for which the application has
/*	expressed interest, and monitors the timer request queue.
/*	It notifies the application whenever events of interest happen.
/*	A negative delay value causes the function to pause until something
/*	happens; a positive delay value causes event_loop() to return when
/*	the next event happens or when the delay time in seconds is over,
/*	whatever happens first. A zero delay effectuates a poll.
/*
/*	Note: in order to avoid race conditions, event_loop() cannot
/*	not be called recursively.
/*
/*	event_request_timer() causes the specified callback function to
/*	be called with the specified context argument after \fIdelay\fR
/*	seconds, or as soon as possible thereafter. The delay should
/*	not be negative (the manifest EVENT_NULL_DELAY provides for
/*	convenient zero-delay notification).
/*	The event argument is equal to EVENT_TIME.
/*	Only one timer request can be active per (callback, context) pair.
/*	Calling event_request_timer() with an existing (callback, context)
/*	pair does not schedule a new event, but updates the time of event
/*	delivery. The result is the absolute time at which the timer is
/*	scheduled to go off.
/*
/*	event_cancel_timer() cancels the specified (callback, context) request.
/*	The application is allowed to cancel non-existing requests. The result
/*	value is the amount of time left before the timer would have gone off,
/*	or -1 in case of no pending timer.
/*
/*	event_enable_read() (event_enable_write()) enables read (write) events
/*	on the named I/O channel. It is up to the application to assemble
/*	partial reads or writes.
/*	An I/O channel cannot handle more than one request at the
/*	same time. The application is allowed to enable an event that
/*	is already enabled (same channel, same read or write operation,
/*	but perhaps a different callback or context). On systems with
/*	kernel-based event filters this is preferred usage, because
/*	each disable and enable request would cost a system call.
/*
/*	The manifest constants EVENT_NULL_CONTEXT and EVENT_NULL_TYPE
/*	provide convenient null values.
/*
/*	The callback routine has the following arguments:
/* .IP fd
/*	The stream on which the event happened.
/* .IP event
/*	An indication of the event type:
/* .RS
/* .IP EVENT_READ
/*	read event,
/* .IP EVENT_WRITE
/*	write event,
/* .IP EVENT_XCPT
/*	exception (actually, any event other than read or write).
/* .RE
/* .IP context
/*	Application context given to event_enable_read() (event_enable_write()).
/* .PP
/*	event_disable_readwrite() disables further I/O events on the specified
/*	I/O channel. The application is allowed to cancel non-existing
/*	I/O event requests.
/*
/*	event_drain() repeatedly calls event_loop() until no more timer
/*	events or I/O events are pending or until the time limit is reached.
/*	This routine must not be called from an event_whatever() callback
/*	routine. Note: this function assumes that no new I/O events
/*	will be registered.
/*
/*	event_fork() must be called by a child process after it is
/*	created with fork(), to re-initialize event processing.
/* DIAGNOSTICS
/*	Panics: interface violations. Fatal errors: out of memory,
/*	system call failure. Warnings: the number of available
/*	file descriptors is much less than FD_SETSIZE.
/* BUGS
/*	This module is based on event selection. It assumes that the
/*	event_loop() routine is called frequently. This approach is
/*	not suitable for applications with compute-bound loops that
/*	take a significant amount of time.
/* LICENSE
/* .ad
/* .fi
/*	The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/*	Wietse Venema
/*	IBM T.J. Watson Research
/*	P.O. Box 704
/*	Yorktown Heights, NY 10598, USA
/*--*/

/* System libraries. */

#include "sys_defs.h"
#include <sys/time.h>			/* XXX: 44BSD uses bzero() */
#include <time.h>
#include <errno.h>
#include <unistd.h>
#include <stddef.h>			/* offsetof() */
#include <string.h>			/* bzero() prototype for 44BSD */
#include <limits.h>			/* INT_MAX */

#ifdef USE_SYS_SELECT_H
#include <sys/select.h>
#endif

/* Application-specific. */

#include "mymalloc.h"
#include "msg.h"
#include "iostuff.h"
#include "ring.h"
#include "events.h"

#if !defined(EVENTS_STYLE)
#error "must define EVENTS_STYLE"
#endif

 /*
  * Traditional BSD-style select(2). Works everywhere, but has a built-in
  * upper bound on the number of file descriptors, and that limit is hard to
  * change on Linux. Is sometimes emulated with SYSV-style poll(2) which
  * doesn't have the file descriptor limit, but unfortunately does not help
  * to improve the performance of servers with lots of connections.
  */
#define EVENT_ALLOC_INCR		10

#if (EVENTS_STYLE == EVENTS_STYLE_SELECT)
typedef fd_set EVENT_MASK;

#define EVENT_MASK_BYTE_COUNT(mask)	sizeof(*(mask))
#define EVENT_MASK_ZERO(mask)		FD_ZERO(mask)
#define EVENT_MASK_SET(fd, mask)	FD_SET((fd), (mask))
#define EVENT_MASK_ISSET(fd, mask)	FD_ISSET((fd), (mask))
#define EVENT_MASK_CLR(fd, mask)	FD_CLR((fd), (mask))
#define EVENT_MASK_CMP(m1, m2) memcmp((m1), (m2), EVENT_MASK_BYTE_COUNT(m1))
#else

 /*
  * Kernel-based event filters (kqueue, /dev/poll, epoll). We use the
  * following file descriptor mask structure which is expanded on the fly.
  */
typedef struct {
    char   *data;			/* bit mask */
    size_t  data_len;			/* data byte count */
} EVENT_MASK;

 /* Bits per byte, byte in vector, bit offset in byte, bytes per set. */
#define EVENT_MASK_NBBY		(8)
#define EVENT_MASK_FD_BYTE(fd, mask) \
	(((unsigned char *) (mask)->data)[(fd) / EVENT_MASK_NBBY])
#define EVENT_MASK_FD_BIT(fd)	(1 << ((fd) % EVENT_MASK_NBBY))
#define EVENT_MASK_BYTES_NEEDED(len) \
	(((len) + (EVENT_MASK_NBBY -1)) / EVENT_MASK_NBBY)
#define EVENT_MASK_BYTE_COUNT(mask)	((mask)->data_len)

 /* Memory management. */
#define EVENT_MASK_ALLOC(mask, bit_len) do { \
	size_t _byte_len = EVENT_MASK_BYTES_NEEDED(bit_len); \
	(mask)->data = mymalloc(_byte_len); \
	memset((mask)->data, 0, _byte_len); \
	(mask)->data_len = _byte_len; \
    } while (0)
#define EVENT_MASK_REALLOC(mask, bit_len) do { \
	size_t _byte_len = EVENT_MASK_BYTES_NEEDED(bit_len); \
	size_t _old_len = (mask)->data_len; \
	(mask)->data = myrealloc((mask)->data, _byte_len); \
	if (_byte_len > _old_len) \
	    memset((mask)->data + _old_len, 0, _byte_len - _old_len); \
	(mask)->data_len = _byte_len; \
    } while (0)
#define EVENT_MASK_FREE(mask)	myfree((mask)->data)

 /* Set operations, modeled after FD_ZERO/SET/ISSET/CLR. */
#define EVENT_MASK_ZERO(mask) \
	memset((mask)->data, 0, (mask)->data_len)
#define EVENT_MASK_SET(fd, mask) \
	(EVENT_MASK_FD_BYTE((fd), (mask)) |= EVENT_MASK_FD_BIT(fd))
#define EVENT_MASK_ISSET(fd, mask) \
	(EVENT_MASK_FD_BYTE((fd), (mask)) & EVENT_MASK_FD_BIT(fd))
#define EVENT_MASK_CLR(fd, mask) \
	(EVENT_MASK_FD_BYTE((fd), (mask)) &= ~EVENT_MASK_FD_BIT(fd))
#define EVENT_MASK_CMP(m1, m2) \
	memcmp((m1)->data, (m2)->data, EVENT_MASK_BYTE_COUNT(m1))
#endif

 /*
  * I/O events.
  */
typedef struct EVENT_FDTABLE EVENT_FDTABLE;

struct EVENT_FDTABLE {
    EVENT_NOTIFY_RDWR_FN callback;
    char   *context;
};
static EVENT_MASK event_rmask;		/* enabled read events */
static EVENT_MASK event_wmask;		/* enabled write events */
static EVENT_MASK event_xmask;		/* for bad news mostly */
static int event_fdlimit;		/* per-process open file limit */
static EVENT_FDTABLE *event_fdtable;	/* one slot per file descriptor */
static int event_fdslots;		/* number of file descriptor slots */
static int event_max_fd = -1;		/* highest fd number seen */

 /*
  * FreeBSD kqueue supports no system call to find out what descriptors are
  * registered in the kernel-based filter. To implement our own sanity checks
  * we maintain our own descriptor bitmask.
  * 
  * FreeBSD kqueue does support application context pointers. Unfortunately,
  * changing that information would cost a system call, and some of the
  * competitors don't support application context. To keep the implementation
  * simple we maintain our own table with call-back information.
  * 
  * FreeBSD kqueue silently unregisters a descriptor from its filter when the
  * descriptor is closed, so our information could get out of sync with the
  * kernel. But that will never happen, because we have to meticulously
  * unregister a file descriptor before it is closed, to avoid errors on
  * systems that are built with EVENTS_STYLE == EVENTS_STYLE_SELECT.
  */
#if (EVENTS_STYLE == EVENTS_STYLE_KQUEUE)
#include <sys/event.h>

 /*
  * Some early FreeBSD implementations don't have the EV_SET macro.
  */
#ifndef EV_SET
#define EV_SET(kp, id, fi, fl, ffl, da, ud) do { \
        (kp)->ident = (id); \
        (kp)->filter = (fi); \
        (kp)->flags = (fl); \
        (kp)->fflags = (ffl); \
        (kp)->data = (da); \
        (kp)->udata = (ud); \
    } while(0)
#endif

 /*
  * Macros to initialize the kernel-based filter; see event_init().
  */
static int event_kq;			/* handle to event filter */

#define EVENT_REG_INIT_HANDLE(er, n) do { \
	er = event_kq = kqueue(); \
    } while (0)
#define EVENT_REG_INIT_TEXT	"kqueue"

#define EVENT_REG_FORK_HANDLE(er, n) do { \
	(void) close(event_kq); \
	EVENT_REG_INIT_HANDLE(er, (n)); \
    } while (0)

 /*
  * Macros to update the kernel-based filter; see event_enable_read(),
  * event_enable_write() and event_disable_readwrite().
  */
#define EVENT_REG_FD_OP(er, fh, ev, op) do { \
	struct kevent dummy; \
	EV_SET(&dummy, (fh), (ev), (op), 0, 0, 0); \
	(er) = kevent(event_kq, &dummy, 1, 0, 0, 0); \
    } while (0)

#define EVENT_REG_ADD_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EV_ADD)
#define EVENT_REG_ADD_READ(e, f)   EVENT_REG_ADD_OP((e), (f), EVFILT_READ)
#define EVENT_REG_ADD_WRITE(e, f)  EVENT_REG_ADD_OP((e), (f), EVFILT_WRITE)
#define EVENT_REG_ADD_TEXT         "kevent EV_ADD"

#define EVENT_REG_DEL_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EV_DELETE)
#define EVENT_REG_DEL_READ(e, f)   EVENT_REG_DEL_OP((e), (f), EVFILT_READ)
#define EVENT_REG_DEL_WRITE(e, f)  EVENT_REG_DEL_OP((e), (f), EVFILT_WRITE)
#define EVENT_REG_DEL_TEXT         "kevent EV_DELETE"

 /*
  * Macros to retrieve event buffers from the kernel; see event_loop().
  */
typedef struct kevent EVENT_BUFFER;

#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \
	struct timespec ts; \
	struct timespec *tsp; \
	if ((delay) < 0) { \
	    tsp = 0; \
	} else { \
	    tsp = &ts; \
	    ts.tv_nsec = 0; \
	    ts.tv_sec = (delay); \
	} \
	(event_count) = kevent(event_kq, (struct kevent *) 0, 0, (event_buf), \
			  (buflen), (tsp)); \
    } while (0)
#define EVENT_BUFFER_READ_TEXT	"kevent"

 /*
  * Macros to process event buffers from the kernel; see event_loop().
  */
#define EVENT_GET_FD(bp)	((bp)->ident)
#define EVENT_GET_TYPE(bp)	((bp)->filter)
#define EVENT_TEST_READ(bp)	(EVENT_GET_TYPE(bp) == EVFILT_READ)
#define EVENT_TEST_WRITE(bp)	(EVENT_GET_TYPE(bp) == EVFILT_WRITE)

#endif

 /*
  * Solaris /dev/poll does not support application context, so we have to
  * maintain our own. This has the benefit of avoiding an expensive system
  * call just to change a call-back function or argument.
  * 
  * Solaris /dev/poll does have a way to query if a specific descriptor is
  * registered. However, we maintain a descriptor mask anyway because a) it
  * avoids having to make an expensive system call to find out if something
  * is registered, b) some EVENTS_STYLE_MUMBLE implementations need a
  * descriptor bitmask anyway and c) we use the bitmask already to implement
  * sanity checks.
  */
#if (EVENTS_STYLE == EVENTS_STYLE_DEVPOLL)
#include <sys/devpoll.h>
#include <fcntl.h>

 /*
  * Macros to initialize the kernel-based filter; see event_init().
  */
static int event_pollfd;		/* handle to file descriptor set */

#define EVENT_REG_INIT_HANDLE(er, n) do { \
	er = event_pollfd = open("/dev/poll", O_RDWR); \
	if (event_pollfd >= 0) close_on_exec(event_pollfd, CLOSE_ON_EXEC); \
    } while (0)
#define EVENT_REG_INIT_TEXT	"open /dev/poll"

#define EVENT_REG_FORK_HANDLE(er, n) do { \
	(void) close(event_pollfd); \
	EVENT_REG_INIT_HANDLE(er, (n)); \
    } while (0)

 /*
  * Macros to update the kernel-based filter; see event_enable_read(),
  * event_enable_write() and event_disable_readwrite().
  */
#define EVENT_REG_FD_OP(er, fh, ev) do { \
	struct pollfd dummy; \
	dummy.fd = (fh); \
	dummy.events = (ev); \
	(er) = write(event_pollfd, (void *) &dummy, \
	    sizeof(dummy)) != sizeof(dummy) ? -1 : 0; \
    } while (0)

#define EVENT_REG_ADD_READ(e, f)  EVENT_REG_FD_OP((e), (f), POLLIN)
#define EVENT_REG_ADD_WRITE(e, f) EVENT_REG_FD_OP((e), (f), POLLOUT)
#define EVENT_REG_ADD_TEXT        "write /dev/poll"

#define EVENT_REG_DEL_BOTH(e, f)  EVENT_REG_FD_OP((e), (f), POLLREMOVE)
#define EVENT_REG_DEL_TEXT        "write /dev/poll"

 /*
  * Macros to retrieve event buffers from the kernel; see event_loop().
  */
typedef struct pollfd EVENT_BUFFER;

#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \
	struct dvpoll dvpoll; \
	dvpoll.dp_fds = (event_buf); \
	dvpoll.dp_nfds = (buflen); \
	dvpoll.dp_timeout = (delay) < 0 ? -1 : (delay) * 1000; \
	(event_count) = ioctl(event_pollfd, DP_POLL, &dvpoll); \
    } while (0)
#define EVENT_BUFFER_READ_TEXT	"ioctl DP_POLL"

 /*
  * Macros to process event buffers from the kernel; see event_loop().
  */
#define EVENT_GET_FD(bp)	((bp)->fd)
#define EVENT_GET_TYPE(bp)	((bp)->revents)
#define EVENT_TEST_READ(bp)	(EVENT_GET_TYPE(bp) & POLLIN)
#define EVENT_TEST_WRITE(bp)	(EVENT_GET_TYPE(bp) & POLLOUT)

#endif

 /*
  * Linux epoll supports no system call to find out what descriptors are
  * registered in the kernel-based filter. To implement our own sanity checks
  * we maintain our own descriptor bitmask.
  * 
  * Linux epoll does support application context pointers. Unfortunately,
  * changing that information would cost a system call, and some of the
  * competitors don't support application context. To keep the implementation
  * simple we maintain our own table with call-back information.
  * 
  * Linux epoll silently unregisters a descriptor from its filter when the
  * descriptor is closed, so our information could get out of sync with the
  * kernel. But that will never happen, because we have to meticulously
  * unregister a file descriptor before it is closed, to avoid errors on
  * systems that are built with EVENTS_STYLE == EVENTS_STYLE_SELECT.
  */
#if (EVENTS_STYLE == EVENTS_STYLE_EPOLL)
#include <sys/epoll.h>

 /*
  * Macros to initialize the kernel-based filter; see event_init().
  */
static int event_epollfd;		/* epoll handle */

#define EVENT_REG_INIT_HANDLE(er, n) do { \
	er = event_epollfd = epoll_create(n); \
	if (event_epollfd >= 0) close_on_exec(event_epollfd, CLOSE_ON_EXEC); \
    } while (0)
#define EVENT_REG_INIT_TEXT	"epoll_create"

#define EVENT_REG_FORK_HANDLE(er, n) do { \
	(void) close(event_epollfd); \
	EVENT_REG_INIT_HANDLE(er, (n)); \
    } while (0)

 /*
  * Macros to update the kernel-based filter; see event_enable_read(),
  * event_enable_write() and event_disable_readwrite().
  */
#define EVENT_REG_FD_OP(er, fh, ev, op) do { \
	struct epoll_event dummy; \
	dummy.events = (ev); \
	dummy.data.fd = (fh); \
	(er) = epoll_ctl(event_epollfd, (op), (fh), &dummy); \
    } while (0)

#define EVENT_REG_ADD_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EPOLL_CTL_ADD)
#define EVENT_REG_ADD_READ(e, f)   EVENT_REG_ADD_OP((e), (f), EPOLLIN)
#define EVENT_REG_ADD_WRITE(e, f)  EVENT_REG_ADD_OP((e), (f), EPOLLOUT)
#define EVENT_REG_ADD_TEXT         "epoll_ctl EPOLL_CTL_ADD"

#define EVENT_REG_DEL_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EPOLL_CTL_DEL)
#define EVENT_REG_DEL_READ(e, f)   EVENT_REG_DEL_OP((e), (f), EPOLLIN)
#define EVENT_REG_DEL_WRITE(e, f)  EVENT_REG_DEL_OP((e), (f), EPOLLOUT)
#define EVENT_REG_DEL_TEXT         "epoll_ctl EPOLL_CTL_DEL"

 /*
  * Macros to retrieve event buffers from the kernel; see event_loop().
  */
typedef struct epoll_event EVENT_BUFFER;

#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \
	(event_count) = epoll_wait(event_epollfd, (event_buf), (buflen), \
				  (delay) < 0 ? -1 : (delay) * 1000); \
    } while (0)
#define EVENT_BUFFER_READ_TEXT	"epoll_wait"

 /*
  * Macros to process event buffers from the kernel; see event_loop().
  */
#define EVENT_GET_FD(bp)	((bp)->data.fd)
#define EVENT_GET_TYPE(bp)	((bp)->events)
#define EVENT_TEST_READ(bp)	(EVENT_GET_TYPE(bp) & EPOLLIN)
#define EVENT_TEST_WRITE(bp)	(EVENT_GET_TYPE(bp) & EPOLLOUT)

#endif

 /*
  * Timer events. Timer requests are kept sorted, in a circular list. We use
  * the RING abstraction, so we get to use a couple ugly macros.
  * 
  * When a call-back function adds a timer request, we label the request with
  * the event_loop() call instance that invoked the call-back. We use this to
  * prevent zero-delay timer requests from running in a tight loop and
  * starving I/O events.
  */
typedef struct EVENT_TIMER EVENT_TIMER;

struct EVENT_TIMER {
    time_t  when;			/* when event is wanted */
    EVENT_NOTIFY_TIME_FN callback;	/* callback function */
    char   *context;			/* callback context */
    long    loop_instance;		/* event_loop() call instance */
    RING    ring;			/* linkage */
};

static RING event_timer_head;		/* timer queue head */
static long event_loop_instance;	/* event_loop() call instance */

#define RING_TO_TIMER(r) \
	((EVENT_TIMER *) ((void *) (r) - offsetof(EVENT_TIMER, ring)))

#define FOREACH_QUEUE_ENTRY(entry, head) \
	for (entry = ring_succ(head); entry != (head); entry = ring_succ(entry))

#define FIRST_TIMER(head) \
	(ring_succ(head) != (head) ? RING_TO_TIMER(ring_succ(head)) : 0)

 /*
  * Other private data structures.
  */
static time_t event_present;		/* cached time of day */

#define EVENT_INIT_NEEDED()	(event_present == 0)

/* event_init - set up tables and such */

static void event_init(void)
{
    EVENT_FDTABLE *fdp;
    int     err;

    if (!EVENT_INIT_NEEDED())
	msg_panic("event_init: repeated call");

    /*
     * Initialize the file descriptor masks and the call-back table. Where
     * possible we extend these data structures on the fly. With select(2)
     * based implementations we can only handle FD_SETSIZE open files.
     */
#if (EVENTS_STYLE == EVENTS_STYLE_SELECT)
    if ((event_fdlimit = open_limit(FD_SETSIZE)) < 0)
	msg_fatal("unable to determine open file limit");
#else
    if ((event_fdlimit = open_limit(INT_MAX)) < 0)
	msg_fatal("unable to determine open file limit");
#endif
    if (event_fdlimit < FD_SETSIZE / 2 && event_fdlimit < 256)
	msg_warn("could allocate space for only %d open files", event_fdlimit);
    event_fdslots = EVENT_ALLOC_INCR;
    event_fdtable = (EVENT_FDTABLE *)
	mymalloc(sizeof(EVENT_FDTABLE) * event_fdslots);
    for (fdp = event_fdtable; fdp < event_fdtable + event_fdslots; fdp++) {
	fdp->callback = 0;
	fdp->context = 0;
    }

    /*
     * Initialize the I/O event request masks.
     */
#if (EVENTS_STYLE == EVENTS_STYLE_SELECT)
    EVENT_MASK_ZERO(&event_rmask);
    EVENT_MASK_ZERO(&event_wmask);
    EVENT_MASK_ZERO(&event_xmask);
#else
    EVENT_MASK_ALLOC(&event_rmask, event_fdslots);
    EVENT_MASK_ALLOC(&event_wmask, event_fdslots);
    EVENT_MASK_ALLOC(&event_xmask, event_fdslots);

    /*
     * Initialize the kernel-based filter.
     */
    EVENT_REG_INIT_HANDLE(err, event_fdslots);
    if (err < 0)
	msg_fatal("%s: %m", EVENT_REG_INIT_TEXT);
#endif

    /*
     * Initialize timer stuff.
     */
    ring_init(&event_timer_head);
    (void) time(&event_present);

    /*
     * Avoid an infinite initialization loop.
     */
    if (EVENT_INIT_NEEDED())
	msg_panic("event_init: unable to initialize");
}

/* event_extend - make room for more descriptor slots */

static void event_extend(int fd)
{
    const char *myname = "event_extend";
    int     old_slots = event_fdslots;
    int     new_slots = (event_fdslots > fd / 2 ?
			 2 * old_slots : fd + EVENT_ALLOC_INCR);
    EVENT_FDTABLE *fdp;

#ifdef EVENT_REG_UPD_HANDLE
    int     err;

#endif

    if (msg_verbose > 2)
	msg_info("%s: fd %d", myname, fd);
    event_fdtable = (EVENT_FDTABLE *)
	myrealloc((void *) event_fdtable, sizeof(EVENT_FDTABLE) * new_slots);
    event_fdslots = new_slots;
    for (fdp = event_fdtable + old_slots;
	 fdp < event_fdtable + new_slots; fdp++) {
	fdp->callback = 0;
	fdp->context = 0;
    }

    /*
     * Initialize the I/O event request masks.
     */
#if (EVENTS_STYLE != EVENTS_STYLE_SELECT)
    EVENT_MASK_REALLOC(&event_rmask, new_slots);
    EVENT_MASK_REALLOC(&event_wmask, new_slots);
    EVENT_MASK_REALLOC(&event_xmask, new_slots);
#endif
#ifdef EVENT_REG_UPD_HANDLE
    EVENT_REG_UPD_HANDLE(err, new_slots);
    if (err < 0)
	msg_fatal("%s: %s: %m", myname, EVENT_REG_UPD_TEXT);
#endif
}

/* event_time - look up cached time of day */

time_t  event_time(void)
{
    if (EVENT_INIT_NEEDED())
	event_init();

    return (event_present);
}

/* event_drain - loop until all pending events are done */

void    event_drain(int time_limit)
{
    EVENT_MASK zero_mask;
    time_t  max_time;

    if (EVENT_INIT_NEEDED())
	return;

#if (EVENTS_STYLE == EVENTS_STYLE_SELECT)
    EVENT_MASK_ZERO(&zero_mask);
#else
    EVENT_MASK_ALLOC(&zero_mask, event_fdslots);
#endif
    (void) time(&event_present);
    max_time = event_present + time_limit;
    while (event_present < max_time
	   && (event_timer_head.pred != &event_timer_head
	       || EVENT_MASK_CMP(&zero_mask, &event_xmask) != 0)) {
	event_loop(1);
#if (EVENTS_STYLE != EVENTS_STYLE_SELECT)
	if (EVENT_MASK_BYTE_COUNT(&zero_mask)
	    != EVENT_MASK_BYTES_NEEDED(event_fdslots))
	    EVENT_MASK_REALLOC(&zero_mask, event_fdslots);
#endif
    }
#if (EVENTS_STYLE != EVENTS_STYLE_SELECT)
    EVENT_MASK_FREE(&zero_mask);
#endif
}

/* event_fork - resume event processing after fork() */

void    event_fork(void)
{
#if (EVENTS_STYLE != EVENTS_STYLE_SELECT)
    EVENT_FDTABLE *fdp;
    int     err;
    int     fd;

    /*
     * No event was ever registered, so there's nothing to be done.
     */
    if (EVENT_INIT_NEEDED())
	return;

    /*
     * Close the existing filter handle and open a new kernel-based filter.
     */
    EVENT_REG_FORK_HANDLE(err, event_fdslots);
    if (err < 0)
	msg_fatal("%s: %m", EVENT_REG_INIT_TEXT);

    /*
     * Populate the new kernel-based filter with events that were registered
     * in the parent process.
     */
    for (fd = 0; fd <= event_max_fd; fd++) {
	if (EVENT_MASK_ISSET(fd, &event_wmask)) {
	    EVENT_MASK_CLR(fd, &event_wmask);
	    fdp = event_fdtable + fd;
	    event_enable_write(fd, fdp->callback, fdp->context);
	} else if (EVENT_MASK_ISSET(fd, &event_rmask)) {
	    EVENT_MASK_CLR(fd, &event_rmask);
	    fdp = event_fdtable + fd;
	    event_enable_read(fd, fdp->callback, fdp->context);
	}
    }
#endif
}

/* event_enable_read - enable read events */

void    event_enable_read(int fd, EVENT_NOTIFY_RDWR_FN callback, void *context)
{
    const char *myname = "event_enable_read";
    EVENT_FDTABLE *fdp;
    int     err;

    if (EVENT_INIT_NEEDED())
	event_init();

    /*
     * Sanity checks.
     */
    if (fd < 0 || fd >= event_fdlimit)
	msg_panic("%s: bad file descriptor: %d", myname, fd);

    if (msg_verbose > 2)
	msg_info("%s: fd %d", myname, fd);

    if (fd >= event_fdslots)
	event_extend(fd);

    /*
     * Disallow mixed (i.e. read and write) requests on the same descriptor.
     */
    if (EVENT_MASK_ISSET(fd, &event_wmask))
	msg_panic("%s: fd %d: read/write I/O request", myname, fd);

    /*
     * Postfix 2.4 allows multiple event_enable_read() calls on the same
     * descriptor without requiring event_disable_readwrite() calls between
     * them. With kernel-based filters (kqueue, /dev/poll, epoll) it's
     * wasteful to make system calls when we change only application
     * call-back information. It has a noticeable effect on smtp-source
     * performance.
     */
    if (EVENT_MASK_ISSET(fd, &event_rmask) == 0) {
	EVENT_MASK_SET(fd, &event_xmask);
	EVENT_MASK_SET(fd, &event_rmask);
	if (event_max_fd < fd)
	    event_max_fd = fd;
#if (EVENTS_STYLE != EVENTS_STYLE_SELECT)
	EVENT_REG_ADD_READ(err, fd);
	if (err < 0)
	    msg_fatal("%s: %s: %m", myname, EVENT_REG_ADD_TEXT);
#endif
    }
    fdp = event_fdtable + fd;
    if (fdp->callback != callback || fdp->context != context) {
	fdp->callback = callback;
	fdp->context = context;
    }
}

/* event_enable_write - enable write events */

void    event_enable_write(int fd, EVENT_NOTIFY_RDWR_FN callback, void *context)
{
    const char *myname = "event_enable_write";
    EVENT_FDTABLE *fdp;
    int     err;

    if (EVENT_INIT_NEEDED())
	event_init();

    /*
     * Sanity checks.
     */
    if (fd < 0 || fd >= event_fdlimit)
	msg_panic("%s: bad file descriptor: %d", myname, fd);

    if (msg_verbose > 2)
	msg_info("%s: fd %d", myname, fd);

    if (fd >= event_fdslots)
	event_extend(fd);

    /*
     * Disallow mixed (i.e. read and write) requests on the same descriptor.
     */
    if (EVENT_MASK_ISSET(fd, &event_rmask))
	msg_panic("%s: fd %d: read/write I/O request", myname, fd);

    /*
     * Postfix 2.4 allows multiple event_enable_write() calls on the same
     * descriptor without requiring event_disable_readwrite() calls between
     * them. With kernel-based filters (kqueue, /dev/poll, epoll) it's
     * incredibly wasteful to make unregister and register system calls when
     * we change only application call-back information. It has a noticeable
     * effect on smtp-source performance.
     */
    if (EVENT_MASK_ISSET(fd, &event_wmask) == 0) {
	EVENT_MASK_SET(fd, &event_xmask);
	EVENT_MASK_SET(fd, &event_wmask);
	if (event_max_fd < fd)
	    event_max_fd = fd;
#if (EVENTS_STYLE != EVENTS_STYLE_SELECT)
	EVENT_REG_ADD_WRITE(err, fd);
	if (err < 0)
	    msg_fatal("%s: %s: %m", myname, EVENT_REG_ADD_TEXT);
#endif
    }
    fdp = event_fdtable + fd;
    if (fdp->callback != callback || fdp->context != context) {
	fdp->callback = callback;
	fdp->context = context;
    }
}

/* event_disable_readwrite - disable request for read or write events */

void    event_disable_readwrite(int fd)
{
    const char *myname = "event_disable_readwrite";
    EVENT_FDTABLE *fdp;
    int     err;

    if (EVENT_INIT_NEEDED())
	event_init();

    /*
     * Sanity checks.
     */
    if (fd < 0 || fd >= event_fdlimit)
	msg_panic("%s: bad file descriptor: %d", myname, fd);

    if (msg_verbose > 2)
	msg_info("%s: fd %d", myname, fd);

    /*
     * Don't complain when there is nothing to cancel. The request may have
     * been canceled from another thread.
     */
    if (fd >= event_fdslots)
	return;
#if (EVENTS_STYLE != EVENTS_STYLE_SELECT)
#ifdef EVENT_REG_DEL_BOTH
    /* XXX Can't seem to disable READ and WRITE events selectively. */
    if (EVENT_MASK_ISSET(fd, &event_rmask)
	|| EVENT_MASK_ISSET(fd, &event_wmask)) {
	EVENT_REG_DEL_BOTH(err, fd);
	if (err < 0)
	    msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT);
    }
#else
    if (EVENT_MASK_ISSET(fd, &event_rmask)) {
	EVENT_REG_DEL_READ(err, fd);
	if (err < 0)
	    msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT);
    } else if (EVENT_MASK_ISSET(fd, &event_wmask)) {
	EVENT_REG_DEL_WRITE(err, fd);
	if (err < 0)
	    msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT);
    }
#endif						/* EVENT_REG_DEL_BOTH */
#endif						/* != EVENTS_STYLE_SELECT */
    EVENT_MASK_CLR(fd, &event_xmask);
    EVENT_MASK_CLR(fd, &event_rmask);
    EVENT_MASK_CLR(fd, &event_wmask);
    fdp = event_fdtable + fd;
    fdp->callback = 0;
    fdp->context = 0;
}

/* event_request_timer - (re)set timer */

time_t  event_request_timer(EVENT_NOTIFY_TIME_FN callback, void *context, int delay)
{
    const char *myname = "event_request_timer";
    RING   *ring;
    EVENT_TIMER *timer;

    if (EVENT_INIT_NEEDED())
	event_init();

    /*
     * Sanity checks.
     */
    if (delay < 0)
	msg_panic("%s: invalid delay: %d", myname, delay);

    /*
     * Make sure we schedule this event at the right time.
     */
    time(&event_present);

    /*
     * See if they are resetting an existing timer request. If so, take the
     * request away from the timer queue so that it can be inserted at the
     * right place.
     */
    FOREACH_QUEUE_ENTRY(ring, &event_timer_head) {
	timer = RING_TO_TIMER(ring);
	if (timer->callback == callback && timer->context == context) {
	    timer->when = event_present + delay;
	    timer->loop_instance = event_loop_instance;
	    ring_detach(ring);
	    if (msg_verbose > 2)
		msg_info("%s: reset 0x%lx 0x%lx %d", myname,
			 (long) callback, (long) context, delay);
	    break;
	}
    }

    /*
     * If not found, schedule a new timer request.
     */
    if (ring == &event_timer_head) {
	timer = (EVENT_TIMER *) mymalloc(sizeof(EVENT_TIMER));
	timer->when = event_present + delay;
	timer->callback = callback;
	timer->context = context;
	timer->loop_instance = event_loop_instance;
	if (msg_verbose > 2)
	    msg_info("%s: set 0x%lx 0x%lx %d", myname,
		     (long) callback, (long) context, delay);
    }

    /*
     * Timer requests are kept sorted to reduce lookup overhead in the event
     * loop.
     * 
     * XXX Append the new request after existing requests for the same time
     * slot. The event_loop() routine depends on this to avoid starving I/O
     * events when a call-back function schedules a zero-delay timer request.
     */
    FOREACH_QUEUE_ENTRY(ring, &event_timer_head) {
	if (timer->when < RING_TO_TIMER(ring)->when)
	    break;
    }
    ring_prepend(ring, &timer->ring);

    return (timer->when);
}

/* event_cancel_timer - cancel timer */

int     event_cancel_timer(EVENT_NOTIFY_TIME_FN callback, void *context)
{
    const char *myname = "event_cancel_timer";
    RING   *ring;
    EVENT_TIMER *timer;
    int     time_left = -1;

    if (EVENT_INIT_NEEDED())
	event_init();

    /*
     * See if they are canceling an existing timer request. Do not complain
     * when the request is not found. It might have been canceled from some
     * other thread.
     */
    FOREACH_QUEUE_ENTRY(ring, &event_timer_head) {
	timer = RING_TO_TIMER(ring);
	if (timer->callback == callback && timer->context == context) {
	    if ((time_left = timer->when - event_present) < 0)
		time_left = 0;
	    ring_detach(ring);
	    myfree((void *) timer);
	    break;
	}
    }
    if (msg_verbose > 2)
	msg_info("%s: 0x%lx 0x%lx %d", myname,
		 (long) callback, (long) context, time_left);
    return (time_left);
}

/* event_loop - wait for the next event */

void    event_loop(int delay)
{
    const char *myname = "event_loop";
    static int nested;

#if (EVENTS_STYLE == EVENTS_STYLE_SELECT)
    fd_set  rmask;
    fd_set  wmask;
    fd_set  xmask;
    struct timeval tv;
    struct timeval *tvp;
    int     new_max_fd;

#else
    EVENT_BUFFER event_buf[100];
    EVENT_BUFFER *bp;

#endif
    int     event_count;
    EVENT_TIMER *timer;
    int     fd;
    EVENT_FDTABLE *fdp;
    int     select_delay;

    if (EVENT_INIT_NEEDED())
	event_init();

    /*
     * XXX Also print the select() masks?
     */
    if (msg_verbose > 2) {
	RING   *ring;

	FOREACH_QUEUE_ENTRY(ring, &event_timer_head) {
	    timer = RING_TO_TIMER(ring);
	    msg_info("%s: time left %3d for 0x%lx 0x%lx", myname,
		     (int) (timer->when - event_present),
		     (long) timer->callback, (long) timer->context);
	}
    }

    /*
     * Find out when the next timer would go off. Timer requests are sorted.
     * If any timer is scheduled, adjust the delay appropriately.
     */
    if ((timer = FIRST_TIMER(&event_timer_head)) != 0) {
	event_present = time((time_t *) 0);
	if ((select_delay = timer->when - event_present) < 0) {
	    select_delay = 0;
	} else if (delay >= 0 && select_delay > delay) {
	    select_delay = delay;
	}
    } else {
	select_delay = delay;
    }
    if (msg_verbose > 2)
	msg_info("event_loop: select_delay %d", select_delay);

    /*
     * Negative delay means: wait until something happens. Zero delay means:
     * poll. Positive delay means: wait at most this long.
     */
#if (EVENTS_STYLE == EVENTS_STYLE_SELECT)
    if (select_delay < 0) {
	tvp = 0;
    } else {
	tvp = &tv;
	tv.tv_usec = 0;
	tv.tv_sec = select_delay;
    }

    /*
     * Pause until the next event happens. When select() has a problem, don't
     * go into a tight loop. Allow select() to be interrupted due to the
     * arrival of a signal.
     */
    rmask = event_rmask;
    wmask = event_wmask;
    xmask = event_xmask;

    event_count = select(event_max_fd + 1, &rmask, &wmask, &xmask, tvp);
    if (event_count < 0) {
	if (errno != EINTR)
	    msg_fatal("event_loop: select: %m");
	return;
    }
#else
    EVENT_BUFFER_READ(event_count, event_buf,
		      sizeof(event_buf) / sizeof(event_buf[0]),
		      select_delay);
    if (event_count < 0) {
	if (errno != EINTR)
	    msg_fatal("event_loop: " EVENT_BUFFER_READ_TEXT ": %m");
	return;
    }
#endif

    /*
     * Before entering the application call-back routines, make sure we
     * aren't being called from a call-back routine. Doing so would make us
     * vulnerable to all kinds of race conditions.
     */
    if (nested++ > 0)
	msg_panic("event_loop: recursive call");

    /*
     * Deliver timer events. Allow the application to add/delete timer queue
     * requests while it is being called back. Requests are sorted: we keep
     * running over the timer request queue from the start, and stop when we
     * reach the future or the list end. We also stop when we reach a timer
     * request that was added by a call-back that was invoked from this
     * event_loop() call instance, for reasons that are explained below.
     * 
     * To avoid dangling pointer problems 1) we must remove a request from the
     * timer queue before delivering its event to the application and 2) we
     * must look up the next timer request *after* calling the application.
     * The latter complicates the handling of zero-delay timer requests that
     * are added by event_loop() call-back functions.
     * 
     * XXX When a timer event call-back function adds a new timer request,
     * event_request_timer() labels the request with the event_loop() call
     * instance that invoked the timer event call-back. We use this instance
     * label here to prevent zero-delay timer requests from running in a
     * tight loop and starving I/O events. To make this solution work,
     * event_request_timer() appends a new request after existing requests
     * for the same time slot.
     */
    event_present = time((time_t *) 0);
    event_loop_instance += 1;

    while ((timer = FIRST_TIMER(&event_timer_head)) != 0) {
	if (timer->when > event_present)
	    break;
	if (timer->loop_instance == event_loop_instance)
	    break;
	ring_detach(&timer->ring);		/* first this */
	if (msg_verbose > 2)
	    msg_info("%s: timer 0x%lx 0x%lx", myname,
		     (long) timer->callback, (long) timer->context);
	timer->callback(EVENT_TIME, timer->context);	/* then this */
	myfree((void *) timer);
    }

    /*
     * Deliver I/O events. Allow the application to cancel event requests
     * while it is being called back. To this end, we keep an eye on the
     * contents of event_xmask, so that we deliver only events that are still
     * wanted. We do not change the event request masks. It is up to the
     * application to determine when a read or write is complete.
     */
#if (EVENTS_STYLE == EVENTS_STYLE_SELECT)
    if (event_count > 0) {
	for (new_max_fd = 0, fd = 0; fd <= event_max_fd; fd++) {
	    if (FD_ISSET(fd, &event_xmask)) {
		new_max_fd = fd;
		/* In case event_fdtable is updated. */
		fdp = event_fdtable + fd;
		if (FD_ISSET(fd, &xmask)) {
		    if (msg_verbose > 2)
			msg_info("%s: exception fd=%d act=0x%lx 0x%lx", myname,
			     fd, (long) fdp->callback, (long) fdp->context);
		    fdp->callback(EVENT_XCPT, fdp->context);
		} else if (FD_ISSET(fd, &wmask)) {
		    if (msg_verbose > 2)
			msg_info("%s: write fd=%d act=0x%lx 0x%lx", myname,
			     fd, (long) fdp->callback, (long) fdp->context);
		    fdp->callback(EVENT_WRITE, fdp->context);
		} else if (FD_ISSET(fd, &rmask)) {
		    if (msg_verbose > 2)
			msg_info("%s: read fd=%d act=0x%lx 0x%lx", myname,
			     fd, (long) fdp->callback, (long) fdp->context);
		    fdp->callback(EVENT_READ, fdp->context);
		}
	    }
	}
	event_max_fd = new_max_fd;
    }
#else
    for (bp = event_buf; bp < event_buf + event_count; bp++) {
	fd = EVENT_GET_FD(bp);
	if (fd < 0 || fd > event_max_fd)
	    msg_panic("%s: bad file descriptor: %d", myname, fd);
	if (EVENT_MASK_ISSET(fd, &event_xmask)) {
	    fdp = event_fdtable + fd;
	    if (EVENT_TEST_READ(bp)) {
		if (msg_verbose > 2)
		    msg_info("%s: read fd=%d act=0x%lx 0x%lx", myname,
			     fd, (long) fdp->callback, (long) fdp->context);
		fdp->callback(EVENT_READ, fdp->context);
	    } else if (EVENT_TEST_WRITE(bp)) {
		if (msg_verbose > 2)
		    msg_info("%s: write fd=%d act=0x%lx 0x%lx", myname,
			     fd, (long) fdp->callback,
			     (long) fdp->context);
		fdp->callback(EVENT_WRITE, fdp->context);
	    } else {
		if (msg_verbose > 2)
		    msg_info("%s: other fd=%d act=0x%lx 0x%lx", myname,
			     fd, (long) fdp->callback, (long) fdp->context);
		fdp->callback(EVENT_XCPT, fdp->context);
	    }
	}
    }
#endif
    nested--;
}

#ifdef TEST

 /*
  * Proof-of-concept test program for the event manager. Schedule a series of
  * events at one-second intervals and let them happen, while echoing any
  * lines read from stdin.
  */
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>

/* timer_event - display event */

static void timer_event(int unused_event, void *context)
{
    printf("%ld: %s\n", (long) event_present, context);
    fflush(stdout);
}

/* echo - echo text received on stdin */

static void echo(int unused_event, void *unused_context)
{
    char    buf[BUFSIZ];

    if (fgets(buf, sizeof(buf), stdin) == 0)
	exit(0);
    printf("Result: %s", buf);
}

/* request - request a bunch of timer events */

static void request(int unused_event, void *unused_context)
{
    event_request_timer(timer_event, "3 first", 3);
    event_request_timer(timer_event, "3 second", 3);
    event_request_timer(timer_event, "4 first", 4);
    event_request_timer(timer_event, "4 second", 4);
    event_request_timer(timer_event, "2 first", 2);
    event_request_timer(timer_event, "2 second", 2);
    event_request_timer(timer_event, "1 first", 1);
    event_request_timer(timer_event, "1 second", 1);
    event_request_timer(timer_event, "0 first", 0);
    event_request_timer(timer_event, "0 second", 0);
}

int     main(int argc, void **argv)
{
    if (argv[1])
	msg_verbose = atoi(argv[1]);
    event_request_timer(request, (void *) 0, 0);
    event_enable_read(fileno(stdin), echo, (void *) 0);
    event_drain(10);
    exit(0);
}

#endif