diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 12:06:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 12:06:34 +0000 |
commit | 5e61585d76ae77fd5e9e96ebabb57afa4d74880d (patch) | |
tree | 2b467823aaeebc7ef8bc9e3cabe8074eaef1666d /src/util/events.c | |
parent | Initial commit. (diff) | |
download | postfix-5e61585d76ae77fd5e9e96ebabb57afa4d74880d.tar.xz postfix-5e61585d76ae77fd5e9e96ebabb57afa4d74880d.zip |
Adding upstream version 3.5.24.upstream/3.5.24upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/util/events.c | 1261 |
1 files changed, 1261 insertions, 0 deletions
diff --git a/src/util/events.c b/src/util/events.c new file mode 100644 index 0000000..c2157bb --- /dev/null +++ b/src/util/events.c @@ -0,0 +1,1261 @@ +/*++ +/* NAME +/* events 3 +/* SUMMARY +/* event manager +/* SYNOPSIS +/* #include <events.h> +/* +/* time_t event_time() +/* +/* void event_loop(delay) +/* int delay; +/* +/* time_t event_request_timer(callback, context, delay) +/* void (*callback)(int event, void *context); +/* void *context; +/* int delay; +/* +/* int event_cancel_timer(callback, context) +/* void (*callback)(int event, void *context); +/* void *context; +/* +/* void event_enable_read(fd, callback, context) +/* int fd; +/* void (*callback)(int event, void *context); +/* void *context; +/* +/* void event_enable_write(fd, callback, context) +/* int fd; +/* void (*callback)(int event, void *context); +/* void *context; +/* +/* void event_disable_readwrite(fd) +/* int fd; +/* +/* void event_drain(time_limit) +/* int time_limit; +/* +/* void event_fork(void) +/* DESCRIPTION +/* This module delivers I/O and timer events. +/* Multiple I/O streams and timers can be monitored simultaneously. +/* Events are delivered via callback routines provided by the +/* application. When requesting an event, the application can provide +/* private context that is passed back when the callback routine is +/* executed. +/* +/* event_time() returns a cached value of the current time. +/* +/* event_loop() monitors all I/O channels for which the application has +/* expressed interest, and monitors the timer request queue. +/* It notifies the application whenever events of interest happen. +/* A negative delay value causes the function to pause until something +/* happens; a positive delay value causes event_loop() to return when +/* the next event happens or when the delay time in seconds is over, +/* whatever happens first. A zero delay effectuates a poll. +/* +/* Note: in order to avoid race conditions, event_loop() cannot +/* not be called recursively. +/* +/* event_request_timer() causes the specified callback function to +/* be called with the specified context argument after \fIdelay\fR +/* seconds, or as soon as possible thereafter. The delay should +/* not be negative (the manifest EVENT_NULL_DELAY provides for +/* convenient zero-delay notification). +/* The event argument is equal to EVENT_TIME. +/* Only one timer request can be active per (callback, context) pair. +/* Calling event_request_timer() with an existing (callback, context) +/* pair does not schedule a new event, but updates the time of event +/* delivery. The result is the absolute time at which the timer is +/* scheduled to go off. +/* +/* event_cancel_timer() cancels the specified (callback, context) request. +/* The application is allowed to cancel non-existing requests. The result +/* value is the amount of time left before the timer would have gone off, +/* or -1 in case of no pending timer. +/* +/* event_enable_read() (event_enable_write()) enables read (write) events +/* on the named I/O channel. It is up to the application to assemble +/* partial reads or writes. +/* An I/O channel cannot handle more than one request at the +/* same time. The application is allowed to enable an event that +/* is already enabled (same channel, same read or write operation, +/* but perhaps a different callback or context). On systems with +/* kernel-based event filters this is preferred usage, because +/* each disable and enable request would cost a system call. +/* +/* The manifest constants EVENT_NULL_CONTEXT and EVENT_NULL_TYPE +/* provide convenient null values. +/* +/* The callback routine has the following arguments: +/* .IP fd +/* The stream on which the event happened. +/* .IP event +/* An indication of the event type: +/* .RS +/* .IP EVENT_READ +/* read event, +/* .IP EVENT_WRITE +/* write event, +/* .IP EVENT_XCPT +/* exception (actually, any event other than read or write). +/* .RE +/* .IP context +/* Application context given to event_enable_read() (event_enable_write()). +/* .PP +/* event_disable_readwrite() disables further I/O events on the specified +/* I/O channel. The application is allowed to cancel non-existing +/* I/O event requests. +/* +/* event_drain() repeatedly calls event_loop() until no more timer +/* events or I/O events are pending or until the time limit is reached. +/* This routine must not be called from an event_whatever() callback +/* routine. Note: this function assumes that no new I/O events +/* will be registered. +/* +/* event_fork() must be called by a child process after it is +/* created with fork(), to re-initialize event processing. +/* DIAGNOSTICS +/* Panics: interface violations. Fatal errors: out of memory, +/* system call failure. Warnings: the number of available +/* file descriptors is much less than FD_SETSIZE. +/* BUGS +/* This module is based on event selection. It assumes that the +/* event_loop() routine is called frequently. This approach is +/* not suitable for applications with compute-bound loops that +/* take a significant amount of time. +/* LICENSE +/* .ad +/* .fi +/* The Secure Mailer license must be distributed with this software. +/* AUTHOR(S) +/* Wietse Venema +/* IBM T.J. Watson Research +/* P.O. Box 704 +/* Yorktown Heights, NY 10598, USA +/*--*/ + +/* System libraries. */ + +#include "sys_defs.h" +#include <sys/time.h> /* XXX: 44BSD uses bzero() */ +#include <time.h> +#include <errno.h> +#include <unistd.h> +#include <stddef.h> /* offsetof() */ +#include <string.h> /* bzero() prototype for 44BSD */ +#include <limits.h> /* INT_MAX */ + +#ifdef USE_SYS_SELECT_H +#include <sys/select.h> +#endif + +/* Application-specific. */ + +#include "mymalloc.h" +#include "msg.h" +#include "iostuff.h" +#include "ring.h" +#include "events.h" + +#if !defined(EVENTS_STYLE) +#error "must define EVENTS_STYLE" +#endif + + /* + * Traditional BSD-style select(2). Works everywhere, but has a built-in + * upper bound on the number of file descriptors, and that limit is hard to + * change on Linux. Is sometimes emulated with SYSV-style poll(2) which + * doesn't have the file descriptor limit, but unfortunately does not help + * to improve the performance of servers with lots of connections. + */ +#define EVENT_ALLOC_INCR 10 + +#if (EVENTS_STYLE == EVENTS_STYLE_SELECT) +typedef fd_set EVENT_MASK; + +#define EVENT_MASK_BYTE_COUNT(mask) sizeof(*(mask)) +#define EVENT_MASK_ZERO(mask) FD_ZERO(mask) +#define EVENT_MASK_SET(fd, mask) FD_SET((fd), (mask)) +#define EVENT_MASK_ISSET(fd, mask) FD_ISSET((fd), (mask)) +#define EVENT_MASK_CLR(fd, mask) FD_CLR((fd), (mask)) +#define EVENT_MASK_CMP(m1, m2) memcmp((m1), (m2), EVENT_MASK_BYTE_COUNT(m1)) +#else + + /* + * Kernel-based event filters (kqueue, /dev/poll, epoll). We use the + * following file descriptor mask structure which is expanded on the fly. + */ +typedef struct { + char *data; /* bit mask */ + size_t data_len; /* data byte count */ +} EVENT_MASK; + + /* Bits per byte, byte in vector, bit offset in byte, bytes per set. */ +#define EVENT_MASK_NBBY (8) +#define EVENT_MASK_FD_BYTE(fd, mask) \ + (((unsigned char *) (mask)->data)[(fd) / EVENT_MASK_NBBY]) +#define EVENT_MASK_FD_BIT(fd) (1 << ((fd) % EVENT_MASK_NBBY)) +#define EVENT_MASK_BYTES_NEEDED(len) \ + (((len) + (EVENT_MASK_NBBY -1)) / EVENT_MASK_NBBY) +#define EVENT_MASK_BYTE_COUNT(mask) ((mask)->data_len) + + /* Memory management. */ +#define EVENT_MASK_ALLOC(mask, bit_len) do { \ + size_t _byte_len = EVENT_MASK_BYTES_NEEDED(bit_len); \ + (mask)->data = mymalloc(_byte_len); \ + memset((mask)->data, 0, _byte_len); \ + (mask)->data_len = _byte_len; \ + } while (0) +#define EVENT_MASK_REALLOC(mask, bit_len) do { \ + size_t _byte_len = EVENT_MASK_BYTES_NEEDED(bit_len); \ + size_t _old_len = (mask)->data_len; \ + (mask)->data = myrealloc((mask)->data, _byte_len); \ + if (_byte_len > _old_len) \ + memset((mask)->data + _old_len, 0, _byte_len - _old_len); \ + (mask)->data_len = _byte_len; \ + } while (0) +#define EVENT_MASK_FREE(mask) myfree((mask)->data) + + /* Set operations, modeled after FD_ZERO/SET/ISSET/CLR. */ +#define EVENT_MASK_ZERO(mask) \ + memset((mask)->data, 0, (mask)->data_len) +#define EVENT_MASK_SET(fd, mask) \ + (EVENT_MASK_FD_BYTE((fd), (mask)) |= EVENT_MASK_FD_BIT(fd)) +#define EVENT_MASK_ISSET(fd, mask) \ + (EVENT_MASK_FD_BYTE((fd), (mask)) & EVENT_MASK_FD_BIT(fd)) +#define EVENT_MASK_CLR(fd, mask) \ + (EVENT_MASK_FD_BYTE((fd), (mask)) &= ~EVENT_MASK_FD_BIT(fd)) +#define EVENT_MASK_CMP(m1, m2) \ + memcmp((m1)->data, (m2)->data, EVENT_MASK_BYTE_COUNT(m1)) +#endif + + /* + * I/O events. + */ +typedef struct EVENT_FDTABLE EVENT_FDTABLE; + +struct EVENT_FDTABLE { + EVENT_NOTIFY_RDWR_FN callback; + char *context; +}; +static EVENT_MASK event_rmask; /* enabled read events */ +static EVENT_MASK event_wmask; /* enabled write events */ +static EVENT_MASK event_xmask; /* for bad news mostly */ +static int event_fdlimit; /* per-process open file limit */ +static EVENT_FDTABLE *event_fdtable; /* one slot per file descriptor */ +static int event_fdslots; /* number of file descriptor slots */ +static int event_max_fd = -1; /* highest fd number seen */ + + /* + * FreeBSD kqueue supports no system call to find out what descriptors are + * registered in the kernel-based filter. To implement our own sanity checks + * we maintain our own descriptor bitmask. + * + * FreeBSD kqueue does support application context pointers. Unfortunately, + * changing that information would cost a system call, and some of the + * competitors don't support application context. To keep the implementation + * simple we maintain our own table with call-back information. + * + * FreeBSD kqueue silently unregisters a descriptor from its filter when the + * descriptor is closed, so our information could get out of sync with the + * kernel. But that will never happen, because we have to meticulously + * unregister a file descriptor before it is closed, to avoid errors on + * systems that are built with EVENTS_STYLE == EVENTS_STYLE_SELECT. + */ +#if (EVENTS_STYLE == EVENTS_STYLE_KQUEUE) +#include <sys/event.h> + + /* + * Some early FreeBSD implementations don't have the EV_SET macro. + */ +#ifndef EV_SET +#define EV_SET(kp, id, fi, fl, ffl, da, ud) do { \ + (kp)->ident = (id); \ + (kp)->filter = (fi); \ + (kp)->flags = (fl); \ + (kp)->fflags = (ffl); \ + (kp)->data = (da); \ + (kp)->udata = (ud); \ + } while(0) +#endif + + /* + * Macros to initialize the kernel-based filter; see event_init(). + */ +static int event_kq; /* handle to event filter */ + +#define EVENT_REG_INIT_HANDLE(er, n) do { \ + er = event_kq = kqueue(); \ + } while (0) +#define EVENT_REG_INIT_TEXT "kqueue" + +#define EVENT_REG_FORK_HANDLE(er, n) do { \ + (void) close(event_kq); \ + EVENT_REG_INIT_HANDLE(er, (n)); \ + } while (0) + + /* + * Macros to update the kernel-based filter; see event_enable_read(), + * event_enable_write() and event_disable_readwrite(). + */ +#define EVENT_REG_FD_OP(er, fh, ev, op) do { \ + struct kevent dummy; \ + EV_SET(&dummy, (fh), (ev), (op), 0, 0, 0); \ + (er) = kevent(event_kq, &dummy, 1, 0, 0, 0); \ + } while (0) + +#define EVENT_REG_ADD_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EV_ADD) +#define EVENT_REG_ADD_READ(e, f) EVENT_REG_ADD_OP((e), (f), EVFILT_READ) +#define EVENT_REG_ADD_WRITE(e, f) EVENT_REG_ADD_OP((e), (f), EVFILT_WRITE) +#define EVENT_REG_ADD_TEXT "kevent EV_ADD" + +#define EVENT_REG_DEL_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EV_DELETE) +#define EVENT_REG_DEL_READ(e, f) EVENT_REG_DEL_OP((e), (f), EVFILT_READ) +#define EVENT_REG_DEL_WRITE(e, f) EVENT_REG_DEL_OP((e), (f), EVFILT_WRITE) +#define EVENT_REG_DEL_TEXT "kevent EV_DELETE" + + /* + * Macros to retrieve event buffers from the kernel; see event_loop(). + */ +typedef struct kevent EVENT_BUFFER; + +#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \ + struct timespec ts; \ + struct timespec *tsp; \ + if ((delay) < 0) { \ + tsp = 0; \ + } else { \ + tsp = &ts; \ + ts.tv_nsec = 0; \ + ts.tv_sec = (delay); \ + } \ + (event_count) = kevent(event_kq, (struct kevent *) 0, 0, (event_buf), \ + (buflen), (tsp)); \ + } while (0) +#define EVENT_BUFFER_READ_TEXT "kevent" + + /* + * Macros to process event buffers from the kernel; see event_loop(). + */ +#define EVENT_GET_FD(bp) ((bp)->ident) +#define EVENT_GET_TYPE(bp) ((bp)->filter) +#define EVENT_TEST_READ(bp) (EVENT_GET_TYPE(bp) == EVFILT_READ) +#define EVENT_TEST_WRITE(bp) (EVENT_GET_TYPE(bp) == EVFILT_WRITE) + +#endif + + /* + * Solaris /dev/poll does not support application context, so we have to + * maintain our own. This has the benefit of avoiding an expensive system + * call just to change a call-back function or argument. + * + * Solaris /dev/poll does have a way to query if a specific descriptor is + * registered. However, we maintain a descriptor mask anyway because a) it + * avoids having to make an expensive system call to find out if something + * is registered, b) some EVENTS_STYLE_MUMBLE implementations need a + * descriptor bitmask anyway and c) we use the bitmask already to implement + * sanity checks. + */ +#if (EVENTS_STYLE == EVENTS_STYLE_DEVPOLL) +#include <sys/devpoll.h> +#include <fcntl.h> + + /* + * Macros to initialize the kernel-based filter; see event_init(). + */ +static int event_pollfd; /* handle to file descriptor set */ + +#define EVENT_REG_INIT_HANDLE(er, n) do { \ + er = event_pollfd = open("/dev/poll", O_RDWR); \ + if (event_pollfd >= 0) close_on_exec(event_pollfd, CLOSE_ON_EXEC); \ + } while (0) +#define EVENT_REG_INIT_TEXT "open /dev/poll" + +#define EVENT_REG_FORK_HANDLE(er, n) do { \ + (void) close(event_pollfd); \ + EVENT_REG_INIT_HANDLE(er, (n)); \ + } while (0) + + /* + * Macros to update the kernel-based filter; see event_enable_read(), + * event_enable_write() and event_disable_readwrite(). + */ +#define EVENT_REG_FD_OP(er, fh, ev) do { \ + struct pollfd dummy; \ + dummy.fd = (fh); \ + dummy.events = (ev); \ + (er) = write(event_pollfd, (void *) &dummy, \ + sizeof(dummy)) != sizeof(dummy) ? -1 : 0; \ + } while (0) + +#define EVENT_REG_ADD_READ(e, f) EVENT_REG_FD_OP((e), (f), POLLIN) +#define EVENT_REG_ADD_WRITE(e, f) EVENT_REG_FD_OP((e), (f), POLLOUT) +#define EVENT_REG_ADD_TEXT "write /dev/poll" + +#define EVENT_REG_DEL_BOTH(e, f) EVENT_REG_FD_OP((e), (f), POLLREMOVE) +#define EVENT_REG_DEL_TEXT "write /dev/poll" + + /* + * Macros to retrieve event buffers from the kernel; see event_loop(). + */ +typedef struct pollfd EVENT_BUFFER; + +#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \ + struct dvpoll dvpoll; \ + dvpoll.dp_fds = (event_buf); \ + dvpoll.dp_nfds = (buflen); \ + dvpoll.dp_timeout = (delay) < 0 ? -1 : (delay) * 1000; \ + (event_count) = ioctl(event_pollfd, DP_POLL, &dvpoll); \ + } while (0) +#define EVENT_BUFFER_READ_TEXT "ioctl DP_POLL" + + /* + * Macros to process event buffers from the kernel; see event_loop(). + */ +#define EVENT_GET_FD(bp) ((bp)->fd) +#define EVENT_GET_TYPE(bp) ((bp)->revents) +#define EVENT_TEST_READ(bp) (EVENT_GET_TYPE(bp) & POLLIN) +#define EVENT_TEST_WRITE(bp) (EVENT_GET_TYPE(bp) & POLLOUT) + +#endif + + /* + * Linux epoll supports no system call to find out what descriptors are + * registered in the kernel-based filter. To implement our own sanity checks + * we maintain our own descriptor bitmask. + * + * Linux epoll does support application context pointers. Unfortunately, + * changing that information would cost a system call, and some of the + * competitors don't support application context. To keep the implementation + * simple we maintain our own table with call-back information. + * + * Linux epoll silently unregisters a descriptor from its filter when the + * descriptor is closed, so our information could get out of sync with the + * kernel. But that will never happen, because we have to meticulously + * unregister a file descriptor before it is closed, to avoid errors on + * systems that are built with EVENTS_STYLE == EVENTS_STYLE_SELECT. + */ +#if (EVENTS_STYLE == EVENTS_STYLE_EPOLL) +#include <sys/epoll.h> + + /* + * Macros to initialize the kernel-based filter; see event_init(). + */ +static int event_epollfd; /* epoll handle */ + +#define EVENT_REG_INIT_HANDLE(er, n) do { \ + er = event_epollfd = epoll_create(n); \ + if (event_epollfd >= 0) close_on_exec(event_epollfd, CLOSE_ON_EXEC); \ + } while (0) +#define EVENT_REG_INIT_TEXT "epoll_create" + +#define EVENT_REG_FORK_HANDLE(er, n) do { \ + (void) close(event_epollfd); \ + EVENT_REG_INIT_HANDLE(er, (n)); \ + } while (0) + + /* + * Macros to update the kernel-based filter; see event_enable_read(), + * event_enable_write() and event_disable_readwrite(). + */ +#define EVENT_REG_FD_OP(er, fh, ev, op) do { \ + struct epoll_event dummy; \ + dummy.events = (ev); \ + dummy.data.fd = (fh); \ + (er) = epoll_ctl(event_epollfd, (op), (fh), &dummy); \ + } while (0) + +#define EVENT_REG_ADD_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EPOLL_CTL_ADD) +#define EVENT_REG_ADD_READ(e, f) EVENT_REG_ADD_OP((e), (f), EPOLLIN) +#define EVENT_REG_ADD_WRITE(e, f) EVENT_REG_ADD_OP((e), (f), EPOLLOUT) +#define EVENT_REG_ADD_TEXT "epoll_ctl EPOLL_CTL_ADD" + +#define EVENT_REG_DEL_OP(e, f, ev) EVENT_REG_FD_OP((e), (f), (ev), EPOLL_CTL_DEL) +#define EVENT_REG_DEL_READ(e, f) EVENT_REG_DEL_OP((e), (f), EPOLLIN) +#define EVENT_REG_DEL_WRITE(e, f) EVENT_REG_DEL_OP((e), (f), EPOLLOUT) +#define EVENT_REG_DEL_TEXT "epoll_ctl EPOLL_CTL_DEL" + + /* + * Macros to retrieve event buffers from the kernel; see event_loop(). + */ +typedef struct epoll_event EVENT_BUFFER; + +#define EVENT_BUFFER_READ(event_count, event_buf, buflen, delay) do { \ + (event_count) = epoll_wait(event_epollfd, (event_buf), (buflen), \ + (delay) < 0 ? -1 : (delay) * 1000); \ + } while (0) +#define EVENT_BUFFER_READ_TEXT "epoll_wait" + + /* + * Macros to process event buffers from the kernel; see event_loop(). + */ +#define EVENT_GET_FD(bp) ((bp)->data.fd) +#define EVENT_GET_TYPE(bp) ((bp)->events) +#define EVENT_TEST_READ(bp) (EVENT_GET_TYPE(bp) & EPOLLIN) +#define EVENT_TEST_WRITE(bp) (EVENT_GET_TYPE(bp) & EPOLLOUT) + +#endif + + /* + * Timer events. Timer requests are kept sorted, in a circular list. We use + * the RING abstraction, so we get to use a couple ugly macros. + * + * When a call-back function adds a timer request, we label the request with + * the event_loop() call instance that invoked the call-back. We use this to + * prevent zero-delay timer requests from running in a tight loop and + * starving I/O events. + */ +typedef struct EVENT_TIMER EVENT_TIMER; + +struct EVENT_TIMER { + time_t when; /* when event is wanted */ + EVENT_NOTIFY_TIME_FN callback; /* callback function */ + char *context; /* callback context */ + long loop_instance; /* event_loop() call instance */ + RING ring; /* linkage */ +}; + +static RING event_timer_head; /* timer queue head */ +static long event_loop_instance; /* event_loop() call instance */ + +#define RING_TO_TIMER(r) \ + ((EVENT_TIMER *) ((void *) (r) - offsetof(EVENT_TIMER, ring))) + +#define FOREACH_QUEUE_ENTRY(entry, head) \ + for (entry = ring_succ(head); entry != (head); entry = ring_succ(entry)) + +#define FIRST_TIMER(head) \ + (ring_succ(head) != (head) ? RING_TO_TIMER(ring_succ(head)) : 0) + + /* + * Other private data structures. + */ +static time_t event_present; /* cached time of day */ + +#define EVENT_INIT_NEEDED() (event_present == 0) + +/* event_init - set up tables and such */ + +static void event_init(void) +{ + EVENT_FDTABLE *fdp; + int err; + + if (!EVENT_INIT_NEEDED()) + msg_panic("event_init: repeated call"); + + /* + * Initialize the file descriptor masks and the call-back table. Where + * possible we extend these data structures on the fly. With select(2) + * based implementations we can only handle FD_SETSIZE open files. + */ +#if (EVENTS_STYLE == EVENTS_STYLE_SELECT) + if ((event_fdlimit = open_limit(FD_SETSIZE)) < 0) + msg_fatal("unable to determine open file limit"); +#else + if ((event_fdlimit = open_limit(INT_MAX)) < 0) + msg_fatal("unable to determine open file limit"); +#endif + if (event_fdlimit < FD_SETSIZE / 2 && event_fdlimit < 256) + msg_warn("could allocate space for only %d open files", event_fdlimit); + event_fdslots = EVENT_ALLOC_INCR; + event_fdtable = (EVENT_FDTABLE *) + mymalloc(sizeof(EVENT_FDTABLE) * event_fdslots); + for (fdp = event_fdtable; fdp < event_fdtable + event_fdslots; fdp++) { + fdp->callback = 0; + fdp->context = 0; + } + + /* + * Initialize the I/O event request masks. + */ +#if (EVENTS_STYLE == EVENTS_STYLE_SELECT) + EVENT_MASK_ZERO(&event_rmask); + EVENT_MASK_ZERO(&event_wmask); + EVENT_MASK_ZERO(&event_xmask); +#else + EVENT_MASK_ALLOC(&event_rmask, event_fdslots); + EVENT_MASK_ALLOC(&event_wmask, event_fdslots); + EVENT_MASK_ALLOC(&event_xmask, event_fdslots); + + /* + * Initialize the kernel-based filter. + */ + EVENT_REG_INIT_HANDLE(err, event_fdslots); + if (err < 0) + msg_fatal("%s: %m", EVENT_REG_INIT_TEXT); +#endif + + /* + * Initialize timer stuff. + */ + ring_init(&event_timer_head); + (void) time(&event_present); + + /* + * Avoid an infinite initialization loop. + */ + if (EVENT_INIT_NEEDED()) + msg_panic("event_init: unable to initialize"); +} + +/* event_extend - make room for more descriptor slots */ + +static void event_extend(int fd) +{ + const char *myname = "event_extend"; + int old_slots = event_fdslots; + int new_slots = (event_fdslots > fd / 2 ? + 2 * old_slots : fd + EVENT_ALLOC_INCR); + EVENT_FDTABLE *fdp; + +#ifdef EVENT_REG_UPD_HANDLE + int err; + +#endif + + if (msg_verbose > 2) + msg_info("%s: fd %d", myname, fd); + event_fdtable = (EVENT_FDTABLE *) + myrealloc((void *) event_fdtable, sizeof(EVENT_FDTABLE) * new_slots); + event_fdslots = new_slots; + for (fdp = event_fdtable + old_slots; + fdp < event_fdtable + new_slots; fdp++) { + fdp->callback = 0; + fdp->context = 0; + } + + /* + * Initialize the I/O event request masks. + */ +#if (EVENTS_STYLE != EVENTS_STYLE_SELECT) + EVENT_MASK_REALLOC(&event_rmask, new_slots); + EVENT_MASK_REALLOC(&event_wmask, new_slots); + EVENT_MASK_REALLOC(&event_xmask, new_slots); +#endif +#ifdef EVENT_REG_UPD_HANDLE + EVENT_REG_UPD_HANDLE(err, new_slots); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_UPD_TEXT); +#endif +} + +/* event_time - look up cached time of day */ + +time_t event_time(void) +{ + if (EVENT_INIT_NEEDED()) + event_init(); + + return (event_present); +} + +/* event_drain - loop until all pending events are done */ + +void event_drain(int time_limit) +{ + EVENT_MASK zero_mask; + time_t max_time; + + if (EVENT_INIT_NEEDED()) + return; + +#if (EVENTS_STYLE == EVENTS_STYLE_SELECT) + EVENT_MASK_ZERO(&zero_mask); +#else + EVENT_MASK_ALLOC(&zero_mask, event_fdslots); +#endif + (void) time(&event_present); + max_time = event_present + time_limit; + while (event_present < max_time + && (event_timer_head.pred != &event_timer_head + || EVENT_MASK_CMP(&zero_mask, &event_xmask) != 0)) { + event_loop(1); +#if (EVENTS_STYLE != EVENTS_STYLE_SELECT) + if (EVENT_MASK_BYTE_COUNT(&zero_mask) + != EVENT_MASK_BYTES_NEEDED(event_fdslots)) + EVENT_MASK_REALLOC(&zero_mask, event_fdslots); +#endif + } +#if (EVENTS_STYLE != EVENTS_STYLE_SELECT) + EVENT_MASK_FREE(&zero_mask); +#endif +} + +/* event_fork - resume event processing after fork() */ + +void event_fork(void) +{ +#if (EVENTS_STYLE != EVENTS_STYLE_SELECT) + EVENT_FDTABLE *fdp; + int err; + int fd; + + /* + * No event was ever registered, so there's nothing to be done. + */ + if (EVENT_INIT_NEEDED()) + return; + + /* + * Close the existing filter handle and open a new kernel-based filter. + */ + EVENT_REG_FORK_HANDLE(err, event_fdslots); + if (err < 0) + msg_fatal("%s: %m", EVENT_REG_INIT_TEXT); + + /* + * Populate the new kernel-based filter with events that were registered + * in the parent process. + */ + for (fd = 0; fd <= event_max_fd; fd++) { + if (EVENT_MASK_ISSET(fd, &event_wmask)) { + EVENT_MASK_CLR(fd, &event_wmask); + fdp = event_fdtable + fd; + event_enable_write(fd, fdp->callback, fdp->context); + } else if (EVENT_MASK_ISSET(fd, &event_rmask)) { + EVENT_MASK_CLR(fd, &event_rmask); + fdp = event_fdtable + fd; + event_enable_read(fd, fdp->callback, fdp->context); + } + } +#endif +} + +/* event_enable_read - enable read events */ + +void event_enable_read(int fd, EVENT_NOTIFY_RDWR_FN callback, void *context) +{ + const char *myname = "event_enable_read"; + EVENT_FDTABLE *fdp; + int err; + + if (EVENT_INIT_NEEDED()) + event_init(); + + /* + * Sanity checks. + */ + if (fd < 0 || fd >= event_fdlimit) + msg_panic("%s: bad file descriptor: %d", myname, fd); + + if (msg_verbose > 2) + msg_info("%s: fd %d", myname, fd); + + if (fd >= event_fdslots) + event_extend(fd); + + /* + * Disallow mixed (i.e. read and write) requests on the same descriptor. + */ + if (EVENT_MASK_ISSET(fd, &event_wmask)) + msg_panic("%s: fd %d: read/write I/O request", myname, fd); + + /* + * Postfix 2.4 allows multiple event_enable_read() calls on the same + * descriptor without requiring event_disable_readwrite() calls between + * them. With kernel-based filters (kqueue, /dev/poll, epoll) it's + * wasteful to make system calls when we change only application + * call-back information. It has a noticeable effect on smtp-source + * performance. + */ + if (EVENT_MASK_ISSET(fd, &event_rmask) == 0) { + EVENT_MASK_SET(fd, &event_xmask); + EVENT_MASK_SET(fd, &event_rmask); + if (event_max_fd < fd) + event_max_fd = fd; +#if (EVENTS_STYLE != EVENTS_STYLE_SELECT) + EVENT_REG_ADD_READ(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_ADD_TEXT); +#endif + } + fdp = event_fdtable + fd; + if (fdp->callback != callback || fdp->context != context) { + fdp->callback = callback; + fdp->context = context; + } +} + +/* event_enable_write - enable write events */ + +void event_enable_write(int fd, EVENT_NOTIFY_RDWR_FN callback, void *context) +{ + const char *myname = "event_enable_write"; + EVENT_FDTABLE *fdp; + int err; + + if (EVENT_INIT_NEEDED()) + event_init(); + + /* + * Sanity checks. + */ + if (fd < 0 || fd >= event_fdlimit) + msg_panic("%s: bad file descriptor: %d", myname, fd); + + if (msg_verbose > 2) + msg_info("%s: fd %d", myname, fd); + + if (fd >= event_fdslots) + event_extend(fd); + + /* + * Disallow mixed (i.e. read and write) requests on the same descriptor. + */ + if (EVENT_MASK_ISSET(fd, &event_rmask)) + msg_panic("%s: fd %d: read/write I/O request", myname, fd); + + /* + * Postfix 2.4 allows multiple event_enable_write() calls on the same + * descriptor without requiring event_disable_readwrite() calls between + * them. With kernel-based filters (kqueue, /dev/poll, epoll) it's + * incredibly wasteful to make unregister and register system calls when + * we change only application call-back information. It has a noticeable + * effect on smtp-source performance. + */ + if (EVENT_MASK_ISSET(fd, &event_wmask) == 0) { + EVENT_MASK_SET(fd, &event_xmask); + EVENT_MASK_SET(fd, &event_wmask); + if (event_max_fd < fd) + event_max_fd = fd; +#if (EVENTS_STYLE != EVENTS_STYLE_SELECT) + EVENT_REG_ADD_WRITE(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_ADD_TEXT); +#endif + } + fdp = event_fdtable + fd; + if (fdp->callback != callback || fdp->context != context) { + fdp->callback = callback; + fdp->context = context; + } +} + +/* event_disable_readwrite - disable request for read or write events */ + +void event_disable_readwrite(int fd) +{ + const char *myname = "event_disable_readwrite"; + EVENT_FDTABLE *fdp; + int err; + + if (EVENT_INIT_NEEDED()) + event_init(); + + /* + * Sanity checks. + */ + if (fd < 0 || fd >= event_fdlimit) + msg_panic("%s: bad file descriptor: %d", myname, fd); + + if (msg_verbose > 2) + msg_info("%s: fd %d", myname, fd); + + /* + * Don't complain when there is nothing to cancel. The request may have + * been canceled from another thread. + */ + if (fd >= event_fdslots) + return; +#if (EVENTS_STYLE != EVENTS_STYLE_SELECT) +#ifdef EVENT_REG_DEL_BOTH + /* XXX Can't seem to disable READ and WRITE events selectively. */ + if (EVENT_MASK_ISSET(fd, &event_rmask) + || EVENT_MASK_ISSET(fd, &event_wmask)) { + EVENT_REG_DEL_BOTH(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT); + } +#else + if (EVENT_MASK_ISSET(fd, &event_rmask)) { + EVENT_REG_DEL_READ(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT); + } else if (EVENT_MASK_ISSET(fd, &event_wmask)) { + EVENT_REG_DEL_WRITE(err, fd); + if (err < 0) + msg_fatal("%s: %s: %m", myname, EVENT_REG_DEL_TEXT); + } +#endif /* EVENT_REG_DEL_BOTH */ +#endif /* != EVENTS_STYLE_SELECT */ + EVENT_MASK_CLR(fd, &event_xmask); + EVENT_MASK_CLR(fd, &event_rmask); + EVENT_MASK_CLR(fd, &event_wmask); + fdp = event_fdtable + fd; + fdp->callback = 0; + fdp->context = 0; +} + +/* event_request_timer - (re)set timer */ + +time_t event_request_timer(EVENT_NOTIFY_TIME_FN callback, void *context, int delay) +{ + const char *myname = "event_request_timer"; + RING *ring; + EVENT_TIMER *timer; + + if (EVENT_INIT_NEEDED()) + event_init(); + + /* + * Sanity checks. + */ + if (delay < 0) + msg_panic("%s: invalid delay: %d", myname, delay); + + /* + * Make sure we schedule this event at the right time. + */ + time(&event_present); + + /* + * See if they are resetting an existing timer request. If so, take the + * request away from the timer queue so that it can be inserted at the + * right place. + */ + FOREACH_QUEUE_ENTRY(ring, &event_timer_head) { + timer = RING_TO_TIMER(ring); + if (timer->callback == callback && timer->context == context) { + timer->when = event_present + delay; + timer->loop_instance = event_loop_instance; + ring_detach(ring); + if (msg_verbose > 2) + msg_info("%s: reset 0x%lx 0x%lx %d", myname, + (long) callback, (long) context, delay); + break; + } + } + + /* + * If not found, schedule a new timer request. + */ + if (ring == &event_timer_head) { + timer = (EVENT_TIMER *) mymalloc(sizeof(EVENT_TIMER)); + timer->when = event_present + delay; + timer->callback = callback; + timer->context = context; + timer->loop_instance = event_loop_instance; + if (msg_verbose > 2) + msg_info("%s: set 0x%lx 0x%lx %d", myname, + (long) callback, (long) context, delay); + } + + /* + * Timer requests are kept sorted to reduce lookup overhead in the event + * loop. + * + * XXX Append the new request after existing requests for the same time + * slot. The event_loop() routine depends on this to avoid starving I/O + * events when a call-back function schedules a zero-delay timer request. + */ + FOREACH_QUEUE_ENTRY(ring, &event_timer_head) { + if (timer->when < RING_TO_TIMER(ring)->when) + break; + } + ring_prepend(ring, &timer->ring); + + return (timer->when); +} + +/* event_cancel_timer - cancel timer */ + +int event_cancel_timer(EVENT_NOTIFY_TIME_FN callback, void *context) +{ + const char *myname = "event_cancel_timer"; + RING *ring; + EVENT_TIMER *timer; + int time_left = -1; + + if (EVENT_INIT_NEEDED()) + event_init(); + + /* + * See if they are canceling an existing timer request. Do not complain + * when the request is not found. It might have been canceled from some + * other thread. + */ + FOREACH_QUEUE_ENTRY(ring, &event_timer_head) { + timer = RING_TO_TIMER(ring); + if (timer->callback == callback && timer->context == context) { + if ((time_left = timer->when - event_present) < 0) + time_left = 0; + ring_detach(ring); + myfree((void *) timer); + break; + } + } + if (msg_verbose > 2) + msg_info("%s: 0x%lx 0x%lx %d", myname, + (long) callback, (long) context, time_left); + return (time_left); +} + +/* event_loop - wait for the next event */ + +void event_loop(int delay) +{ + const char *myname = "event_loop"; + static int nested; + +#if (EVENTS_STYLE == EVENTS_STYLE_SELECT) + fd_set rmask; + fd_set wmask; + fd_set xmask; + struct timeval tv; + struct timeval *tvp; + int new_max_fd; + +#else + EVENT_BUFFER event_buf[100]; + EVENT_BUFFER *bp; + +#endif + int event_count; + EVENT_TIMER *timer; + int fd; + EVENT_FDTABLE *fdp; + int select_delay; + + if (EVENT_INIT_NEEDED()) + event_init(); + + /* + * XXX Also print the select() masks? + */ + if (msg_verbose > 2) { + RING *ring; + + FOREACH_QUEUE_ENTRY(ring, &event_timer_head) { + timer = RING_TO_TIMER(ring); + msg_info("%s: time left %3d for 0x%lx 0x%lx", myname, + (int) (timer->when - event_present), + (long) timer->callback, (long) timer->context); + } + } + + /* + * Find out when the next timer would go off. Timer requests are sorted. + * If any timer is scheduled, adjust the delay appropriately. + */ + if ((timer = FIRST_TIMER(&event_timer_head)) != 0) { + event_present = time((time_t *) 0); + if ((select_delay = timer->when - event_present) < 0) { + select_delay = 0; + } else if (delay >= 0 && select_delay > delay) { + select_delay = delay; + } + } else { + select_delay = delay; + } + if (msg_verbose > 2) + msg_info("event_loop: select_delay %d", select_delay); + + /* + * Negative delay means: wait until something happens. Zero delay means: + * poll. Positive delay means: wait at most this long. + */ +#if (EVENTS_STYLE == EVENTS_STYLE_SELECT) + if (select_delay < 0) { + tvp = 0; + } else { + tvp = &tv; + tv.tv_usec = 0; + tv.tv_sec = select_delay; + } + + /* + * Pause until the next event happens. When select() has a problem, don't + * go into a tight loop. Allow select() to be interrupted due to the + * arrival of a signal. + */ + rmask = event_rmask; + wmask = event_wmask; + xmask = event_xmask; + + event_count = select(event_max_fd + 1, &rmask, &wmask, &xmask, tvp); + if (event_count < 0) { + if (errno != EINTR) + msg_fatal("event_loop: select: %m"); + return; + } +#else + EVENT_BUFFER_READ(event_count, event_buf, + sizeof(event_buf) / sizeof(event_buf[0]), + select_delay); + if (event_count < 0) { + if (errno != EINTR) + msg_fatal("event_loop: " EVENT_BUFFER_READ_TEXT ": %m"); + return; + } +#endif + + /* + * Before entering the application call-back routines, make sure we + * aren't being called from a call-back routine. Doing so would make us + * vulnerable to all kinds of race conditions. + */ + if (nested++ > 0) + msg_panic("event_loop: recursive call"); + + /* + * Deliver timer events. Allow the application to add/delete timer queue + * requests while it is being called back. Requests are sorted: we keep + * running over the timer request queue from the start, and stop when we + * reach the future or the list end. We also stop when we reach a timer + * request that was added by a call-back that was invoked from this + * event_loop() call instance, for reasons that are explained below. + * + * To avoid dangling pointer problems 1) we must remove a request from the + * timer queue before delivering its event to the application and 2) we + * must look up the next timer request *after* calling the application. + * The latter complicates the handling of zero-delay timer requests that + * are added by event_loop() call-back functions. + * + * XXX When a timer event call-back function adds a new timer request, + * event_request_timer() labels the request with the event_loop() call + * instance that invoked the timer event call-back. We use this instance + * label here to prevent zero-delay timer requests from running in a + * tight loop and starving I/O events. To make this solution work, + * event_request_timer() appends a new request after existing requests + * for the same time slot. + */ + event_present = time((time_t *) 0); + event_loop_instance += 1; + + while ((timer = FIRST_TIMER(&event_timer_head)) != 0) { + if (timer->when > event_present) + break; + if (timer->loop_instance == event_loop_instance) + break; + ring_detach(&timer->ring); /* first this */ + if (msg_verbose > 2) + msg_info("%s: timer 0x%lx 0x%lx", myname, + (long) timer->callback, (long) timer->context); + timer->callback(EVENT_TIME, timer->context); /* then this */ + myfree((void *) timer); + } + + /* + * Deliver I/O events. Allow the application to cancel event requests + * while it is being called back. To this end, we keep an eye on the + * contents of event_xmask, so that we deliver only events that are still + * wanted. We do not change the event request masks. It is up to the + * application to determine when a read or write is complete. + */ +#if (EVENTS_STYLE == EVENTS_STYLE_SELECT) + if (event_count > 0) { + for (new_max_fd = 0, fd = 0; fd <= event_max_fd; fd++) { + if (FD_ISSET(fd, &event_xmask)) { + new_max_fd = fd; + /* In case event_fdtable is updated. */ + fdp = event_fdtable + fd; + if (FD_ISSET(fd, &xmask)) { + if (msg_verbose > 2) + msg_info("%s: exception fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, (long) fdp->context); + fdp->callback(EVENT_XCPT, fdp->context); + } else if (FD_ISSET(fd, &wmask)) { + if (msg_verbose > 2) + msg_info("%s: write fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, (long) fdp->context); + fdp->callback(EVENT_WRITE, fdp->context); + } else if (FD_ISSET(fd, &rmask)) { + if (msg_verbose > 2) + msg_info("%s: read fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, (long) fdp->context); + fdp->callback(EVENT_READ, fdp->context); + } + } + } + event_max_fd = new_max_fd; + } +#else + for (bp = event_buf; bp < event_buf + event_count; bp++) { + fd = EVENT_GET_FD(bp); + if (fd < 0 || fd > event_max_fd) + msg_panic("%s: bad file descriptor: %d", myname, fd); + if (EVENT_MASK_ISSET(fd, &event_xmask)) { + fdp = event_fdtable + fd; + if (EVENT_TEST_READ(bp)) { + if (msg_verbose > 2) + msg_info("%s: read fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, (long) fdp->context); + fdp->callback(EVENT_READ, fdp->context); + } else if (EVENT_TEST_WRITE(bp)) { + if (msg_verbose > 2) + msg_info("%s: write fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, + (long) fdp->context); + fdp->callback(EVENT_WRITE, fdp->context); + } else { + if (msg_verbose > 2) + msg_info("%s: other fd=%d act=0x%lx 0x%lx", myname, + fd, (long) fdp->callback, (long) fdp->context); + fdp->callback(EVENT_XCPT, fdp->context); + } + } + } +#endif + nested--; +} + +#ifdef TEST + + /* + * Proof-of-concept test program for the event manager. Schedule a series of + * events at one-second intervals and let them happen, while echoing any + * lines read from stdin. + */ +#include <stdio.h> +#include <ctype.h> +#include <stdlib.h> + +/* timer_event - display event */ + +static void timer_event(int unused_event, void *context) +{ + printf("%ld: %s\n", (long) event_present, context); + fflush(stdout); +} + +/* echo - echo text received on stdin */ + +static void echo(int unused_event, void *unused_context) +{ + char buf[BUFSIZ]; + + if (fgets(buf, sizeof(buf), stdin) == 0) + exit(0); + printf("Result: %s", buf); +} + +/* request - request a bunch of timer events */ + +static void request(int unused_event, void *unused_context) +{ + event_request_timer(timer_event, "3 first", 3); + event_request_timer(timer_event, "3 second", 3); + event_request_timer(timer_event, "4 first", 4); + event_request_timer(timer_event, "4 second", 4); + event_request_timer(timer_event, "2 first", 2); + event_request_timer(timer_event, "2 second", 2); + event_request_timer(timer_event, "1 first", 1); + event_request_timer(timer_event, "1 second", 1); + event_request_timer(timer_event, "0 first", 0); + event_request_timer(timer_event, "0 second", 0); +} + +int main(int argc, void **argv) +{ + if (argv[1]) + msg_verbose = atoi(argv[1]); + event_request_timer(request, (void *) 0, 0); + event_enable_read(fileno(stdin), echo, (void *) 0); + event_drain(10); + exit(0); +} + +#endif |