#include #include #include /* * Substitute poll(2) function using POSIX real time signals. * * The poll(2) system call often has significant latencies and realtime * impacts (probably because of its variable length argument list). * * These functions let us use real time signals and sigtimedwait(2) instead * of poll - for those files which work with real time signals. * In the 2.4 series of Linux kernels, this does *not* include FIFOs. * * NOTE: We (have to) grab the SIGPOLL signal for our own purposes. * Hope that's OK with you... * * Special caution: We can only incompletely simulate the difference between * the level-triggered interface of poll(2) and the edge-triggered behavior * of I/O signals. As a result you *must* read all previously-indicated * incoming data before calling cl_poll() again. Otherwise you may miss * some incoming data (and possibly hang). * * * Copyright (C) 2003 IBM Corporation * * Author: * * This software licensed under the GNU LGPL. * * * This library is free software; you can redistribute it and/or * modify it under the terms of version 2.1 of the GNU Lesser General Public * License as published by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * **************************************************************************/ #define __USE_GNU 1 # include #undef __USE_GNU #include #include #include #include #include #include /* Turn on to log odd realtime behavior */ #define TIME_CALLS 1 #ifdef TIME_CALLS # include # include #endif static int debug = 0; int /* Slightly sleazy... */ cl_glibpoll(GPollFD* ufds, guint nfsd, gint timeout) { (void)debug; return cl_poll((struct pollfd*)ufds, nfsd, timeout); } #if defined (F_SETSIG) && defined(F_SETOWN) && defined (O_ASYNC) # define HAVE_FCNTL_F_SETSIG #endif #ifndef HAVE_FCNTL_F_SETSIG /* * Dummy cl_poll() and cl_poll_ignore() functions for systems where * we don't have all the support we need. */ int cl_poll(struct pollfd *fds, unsigned int nfds, int timeout) { return poll(fds, (nfds_t)nfds, timeout); } int cl_poll_ignore(int fd) { return 0; } #else /* HAVE_FCNTL_F_SETSIG */ static void dump_fd_info(struct pollfd *fds, unsigned int nfds, int timeoutms); static void check_fd_info(struct pollfd *fds, unsigned int nfds); static void cl_real_poll_fd(int fd); static void cl_poll_sigpoll_overflow_sigaction(int nsig, siginfo_t* , void*); static void cl_poll_sigpoll_overflow(void); static int cl_poll_get_sigqlimit(void); typedef unsigned char poll_bool; /* * Here's our strategy: * We have a set of signals which we use for these file descriptors, * and we use sigtimedwait(2) to wait on information from these various * signals. * * If we are ever asked to wait for a particular signal, then we will * enable signals for that file descriptor, and post the events in * our own cache. The next time you include that signal in a call * to cl_poll(), you will get the information delivered * to you in your cl_poll() call. * * If you want to stop monitoring a particular file descriptor, use * cl_poll_ignore() for that purpose. Doing this is a good idea, but * not fatal if omitted... */ /* Information about a file descriptor we're monitoring */ typedef struct poll_fd_info_s { short nsig; /* Which signal goes with it? */ short pendevents; /* Pending events */ }poll_info_t; static int max_allocated = 0; static poll_bool* is_monitored = NULL; /* Sized by max_allocated */ static poll_info_t* monitorinfo = NULL; /* Sized by max_allocated */ static int cl_nsig = 0; static gboolean SigQOverflow = FALSE; static int cl_init_poll_sig(struct pollfd *fds, unsigned int nfds); static short cl_poll_assignsig(int fd); static void cl_poll_sigaction(int nsig, siginfo_t* info, void* v); static int cl_poll_prepsig(int nsig); /* * SignalSet is the set of all file descriptors we're monitoring. * * We monitor a file descriptor forever, unless you tell us not to * by calling cl_poll_ignore(), or you (mistakenly) give it to * us to look at in another poll call after you've closed it. */ static sigset_t SignalSet; /* Select the signal you want us to use (must be a RT signal) */ int cl_poll_setsig(int nsig) { if (nsig < SIGRTMIN || nsig >= SIGRTMAX) { errno = EINVAL; return -1; } if (cl_poll_prepsig(nsig) < 0) { return -1; } cl_nsig = nsig; return 0; } /* * It's harmless to call us multiple times on the same signal. */ static int cl_poll_prepsig(int nsig) { static gboolean setinityet=FALSE; if (!setinityet) { CL_SIGEMPTYSET(&SignalSet); cl_signal_set_simple_action(SIGPOLL , cl_poll_sigpoll_overflow_sigaction , NULL); setinityet = TRUE; } if (CL_SIGINTERRUPT(nsig, FALSE) < 0) { cl_perror("sig_interrupt(%d, FALSE)", nsig); return -1; } if (CL_SIGADDSET(&SignalSet, nsig) < 0) { cl_perror("sig_addset(&SignalSet, %d)", nsig); return -1; } if (CL_SIGPROCMASK(SIG_BLOCK, &SignalSet, NULL) < 0) { cl_perror("sig_sigprocmask(SIG_BLOCK, sig %d)", nsig); return -1; } if (debug) { cl_log(LOG_DEBUG , "Signal %d belongs to us...", nsig); cl_log(LOG_DEBUG, "cl_poll_prepsig(%d) succeeded.", nsig); } return 0; } #define FD_CHUNKSIZE 64 /* Set of events everyone must monitor whether they want to or not ;-) */ #define CONSTEVENTS (POLLHUP|POLLERR|POLLNVAL) #define RECORDFDEVENT(fd, flags) (monitorinfo[fd].pendevents |= (flags)) /* * Initialized our poll-simulation data structures. * This means (among other things) registering any monitored * file descriptors. */ static int cl_init_poll_sig(struct pollfd *fds, unsigned int nfds) { unsigned j; int maxmonfd = -1; int nmatch = 0; if (cl_nsig == 0) { cl_nsig = ((SIGRTMIN+SIGRTMAX)/2); if (cl_poll_setsig(cl_nsig) < 0) { return -1; } } for (j=0; j < nfds; ++j) { const int fd = fds[j].fd; if (fd > maxmonfd) { maxmonfd = fd; } } /* See if we need to malloc/realloc our data structures */ if (maxmonfd >= max_allocated) { int newsize; int growthamount; newsize = ((maxmonfd + FD_CHUNKSIZE)/FD_CHUNKSIZE) * FD_CHUNKSIZE; growthamount = newsize - max_allocated; /* This can't happen ;-) */ if (growthamount <= 0 || newsize <= maxmonfd) { errno = EINVAL; return -1; } /* Allocate (more) memory! */ if ((is_monitored = (poll_bool*)realloc(is_monitored , newsize * sizeof(poll_bool))) == NULL || (monitorinfo = (poll_info_t*) realloc(monitorinfo , newsize * sizeof(poll_info_t))) == NULL) { if (is_monitored) { free(is_monitored); is_monitored = NULL; } if (monitorinfo) { free(monitorinfo); monitorinfo = NULL; } max_allocated = 0; errno = ENOMEM; return -1; } memset(monitorinfo+max_allocated, 0 , growthamount * sizeof(monitorinfo[0])); memset(is_monitored+max_allocated, FALSE , growthamount*sizeof(is_monitored[0])); max_allocated = newsize; } if (fds->events != 0 && debug) { cl_log(LOG_DEBUG , "Current event mask for fd [0] {%d} 0x%x" , fds->fd, fds->events); } /* * Examine each fd for the following things: * Is it already monitored? * if not, set it up for monitoring. * Do we have events for it? * if so, post events... */ for (j=0; j < nfds; ++j) { const int fd = fds[j].fd; poll_info_t* moni = monitorinfo+fd; short nsig; int badfd = FALSE; is_monitored[fd] = TRUE; if (moni->nsig <= 0) { nsig = cl_poll_assignsig(fd); if (nsig < 0) { RECORDFDEVENT(fd, POLLERR); badfd = TRUE; }else{ /* Use real poll(2) to get initial * event status */ moni->nsig = nsig; cl_real_poll_fd(fd); } }else if (fcntl(fd, F_GETFD) < 0) { cl_log(LOG_ERR, "bad fd(%d)", fd); RECORDFDEVENT(fd, POLLNVAL); badfd = TRUE; } /* Look for pending events... */ fds[j].revents = (moni->pendevents & (fds[j].events|CONSTEVENTS)); if (fds[j].revents) { ++nmatch; moni->pendevents &= ~(fds[j].revents); if (debug) { cl_log(LOG_DEBUG , "revents for fd %d: 0x%x" , fds[j].fd, fds[j].revents); cl_log(LOG_DEBUG , "events for fd %d: 0x%x" , fds[j].fd, fds[j].events); } }else if (fds[j].events && debug) { cl_log(LOG_DEBUG , "pendevents for fd %d: 0x%x" , fds[j].fd, moni->pendevents); } if (badfd) { cl_poll_ignore(fd); } } if (nmatch != 0 && debug) { cl_log(LOG_DEBUG, "Returning %d events from cl_init_poll_sig()" , nmatch); } return nmatch; } /* * Initialize our current state of the world with info from the * real poll(2) call. * * We call this when we first see a particular fd, and after a signal * queue overflow. */ static void cl_real_poll_fd(int fd) { struct pollfd pfd[1]; if (fd >= max_allocated || !is_monitored[fd]) { return; } if (debug) { cl_log(LOG_DEBUG , "Calling poll(2) on fd %d", fd); } /* Get the current state of affaris from poll(2) */ pfd[0].fd = fd; pfd[0].revents = 0; pfd[0].events = ~0; if (poll(pfd, 1, 0) >= 0) { RECORDFDEVENT(fd, pfd[0].revents); if (pfd[0].revents & (POLLNVAL|POLLERR)) { cl_log(LOG_INFO, "cl_poll_real_fd(%d): error in revents [%d]" , fd, pfd[0].revents); } if (debug) { cl_log(LOG_DEBUG , "Old news from poll(2) for fd %d: 0x%x" , fd, pfd[0].revents); } }else{ if (fcntl(fd, F_GETFL) < 0) { cl_perror("cl_poll_real_fd(%d): F_GETFL failure" , fd); RECORDFDEVENT(fd, POLLNVAL); }else{ RECORDFDEVENT(fd, POLLERR); } } } /* * Assign a signal for monitoring the given file descriptor */ static short cl_poll_assignsig(int fd) { int flags; if (debug) { cl_log(LOG_DEBUG , "Signal %d monitors fd %d...", cl_nsig, fd); } /* Test to see if the file descriptor is good */ if ((flags = fcntl(fd, F_GETFL)) < 0) { cl_perror("cl_poll_assignsig(%d) F_GETFL failure" , fd); return -1; } /* Associate the right signal with the fd */ if (fcntl(fd, F_SETSIG, cl_nsig) < 0) { cl_perror("cl_poll_assignsig(%d) F_SETSIG failure" , fd); return -1; } /* Direct the signals to us */ if (fcntl(fd, F_SETOWN, getpid()) < 0) { cl_perror("cl_poll_assignsig(%d) F_SETOWN failure", fd); return -1; } /* OK... Go ahead and send us signals! */ if (fcntl(fd, F_SETFL, flags|O_ASYNC) < 0) { cl_perror("cl_poll_assignsig(%d) F_SETFL(O_ASYNC) failure" , fd); return -1; } return cl_nsig; } /* * This is a function we call as a (fake) signal handler. * * It records events to our "monitorinfo" structure. * * Except for the cl_log() call, it could be called in a signal * context. */ static void cl_poll_sigaction(int nsig, siginfo_t* info, void* v) { int fd; /* What do you suppose all the various si_code values mean? */ fd = info->si_fd; if (debug) { cl_log(LOG_DEBUG , "cl_poll_sigaction(nsig=%d fd=%d" ", si_code=%d si_band=0x%lx)" , nsig, fd, info->si_code , (unsigned long)info->si_band); } if (fd <= 0) { return; } if (fd >= max_allocated || !is_monitored[fd]) { return; } /* We should not call logging functions in (real) signal handlers */ if (nsig != monitorinfo[fd].nsig) { cl_log(LOG_ERR, "cl_poll_sigaction called with signal %d/%d" , nsig, monitorinfo[fd].nsig); } /* Record everything as a pending event. */ RECORDFDEVENT(fd, info->si_band); } /* * This is called whenever a file descriptor shouldn't be * monitored any more. */ int cl_poll_ignore(int fd) { int flags; if (debug) { cl_log(LOG_DEBUG , "cl_poll_ignore(%d)", fd); } if (fd < 0 || fd >= max_allocated) { errno = EINVAL; return -1; } if (!is_monitored[fd]) { return 0; } is_monitored[fd] = FALSE; memset(monitorinfo+fd, 0, sizeof(monitorinfo[0])); if ((flags = fcntl(fd, F_GETFL)) >= 0) { flags &= ~O_ASYNC; if (fcntl(fd, F_SETFL, flags) < 0) { return -1; } }else{ return flags; } return 0; } /* * cl_poll: fake poll routine based on POSIX realtime signals. * * We want to emulate poll as exactly as possible, but poll has a couple * of problems: scaleability, and it tends to sleep in the kernel * because the first argument is an argument of arbitrary size, and * generally requires allocating memory. * * The challenge is that poll is level-triggered, but the POSIX * signals (and sigtimedwait(2)) are edge triggered. This is * one of the reasons why we have the cl_real_poll_fd() function * - to get the current "level" before we start. * Once we have this level we can compute something like the current * level */ int cl_poll(struct pollfd *fds, unsigned int nfds, int timeoutms) { int nready; struct timespec ts; static const struct timespec zerotime = {0L, 0L}; const struct timespec* itertime = &ts; siginfo_t info; int eventcount = 0; unsigned int j; int savederrno = errno; int stw_errno; int rc; longclock_t starttime; longclock_t endtime; const int msfudge = 2* 1000/hz_longclock(); int mselapsed = 0; /* Do we have any old news to report? */ if ((nready=cl_init_poll_sig(fds, nfds)) != 0) { /* Return error or old news to report */ if (debug) { cl_log(LOG_DEBUG, "cl_poll: early return(%d)", nready); } return nready; } /* Nothing to report yet... */ /* So, we'll do a sigtimedwait(2) to wait for signals * and see if we can find something to report... * * cl_init_poll() prepared a set of file signals to watch... */ recalcandwaitagain: if (timeoutms >= 0) { ts.tv_sec = timeoutms / 1000; ts.tv_nsec = (((unsigned long)timeoutms) % 1000UL)*1000000UL; }else{ ts.tv_sec = G_MAXLONG; ts.tv_nsec = 99999999UL; } /* * Perform a timed wait for any of our signals... * * We shouldn't sleep for any call but (possibly) the first one. * Subsequent calls should just pick up other events without * sleeping. */ starttime = time_longclock(); /* * Wait up to the prescribed time for a signal. * If we get a signal, then loop grabbing all other * pending signals. Note that subsequent iterations will * use &zerotime to get the minimum wait time. */ if (debug) { check_fd_info(fds, nfds); dump_fd_info(fds, nfds, timeoutms); } waitagain: while (sigtimedwait(&SignalSet, &info, itertime) >= 0) { int nsig = info.si_signo; /* Call signal handler to simulate signal reception */ cl_poll_sigaction(nsig, &info, NULL); itertime = &zerotime; } stw_errno=errno; /* Save errno for later use */ endtime = time_longclock(); mselapsed = longclockto_ms(sub_longclock(endtime, starttime)); #ifdef TIME_CALLS if (timeoutms >= 0 && mselapsed > timeoutms + msfudge) { /* We slept too long... */ cl_log(LOG_WARNING , "sigtimedwait() sequence for %d ms took %d ms" , timeoutms, mselapsed); } #endif if (SigQOverflow) { /* OOPS! Better recover from this! */ /* This will use poll(2) to correct our current status */ cl_poll_sigpoll_overflow(); } /* Post observed events and count them... */ for (j=0; j < nfds; ++j) { int fd = fds[j].fd; poll_info_t* moni = monitorinfo+fd; fds[j].revents = (moni->pendevents & (fds[j].events|CONSTEVENTS)); if (fds[j].revents) { ++eventcount; moni->pendevents &= ~(fds[j].revents); /* Make POLLHUP persistent */ if (fds[j].revents & POLLHUP) { moni->pendevents |= POLLHUP; /* Don't lose input events at EOF */ if (fds[j].events & POLLIN) { cl_real_poll_fd(fds[j].fd); } } } } if (eventcount == 0 && stw_errno == EAGAIN && timeoutms != 0) { /* We probably saw an event the user didn't ask to see. */ /* Consquently, we may have more waiting to do */ if (timeoutms < 0) { /* Restore our infinite wait time */ itertime = &ts; goto waitagain; }else if (timeoutms > 0) { if (mselapsed < timeoutms) { timeoutms -= mselapsed; goto recalcandwaitagain; } } } rc = (eventcount > 0 ? eventcount : (stw_errno == EAGAIN ? 0 : -1)); if (rc >= 0) { errno = savederrno; } return rc; } /* * Debugging routine for printing current poll arguments, etc. */ static void dump_fd_info(struct pollfd *fds, unsigned int nfds, int timeoutms) { unsigned j; cl_log(LOG_DEBUG, "timeout: %d milliseconds", timeoutms); for (j=0; j < nfds; ++j) { int fd = fds[j].fd; poll_info_t* moni = monitorinfo+fd; cl_log(LOG_DEBUG, "fd %d flags: 0%o, signal: %d, events: 0x%x" ", revents: 0x%x, pendevents: 0x%x" , fd, fcntl(fd, F_GETFL), moni->nsig , fds[j].events, fds[j].revents, moni->pendevents); } for (j=SIGRTMIN; j < (unsigned)SIGRTMAX; ++j) { if (!sigismember(&SignalSet, j)) { continue; } cl_log(LOG_DEBUG, "Currently monitoring RT signal %d", j); } } /* * Debugging routine for auditing our file descriptors, etc. */ static void check_fd_info(struct pollfd *fds, unsigned int nfds) { unsigned j; for (j=0; j < nfds; ++j) { int fd = fds[j].fd; poll_info_t* moni = monitorinfo+fd; if (!sigismember(&SignalSet, moni->nsig)) { cl_log(LOG_ERR, "SIGNAL %d not in monitored SignalSet" , moni->nsig); } } for (j=0; j < 10; ++j) { int sig; int flags; int pid; if ((flags = fcntl(j, F_GETFL)) < 0 || (flags & O_ASYNC) ==0){ continue; } sig = fcntl(j, F_GETSIG); if (sig == 0) { cl_log(LOG_ERR, "FD %d will get SIGIO", j); } if (!sigismember(&SignalSet, sig)) { cl_log(LOG_ERR, "FD %d (signal %d) is not in SignalSet" , j, sig); } if (sig < SIGRTMIN || sig >= SIGRTMAX) { cl_log(LOG_ERR, "FD %d (signal %d) is not RealTime" , j, sig); } pid = fcntl(j, F_GETOWN); if (pid != getpid()) { cl_log(LOG_ERR, "FD %d (signal %d) owner is pid %d" , j, sig, pid); } } } /* Note that the kernel signalled an event queue overflow */ static void cl_poll_sigpoll_overflow_sigaction(int nsig, siginfo_t* info, void* v) { SigQOverflow = TRUE; } #define MAXQNAME "rtsig-max" /* * Called when signal queue overflow is suspected. * We then use poll(2) to get the current data. It's slow, but it * should work quite nicely. */ static void cl_poll_sigpoll_overflow(void) { int fd; int limit; if (!SigQOverflow) { return; } cl_log(LOG_WARNING, "System signal queue overflow."); limit = cl_poll_get_sigqlimit(); if (limit > 0) { cl_log(LOG_WARNING, "Increase '%s'. Current limit is %d" " (see sysctl(8)).", MAXQNAME, limit); } SigQOverflow = FALSE; for (fd = 0; fd < max_allocated; ++fd) { if (is_monitored[fd]) { cl_real_poll_fd(fd); } } } #define PSK "/proc/sys/kernel/" /* Get current kernel signal queue limit */ /* This only works on Linux - but that's not a big problem... */ static int cl_poll_get_sigqlimit(void) { int limit = -1; int pfd; char result[32]; pfd = open(PSK MAXQNAME, O_RDONLY); if (pfd >= 0 && read(pfd, result, sizeof(result)) > 1) { limit = atoi(result); if (limit < 1) { limit = -1; } } if (pfd >= 0) { close(pfd); } return limit; } #endif /* HAVE_FCNTL_F_SETSIG */