5 files changed, 5897 insertions, 0 deletions
diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h
new file mode 100644
index 0000000..6092652
--- /dev/null
+++ b/src/libsystemd/sd-event/event-source.h
@@ -0,0 +1,227 @@
+#pragma once
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/epoll.h>
+#include <sys/timerfd.h>
+#include <sys/wait.h>
+
+#include "sd-event.h"
+
+#include "hashmap.h"
+#include "inotify-util.h"
+#include "list.h"
+#include "prioq.h"
+#include "ratelimit.h"
+
+typedef enum EventSourceType {
+        SOURCE_IO,
+        SOURCE_TIME_REALTIME,
+        SOURCE_TIME_BOOTTIME,
+        SOURCE_TIME_MONOTONIC,
+        SOURCE_TIME_REALTIME_ALARM,
+        SOURCE_TIME_BOOTTIME_ALARM,
+        SOURCE_SIGNAL,
+        SOURCE_CHILD,
+        SOURCE_DEFER,
+        SOURCE_POST,
+        SOURCE_EXIT,
+        SOURCE_WATCHDOG,
+        SOURCE_INOTIFY,
+        _SOURCE_EVENT_SOURCE_TYPE_MAX,
+        _SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL,
+} EventSourceType;
+
+/* All objects we use in epoll events start with this value, so that
+ * we know how to dispatch it */
+typedef enum WakeupType {
+        WAKEUP_NONE,
+        WAKEUP_EVENT_SOURCE, /* either I/O or pidfd wakeup */
+        WAKEUP_CLOCK_DATA,
+        WAKEUP_SIGNAL_DATA,
+        WAKEUP_INOTIFY_DATA,
+        _WAKEUP_TYPE_MAX,
+        _WAKEUP_TYPE_INVALID = -EINVAL,
+} WakeupType;
+
+struct inode_data;
+
+struct sd_event_source {
+        WakeupType wakeup;
+
+        unsigned n_ref;
+
+        sd_event *event;
+        void *userdata;
+        sd_event_handler_t prepare;
+
+        char *description;
+
+        EventSourceType type;
+        signed int enabled:3;
+        bool pending:1;
+        bool dispatching:1;
+        bool floating:1;
+        bool exit_on_failure:1;
+        bool ratelimited:1;
+
+        int64_t priority;
+        unsigned pending_index;
+        unsigned prepare_index;
+        uint64_t pending_iteration;
+        uint64_t prepare_iteration;
+
+        sd_event_destroy_t destroy_callback;
+        sd_event_handler_t ratelimit_expire_callback;
+
+        LIST_FIELDS(sd_event_source, sources);
+
+        RateLimit rate_limit;
+
+        /* These are primarily fields relevant for time event sources, but since any event source can
+         * effectively become one when rate-limited, this is part of the common fields. */
+        unsigned earliest_index;
+        unsigned latest_index;
+
+        union {
+                struct {
+                        sd_event_io_handler_t callback;
+                        int fd;
+                        uint32_t events;
+                        uint32_t revents;
+                        bool registered:1;
+                        bool owned:1;
+                } io;
+                struct {
+                        sd_event_time_handler_t callback;
+                        usec_t next, accuracy;
+                } time;
+                struct {
+                        sd_event_signal_handler_t callback;
+                        struct signalfd_siginfo siginfo;
+                        int sig;
+                        bool unblock;
+                } signal;
+                struct {
+                        sd_event_child_handler_t callback;
+                        siginfo_t siginfo;
+                        pid_t pid;
+                        int options;
+                        int pidfd;
+                        bool registered:1; /* whether the pidfd is registered in the epoll */
+                        bool pidfd_owned:1; /* close pidfd when event source is freed */
+                        bool process_owned:1; /* kill+reap process when event source is freed */
+                        bool exited:1; /* true if process exited (i.e. if there's value in SIGKILLing it if we want to get rid of it) */
+                        bool waited:1; /* true if process was waited for (i.e. if there's value in waitid(P_PID)'ing it if we want to get rid of it) */
+                } child;
+                struct {
+                        sd_event_handler_t callback;
+                } defer;
+                struct {
+                        sd_event_handler_t callback;
+                } post;
+                struct {
+                        sd_event_handler_t callback;
+                        unsigned prioq_index;
+                } exit;
+                struct {
+                        sd_event_inotify_handler_t callback;
+                        uint32_t mask;
+                        struct inode_data *inode_data;
+                        LIST_FIELDS(sd_event_source, by_inode_data);
+                } inotify;
+        };
+};
+
+struct clock_data {
+        WakeupType wakeup;
+        int fd;
+
+        /* For all clocks we maintain two priority queues each, one
+         * ordered for the earliest times the events may be
+         * dispatched, and one ordered by the latest times they must
+         * have been dispatched. The range between the top entries in
+         * the two prioqs is the time window we can freely schedule
+         * wakeups in */
+
+        Prioq *earliest;
+        Prioq *latest;
+        usec_t next;
+
+        bool needs_rearm:1;
+};
+
+struct signal_data {
+        WakeupType wakeup;
+
+        /* For each priority we maintain one signal fd, so that we
+         * only have to dequeue a single event per priority at a
+         * time. */
+
+        int fd;
+        int64_t priority;
+        sigset_t sigset;
+        sd_event_source *current;
+};
+
+/* A structure listing all event sources currently watching a specific inode */
+struct inode_data {
+        /* The identifier for the inode, the combination of the .st_dev + .st_ino fields of the file */
+        ino_t ino;
+        dev_t dev;
+
+        /* An fd of the inode to watch. The fd is kept open until the next iteration of the loop, so that we can
+         * rearrange the priority still until then, as we need the original inode to change the priority as we need to
+         * add a watch descriptor to the right inotify for the priority which we can only do if we have a handle to the
+         * original inode. We keep a list of all inode_data objects with an open fd in the to_close list (see below) of
+         * the sd-event object, so that it is efficient to close everything, before entering the next event loop
+         * iteration. */
+        int fd;
+
+        /* The inotify "watch descriptor" */
+        int wd;
+
+        /* The combination of the mask of all inotify watches on this inode we manage. This is also the mask that has
+         * most recently been set on the watch descriptor. */
+        uint32_t combined_mask;
+
+        /* All event sources subscribed to this inode */
+        LIST_HEAD(sd_event_source, event_sources);
+
+        /* The inotify object we watch this inode with */
+        struct inotify_data *inotify_data;
+
+        /* A linked list of all inode data objects with fds to close (see above) */
+        LIST_FIELDS(struct inode_data, to_close);
+};
+
+/* A structure encapsulating an inotify fd */
+struct inotify_data {
+        WakeupType wakeup;
+
+        /* For each priority we maintain one inotify fd, so that we only have to dequeue a single event per priority at
+         * a time */
+
+        int fd;
+        int64_t priority;
+
+        Hashmap *inodes; /* The inode_data structures keyed by dev+ino */
+        Hashmap *wd;     /* The inode_data structures keyed by the watch descriptor for each */
+
+        /* The buffer we read inotify events into */
+        union inotify_event_buffer buffer;
+        size_t buffer_filled; /* fill level of the buffer */
+
+        /* How many event sources are currently marked pending for this inotify. We won't read new events off the
+         * inotify fd as long as there are still pending events on the inotify (because we have no strategy of queuing
+         * the events locally if they can't be coalesced). */
+        unsigned n_pending;
+
+        /* If this counter is non-zero, don't GC the inotify data object even if not used to watch any inode
+         * anymore. This is useful to pin the object for a bit longer, after the last event source needing it
+         * is gone. */
+        unsigned n_busy;
+
+        /* A linked list of all inotify objects with data already read, that still need processing. We keep this list
+         * to make it efficient to figure out what inotify objects to process data on next. */
+        LIST_FIELDS(struct inotify_data, buffered);
+};
diff --git a/src/libsystemd/sd-event/event-util.c b/src/libsystemd/sd-event/event-util.c
new file mode 100644
index 0000000..a36eba9
--- /dev/null
+++ b/src/libsystemd/sd-event/event-util.c
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "event-source.h"
+#include "event-util.h"
+#include "fd-util.h"
+#include "log.h"
+#include "string-util.h"
+
+int event_reset_time(
+                sd_event *e,
+                sd_event_source **s,
+                clockid_t clock,
+                uint64_t usec,
+                uint64_t accuracy,
+                sd_event_time_handler_t callback,
+                void *userdata,
+                int64_t priority,
+                const char *description,
+                bool force_reset) {
+
+        bool created = false;
+        int enabled, r;
+        clockid_t c;
+
+        assert(e);
+        assert(s);
+
+        if (*s) {
+                if (!force_reset) {
+                        r = sd_event_source_get_enabled(*s, &enabled);
+                        if (r < 0)
+                                return log_debug_errno(r, "sd-event: Failed to query whether event source \"%s\" is enabled or not: %m",
+                                                       strna((*s)->description ?: description));
+
+                        if (enabled != SD_EVENT_OFF)
+                                return 0;
+                }
+
+                r = sd_event_source_get_time_clock(*s, &c);
+                if (r < 0)
+                        return log_debug_errno(r, "sd-event: Failed to get clock id of event source \"%s\": %m", strna((*s)->description ?: description));
+
+                if (c != clock)
+                        return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+                                               "sd-event: Current clock id %i of event source \"%s\" is different from specified one %i.",
+                                               (int)c,
+                                               strna((*s)->description ? : description),
+                                               (int)clock);
+
+                r = sd_event_source_set_time(*s, usec);
+                if (r < 0)
+                        return log_debug_errno(r, "sd-event: Failed to set time for event source \"%s\": %m", strna((*s)->description ?: description));
+
+                r = sd_event_source_set_time_accuracy(*s, accuracy);
+                if (r < 0)
+                        return log_debug_errno(r, "sd-event: Failed to set accuracy for event source \"%s\": %m", strna((*s)->description ?: description));
+
+                /* callback function is not updated, as we do not have sd_event_source_set_time_callback(). */
+
+                (void) sd_event_source_set_userdata(*s, userdata);
+
+                r = sd_event_source_set_enabled(*s, SD_EVENT_ONESHOT);
+                if (r < 0)
+                        return log_debug_errno(r, "sd-event: Failed to enable event source \"%s\": %m", strna((*s)->description ?: description));
+        } else {
+                r = sd_event_add_time(e, s, clock, usec, accuracy, callback, userdata);
+                if (r < 0)
+                        return log_debug_errno(r, "sd-event: Failed to create timer event \"%s\": %m", strna(description));
+
+                created = true;
+        }
+
+        r = sd_event_source_set_priority(*s, priority);
+        if (r < 0)
+                return log_debug_errno(r, "sd-event: Failed to set priority for event source \"%s\": %m", strna((*s)->description ?: description));
+
+        if (description) {
+                r = sd_event_source_set_description(*s, description);
+                if (r < 0)
+                        return log_debug_errno(r, "sd-event: Failed to set description for event source \"%s\": %m", description);
+        }
+
+        return created;
+}
+
+int event_reset_time_relative(
+                sd_event *e,
+                sd_event_source **s,
+                clockid_t clock,
+                uint64_t usec,
+                uint64_t accuracy,
+                sd_event_time_handler_t callback,
+                void *userdata,
+                int64_t priority,
+                const char *description,
+                bool force_reset) {
+
+        usec_t usec_now;
+        int r;
+
+        assert(e);
+
+        r = sd_event_now(e, clock, &usec_now);
+        if (r < 0)
+                return log_debug_errno(r, "sd-event: Failed to get the current time: %m");
+
+        return event_reset_time(e, s, clock, usec_add(usec_now, usec), accuracy, callback, userdata, priority, description, force_reset);
+}
+
+int event_add_time_change(sd_event *e, sd_event_source **ret, sd_event_io_handler_t callback, void *userdata) {
+        _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+        _cleanup_close_ int fd = -1;
+        int r;
+
+        assert(e);
+
+        /* Allocates an IO event source that gets woken up whenever the clock changes. Needs to be recreated on each event */
+
+        fd = time_change_fd();
+        if (fd < 0)
+                return fd;
+
+        r = sd_event_add_io(e, &s, fd, EPOLLIN, callback, userdata);
+        if (r < 0)
+                return r;
+
+        r = sd_event_source_set_io_fd_own(s, true);
+        if (r < 0)
+                return r;
+
+        TAKE_FD(fd);
+
+        r = sd_event_source_set_description(s, "time-change");
+        if (r < 0)
+                return r;
+
+        if (ret)
+                *ret = TAKE_PTR(s);
+        else {
+                r = sd_event_source_set_floating(s, true);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
diff --git a/src/libsystemd/sd-event/event-util.h b/src/libsystemd/sd-event/event-util.h
new file mode 100644
index 0000000..c185584
--- /dev/null
+++ b/src/libsystemd/sd-event/event-util.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-event.h"
+
+int event_reset_time(
+                sd_event *e,
+                sd_event_source **s,
+                clockid_t clock,
+                uint64_t usec,
+                uint64_t accuracy,
+                sd_event_time_handler_t callback,
+                void *userdata,
+                int64_t priority,
+                const char *description,
+                bool force_reset);
+int event_reset_time_relative(
+                sd_event *e,
+                sd_event_source **s,
+                clockid_t clock,
+                uint64_t usec,
+                uint64_t accuracy,
+                sd_event_time_handler_t callback,
+                void *userdata,
+                int64_t priority,
+                const char *description,
+                bool force_reset);
+static inline int event_source_disable(sd_event_source *s) {
+        return sd_event_source_set_enabled(s, SD_EVENT_OFF);
+}
+
+int event_add_time_change(sd_event *e, sd_event_source **ret, sd_event_io_handler_t callback, void *userdata);
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
new file mode 100644
index 0000000..aee2ed5
--- /dev/null
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -0,0 +1,4676 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/epoll.h>
+#include <sys/timerfd.h>
+#include <sys/wait.h>
+
+#include "sd-daemon.h"
+#include "sd-event.h"
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "env-util.h"
+#include "event-source.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "hashmap.h"
+#include "list.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "missing_syscall.h"
+#include "missing_threads.h"
+#include "prioq.h"
+#include "process-util.h"
+#include "set.h"
+#include "signal-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strxcpyx.h"
+#include "time-util.h"
+
+#define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
+
+static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
+        /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
+        return s &&
+                s->type == SOURCE_CHILD &&
+                s->child.pidfd >= 0 &&
+                s->child.options == WEXITED;
+}
+
+static bool event_source_is_online(sd_event_source *s) {
+        assert(s);
+        return s->enabled != SD_EVENT_OFF && !s->ratelimited;
+}
+
+static bool event_source_is_offline(sd_event_source *s) {
+        assert(s);
+        return s->enabled == SD_EVENT_OFF || s->ratelimited;
+}
+
+static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
+        [SOURCE_IO]                  = "io",
+        [SOURCE_TIME_REALTIME]       = "realtime",
+        [SOURCE_TIME_BOOTTIME]       = "bootime",
+        [SOURCE_TIME_MONOTONIC]      = "monotonic",
+        [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
+        [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
+        [SOURCE_SIGNAL]              = "signal",
+        [SOURCE_CHILD]               = "child",
+        [SOURCE_DEFER]               = "defer",
+        [SOURCE_POST]                = "post",
+        [SOURCE_EXIT]                = "exit",
+        [SOURCE_WATCHDOG]            = "watchdog",
+        [SOURCE_INOTIFY]             = "inotify",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
+
+#define EVENT_SOURCE_IS_TIME(t)                 \
+        IN_SET((t),                             \
+               SOURCE_TIME_REALTIME,            \
+               SOURCE_TIME_BOOTTIME,            \
+               SOURCE_TIME_MONOTONIC,           \
+               SOURCE_TIME_REALTIME_ALARM,      \
+               SOURCE_TIME_BOOTTIME_ALARM)
+
+#define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
+        IN_SET((t),                             \
+               SOURCE_IO,                       \
+               SOURCE_TIME_REALTIME,            \
+               SOURCE_TIME_BOOTTIME,            \
+               SOURCE_TIME_MONOTONIC,           \
+               SOURCE_TIME_REALTIME_ALARM,      \
+               SOURCE_TIME_BOOTTIME_ALARM,      \
+               SOURCE_SIGNAL,                   \
+               SOURCE_DEFER,                    \
+               SOURCE_INOTIFY)
+
+/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
+ * Time sources and ratelimited sources can be passed, so effectively this is the same as the
+ * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
+#define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
+
+struct sd_event {
+        unsigned n_ref;
+
+        int epoll_fd;
+        int watchdog_fd;
+
+        Prioq *pending;
+        Prioq *prepare;
+
+        /* timerfd_create() only supports these five clocks so far. We
+         * can add support for more clocks when the kernel learns to
+         * deal with them, too. */
+        struct clock_data realtime;
+        struct clock_data boottime;
+        struct clock_data monotonic;
+        struct clock_data realtime_alarm;
+        struct clock_data boottime_alarm;
+
+        usec_t perturb;
+
+        sd_event_source **signal_sources; /* indexed by signal number */
+        Hashmap *signal_data; /* indexed by priority */
+
+        Hashmap *child_sources;
+        unsigned n_online_child_sources;
+
+        Set *post_sources;
+
+        Prioq *exit;
+
+        Hashmap *inotify_data; /* indexed by priority */
+
+        /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
+        LIST_HEAD(struct inode_data, inode_data_to_close_list);
+
+        /* A list of inotify objects that already have events buffered which aren't processed yet */
+        LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
+
+        pid_t original_pid;
+
+        uint64_t iteration;
+        triple_timestamp timestamp;
+        int state;
+
+        bool exit_requested:1;
+        bool need_process_child:1;
+        bool watchdog:1;
+        bool profile_delays:1;
+
+        int exit_code;
+
+        pid_t tid;
+        sd_event **default_event_ptr;
+
+        usec_t watchdog_last, watchdog_period;
+
+        unsigned n_sources;
+
+        struct epoll_event *event_queue;
+
+        LIST_HEAD(sd_event_source, sources);
+
+        sd_event_source *sigint_event_source, *sigterm_event_source;
+
+        usec_t last_run_usec, last_log_usec;
+        unsigned delays[sizeof(usec_t) * 8];
+};
+
+static thread_local sd_event *default_event = NULL;
+
+static void source_disconnect(sd_event_source *s);
+static void event_gc_inode_data(sd_event *e, struct inode_data *d);
+
+static sd_event *event_resolve(sd_event *e) {
+        return e == SD_EVENT_DEFAULT ? default_event : e;
+}
+
+static int pending_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+        int r;
+
+        assert(x->pending);
+        assert(y->pending);
+
+        /* Enabled ones first */
+        r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
+        if (r != 0)
+                return r;
+
+        /* Non rate-limited ones first. */
+        r = CMP(!!x->ratelimited, !!y->ratelimited);
+        if (r != 0)
+                return r;
+
+        /* Lower priority values first */
+        r = CMP(x->priority, y->priority);
+        if (r != 0)
+                return r;
+
+        /* Older entries first */
+        return CMP(x->pending_iteration, y->pending_iteration);
+}
+
+static int prepare_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+        int r;
+
+        assert(x->prepare);
+        assert(y->prepare);
+
+        /* Enabled ones first */
+        r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
+        if (r != 0)
+                return r;
+
+        /* Non rate-limited ones first. */
+        r = CMP(!!x->ratelimited, !!y->ratelimited);
+        if (r != 0)
+                return r;
+
+        /* Move most recently prepared ones last, so that we can stop
+         * preparing as soon as we hit one that has already been
+         * prepared in the current iteration */
+        r = CMP(x->prepare_iteration, y->prepare_iteration);
+        if (r != 0)
+                return r;
+
+        /* Lower priority values first */
+        return CMP(x->priority, y->priority);
+}
+
+static usec_t time_event_source_next(const sd_event_source *s) {
+        assert(s);
+
+        /* We have two kinds of event sources that have elapsation times associated with them: the actual
+         * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
+         * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
+         * looking at here. */
+
+        if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
+                assert(s->rate_limit.begin != 0);
+                assert(s->rate_limit.interval != 0);
+                return usec_add(s->rate_limit.begin, s->rate_limit.interval);
+        }
+
+        /* Otherwise this must be a time event source, if not ratelimited */
+        if (EVENT_SOURCE_IS_TIME(s->type))
+                return s->time.next;
+
+        return USEC_INFINITY;
+}
+
+static usec_t time_event_source_latest(const sd_event_source *s) {
+        assert(s);
+
+        if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
+                               * same, as we should avoid adding additional inaccuracy on an inaccuracy time
+                               * window */
+                assert(s->rate_limit.begin != 0);
+                assert(s->rate_limit.interval != 0);
+                return usec_add(s->rate_limit.begin, s->rate_limit.interval);
+        }
+
+        /* Must be a time event source, if not ratelimited */
+        if (EVENT_SOURCE_IS_TIME(s->type))
+                return usec_add(s->time.next, s->time.accuracy);
+
+        return USEC_INFINITY;
+}
+
+static bool event_source_timer_candidate(const sd_event_source *s) {
+        assert(s);
+
+        /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
+         * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
+        return !s->pending || s->ratelimited;
+}
+
+static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
+        const sd_event_source *x = a, *y = b;
+        int r;
+
+        /* Enabled ones first */
+        r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
+        if (r != 0)
+                return r;
+
+        /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
+        r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
+        if (r != 0)
+                return r;
+
+        /* Order by time */
+        return CMP(time_func(x), time_func(y));
+}
+
+static int earliest_time_prioq_compare(const void *a, const void *b) {
+        return time_prioq_compare(a, b, time_event_source_next);
+}
+
+static int latest_time_prioq_compare(const void *a, const void *b) {
+        return time_prioq_compare(a, b, time_event_source_latest);
+}
+
+static int exit_prioq_compare(const void *a, const void *b) {
+        const sd_event_source *x = a, *y = b;
+        int r;
+
+        assert(x->type == SOURCE_EXIT);
+        assert(y->type == SOURCE_EXIT);
+
+        /* Enabled ones first */
+        r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
+        if (r != 0)
+                return r;
+
+        /* Lower priority values first */
+        return CMP(x->priority, y->priority);
+}
+
+static void free_clock_data(struct clock_data *d) {
+        assert(d);
+        assert(d->wakeup == WAKEUP_CLOCK_DATA);
+
+        safe_close(d->fd);
+        prioq_free(d->earliest);
+        prioq_free(d->latest);
+}
+
+static sd_event *event_free(sd_event *e) {
+        sd_event_source *s;
+
+        assert(e);
+
+        e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
+        e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
+
+        while ((s = e->sources)) {
+                assert(s->floating);
+                source_disconnect(s);
+                sd_event_source_unref(s);
+        }
+
+        assert(e->n_sources == 0);
+
+        if (e->default_event_ptr)
+                *(e->default_event_ptr) = NULL;
+
+        safe_close(e->epoll_fd);
+        safe_close(e->watchdog_fd);
+
+        free_clock_data(&e->realtime);
+        free_clock_data(&e->boottime);
+        free_clock_data(&e->monotonic);
+        free_clock_data(&e->realtime_alarm);
+        free_clock_data(&e->boottime_alarm);
+
+        prioq_free(e->pending);
+        prioq_free(e->prepare);
+        prioq_free(e->exit);
+
+        free(e->signal_sources);
+        hashmap_free(e->signal_data);
+
+        hashmap_free(e->inotify_data);
+
+        hashmap_free(e->child_sources);
+        set_free(e->post_sources);
+
+        free(e->event_queue);
+
+        return mfree(e);
+}
+
+_public_ int sd_event_new(sd_event** ret) {
+        sd_event *e;
+        int r;
+
+        assert_return(ret, -EINVAL);
+
+        e = new(sd_event, 1);
+        if (!e)
+                return -ENOMEM;
+
+        *e = (sd_event) {
+                .n_ref = 1,
+                .epoll_fd = -1,
+                .watchdog_fd = -1,
+                .realtime.wakeup = WAKEUP_CLOCK_DATA,
+                .realtime.fd = -1,
+                .realtime.next = USEC_INFINITY,
+                .boottime.wakeup = WAKEUP_CLOCK_DATA,
+                .boottime.fd = -1,
+                .boottime.next = USEC_INFINITY,
+                .monotonic.wakeup = WAKEUP_CLOCK_DATA,
+                .monotonic.fd = -1,
+                .monotonic.next = USEC_INFINITY,
+                .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
+                .realtime_alarm.fd = -1,
+                .realtime_alarm.next = USEC_INFINITY,
+                .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
+                .boottime_alarm.fd = -1,
+                .boottime_alarm.next = USEC_INFINITY,
+                .perturb = USEC_INFINITY,
+                .original_pid = getpid_cached(),
+        };
+
+        r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
+        if (r < 0)
+                goto fail;
+
+        e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+        if (e->epoll_fd < 0) {
+                r = -errno;
+                goto fail;
+        }
+
+        e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
+
+        if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
+                log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 %s 2^63 us will be logged every 5s.",
+                          special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+                e->profile_delays = true;
+        }
+
+        *ret = e;
+        return 0;
+
+fail:
+        event_free(e);
+        return r;
+}
+
+DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
+#define PROTECT_EVENT(e)                                                \
+        _unused_ _cleanup_(sd_event_unrefp) sd_event *_ref = sd_event_ref(e);
+
+_public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
+        if (s)
+                (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
+        return sd_event_source_unref(s);
+}
+
+static bool event_pid_changed(sd_event *e) {
+        assert(e);
+
+        /* We don't support people creating an event loop and keeping
+         * it around over a fork(). Let's complain. */
+
+        return e->original_pid != getpid_cached();
+}
+
+static void source_io_unregister(sd_event_source *s) {
+        assert(s);
+        assert(s->type == SOURCE_IO);
+
+        if (event_pid_changed(s->event))
+                return;
+
+        if (!s->io.registered)
+                return;
+
+        if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
+                log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
+                                strna(s->description), event_source_type_to_string(s->type));
+
+        s->io.registered = false;
+}
+
+static int source_io_register(
+                sd_event_source *s,
+                int enabled,
+                uint32_t events) {
+
+        assert(s);
+        assert(s->type == SOURCE_IO);
+        assert(enabled != SD_EVENT_OFF);
+
+        struct epoll_event ev = {
+                .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
+                .data.ptr = s,
+        };
+
+        if (epoll_ctl(s->event->epoll_fd,
+                      s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
+                      s->io.fd, &ev) < 0)
+                return -errno;
+
+        s->io.registered = true;
+
+        return 0;
+}
+
+static void source_child_pidfd_unregister(sd_event_source *s) {
+        assert(s);
+        assert(s->type == SOURCE_CHILD);
+
+        if (event_pid_changed(s->event))
+                return;
+
+        if (!s->child.registered)
+                return;
+
+        if (EVENT_SOURCE_WATCH_PIDFD(s))
+                if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
+                        log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
+                                        strna(s->description), event_source_type_to_string(s->type));
+
+        s->child.registered = false;
+}
+
+static int source_child_pidfd_register(sd_event_source *s, int enabled) {
+        assert(s);
+        assert(s->type == SOURCE_CHILD);
+        assert(enabled != SD_EVENT_OFF);
+
+        if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+                struct epoll_event ev = {
+                        .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
+                        .data.ptr = s,
+                };
+
+                if (epoll_ctl(s->event->epoll_fd,
+                              s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
+                              s->child.pidfd, &ev) < 0)
+                        return -errno;
+        }
+
+        s->child.registered = true;
+        return 0;
+}
+
+static clockid_t event_source_type_to_clock(EventSourceType t) {
+
+        switch (t) {
+
+        case SOURCE_TIME_REALTIME:
+                return CLOCK_REALTIME;
+
+        case SOURCE_TIME_BOOTTIME:
+                return CLOCK_BOOTTIME;
+
+        case SOURCE_TIME_MONOTONIC:
+                return CLOCK_MONOTONIC;
+
+        case SOURCE_TIME_REALTIME_ALARM:
+                return CLOCK_REALTIME_ALARM;
+
+        case SOURCE_TIME_BOOTTIME_ALARM:
+                return CLOCK_BOOTTIME_ALARM;
+
+        default:
+                return (clockid_t) -1;
+        }
+}
+
+static EventSourceType clock_to_event_source_type(clockid_t clock) {
+
+        switch (clock) {
+
+        case CLOCK_REALTIME:
+                return SOURCE_TIME_REALTIME;
+
+        case CLOCK_BOOTTIME:
+                return SOURCE_TIME_BOOTTIME;
+
+        case CLOCK_MONOTONIC:
+                return SOURCE_TIME_MONOTONIC;
+
+        case CLOCK_REALTIME_ALARM:
+                return SOURCE_TIME_REALTIME_ALARM;
+
+        case CLOCK_BOOTTIME_ALARM:
+                return SOURCE_TIME_BOOTTIME_ALARM;
+
+        default:
+                return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
+        }
+}
+
+static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
+        assert(e);
+
+        switch (t) {
+
+        case SOURCE_TIME_REALTIME:
+                return &e->realtime;
+
+        case SOURCE_TIME_BOOTTIME:
+                return &e->boottime;
+
+        case SOURCE_TIME_MONOTONIC:
+                return &e->monotonic;
+
+        case SOURCE_TIME_REALTIME_ALARM:
+                return &e->realtime_alarm;
+
+        case SOURCE_TIME_BOOTTIME_ALARM:
+                return &e->boottime_alarm;
+
+        default:
+                return NULL;
+        }
+}
+
+static void event_free_signal_data(sd_event *e, struct signal_data *d) {
+        assert(e);
+
+        if (!d)
+                return;
+
+        hashmap_remove(e->signal_data, &d->priority);
+        safe_close(d->fd);
+        free(d);
+}
+
+static int event_make_signal_data(
+                sd_event *e,
+                int sig,
+                struct signal_data **ret) {
+
+        struct signal_data *d;
+        bool added = false;
+        sigset_t ss_copy;
+        int64_t priority;
+        int r;
+
+        assert(e);
+
+        if (event_pid_changed(e))
+                return -ECHILD;
+
+        if (e->signal_sources && e->signal_sources[sig])
+                priority = e->signal_sources[sig]->priority;
+        else
+                priority = SD_EVENT_PRIORITY_NORMAL;
+
+        d = hashmap_get(e->signal_data, &priority);
+        if (d) {
+                if (sigismember(&d->sigset, sig) > 0) {
+                        if (ret)
+                                *ret = d;
+                        return 0;
+                }
+        } else {
+                d = new(struct signal_data, 1);
+                if (!d)
+                        return -ENOMEM;
+
+                *d = (struct signal_data) {
+                        .wakeup = WAKEUP_SIGNAL_DATA,
+                        .fd = -1,
+                        .priority = priority,
+                };
+
+                r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
+                if (r < 0) {
+                        free(d);
+                        return r;
+                }
+
+                added = true;
+        }
+
+        ss_copy = d->sigset;
+        assert_se(sigaddset(&ss_copy, sig) >= 0);
+
+        r = signalfd(d->fd >= 0 ? d->fd : -1,   /* the first arg must be -1 or a valid signalfd */
+                     &ss_copy,
+                     SFD_NONBLOCK|SFD_CLOEXEC);
+        if (r < 0) {
+                r = -errno;
+                goto fail;
+        }
+
+        d->sigset = ss_copy;
+
+        if (d->fd >= 0) {
+                if (ret)
+                        *ret = d;
+                return 0;
+        }
+
+        d->fd = fd_move_above_stdio(r);
+
+        struct epoll_event ev = {
+                .events = EPOLLIN,
+                .data.ptr = d,
+        };
+
+        if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
+                r = -errno;
+                goto fail;
+        }
+
+        if (ret)
+                *ret = d;
+
+        return 0;
+
+fail:
+        if (added)
+                event_free_signal_data(e, d);
+
+        return r;
+}
+
+static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
+        assert(e);
+        assert(d);
+
+        /* Turns off the specified signal in the signal data
+         * object. If the signal mask of the object becomes empty that
+         * way removes it. */
+
+        if (sigismember(&d->sigset, sig) == 0)
+                return;
+
+        assert_se(sigdelset(&d->sigset, sig) >= 0);
+
+        if (sigisemptyset(&d->sigset)) {
+                /* If all the mask is all-zero we can get rid of the structure */
+                event_free_signal_data(e, d);
+                return;
+        }
+
+        if (event_pid_changed(e))
+                return;
+
+        assert(d->fd >= 0);
+
+        if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
+                log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
+}
+
+static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
+        struct signal_data *d;
+        static const int64_t zero_priority = 0;
+
+        assert(e);
+
+        /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
+         * and possibly drop the signalfd for it. */
+
+        if (sig == SIGCHLD &&
+            e->n_online_child_sources > 0)
+                return;
+
+        if (e->signal_sources &&
+            e->signal_sources[sig] &&
+            event_source_is_online(e->signal_sources[sig]))
+                return;
+
+        /*
+         * The specified signal might be enabled in three different queues:
+         *
+         * 1) the one that belongs to the priority passed (if it is non-NULL)
+         * 2) the one that belongs to the priority of the event source of the signal (if there is one)
+         * 3) the 0 priority (to cover the SIGCHLD case)
+         *
+         * Hence, let's remove it from all three here.
+         */
+
+        if (priority) {
+                d = hashmap_get(e->signal_data, priority);
+                if (d)
+                        event_unmask_signal_data(e, d, sig);
+        }
+
+        if (e->signal_sources && e->signal_sources[sig]) {
+                d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
+                if (d)
+                        event_unmask_signal_data(e, d, sig);
+        }
+
+        d = hashmap_get(e->signal_data, &zero_priority);
+        if (d)
+                event_unmask_signal_data(e, d, sig);
+}
+
+static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
+        assert(s);
+
+        /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
+         * they are enabled/disabled or marked pending and such. */
+
+        if (s->pending)
+                prioq_reshuffle(s->event->pending, s, &s->pending_index);
+
+        if (s->prepare)
+                prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
+}
+
+static void event_source_time_prioq_reshuffle(sd_event_source *s) {
+        struct clock_data *d;
+
+        assert(s);
+
+        /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
+         * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
+         * properly again. */
+
+        if (s->ratelimited)
+                d = &s->event->monotonic;
+        else if (EVENT_SOURCE_IS_TIME(s->type))
+                assert_se(d = event_get_clock_data(s->event, s->type));
+        else
+                return; /* no-op for an event source which is neither a timer nor ratelimited. */
+
+        prioq_reshuffle(d->earliest, s, &s->earliest_index);
+        prioq_reshuffle(d->latest, s, &s->latest_index);
+        d->needs_rearm = true;
+}
+
+static void event_source_time_prioq_remove(
+                sd_event_source *s,
+                struct clock_data *d) {
+
+        assert(s);
+        assert(d);
+
+        prioq_remove(d->earliest, s, &s->earliest_index);
+        prioq_remove(d->latest, s, &s->latest_index);
+        s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
+        d->needs_rearm = true;
+}
+
+static void source_disconnect(sd_event_source *s) {
+        sd_event *event;
+        int r;
+
+        assert(s);
+
+        if (!s->event)
+                return;
+
+        assert(s->event->n_sources > 0);
+
+        switch (s->type) {
+
+        case SOURCE_IO:
+                if (s->io.fd >= 0)
+                        source_io_unregister(s);
+
+                break;
+
+        case SOURCE_TIME_REALTIME:
+        case SOURCE_TIME_BOOTTIME:
+        case SOURCE_TIME_MONOTONIC:
+        case SOURCE_TIME_REALTIME_ALARM:
+        case SOURCE_TIME_BOOTTIME_ALARM:
+                /* Only remove this event source from the time event source here if it is not ratelimited. If
+                 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
+                 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
+
+                if (!s->ratelimited) {
+                        struct clock_data *d;
+                        assert_se(d = event_get_clock_data(s->event, s->type));
+                        event_source_time_prioq_remove(s, d);
+                }
+
+                break;
+
+        case SOURCE_SIGNAL:
+                if (s->signal.sig > 0) {
+
+                        if (s->event->signal_sources)
+                                s->event->signal_sources[s->signal.sig] = NULL;
+
+                        event_gc_signal_data(s->event, &s->priority, s->signal.sig);
+
+                        if (s->signal.unblock) {
+                                sigset_t new_ss;
+
+                                if (sigemptyset(&new_ss) < 0)
+                                        log_debug_errno(errno, "Failed to reset signal set, ignoring: %m");
+                                else if (sigaddset(&new_ss, s->signal.sig) < 0)
+                                        log_debug_errno(errno, "Failed to add signal %i to signal mask, ignoring: %m", s->signal.sig);
+                                else {
+                                        r = pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
+                                        if (r != 0)
+                                                log_debug_errno(r, "Failed to unblock signal %i, ignoring: %m", s->signal.sig);
+                                }
+                        }
+                }
+
+                break;
+
+        case SOURCE_CHILD:
+                if (event_pid_changed(s->event))
+                        s->child.process_owned = false;
+
+                if (s->child.pid > 0) {
+                        if (event_source_is_online(s)) {
+                                assert(s->event->n_online_child_sources > 0);
+                                s->event->n_online_child_sources--;
+                        }
+
+                        (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
+                }
+
+                if (EVENT_SOURCE_WATCH_PIDFD(s))
+                        source_child_pidfd_unregister(s);
+                else
+                        event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+
+                break;
+
+        case SOURCE_DEFER:
+                /* nothing */
+                break;
+
+        case SOURCE_POST:
+                set_remove(s->event->post_sources, s);
+                break;
+
+        case SOURCE_EXIT:
+                prioq_remove(s->event->exit, s, &s->exit.prioq_index);
+                break;
+
+        case SOURCE_INOTIFY: {
+                struct inode_data *inode_data;
+
+                inode_data = s->inotify.inode_data;
+                if (inode_data) {
+                        struct inotify_data *inotify_data;
+                        assert_se(inotify_data = inode_data->inotify_data);
+
+                        /* Detach this event source from the inode object */
+                        LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
+                        s->inotify.inode_data = NULL;
+
+                        if (s->pending) {
+                                assert(inotify_data->n_pending > 0);
+                                inotify_data->n_pending--;
+                        }
+
+                        /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
+                         * continued to being watched. That's because inotify doesn't really have an API for that: we
+                         * can only change watch masks with access to the original inode either by fd or by path. But
+                         * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
+                         * continuously and keeping the mount busy which we can't really do. We could reconstruct the
+                         * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
+                         * there), but given the need for open_by_handle_at() which is privileged and not universally
+                         * available this would be quite an incomplete solution. Hence we go the other way, leave the
+                         * mask set, even if it is not minimized now, and ignore all events we aren't interested in
+                         * anymore after reception. Yes, this sucks, but … Linux … */
+
+                        /* Maybe release the inode data (and its inotify) */
+                        event_gc_inode_data(s->event, inode_data);
+                }
+
+                break;
+        }
+
+        default:
+                assert_not_reached();
+        }
+
+        if (s->pending)
+                prioq_remove(s->event->pending, s, &s->pending_index);
+
+        if (s->prepare)
+                prioq_remove(s->event->prepare, s, &s->prepare_index);
+
+        if (s->ratelimited)
+                event_source_time_prioq_remove(s, &s->event->monotonic);
+
+        event = TAKE_PTR(s->event);
+        LIST_REMOVE(sources, event->sources, s);
+        event->n_sources--;
+
+        /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
+         * pidfd associated with this event source, which we'll do only on source_free(). */
+
+        if (!s->floating)
+                sd_event_unref(event);
+}
+
+static sd_event_source* source_free(sd_event_source *s) {
+        assert(s);
+
+        source_disconnect(s);
+
+        if (s->type == SOURCE_IO && s->io.owned)
+                s->io.fd = safe_close(s->io.fd);
+
+        if (s->type == SOURCE_CHILD) {
+                /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
+
+                if (s->child.process_owned) {
+
+                        if (!s->child.exited) {
+                                bool sent = false;
+
+                                if (s->child.pidfd >= 0) {
+                                        if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
+                                                if (errno == ESRCH) /* Already dead */
+                                                        sent = true;
+                                                else if (!ERRNO_IS_NOT_SUPPORTED(errno))
+                                                        log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
+                                                                        s->child.pid);
+                                        } else
+                                                sent = true;
+                                }
+
+                                if (!sent)
+                                        if (kill(s->child.pid, SIGKILL) < 0)
+                                                if (errno != ESRCH) /* Already dead */
+                                                        log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
+                                                                        s->child.pid);
+                        }
+
+                        if (!s->child.waited) {
+                                siginfo_t si = {};
+
+                                /* Reap the child if we can */
+                                (void) waitid(P_PID, s->child.pid, &si, WEXITED);
+                        }
+                }
+
+                if (s->child.pidfd_owned)
+                        s->child.pidfd = safe_close(s->child.pidfd);
+        }
+
+        if (s->destroy_callback)
+                s->destroy_callback(s->userdata);
+
+        free(s->description);
+        return mfree(s);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
+
+static int source_set_pending(sd_event_source *s, bool b) {
+        int r;
+
+        assert(s);
+        assert(s->type != SOURCE_EXIT);
+
+        if (s->pending == b)
+                return 0;
+
+        s->pending = b;
+
+        if (b) {
+                s->pending_iteration = s->event->iteration;
+
+                r = prioq_put(s->event->pending, s, &s->pending_index);
+                if (r < 0) {
+                        s->pending = false;
+                        return r;
+                }
+        } else
+                assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
+
+        if (EVENT_SOURCE_IS_TIME(s->type))
+                event_source_time_prioq_reshuffle(s);
+
+        if (s->type == SOURCE_SIGNAL && !b) {
+                struct signal_data *d;
+
+                d = hashmap_get(s->event->signal_data, &s->priority);
+                if (d && d->current == s)
+                        d->current = NULL;
+        }
+
+        if (s->type == SOURCE_INOTIFY) {
+
+                assert(s->inotify.inode_data);
+                assert(s->inotify.inode_data->inotify_data);
+
+                if (b)
+                        s->inotify.inode_data->inotify_data->n_pending ++;
+                else {
+                        assert(s->inotify.inode_data->inotify_data->n_pending > 0);
+                        s->inotify.inode_data->inotify_data->n_pending --;
+                }
+        }
+
+        return 1;
+}
+
+static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
+        sd_event_source *s;
+
+        assert(e);
+
+        s = new(sd_event_source, 1);
+        if (!s)
+                return NULL;
+
+        *s = (struct sd_event_source) {
+                .n_ref = 1,
+                .event = e,
+                .floating = floating,
+                .type = type,
+                .pending_index = PRIOQ_IDX_NULL,
+                .prepare_index = PRIOQ_IDX_NULL,
+        };
+
+        if (!floating)
+                sd_event_ref(e);
+
+        LIST_PREPEND(sources, e->sources, s);
+        e->n_sources++;
+
+        return s;
+}
+
+static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        assert(s);
+
+        return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
+}
+
+_public_ int sd_event_add_io(
+                sd_event *e,
+                sd_event_source **ret,
+                int fd,
+                uint32_t events,
+                sd_event_io_handler_t callback,
+                void *userdata) {
+
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(fd >= 0, -EBADF);
+        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!callback)
+                callback = io_exit_callback;
+
+        s = source_new(e, !ret, SOURCE_IO);
+        if (!s)
+                return -ENOMEM;
+
+        s->wakeup = WAKEUP_EVENT_SOURCE;
+        s->io.fd = fd;
+        s->io.events = events;
+        s->io.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ON;
+
+        r = source_io_register(s, s->enabled, events);
+        if (r < 0)
+                return r;
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+
+        return 0;
+}
+
+static void initialize_perturb(sd_event *e) {
+        sd_id128_t id = {};
+
+        /* When we sleep for longer, we try to realign the wakeup to the same time within each
+         * minute/second/250ms, so that events all across the system can be coalesced into a single CPU
+         * wakeup. However, let's take some system-specific randomness for this value, so that in a network
+         * of systems with synced clocks timer events are distributed a bit. Here, we calculate a
+         * perturbation usec offset from the boot ID (or machine ID if failed, e.g. /proc is not mounted). */
+
+        if (_likely_(e->perturb != USEC_INFINITY))
+                return;
+
+        if (sd_id128_get_boot(&id) >= 0 || sd_id128_get_machine(&id) >= 0)
+                e->perturb = (id.qwords[0] ^ id.qwords[1]) % USEC_PER_MINUTE;
+        else
+                e->perturb = 0; /* This is a super early process without /proc and /etc ?? */
+}
+
+static int event_setup_timer_fd(
+                sd_event *e,
+                struct clock_data *d,
+                clockid_t clock) {
+
+        assert(e);
+        assert(d);
+
+        if (_likely_(d->fd >= 0))
+                return 0;
+
+        _cleanup_close_ int fd = -1;
+
+        fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
+        if (fd < 0)
+                return -errno;
+
+        fd = fd_move_above_stdio(fd);
+
+        struct epoll_event ev = {
+                .events = EPOLLIN,
+                .data.ptr = d,
+        };
+
+        if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
+                return -errno;
+
+        d->fd = TAKE_FD(fd);
+        return 0;
+}
+
+static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
+        assert(s);
+
+        return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
+}
+
+static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
+        int r;
+
+        assert(d);
+
+        if (d->fd < 0) {
+                r = event_setup_timer_fd(e, d, clock);
+                if (r < 0)
+                        return r;
+        }
+
+        r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
+        if (r < 0)
+                return r;
+
+        r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+static int event_source_time_prioq_put(
+                sd_event_source *s,
+                struct clock_data *d) {
+
+        int r;
+
+        assert(s);
+        assert(d);
+        assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
+
+        r = prioq_put(d->earliest, s, &s->earliest_index);
+        if (r < 0)
+                return r;
+
+        r = prioq_put(d->latest, s, &s->latest_index);
+        if (r < 0) {
+                assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
+                s->earliest_index = PRIOQ_IDX_NULL;
+                return r;
+        }
+
+        d->needs_rearm = true;
+        return 0;
+}
+
+_public_ int sd_event_add_time(
+                sd_event *e,
+                sd_event_source **ret,
+                clockid_t clock,
+                uint64_t usec,
+                uint64_t accuracy,
+                sd_event_time_handler_t callback,
+                void *userdata) {
+
+        EventSourceType type;
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        struct clock_data *d;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(accuracy != UINT64_MAX, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
+                return -EOPNOTSUPP;
+
+        type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
+        if (type < 0)
+                return -EOPNOTSUPP;
+
+        if (!callback)
+                callback = time_exit_callback;
+
+        assert_se(d = event_get_clock_data(e, type));
+
+        r = setup_clock_data(e, d, clock);
+        if (r < 0)
+                return r;
+
+        s = source_new(e, !ret, type);
+        if (!s)
+                return -ENOMEM;
+
+        s->time.next = usec;
+        s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
+        s->time.callback = callback;
+        s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = event_source_time_prioq_put(s, d);
+        if (r < 0)
+                return r;
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+
+        return 0;
+}
+
+_public_ int sd_event_add_time_relative(
+                sd_event *e,
+                sd_event_source **ret,
+                clockid_t clock,
+                uint64_t usec,
+                uint64_t accuracy,
+                sd_event_time_handler_t callback,
+                void *userdata) {
+
+        usec_t t;
+        int r;
+
+        /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
+         * checks for overflow. */
+
+        r = sd_event_now(e, clock, &t);
+        if (r < 0)
+                return r;
+
+        if (usec >= USEC_INFINITY - t)
+                return -EOVERFLOW;
+
+        return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
+}
+
+static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+        assert(s);
+
+        return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
+}
+
+_public_ int sd_event_add_signal(
+                sd_event *e,
+                sd_event_source **ret,
+                int sig,
+                sd_event_signal_handler_t callback,
+                void *userdata) {
+
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        struct signal_data *d;
+        sigset_t new_ss;
+        bool block_it;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        /* Let's make sure our special flag stays outside of the valid signal range */
+        assert_cc(_NSIG < SD_EVENT_SIGNAL_PROCMASK);
+
+        if (sig & SD_EVENT_SIGNAL_PROCMASK) {
+                sig &= ~SD_EVENT_SIGNAL_PROCMASK;
+                assert_return(SIGNAL_VALID(sig), -EINVAL);
+
+                block_it = true;
+        } else {
+                assert_return(SIGNAL_VALID(sig), -EINVAL);
+
+                r = signal_is_blocked(sig);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return -EBUSY;
+
+                block_it = false;
+        }
+
+        if (!callback)
+                callback = signal_exit_callback;
+
+        if (!e->signal_sources) {
+                e->signal_sources = new0(sd_event_source*, _NSIG);
+                if (!e->signal_sources)
+                        return -ENOMEM;
+        } else if (e->signal_sources[sig])
+                return -EBUSY;
+
+        s = source_new(e, !ret, SOURCE_SIGNAL);
+        if (!s)
+                return -ENOMEM;
+
+        s->signal.sig = sig;
+        s->signal.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ON;
+
+        e->signal_sources[sig] = s;
+
+        if (block_it) {
+                sigset_t old_ss;
+
+                if (sigemptyset(&new_ss) < 0)
+                        return -errno;
+
+                if (sigaddset(&new_ss, sig) < 0)
+                        return -errno;
+
+                r = pthread_sigmask(SIG_BLOCK, &new_ss, &old_ss);
+                if (r != 0)
+                        return -r;
+
+                r = sigismember(&old_ss, sig);
+                if (r < 0)
+                        return -errno;
+
+                s->signal.unblock = !r;
+        } else
+                s->signal.unblock = false;
+
+        r = event_make_signal_data(e, sig, &d);
+        if (r < 0) {
+                if (s->signal.unblock)
+                        (void) pthread_sigmask(SIG_UNBLOCK, &new_ss, NULL);
+
+                return r;
+        }
+
+        /* Use the signal name as description for the event source by default */
+        (void) sd_event_source_set_description(s, signal_to_string(sig));
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+
+        return 0;
+}
+
+static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
+        assert(s);
+
+        return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
+}
+
+static bool shall_use_pidfd(void) {
+        /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
+        return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
+}
+
+_public_ int sd_event_add_child(
+                sd_event *e,
+                sd_event_source **ret,
+                pid_t pid,
+                int options,
+                sd_event_child_handler_t callback,
+                void *userdata) {
+
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(pid > 1, -EINVAL);
+        assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
+        assert_return(options != 0, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!callback)
+                callback = child_exit_callback;
+
+        if (e->n_online_child_sources == 0) {
+                /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
+                 * for compatibility with pre-pidfd and because we don't want the reap the child processes
+                 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
+                 * take effect.
+                 *
+                 * (As an optimization we only do this check on the first child event source created.) */
+                r = signal_is_blocked(SIGCHLD);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return -EBUSY;
+        }
+
+        r = hashmap_ensure_allocated(&e->child_sources, NULL);
+        if (r < 0)
+                return r;
+
+        if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
+                return -EBUSY;
+
+        s = source_new(e, !ret, SOURCE_CHILD);
+        if (!s)
+                return -ENOMEM;
+
+        s->wakeup = WAKEUP_EVENT_SOURCE;
+        s->child.options = options;
+        s->child.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
+         * pin the PID, and make regular waitid() handling race-free. */
+
+        if (shall_use_pidfd()) {
+                s->child.pidfd = pidfd_open(pid, 0);
+                if (s->child.pidfd < 0) {
+                        /* Propagate errors unless the syscall is not supported or blocked */
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                return -errno;
+                } else
+                        s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
+        } else
+                s->child.pidfd = -1;
+
+        if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+                /* We have a pidfd and we only want to watch for exit */
+                r = source_child_pidfd_register(s, s->enabled);
+                if (r < 0)
+                        return r;
+
+        } else {
+                /* We have no pidfd or we shall wait for some other event than WEXITED */
+                r = event_make_signal_data(e, SIGCHLD, NULL);
+                if (r < 0)
+                        return r;
+
+                e->need_process_child = true;
+        }
+
+        r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
+        if (r < 0)
+                return r;
+
+        /* These must be done after everything succeeds. */
+        s->child.pid = pid;
+        e->n_online_child_sources++;
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+        return 0;
+}
+
+_public_ int sd_event_add_child_pidfd(
+                sd_event *e,
+                sd_event_source **ret,
+                int pidfd,
+                int options,
+                sd_event_child_handler_t callback,
+                void *userdata) {
+
+
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        pid_t pid;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(pidfd >= 0, -EBADF);
+        assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
+        assert_return(options != 0, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!callback)
+                callback = child_exit_callback;
+
+        if (e->n_online_child_sources == 0) {
+                r = signal_is_blocked(SIGCHLD);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return -EBUSY;
+        }
+
+        r = hashmap_ensure_allocated(&e->child_sources, NULL);
+        if (r < 0)
+                return r;
+
+        r = pidfd_get_pid(pidfd, &pid);
+        if (r < 0)
+                return r;
+
+        if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
+                return -EBUSY;
+
+        s = source_new(e, !ret, SOURCE_CHILD);
+        if (!s)
+                return -ENOMEM;
+
+        s->wakeup = WAKEUP_EVENT_SOURCE;
+        s->child.pidfd = pidfd;
+        s->child.pid = pid;
+        s->child.options = options;
+        s->child.callback = callback;
+        s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
+        if (r < 0)
+                return r;
+
+        if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+                /* We only want to watch for WEXITED */
+                r = source_child_pidfd_register(s, s->enabled);
+                if (r < 0)
+                        return r;
+        } else {
+                /* We shall wait for some other event than WEXITED */
+                r = event_make_signal_data(e, SIGCHLD, NULL);
+                if (r < 0)
+                        return r;
+
+                e->need_process_child = true;
+        }
+
+        e->n_online_child_sources++;
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+        return 0;
+}
+
+static int generic_exit_callback(sd_event_source *s, void *userdata) {
+        assert(s);
+
+        return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
+}
+
+_public_ int sd_event_add_defer(
+                sd_event *e,
+                sd_event_source **ret,
+                sd_event_handler_t callback,
+                void *userdata) {
+
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!callback)
+                callback = generic_exit_callback;
+
+        s = source_new(e, !ret, SOURCE_DEFER);
+        if (!s)
+                return -ENOMEM;
+
+        s->defer.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = source_set_pending(s, true);
+        if (r < 0)
+                return r;
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+
+        return 0;
+}
+
+_public_ int sd_event_add_post(
+                sd_event *e,
+                sd_event_source **ret,
+                sd_event_handler_t callback,
+                void *userdata) {
+
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!callback)
+                callback = generic_exit_callback;
+
+        s = source_new(e, !ret, SOURCE_POST);
+        if (!s)
+                return -ENOMEM;
+
+        s->post.callback = callback;
+        s->userdata = userdata;
+        s->enabled = SD_EVENT_ON;
+
+        r = set_ensure_put(&e->post_sources, NULL, s);
+        if (r < 0)
+                return r;
+        assert(r > 0);
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+
+        return 0;
+}
+
+_public_ int sd_event_add_exit(
+                sd_event *e,
+                sd_event_source **ret,
+                sd_event_handler_t callback,
+                void *userdata) {
+
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(callback, -EINVAL);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
+        if (r < 0)
+                return r;
+
+        s = source_new(e, !ret, SOURCE_EXIT);
+        if (!s)
+                return -ENOMEM;
+
+        s->exit.callback = callback;
+        s->userdata = userdata;
+        s->exit.prioq_index = PRIOQ_IDX_NULL;
+        s->enabled = SD_EVENT_ONESHOT;
+
+        r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
+        if (r < 0)
+                return r;
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+
+        return 0;
+}
+
+static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
+        assert(e);
+
+        if (!d)
+                return;
+
+        assert(hashmap_isempty(d->inodes));
+        assert(hashmap_isempty(d->wd));
+
+        if (d->buffer_filled > 0)
+                LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
+
+        hashmap_free(d->inodes);
+        hashmap_free(d->wd);
+
+        assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
+
+        if (d->fd >= 0) {
+                if (!event_pid_changed(e) &&
+                    epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
+                        log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
+
+                safe_close(d->fd);
+        }
+        free(d);
+}
+
+static int event_make_inotify_data(
+                sd_event *e,
+                int64_t priority,
+                struct inotify_data **ret) {
+
+        _cleanup_close_ int fd = -1;
+        struct inotify_data *d;
+        int r;
+
+        assert(e);
+
+        d = hashmap_get(e->inotify_data, &priority);
+        if (d) {
+                if (ret)
+                        *ret = d;
+                return 0;
+        }
+
+        fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
+        if (fd < 0)
+                return -errno;
+
+        fd = fd_move_above_stdio(fd);
+
+        d = new(struct inotify_data, 1);
+        if (!d)
+                return -ENOMEM;
+
+        *d = (struct inotify_data) {
+                .wakeup = WAKEUP_INOTIFY_DATA,
+                .fd = TAKE_FD(fd),
+                .priority = priority,
+        };
+
+        r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
+        if (r < 0) {
+                d->fd = safe_close(d->fd);
+                free(d);
+                return r;
+        }
+
+        struct epoll_event ev = {
+                .events = EPOLLIN,
+                .data.ptr = d,
+        };
+
+        if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
+                r = -errno;
+                d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
+                                            * remove the fd from the epoll first, which we don't want as we couldn't
+                                            * add it in the first place. */
+                event_free_inotify_data(e, d);
+                return r;
+        }
+
+        if (ret)
+                *ret = d;
+
+        return 1;
+}
+
+static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
+        int r;
+
+        assert(x);
+        assert(y);
+
+        r = CMP(x->dev, y->dev);
+        if (r != 0)
+                return r;
+
+        return CMP(x->ino, y->ino);
+}
+
+static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
+        assert(d);
+
+        siphash24_compress(&d->dev, sizeof(d->dev), state);
+        siphash24_compress(&d->ino, sizeof(d->ino), state);
+}
+
+DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
+
+static void event_free_inode_data(
+                sd_event *e,
+                struct inode_data *d) {
+
+        assert(e);
+
+        if (!d)
+                return;
+
+        assert(!d->event_sources);
+
+        if (d->fd >= 0) {
+                LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
+                safe_close(d->fd);
+        }
+
+        if (d->inotify_data) {
+
+                if (d->wd >= 0) {
+                        if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
+                                /* So here's a problem. At the time this runs the watch descriptor might already be
+                                 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
+                                 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
+                                 * likely case to happen. */
+
+                                if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
+                                        log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
+                        }
+
+                        assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
+                }
+
+                assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
+        }
+
+        free(d);
+}
+
+static void event_gc_inotify_data(
+                sd_event *e,
+                struct inotify_data *d) {
+
+        assert(e);
+
+        /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
+         * any inode with it anymore, which in turn happens if no event source of this priority is interested
+         * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
+         * (under the expectation that the GC is called again once the counter is decremented). */
+
+        if (!d)
+                return;
+
+        if (!hashmap_isempty(d->inodes))
+                return;
+
+        if (d->n_busy > 0)
+                return;
+
+        event_free_inotify_data(e, d);
+}
+
+static void event_gc_inode_data(
+                sd_event *e,
+                struct inode_data *d) {
+
+        struct inotify_data *inotify_data;
+
+        assert(e);
+
+        if (!d)
+                return;
+
+        if (d->event_sources)
+                return;
+
+        inotify_data = d->inotify_data;
+        event_free_inode_data(e, d);
+
+        event_gc_inotify_data(e, inotify_data);
+}
+
+static int event_make_inode_data(
+                sd_event *e,
+                struct inotify_data *inotify_data,
+                dev_t dev,
+                ino_t ino,
+                struct inode_data **ret) {
+
+        struct inode_data *d, key;
+        int r;
+
+        assert(e);
+        assert(inotify_data);
+
+        key = (struct inode_data) {
+                .ino = ino,
+                .dev = dev,
+        };
+
+        d = hashmap_get(inotify_data->inodes, &key);
+        if (d) {
+                if (ret)
+                        *ret = d;
+
+                return 0;
+        }
+
+        r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
+        if (r < 0)
+                return r;
+
+        d = new(struct inode_data, 1);
+        if (!d)
+                return -ENOMEM;
+
+        *d = (struct inode_data) {
+                .dev = dev,
+                .ino = ino,
+                .wd = -1,
+                .fd = -1,
+                .inotify_data = inotify_data,
+        };
+
+        r = hashmap_put(inotify_data->inodes, d, d);
+        if (r < 0) {
+                free(d);
+                return r;
+        }
+
+        if (ret)
+                *ret = d;
+
+        return 1;
+}
+
+static uint32_t inode_data_determine_mask(struct inode_data *d) {
+        bool excl_unlink = true;
+        uint32_t combined = 0;
+
+        assert(d);
+
+        /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
+         * the IN_EXCL_UNLINK flag is ANDed instead.
+         *
+         * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
+         * because we cannot change the mask anymore after the event source was created once, since the kernel has no
+         * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
+         * events we don't care for client-side. */
+
+        LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
+
+                if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
+                        excl_unlink = false;
+
+                combined |= s->inotify.mask;
+        }
+
+        return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
+}
+
+static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
+        uint32_t combined_mask;
+        int wd, r;
+
+        assert(d);
+        assert(d->fd >= 0);
+
+        combined_mask = inode_data_determine_mask(d);
+
+        if (d->wd >= 0 && combined_mask == d->combined_mask)
+                return 0;
+
+        r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
+        if (r < 0)
+                return r;
+
+        wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
+        if (wd < 0)
+                return -errno;
+
+        if (d->wd < 0) {
+                r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
+                if (r < 0) {
+                        (void) inotify_rm_watch(d->inotify_data->fd, wd);
+                        return r;
+                }
+
+                d->wd = wd;
+
+        } else if (d->wd != wd) {
+
+                log_debug("Weird, the watch descriptor we already knew for this inode changed?");
+                (void) inotify_rm_watch(d->fd, wd);
+                return -EINVAL;
+        }
+
+        d->combined_mask = combined_mask;
+        return 1;
+}
+
+static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
+        assert(s);
+
+        return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
+}
+
+static int event_add_inotify_fd_internal(
+                sd_event *e,
+                sd_event_source **ret,
+                int fd,
+                bool donate,
+                uint32_t mask,
+                sd_event_inotify_handler_t callback,
+                void *userdata) {
+
+        _cleanup_close_ int donated_fd = donate ? fd : -1;
+        _cleanup_(source_freep) sd_event_source *s = NULL;
+        struct inotify_data *inotify_data = NULL;
+        struct inode_data *inode_data = NULL;
+        struct stat st;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(fd >= 0, -EBADF);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!callback)
+                callback = inotify_exit_callback;
+
+        /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
+         * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
+         * the user can't use them for us. */
+        if (mask & IN_MASK_ADD)
+                return -EINVAL;
+
+        if (fstat(fd, &st) < 0)
+                return -errno;
+
+        s = source_new(e, !ret, SOURCE_INOTIFY);
+        if (!s)
+                return -ENOMEM;
+
+        s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
+        s->inotify.mask = mask;
+        s->inotify.callback = callback;
+        s->userdata = userdata;
+
+        /* Allocate an inotify object for this priority, and an inode object within it */
+        r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
+        if (r < 0)
+                return r;
+
+        r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
+        if (r < 0) {
+                event_gc_inotify_data(e, inotify_data);
+                return r;
+        }
+
+        /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
+         * the event source, until then, for which we need the original inode. */
+        if (inode_data->fd < 0) {
+                if (donated_fd >= 0)
+                        inode_data->fd = TAKE_FD(donated_fd);
+                else {
+                        inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+                        if (inode_data->fd < 0) {
+                                r = -errno;
+                                event_gc_inode_data(e, inode_data);
+                                return r;
+                        }
+                }
+
+                LIST_PREPEND(to_close, e->inode_data_to_close_list, inode_data);
+        }
+
+        /* Link our event source to the inode data object */
+        LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
+        s->inotify.inode_data = inode_data;
+
+        /* Actually realize the watch now */
+        r = inode_data_realize_watch(e, inode_data);
+        if (r < 0)
+                return r;
+
+        if (ret)
+                *ret = s;
+        TAKE_PTR(s);
+
+        return 0;
+}
+
+_public_ int sd_event_add_inotify_fd(
+                sd_event *e,
+                sd_event_source **ret,
+                int fd,
+                uint32_t mask,
+                sd_event_inotify_handler_t callback,
+                void *userdata) {
+
+        return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
+}
+
+_public_ int sd_event_add_inotify(
+                sd_event *e,
+                sd_event_source **ret,
+                const char *path,
+                uint32_t mask,
+                sd_event_inotify_handler_t callback,
+                void *userdata) {
+
+        sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
+        int fd, r;
+
+        assert_return(path, -EINVAL);
+
+        fd = open(path, O_PATH|O_CLOEXEC|
+                  (mask & IN_ONLYDIR ? O_DIRECTORY : 0)|
+                  (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
+        if (fd < 0)
+                return -errno;
+
+        r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(s, path);
+
+        if (ret)
+                *ret = s;
+
+        return r;
+}
+
+static sd_event_source* event_source_free(sd_event_source *s) {
+        if (!s)
+                return NULL;
+
+        /* Here's a special hack: when we are called from a
+         * dispatch handler we won't free the event source
+         * immediately, but we will detach the fd from the
+         * epoll. This way it is safe for the caller to unref
+         * the event source and immediately close the fd, but
+         * we still retain a valid event source object after
+         * the callback. */
+
+        if (s->dispatching)
+                source_disconnect(s);
+        else
+                source_free(s);
+
+        return NULL;
+}
+
+DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
+
+_public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
+        assert_return(s, -EINVAL);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        return free_and_strdup(&s->description, description);
+}
+
+_public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
+        assert_return(s, -EINVAL);
+        assert_return(description, -EINVAL);
+
+        if (!s->description)
+                return -ENXIO;
+
+        *description = s->description;
+        return 0;
+}
+
+_public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
+        assert_return(s, NULL);
+
+        return s->event;
+}
+
+_public_ int sd_event_source_get_pending(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type != SOURCE_EXIT, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        return s->pending;
+}
+
+_public_ int sd_event_source_get_io_fd(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        return s->io.fd;
+}
+
+_public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(fd >= 0, -EBADF);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->io.fd == fd)
+                return 0;
+
+        if (event_source_is_offline(s)) {
+                s->io.fd = fd;
+                s->io.registered = false;
+        } else {
+                int saved_fd;
+
+                saved_fd = s->io.fd;
+                assert(s->io.registered);
+
+                s->io.fd = fd;
+                s->io.registered = false;
+
+                r = source_io_register(s, s->enabled, s->io.events);
+                if (r < 0) {
+                        s->io.fd = saved_fd;
+                        s->io.registered = true;
+                        return r;
+                }
+
+                (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
+        }
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+
+        return s->io.owned;
+}
+
+_public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+
+        s->io.owned = own;
+        return 0;
+}
+
+_public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
+        assert_return(s, -EINVAL);
+        assert_return(events, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *events = s->io.events;
+        return 0;
+}
+
+_public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        /* edge-triggered updates are never skipped, so we can reset edges */
+        if (s->io.events == events && !(events & EPOLLET))
+                return 0;
+
+        r = source_set_pending(s, false);
+        if (r < 0)
+                return r;
+
+        if (event_source_is_online(s)) {
+                r = source_io_register(s, s->enabled, events);
+                if (r < 0)
+                        return r;
+        }
+
+        s->io.events = events;
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
+        assert_return(s, -EINVAL);
+        assert_return(revents, -EINVAL);
+        assert_return(s->type == SOURCE_IO, -EDOM);
+        assert_return(s->pending, -ENODATA);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *revents = s->io.revents;
+        return 0;
+}
+
+_public_ int sd_event_source_get_signal(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_SIGNAL, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        return s->signal.sig;
+}
+
+_public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
+        assert_return(s, -EINVAL);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *priority = s->priority;
+        return 0;
+}
+
+_public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
+        bool rm_inotify = false, rm_inode = false;
+        struct inotify_data *new_inotify_data = NULL;
+        struct inode_data *new_inode_data = NULL;
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->priority == priority)
+                return 0;
+
+        if (s->type == SOURCE_INOTIFY) {
+                struct inode_data *old_inode_data;
+
+                assert(s->inotify.inode_data);
+                old_inode_data = s->inotify.inode_data;
+
+                /* We need the original fd to change the priority. If we don't have it we can't change the priority,
+                 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
+                 * events we allow priority changes only until the first following iteration. */
+                if (old_inode_data->fd < 0)
+                        return -EOPNOTSUPP;
+
+                r = event_make_inotify_data(s->event, priority, &new_inotify_data);
+                if (r < 0)
+                        return r;
+                rm_inotify = r > 0;
+
+                r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
+                if (r < 0)
+                        goto fail;
+                rm_inode = r > 0;
+
+                if (new_inode_data->fd < 0) {
+                        /* Duplicate the fd for the new inode object if we don't have any yet */
+                        new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
+                        if (new_inode_data->fd < 0) {
+                                r = -errno;
+                                goto fail;
+                        }
+
+                        LIST_PREPEND(to_close, s->event->inode_data_to_close_list, new_inode_data);
+                }
+
+                /* Move the event source to the new inode data structure */
+                LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
+                LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
+                s->inotify.inode_data = new_inode_data;
+
+                /* Now create the new watch */
+                r = inode_data_realize_watch(s->event, new_inode_data);
+                if (r < 0) {
+                        /* Move it back */
+                        LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
+                        LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
+                        s->inotify.inode_data = old_inode_data;
+                        goto fail;
+                }
+
+                s->priority = priority;
+
+                event_gc_inode_data(s->event, old_inode_data);
+
+        } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
+                struct signal_data *old, *d;
+
+                /* Move us from the signalfd belonging to the old
+                 * priority to the signalfd of the new priority */
+
+                assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
+
+                s->priority = priority;
+
+                r = event_make_signal_data(s->event, s->signal.sig, &d);
+                if (r < 0) {
+                        s->priority = old->priority;
+                        return r;
+                }
+
+                event_unmask_signal_data(s->event, old, s->signal.sig);
+        } else
+                s->priority = priority;
+
+        event_source_pp_prioq_reshuffle(s);
+
+        if (s->type == SOURCE_EXIT)
+                prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+
+        return 0;
+
+fail:
+        if (rm_inode)
+                event_free_inode_data(s->event, new_inode_data);
+
+        if (rm_inotify)
+                event_free_inotify_data(s->event, new_inotify_data);
+
+        return r;
+}
+
+_public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
+        /* Quick mode: the event source doesn't exist and we only want to query boolean enablement state. */
+        if (!s && !ret)
+                return false;
+
+        assert_return(s, -EINVAL);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (ret)
+                *ret = s->enabled;
+
+        return s->enabled != SD_EVENT_OFF;
+}
+
+static int event_source_offline(
+                sd_event_source *s,
+                int enabled,
+                bool ratelimited) {
+
+        bool was_offline;
+        int r;
+
+        assert(s);
+        assert(enabled == SD_EVENT_OFF || ratelimited);
+
+        /* Unset the pending flag when this event source is disabled */
+        if (s->enabled != SD_EVENT_OFF &&
+            enabled == SD_EVENT_OFF &&
+            !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+                r = source_set_pending(s, false);
+                if (r < 0)
+                        return r;
+        }
+
+        was_offline = event_source_is_offline(s);
+        s->enabled = enabled;
+        s->ratelimited = ratelimited;
+
+        switch (s->type) {
+
+        case SOURCE_IO:
+                source_io_unregister(s);
+                break;
+
+        case SOURCE_SIGNAL:
+                event_gc_signal_data(s->event, &s->priority, s->signal.sig);
+                break;
+
+        case SOURCE_CHILD:
+                if (!was_offline) {
+                        assert(s->event->n_online_child_sources > 0);
+                        s->event->n_online_child_sources--;
+                }
+
+                if (EVENT_SOURCE_WATCH_PIDFD(s))
+                        source_child_pidfd_unregister(s);
+                else
+                        event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+                break;
+
+        case SOURCE_EXIT:
+                prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+                break;
+
+        case SOURCE_TIME_REALTIME:
+        case SOURCE_TIME_BOOTTIME:
+        case SOURCE_TIME_MONOTONIC:
+        case SOURCE_TIME_REALTIME_ALARM:
+        case SOURCE_TIME_BOOTTIME_ALARM:
+        case SOURCE_DEFER:
+        case SOURCE_POST:
+        case SOURCE_INOTIFY:
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
+        event_source_time_prioq_reshuffle(s);
+
+        return 1;
+}
+
+static int event_source_online(
+                sd_event_source *s,
+                int enabled,
+                bool ratelimited) {
+
+        bool was_online;
+        int r;
+
+        assert(s);
+        assert(enabled != SD_EVENT_OFF || !ratelimited);
+
+        /* Unset the pending flag when this event source is enabled */
+        if (s->enabled == SD_EVENT_OFF &&
+            enabled != SD_EVENT_OFF &&
+            !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+                r = source_set_pending(s, false);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Are we really ready for onlining? */
+        if (enabled == SD_EVENT_OFF || ratelimited) {
+                /* Nope, we are not ready for onlining, then just update the precise state and exit */
+                s->enabled = enabled;
+                s->ratelimited = ratelimited;
+                return 0;
+        }
+
+        was_online = event_source_is_online(s);
+
+        switch (s->type) {
+        case SOURCE_IO:
+                r = source_io_register(s, enabled, s->io.events);
+                if (r < 0)
+                        return r;
+                break;
+
+        case SOURCE_SIGNAL:
+                r = event_make_signal_data(s->event, s->signal.sig, NULL);
+                if (r < 0) {
+                        event_gc_signal_data(s->event, &s->priority, s->signal.sig);
+                        return r;
+                }
+
+                break;
+
+        case SOURCE_CHILD:
+                if (EVENT_SOURCE_WATCH_PIDFD(s)) {
+                        /* yes, we have pidfd */
+
+                        r = source_child_pidfd_register(s, enabled);
+                        if (r < 0)
+                                return r;
+                } else {
+                        /* no pidfd, or something other to watch for than WEXITED */
+
+                        r = event_make_signal_data(s->event, SIGCHLD, NULL);
+                        if (r < 0) {
+                                event_gc_signal_data(s->event, &s->priority, SIGCHLD);
+                                return r;
+                        }
+                }
+
+                if (!was_online)
+                        s->event->n_online_child_sources++;
+                break;
+
+        case SOURCE_TIME_REALTIME:
+        case SOURCE_TIME_BOOTTIME:
+        case SOURCE_TIME_MONOTONIC:
+        case SOURCE_TIME_REALTIME_ALARM:
+        case SOURCE_TIME_BOOTTIME_ALARM:
+        case SOURCE_EXIT:
+        case SOURCE_DEFER:
+        case SOURCE_POST:
+        case SOURCE_INOTIFY:
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        s->enabled = enabled;
+        s->ratelimited = ratelimited;
+
+        /* Non-failing operations below */
+        if (s->type == SOURCE_EXIT)
+                prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
+
+        /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
+        event_source_time_prioq_reshuffle(s);
+
+        return 1;
+}
+
+_public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
+        int r;
+
+        assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
+
+        /* Quick mode: if the source doesn't exist, SD_EVENT_OFF is a noop. */
+        if (m == SD_EVENT_OFF && !s)
+                return 0;
+
+        assert_return(s, -EINVAL);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
+        if (s->event->state == SD_EVENT_FINISHED)
+                return m == SD_EVENT_OFF ? 0 : -ESTALE;
+
+        if (s->enabled == m) /* No change? */
+                return 0;
+
+        if (m == SD_EVENT_OFF)
+                r = event_source_offline(s, m, s->ratelimited);
+        else {
+                if (s->enabled != SD_EVENT_OFF) {
+                        /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
+                         * event source is already enabled after all. */
+                        s->enabled = m;
+                        return 0;
+                }
+
+                r = event_source_online(s, m, s->ratelimited);
+        }
+        if (r < 0)
+                return r;
+
+        event_source_pp_prioq_reshuffle(s);
+        return 0;
+}
+
+_public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *usec = s->time.next;
+        return 0;
+}
+
+_public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        r = source_set_pending(s, false);
+        if (r < 0)
+                return r;
+
+        s->time.next = usec;
+
+        event_source_time_prioq_reshuffle(s);
+        return 0;
+}
+
+_public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
+        usec_t t;
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
+
+        if (usec == USEC_INFINITY)
+                return sd_event_source_set_time(s, USEC_INFINITY);
+
+        r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
+        if (r < 0)
+                return r;
+
+        usec = usec_add(t, usec);
+        if (usec == USEC_INFINITY)
+                return -EOVERFLOW;
+
+        return sd_event_source_set_time(s, usec);
+}
+
+_public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
+        assert_return(s, -EINVAL);
+        assert_return(usec, -EINVAL);
+        assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *usec = s->time.accuracy;
+        return 0;
+}
+
+_public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(usec != UINT64_MAX, -EINVAL);
+        assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        r = source_set_pending(s, false);
+        if (r < 0)
+                return r;
+
+        if (usec == 0)
+                usec = DEFAULT_ACCURACY_USEC;
+
+        s->time.accuracy = usec;
+
+        event_source_time_prioq_reshuffle(s);
+        return 0;
+}
+
+_public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
+        assert_return(s, -EINVAL);
+        assert_return(clock, -EINVAL);
+        assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *clock = event_source_type_to_clock(s->type);
+        return 0;
+}
+
+_public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
+        assert_return(s, -EINVAL);
+        assert_return(pid, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *pid = s->child.pid;
+        return 0;
+}
+
+_public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->child.pidfd < 0)
+                return -EOPNOTSUPP;
+
+        return s->child.pidfd;
+}
+
+_public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+        assert_return(SIGNAL_VALID(sig), -EINVAL);
+
+        /* If we already have seen indication the process exited refuse sending a signal early. This way we
+         * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
+         * available. */
+        if (s->child.exited)
+                return -ESRCH;
+
+        if (s->child.pidfd >= 0) {
+                siginfo_t copy;
+
+                /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
+                 * structure here */
+                if (si)
+                        copy = *si;
+
+                if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
+                        /* Let's propagate the error only if the system call is not implemented or prohibited */
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                return -errno;
+                } else
+                        return 0;
+        }
+
+        /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
+         * this here. */
+        if (flags != 0)
+                return -EOPNOTSUPP;
+
+        if (si) {
+                /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
+                siginfo_t copy = *si;
+
+                if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
+                        return -errno;
+        } else if (kill(s->child.pid, sig) < 0)
+                return -errno;
+
+        return 0;
+}
+
+_public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+        if (s->child.pidfd < 0)
+                return -EOPNOTSUPP;
+
+        return s->child.pidfd_owned;
+}
+
+_public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+        if (s->child.pidfd < 0)
+                return -EOPNOTSUPP;
+
+        s->child.pidfd_owned = own;
+        return 0;
+}
+
+_public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+        return s->child.process_owned;
+}
+
+_public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type == SOURCE_CHILD, -EDOM);
+
+        s->child.process_owned = own;
+        return 0;
+}
+
+_public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
+        assert_return(s, -EINVAL);
+        assert_return(mask, -EINVAL);
+        assert_return(s->type == SOURCE_INOTIFY, -EDOM);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        *mask = s->inotify.mask;
+        return 0;
+}
+
+_public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
+        int r;
+
+        assert_return(s, -EINVAL);
+        assert_return(s->type != SOURCE_EXIT, -EDOM);
+        assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(s->event), -ECHILD);
+
+        if (s->prepare == callback)
+                return 0;
+
+        if (callback && s->prepare) {
+                s->prepare = callback;
+                return 0;
+        }
+
+        r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
+        if (r < 0)
+                return r;
+
+        s->prepare = callback;
+
+        if (callback) {
+                r = prioq_put(s->event->prepare, s, &s->prepare_index);
+                if (r < 0)
+                        return r;
+        } else
+                prioq_remove(s->event->prepare, s, &s->prepare_index);
+
+        return 0;
+}
+
+_public_ void* sd_event_source_get_userdata(sd_event_source *s) {
+        assert_return(s, NULL);
+
+        return s->userdata;
+}
+
+_public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
+        void *ret;
+
+        assert_return(s, NULL);
+
+        ret = s->userdata;
+        s->userdata = userdata;
+
+        return ret;
+}
+
+static int event_source_enter_ratelimited(sd_event_source *s) {
+        int r;
+
+        assert(s);
+
+        /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
+         * the end of the rate limit time window, much as if it was a timer event source. */
+
+        if (s->ratelimited)
+                return 0; /* Already ratelimited, this is a NOP hence */
+
+        /* Make sure we can install a CLOCK_MONOTONIC event further down. */
+        r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
+        if (r < 0)
+                return r;
+
+        /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
+         * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
+         * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
+        if (EVENT_SOURCE_IS_TIME(s->type))
+                event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
+
+        /* Now, let's add the event source to the monotonic clock instead */
+        r = event_source_time_prioq_put(s, &s->event->monotonic);
+        if (r < 0)
+                goto fail;
+
+        /* And let's take the event source officially offline */
+        r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
+        if (r < 0) {
+                event_source_time_prioq_remove(s, &s->event->monotonic);
+                goto fail;
+        }
+
+        event_source_pp_prioq_reshuffle(s);
+
+        log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
+        return 0;
+
+fail:
+        /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
+         * space for it should already be allocated. */
+        if (EVENT_SOURCE_IS_TIME(s->type))
+                assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
+
+        return r;
+}
+
+static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
+        int r;
+
+        assert(s);
+
+        if (!s->ratelimited)
+                return 0;
+
+        /* Let's take the event source out of the monotonic prioq first. */
+        event_source_time_prioq_remove(s, &s->event->monotonic);
+
+        /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
+        if (EVENT_SOURCE_IS_TIME(s->type)) {
+                r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
+                if (r < 0)
+                        goto fail;
+        }
+
+        /* Let's try to take it online again.  */
+        r = event_source_online(s, s->enabled, /* ratelimited= */ false);
+        if (r < 0) {
+                /* Do something roughly sensible when this failed: undo the two prioq ops above */
+                if (EVENT_SOURCE_IS_TIME(s->type))
+                        event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
+
+                goto fail;
+        }
+
+        event_source_pp_prioq_reshuffle(s);
+        ratelimit_reset(&s->rate_limit);
+
+        log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
+
+        if (run_callback && s->ratelimit_expire_callback) {
+                s->dispatching = true;
+                r = s->ratelimit_expire_callback(s, s->userdata);
+                s->dispatching = false;
+
+                if (r < 0) {
+                        log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
+                                        strna(s->description),
+                                        event_source_type_to_string(s->type),
+                                        s->exit_on_failure ? "exiting" : "disabling");
+
+                        if (s->exit_on_failure)
+                                (void) sd_event_exit(s->event, r);
+                }
+
+                if (s->n_ref == 0)
+                        source_free(s);
+                else if (r < 0)
+                        assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
+
+                return 1;
+        }
+
+        return 0;
+
+fail:
+        /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
+         * simply put it back in it, maybe we can then process it more successfully next iteration. */
+        assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
+
+        return r;
+}
+
+static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
+        usec_t c;
+        assert(e);
+        assert(a <= b);
+
+        if (a <= 0)
+                return 0;
+        if (a >= USEC_INFINITY)
+                return USEC_INFINITY;
+
+        if (b <= a + 1)
+                return a;
+
+        initialize_perturb(e);
+
+        /*
+          Find a good time to wake up again between times a and b. We
+          have two goals here:
+
+          a) We want to wake up as seldom as possible, hence prefer
+             later times over earlier times.
+
+          b) But if we have to wake up, then let's make sure to
+             dispatch as much as possible on the entire system.
+
+          We implement this by waking up everywhere at the same time
+          within any given minute if we can, synchronised via the
+          perturbation value determined from the boot ID. If we can't,
+          then we try to find the same spot in every 10s, then 1s and
+          then 250ms step. Otherwise, we pick the last possible time
+          to wake up.
+        */
+
+        c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_MINUTE))
+                        return b;
+
+                c -= USEC_PER_MINUTE;
+        }
+
+        if (c >= a)
+                return c;
+
+        c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_SEC*10))
+                        return b;
+
+                c -= USEC_PER_SEC*10;
+        }
+
+        if (c >= a)
+                return c;
+
+        c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_SEC))
+                        return b;
+
+                c -= USEC_PER_SEC;
+        }
+
+        if (c >= a)
+                return c;
+
+        c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
+        if (c >= b) {
+                if (_unlikely_(c < USEC_PER_MSEC*250))
+                        return b;
+
+                c -= USEC_PER_MSEC*250;
+        }
+
+        if (c >= a)
+                return c;
+
+        return b;
+}
+
+static int event_arm_timer(
+                sd_event *e,
+                struct clock_data *d) {
+
+        struct itimerspec its = {};
+        sd_event_source *a, *b;
+        usec_t t;
+
+        assert(e);
+        assert(d);
+
+        if (!d->needs_rearm)
+                return 0;
+
+        d->needs_rearm = false;
+
+        a = prioq_peek(d->earliest);
+        assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
+        if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
+
+                if (d->fd < 0)
+                        return 0;
+
+                if (d->next == USEC_INFINITY)
+                        return 0;
+
+                /* disarm */
+                if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
+                        return -errno;
+
+                d->next = USEC_INFINITY;
+                return 0;
+        }
+
+        b = prioq_peek(d->latest);
+        assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
+        assert(b && b->enabled != SD_EVENT_OFF);
+
+        t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
+        if (d->next == t)
+                return 0;
+
+        assert_se(d->fd >= 0);
+
+        if (t == 0) {
+                /* We don' want to disarm here, just mean some time looooong ago. */
+                its.it_value.tv_sec = 0;
+                its.it_value.tv_nsec = 1;
+        } else
+                timespec_store(&its.it_value, t);
+
+        if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
+                return -errno;
+
+        d->next = t;
+        return 0;
+}
+
+static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
+        assert(e);
+        assert(s);
+        assert(s->type == SOURCE_IO);
+
+        /* If the event source was already pending, we just OR in the
+         * new revents, otherwise we reset the value. The ORing is
+         * necessary to handle EPOLLONESHOT events properly where
+         * readability might happen independently of writability, and
+         * we need to keep track of both */
+
+        if (s->pending)
+                s->io.revents |= revents;
+        else
+                s->io.revents = revents;
+
+        return source_set_pending(s, true);
+}
+
+static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
+        uint64_t x;
+        ssize_t ss;
+
+        assert(e);
+        assert(fd >= 0);
+
+        assert_return(events == EPOLLIN, -EIO);
+
+        ss = read(fd, &x, sizeof(x));
+        if (ss < 0) {
+                if (ERRNO_IS_TRANSIENT(errno))
+                        return 0;
+
+                return -errno;
+        }
+
+        if (_unlikely_(ss != sizeof(x)))
+                return -EIO;
+
+        if (next)
+                *next = USEC_INFINITY;
+
+        return 0;
+}
+
+static int process_timer(
+                sd_event *e,
+                usec_t n,
+                struct clock_data *d) {
+
+        sd_event_source *s;
+        bool callback_invoked = false;
+        int r;
+
+        assert(e);
+        assert(d);
+
+        for (;;) {
+                s = prioq_peek(d->earliest);
+                assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
+
+                if (!s || time_event_source_next(s) > n)
+                        break;
+
+                if (s->ratelimited) {
+                        /* This is an event sources whose ratelimit window has ended. Let's turn it on
+                         * again. */
+                        assert(s->ratelimited);
+
+                        r = event_source_leave_ratelimit(s, /* run_callback */ true);
+                        if (r < 0)
+                                return r;
+                        else if (r == 1)
+                                callback_invoked = true;
+
+                        continue;
+                }
+
+                if (s->enabled == SD_EVENT_OFF || s->pending)
+                        break;
+
+                r = source_set_pending(s, true);
+                if (r < 0)
+                        return r;
+
+                event_source_time_prioq_reshuffle(s);
+        }
+
+        return callback_invoked;
+}
+
+static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
+        int64_t min_priority = threshold;
+        bool something_new = false;
+        sd_event_source *s;
+        int r;
+
+        assert(e);
+        assert(ret_min_priority);
+
+        if (!e->need_process_child) {
+                *ret_min_priority = min_priority;
+                return 0;
+        }
+
+        e->need_process_child = false;
+
+        /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
+         * for, instead of using P_ALL. This is because we only want to get child information of very
+         * specific child processes, and not all of them. We might not have processed the SIGCHLD event
+         * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
+         * hence we really don't want anything flushed out of the kernel's queue that we don't care
+         * about. Since this is O(n) this means that if you have a lot of processes you probably want
+         * to handle SIGCHLD yourself.
+         *
+         * We do not reap the children here (by using WNOWAIT), this is only done after the event
+         * source is dispatched so that the callback still sees the process as a zombie. */
+
+        HASHMAP_FOREACH(s, e->child_sources) {
+                assert(s->type == SOURCE_CHILD);
+
+                if (s->priority > threshold)
+                        continue;
+
+                if (s->pending)
+                        continue;
+
+                if (event_source_is_offline(s))
+                        continue;
+
+                if (s->child.exited)
+                        continue;
+
+                if (EVENT_SOURCE_WATCH_PIDFD(s))
+                        /* There's a usable pidfd known for this event source? Then don't waitid() for
+                         * it here */
+                        continue;
+
+                zero(s->child.siginfo);
+                if (waitid(P_PID, s->child.pid, &s->child.siginfo,
+                           WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
+                        return negative_errno();
+
+                if (s->child.siginfo.si_pid != 0) {
+                        bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
+
+                        if (zombie)
+                                s->child.exited = true;
+
+                        if (!zombie && (s->child.options & WEXITED)) {
+                                /* If the child isn't dead then let's immediately remove the state
+                                 * change from the queue, since there's no benefit in leaving it
+                                 * queued. */
+
+                                assert(s->child.options & (WSTOPPED|WCONTINUED));
+                                (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
+                        }
+
+                        r = source_set_pending(s, true);
+                        if (r < 0)
+                                return r;
+                        if (r > 0) {
+                                something_new = true;
+                                min_priority = MIN(min_priority, s->priority);
+                        }
+                }
+        }
+
+        *ret_min_priority = min_priority;
+        return something_new;
+}
+
+static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
+        assert(e);
+        assert(s);
+        assert(s->type == SOURCE_CHILD);
+
+        if (s->pending)
+                return 0;
+
+        if (event_source_is_offline(s))
+                return 0;
+
+        if (!EVENT_SOURCE_WATCH_PIDFD(s))
+                return 0;
+
+        zero(s->child.siginfo);
+        if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
+                return -errno;
+
+        if (s->child.siginfo.si_pid == 0)
+                return 0;
+
+        if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
+                s->child.exited = true;
+
+        return source_set_pending(s, true);
+}
+
+static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
+        int r;
+
+        assert(e);
+        assert(d);
+        assert_return(events == EPOLLIN, -EIO);
+        assert(min_priority);
+
+        /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
+         * sure to recheck the children we watch. This is because we only ever dequeue the first signal
+         * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
+         * but we might have higher priority children we care about hence we need to check that
+         * explicitly. */
+
+        if (sigismember(&d->sigset, SIGCHLD))
+                e->need_process_child = true;
+
+        /* If there's already an event source pending for this priority we don't read another */
+        if (d->current)
+                return 0;
+
+        for (;;) {
+                struct signalfd_siginfo si;
+                ssize_t n;
+                sd_event_source *s = NULL;
+
+                n = read(d->fd, &si, sizeof(si));
+                if (n < 0) {
+                        if (ERRNO_IS_TRANSIENT(errno))
+                                return 0;
+
+                        return -errno;
+                }
+
+                if (_unlikely_(n != sizeof(si)))
+                        return -EIO;
+
+                assert(SIGNAL_VALID(si.ssi_signo));
+
+                if (e->signal_sources)
+                        s = e->signal_sources[si.ssi_signo];
+                if (!s)
+                        continue;
+                if (s->pending)
+                        continue;
+
+                s->signal.siginfo = si;
+                d->current = s;
+
+                r = source_set_pending(s, true);
+                if (r < 0)
+                        return r;
+                if (r > 0 && *min_priority >= s->priority) {
+                        *min_priority = s->priority;
+                        return 1; /* an event source with smaller priority is queued. */
+                }
+
+                return 0;
+        }
+}
+
+static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
+        ssize_t n;
+
+        assert(e);
+        assert(d);
+
+        assert_return(revents == EPOLLIN, -EIO);
+
+        /* If there's already an event source pending for this priority, don't read another */
+        if (d->n_pending > 0)
+                return 0;
+
+        /* Is the read buffer non-empty? If so, let's not read more */
+        if (d->buffer_filled > 0)
+                return 0;
+
+        if (d->priority > threshold)
+                return 0;
+
+        n = read(d->fd, &d->buffer, sizeof(d->buffer));
+        if (n < 0) {
+                if (ERRNO_IS_TRANSIENT(errno))
+                        return 0;
+
+                return -errno;
+        }
+
+        assert(n > 0);
+        d->buffer_filled = (size_t) n;
+        LIST_PREPEND(buffered, e->buffered_inotify_data_list, d);
+
+        return 1;
+}
+
+static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
+        assert(e);
+        assert(d);
+        assert(sz <= d->buffer_filled);
+
+        if (sz == 0)
+                return;
+
+        /* Move the rest to the buffer to the front, in order to get things properly aligned again */
+        memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
+        d->buffer_filled -= sz;
+
+        if (d->buffer_filled == 0)
+                LIST_REMOVE(buffered, e->buffered_inotify_data_list, d);
+}
+
+static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
+        int r;
+
+        assert(e);
+        assert(d);
+
+        /* If there's already an event source pending for this priority, don't read another */
+        if (d->n_pending > 0)
+                return 0;
+
+        while (d->buffer_filled > 0) {
+                size_t sz;
+
+                /* Let's validate that the event structures are complete */
+                if (d->buffer_filled < offsetof(struct inotify_event, name))
+                        return -EIO;
+
+                sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
+                if (d->buffer_filled < sz)
+                        return -EIO;
+
+                if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
+                        struct inode_data *inode_data;
+
+                        /* The queue overran, let's pass this event to all event sources connected to this inotify
+                         * object */
+
+                        HASHMAP_FOREACH(inode_data, d->inodes)
+                                LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
+
+                                        if (event_source_is_offline(s))
+                                                continue;
+
+                                        r = source_set_pending(s, true);
+                                        if (r < 0)
+                                                return r;
+                                }
+                } else {
+                        struct inode_data *inode_data;
+
+                        /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
+                         * our watch descriptor table. */
+                        if (d->buffer.ev.mask & IN_IGNORED) {
+
+                                inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
+                                if (!inode_data) {
+                                        event_inotify_data_drop(e, d, sz);
+                                        continue;
+                                }
+
+                                /* The watch descriptor was removed by the kernel, let's drop it here too */
+                                inode_data->wd = -1;
+                        } else {
+                                inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
+                                if (!inode_data) {
+                                        event_inotify_data_drop(e, d, sz);
+                                        continue;
+                                }
+                        }
+
+                        /* Trigger all event sources that are interested in these events. Also trigger all event
+                         * sources if IN_IGNORED or IN_UNMOUNT is set. */
+                        LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
+
+                                if (event_source_is_offline(s))
+                                        continue;
+
+                                if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
+                                    (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
+                                        continue;
+
+                                r = source_set_pending(s, true);
+                                if (r < 0)
+                                        return r;
+                        }
+                }
+
+                /* Something pending now? If so, let's finish, otherwise let's read more. */
+                if (d->n_pending > 0)
+                        return 1;
+        }
+
+        return 0;
+}
+
+static int process_inotify(sd_event *e) {
+        int r, done = 0;
+
+        assert(e);
+
+        LIST_FOREACH(buffered, d, e->buffered_inotify_data_list) {
+                r = event_inotify_data_process(e, d);
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        done ++;
+        }
+
+        return done;
+}
+
+static int source_dispatch(sd_event_source *s) {
+        EventSourceType saved_type;
+        sd_event *saved_event;
+        int r = 0;
+
+        assert(s);
+        assert(s->pending || s->type == SOURCE_EXIT);
+
+        /* Save the event source type, here, so that we still know it after the event callback which might
+         * invalidate the event. */
+        saved_type = s->type;
+
+        /* Similarly, store a reference to the event loop object, so that we can still access it after the
+         * callback might have invalidated/disconnected the event source. */
+        saved_event = s->event;
+        PROTECT_EVENT(saved_event);
+
+        /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
+        assert(!s->ratelimited);
+        if (!ratelimit_below(&s->rate_limit)) {
+                r = event_source_enter_ratelimited(s);
+                if (r < 0)
+                        return r;
+
+                return 1;
+        }
+
+        if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
+                r = source_set_pending(s, false);
+                if (r < 0)
+                        return r;
+        }
+
+        if (s->type != SOURCE_POST) {
+                sd_event_source *z;
+
+                /* If we execute a non-post source, let's mark all post sources as pending. */
+
+                SET_FOREACH(z, s->event->post_sources) {
+                        if (event_source_is_offline(z))
+                                continue;
+
+                        r = source_set_pending(z, true);
+                        if (r < 0)
+                                return r;
+                }
+        }
+
+        if (s->enabled == SD_EVENT_ONESHOT) {
+                r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
+                if (r < 0)
+                        return r;
+        }
+
+        s->dispatching = true;
+
+        switch (s->type) {
+
+        case SOURCE_IO:
+                r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
+                break;
+
+        case SOURCE_TIME_REALTIME:
+        case SOURCE_TIME_BOOTTIME:
+        case SOURCE_TIME_MONOTONIC:
+        case SOURCE_TIME_REALTIME_ALARM:
+        case SOURCE_TIME_BOOTTIME_ALARM:
+                r = s->time.callback(s, s->time.next, s->userdata);
+                break;
+
+        case SOURCE_SIGNAL:
+                r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
+                break;
+
+        case SOURCE_CHILD: {
+                bool zombie;
+
+                zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
+
+                r = s->child.callback(s, &s->child.siginfo, s->userdata);
+
+                /* Now, reap the PID for good. */
+                if (zombie) {
+                        (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
+                        s->child.waited = true;
+                }
+
+                break;
+        }
+
+        case SOURCE_DEFER:
+                r = s->defer.callback(s, s->userdata);
+                break;
+
+        case SOURCE_POST:
+                r = s->post.callback(s, s->userdata);
+                break;
+
+        case SOURCE_EXIT:
+                r = s->exit.callback(s, s->userdata);
+                break;
+
+        case SOURCE_INOTIFY: {
+                struct sd_event *e = s->event;
+                struct inotify_data *d;
+                size_t sz;
+
+                assert(s->inotify.inode_data);
+                assert_se(d = s->inotify.inode_data->inotify_data);
+
+                assert(d->buffer_filled >= offsetof(struct inotify_event, name));
+                sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
+                assert(d->buffer_filled >= sz);
+
+                /* If the inotify callback destroys the event source then this likely means we don't need to
+                 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
+                 * free it immediately, then we couldn't drop the event from the inotify event queue without
+                 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
+                 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
+                 * explicitly GC it after we are done dropping the inotify event from the buffer. */
+                d->n_busy++;
+                r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
+                d->n_busy--;
+
+                /* When no event is pending anymore on this inotify object, then let's drop the event from
+                 * the inotify event queue buffer. */
+                if (d->n_pending == 0)
+                        event_inotify_data_drop(e, d, sz);
+
+                /* Now we don't want to access 'd' anymore, it's OK to GC now. */
+                event_gc_inotify_data(e, d);
+                break;
+        }
+
+        case SOURCE_WATCHDOG:
+        case _SOURCE_EVENT_SOURCE_TYPE_MAX:
+        case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
+                assert_not_reached();
+        }
+
+        s->dispatching = false;
+
+        if (r < 0) {
+                log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
+                                strna(s->description),
+                                event_source_type_to_string(saved_type),
+                                s->exit_on_failure ? "exiting" : "disabling");
+
+                if (s->exit_on_failure)
+                        (void) sd_event_exit(saved_event, r);
+        }
+
+        if (s->n_ref == 0)
+                source_free(s);
+        else if (r < 0)
+                assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
+
+        return 1;
+}
+
+static int event_prepare(sd_event *e) {
+        int r;
+
+        assert(e);
+
+        for (;;) {
+                sd_event_source *s;
+
+                s = prioq_peek(e->prepare);
+                if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
+                        break;
+
+                s->prepare_iteration = e->iteration;
+                r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
+                if (r < 0)
+                        return r;
+
+                assert(s->prepare);
+
+                s->dispatching = true;
+                r = s->prepare(s, s->userdata);
+                s->dispatching = false;
+
+                if (r < 0) {
+                        log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
+                                        strna(s->description),
+                                        event_source_type_to_string(s->type),
+                                        s->exit_on_failure ? "exiting" : "disabling");
+
+                        if (s->exit_on_failure)
+                                (void) sd_event_exit(e, r);
+                }
+
+                if (s->n_ref == 0)
+                        source_free(s);
+                else if (r < 0)
+                        assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
+        }
+
+        return 0;
+}
+
+static int dispatch_exit(sd_event *e) {
+        sd_event_source *p;
+        int r;
+
+        assert(e);
+
+        p = prioq_peek(e->exit);
+        assert(!p || p->type == SOURCE_EXIT);
+
+        if (!p || event_source_is_offline(p)) {
+                e->state = SD_EVENT_FINISHED;
+                return 0;
+        }
+
+        PROTECT_EVENT(e);
+        e->iteration++;
+        e->state = SD_EVENT_EXITING;
+        r = source_dispatch(p);
+        e->state = SD_EVENT_INITIAL;
+        return r;
+}
+
+static sd_event_source* event_next_pending(sd_event *e) {
+        sd_event_source *p;
+
+        assert(e);
+
+        p = prioq_peek(e->pending);
+        if (!p)
+                return NULL;
+
+        if (event_source_is_offline(p))
+                return NULL;
+
+        return p;
+}
+
+static int arm_watchdog(sd_event *e) {
+        struct itimerspec its = {};
+        usec_t t;
+
+        assert(e);
+        assert(e->watchdog_fd >= 0);
+
+        t = sleep_between(e,
+                          usec_add(e->watchdog_last, (e->watchdog_period / 2)),
+                          usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
+
+        timespec_store(&its.it_value, t);
+
+        /* Make sure we never set the watchdog to 0, which tells the
+         * kernel to disable it. */
+        if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
+                its.it_value.tv_nsec = 1;
+
+        return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
+}
+
+static int process_watchdog(sd_event *e) {
+        assert(e);
+
+        if (!e->watchdog)
+                return 0;
+
+        /* Don't notify watchdog too often */
+        if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
+                return 0;
+
+        sd_notify(false, "WATCHDOG=1");
+        e->watchdog_last = e->timestamp.monotonic;
+
+        return arm_watchdog(e);
+}
+
+static void event_close_inode_data_fds(sd_event *e) {
+        struct inode_data *d;
+
+        assert(e);
+
+        /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
+         * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
+         * adjustments to the event source, such as changing the priority (which requires us to remove and re-add a watch
+         * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
+         * compromise. */
+
+        while ((d = e->inode_data_to_close_list)) {
+                assert(d->fd >= 0);
+                d->fd = safe_close(d->fd);
+
+                LIST_REMOVE(to_close, e->inode_data_to_close_list, d);
+        }
+}
+
+_public_ int sd_event_prepare(sd_event *e) {
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
+
+        /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
+         * this check here once, since gettid() is typically not cached, and thus want to minimize
+         * syscalls */
+        assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
+
+        /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
+        PROTECT_EVENT(e);
+
+        if (e->exit_requested)
+                goto pending;
+
+        e->iteration++;
+
+        e->state = SD_EVENT_PREPARING;
+        r = event_prepare(e);
+        e->state = SD_EVENT_INITIAL;
+        if (r < 0)
+                return r;
+
+        r = event_arm_timer(e, &e->realtime);
+        if (r < 0)
+                return r;
+
+        r = event_arm_timer(e, &e->boottime);
+        if (r < 0)
+                return r;
+
+        r = event_arm_timer(e, &e->monotonic);
+        if (r < 0)
+                return r;
+
+        r = event_arm_timer(e, &e->realtime_alarm);
+        if (r < 0)
+                return r;
+
+        r = event_arm_timer(e, &e->boottime_alarm);
+        if (r < 0)
+                return r;
+
+        event_close_inode_data_fds(e);
+
+        if (event_next_pending(e) || e->need_process_child || e->buffered_inotify_data_list)
+                goto pending;
+
+        e->state = SD_EVENT_ARMED;
+
+        return 0;
+
+pending:
+        e->state = SD_EVENT_ARMED;
+        r = sd_event_wait(e, 0);
+        if (r == 0)
+                e->state = SD_EVENT_ARMED;
+
+        return r;
+}
+
+static int epoll_wait_usec(
+                int fd,
+                struct epoll_event *events,
+                int maxevents,
+                usec_t timeout) {
+
+        int msec;
+#if 0
+        static bool epoll_pwait2_absent = false;
+        int r;
+
+        /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not.
+         *
+         * FIXME: this is temporarily disabled until epoll_pwait2() becomes more widely available.
+         * See https://github.com/systemd/systemd/pull/18973 and
+         * https://github.com/systemd/systemd/issues/19052. */
+
+        if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
+                r = epoll_pwait2(fd,
+                                 events,
+                                 maxevents,
+                                 TIMESPEC_STORE(timeout),
+                                 NULL);
+                if (r >= 0)
+                        return r;
+                if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                        return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
+                                        * supported. */
+
+                epoll_pwait2_absent = true;
+        }
+#endif
+
+        if (timeout == USEC_INFINITY)
+                msec = -1;
+        else {
+                usec_t k;
+
+                k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
+                if (k >= INT_MAX)
+                        msec = INT_MAX; /* Saturate */
+                else
+                        msec = (int) k;
+        }
+
+        return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
+}
+
+static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
+        size_t n_event_queue, m, n_event_max;
+        int64_t min_priority = threshold;
+        bool something_new = false;
+        int r;
+
+        assert(e);
+        assert(ret_min_priority);
+
+        n_event_queue = MAX(e->n_sources, 1u);
+        if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
+                return -ENOMEM;
+
+        n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
+
+        /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
+        if (e->buffered_inotify_data_list)
+                timeout = 0;
+
+        for (;;) {
+                r = epoll_wait_usec(
+                                e->epoll_fd,
+                                e->event_queue,
+                                n_event_max,
+                                timeout);
+                if (r < 0)
+                        return r;
+
+                m = (size_t) r;
+
+                if (m < n_event_max)
+                        break;
+
+                if (n_event_max >= n_event_queue * 10)
+                        break;
+
+                if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
+                        return -ENOMEM;
+
+                n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
+                timeout = 0;
+        }
+
+        /* Set timestamp only when this is called first time. */
+        if (threshold == INT64_MAX)
+                triple_timestamp_get(&e->timestamp);
+
+        for (size_t i = 0; i < m; i++) {
+
+                if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
+                        r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
+                else {
+                        WakeupType *t = e->event_queue[i].data.ptr;
+
+                        switch (*t) {
+
+                        case WAKEUP_EVENT_SOURCE: {
+                                sd_event_source *s = e->event_queue[i].data.ptr;
+
+                                assert(s);
+
+                                if (s->priority > threshold)
+                                        continue;
+
+                                min_priority = MIN(min_priority, s->priority);
+
+                                switch (s->type) {
+
+                                case SOURCE_IO:
+                                        r = process_io(e, s, e->event_queue[i].events);
+                                        break;
+
+                                case SOURCE_CHILD:
+                                        r = process_pidfd(e, s, e->event_queue[i].events);
+                                        break;
+
+                                default:
+                                        assert_not_reached();
+                                }
+
+                                break;
+                        }
+
+                        case WAKEUP_CLOCK_DATA: {
+                                struct clock_data *d = e->event_queue[i].data.ptr;
+
+                                assert(d);
+
+                                r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
+                                break;
+                        }
+
+                        case WAKEUP_SIGNAL_DATA:
+                                r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
+                                break;
+
+                        case WAKEUP_INOTIFY_DATA:
+                                r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
+                                break;
+
+                        default:
+                                assert_not_reached();
+                        }
+                }
+                if (r < 0)
+                        return r;
+                if (r > 0)
+                        something_new = true;
+        }
+
+        *ret_min_priority = min_priority;
+        return something_new;
+}
+
+_public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
+
+        if (e->exit_requested) {
+                e->state = SD_EVENT_PENDING;
+                return 1;
+        }
+
+        for (int64_t threshold = INT64_MAX; ; threshold--) {
+                int64_t epoll_min_priority, child_min_priority;
+
+                /* There may be a possibility that new epoll (especially IO) and child events are
+                 * triggered just after process_epoll() call but before process_child(), and the new IO
+                 * events may have higher priority than the child events. To salvage these events,
+                 * let's call epoll_wait() again, but accepts only events with higher priority than the
+                 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
+                 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
+                 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
+
+                r = process_epoll(e, timeout, threshold, &epoll_min_priority);
+                if (r == -EINTR) {
+                        e->state = SD_EVENT_PENDING;
+                        return 1;
+                }
+                if (r < 0)
+                        goto finish;
+                if (r == 0 && threshold < INT64_MAX)
+                        /* No new epoll event. */
+                        break;
+
+                r = process_child(e, threshold, &child_min_priority);
+                if (r < 0)
+                        goto finish;
+                if (r == 0)
+                        /* No new child event. */
+                        break;
+
+                threshold = MIN(epoll_min_priority, child_min_priority);
+                if (threshold == INT64_MIN)
+                        break;
+
+                timeout = 0;
+        }
+
+        r = process_watchdog(e);
+        if (r < 0)
+                goto finish;
+
+        r = process_inotify(e);
+        if (r < 0)
+                goto finish;
+
+        r = process_timer(e, e->timestamp.realtime, &e->realtime);
+        if (r < 0)
+                goto finish;
+
+        r = process_timer(e, e->timestamp.boottime, &e->boottime);
+        if (r < 0)
+                goto finish;
+
+        r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
+        if (r < 0)
+                goto finish;
+
+        r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
+        if (r < 0)
+                goto finish;
+
+        r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
+        if (r < 0)
+                goto finish;
+        else if (r == 1) {
+                /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
+                 * put loop in the initial state in order to evaluate (in the next iteration) also sources
+                 * there were potentially re-enabled by the callback.
+                 *
+                 * Wondering why we treat only this invocation of process_timer() differently? Once event
+                 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
+                 * ratelimit expiry callback is never called for any other timer type. */
+                r = 0;
+                goto finish;
+        }
+
+        if (event_next_pending(e)) {
+                e->state = SD_EVENT_PENDING;
+                return 1;
+        }
+
+        r = 0;
+
+finish:
+        e->state = SD_EVENT_INITIAL;
+
+        return r;
+}
+
+_public_ int sd_event_dispatch(sd_event *e) {
+        sd_event_source *p;
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
+
+        if (e->exit_requested)
+                return dispatch_exit(e);
+
+        p = event_next_pending(e);
+        if (p) {
+                PROTECT_EVENT(e);
+
+                e->state = SD_EVENT_RUNNING;
+                r = source_dispatch(p);
+                e->state = SD_EVENT_INITIAL;
+                return r;
+        }
+
+        e->state = SD_EVENT_INITIAL;
+
+        return 1;
+}
+
+static void event_log_delays(sd_event *e) {
+        char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
+        size_t l, i;
+
+        p = b;
+        l = sizeof(b);
+        for (i = 0; i < ELEMENTSOF(e->delays); i++) {
+                l = strpcpyf(&p, l, "%u ", e->delays[i]);
+                e->delays[i] = 0;
+        }
+        log_debug("Event loop iterations: %s", b);
+}
+
+_public_ int sd_event_run(sd_event *e, uint64_t timeout) {
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
+
+        if (e->profile_delays && e->last_run_usec != 0) {
+                usec_t this_run;
+                unsigned l;
+
+                this_run = now(CLOCK_MONOTONIC);
+
+                l = log2u64(this_run - e->last_run_usec);
+                assert(l < ELEMENTSOF(e->delays));
+                e->delays[l]++;
+
+                if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
+                        event_log_delays(e);
+                        e->last_log_usec = this_run;
+                }
+        }
+
+        /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
+        PROTECT_EVENT(e);
+
+        r = sd_event_prepare(e);
+        if (r == 0)
+                /* There was nothing? Then wait... */
+                r = sd_event_wait(e, timeout);
+
+        if (e->profile_delays)
+                e->last_run_usec = now(CLOCK_MONOTONIC);
+
+        if (r > 0) {
+                /* There's something now, then let's dispatch it */
+                r = sd_event_dispatch(e);
+                if (r < 0)
+                        return r;
+
+                return 1;
+        }
+
+        return r;
+}
+
+_public_ int sd_event_loop(sd_event *e) {
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+        assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
+
+        PROTECT_EVENT(e);
+
+        while (e->state != SD_EVENT_FINISHED) {
+                r = sd_event_run(e, UINT64_MAX);
+                if (r < 0)
+                        return r;
+        }
+
+        return e->exit_code;
+}
+
+_public_ int sd_event_get_fd(sd_event *e) {
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        return e->epoll_fd;
+}
+
+_public_ int sd_event_get_state(sd_event *e) {
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        return e->state;
+}
+
+_public_ int sd_event_get_exit_code(sd_event *e, int *code) {
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(code, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!e->exit_requested)
+                return -ENODATA;
+
+        *code = e->exit_code;
+        return 0;
+}
+
+_public_ int sd_event_exit(sd_event *e, int code) {
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        e->exit_requested = true;
+        e->exit_code = code;
+
+        return 0;
+}
+
+_public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(usec, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
+                return -EOPNOTSUPP;
+
+        if (!triple_timestamp_is_set(&e->timestamp)) {
+                /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
+                *usec = now(clock);
+                return 1;
+        }
+
+        *usec = triple_timestamp_by_clock(&e->timestamp, clock);
+        return 0;
+}
+
+_public_ int sd_event_default(sd_event **ret) {
+        sd_event *e = NULL;
+        int r;
+
+        if (!ret)
+                return !!default_event;
+
+        if (default_event) {
+                *ret = sd_event_ref(default_event);
+                return 0;
+        }
+
+        r = sd_event_new(&e);
+        if (r < 0)
+                return r;
+
+        e->default_event_ptr = &default_event;
+        e->tid = gettid();
+        default_event = e;
+
+        *ret = e;
+        return 1;
+}
+
+_public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(tid, -EINVAL);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (e->tid != 0) {
+                *tid = e->tid;
+                return 0;
+        }
+
+        return -ENXIO;
+}
+
+_public_ int sd_event_set_watchdog(sd_event *e, int b) {
+        int r;
+
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        if (e->watchdog == !!b)
+                return e->watchdog;
+
+        if (b) {
+                r = sd_watchdog_enabled(false, &e->watchdog_period);
+                if (r <= 0)
+                        return r;
+
+                /* Issue first ping immediately */
+                sd_notify(false, "WATCHDOG=1");
+                e->watchdog_last = now(CLOCK_MONOTONIC);
+
+                e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
+                if (e->watchdog_fd < 0)
+                        return -errno;
+
+                r = arm_watchdog(e);
+                if (r < 0)
+                        goto fail;
+
+                struct epoll_event ev = {
+                        .events = EPOLLIN,
+                        .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
+                };
+
+                if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
+                        r = -errno;
+                        goto fail;
+                }
+
+        } else {
+                if (e->watchdog_fd >= 0) {
+                        (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
+                        e->watchdog_fd = safe_close(e->watchdog_fd);
+                }
+        }
+
+        e->watchdog = !!b;
+        return e->watchdog;
+
+fail:
+        e->watchdog_fd = safe_close(e->watchdog_fd);
+        return r;
+}
+
+_public_ int sd_event_get_watchdog(sd_event *e) {
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        return e->watchdog;
+}
+
+_public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
+        assert_return(e, -EINVAL);
+        assert_return(e = event_resolve(e), -ENOPKG);
+        assert_return(!event_pid_changed(e), -ECHILD);
+
+        *ret = e->iteration;
+        return 0;
+}
+
+_public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
+        assert_return(s, -EINVAL);
+
+        s->destroy_callback = callback;
+        return 0;
+}
+
+_public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
+        assert_return(s, -EINVAL);
+
+        if (ret)
+                *ret = s->destroy_callback;
+
+        return !!s->destroy_callback;
+}
+
+_public_ int sd_event_source_get_floating(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+
+        return s->floating;
+}
+
+_public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
+        assert_return(s, -EINVAL);
+
+        if (s->floating == !!b)
+                return 0;
+
+        if (!s->event) /* Already disconnected */
+                return -ESTALE;
+
+        s->floating = b;
+
+        if (b) {
+                sd_event_source_ref(s);
+                sd_event_unref(s->event);
+        } else {
+                sd_event_ref(s->event);
+                sd_event_source_unref(s);
+        }
+
+        return 1;
+}
+
+_public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type != SOURCE_EXIT, -EDOM);
+
+        return s->exit_on_failure;
+}
+
+_public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
+        assert_return(s, -EINVAL);
+        assert_return(s->type != SOURCE_EXIT, -EDOM);
+
+        if (s->exit_on_failure == !!b)
+                return 0;
+
+        s->exit_on_failure = b;
+        return 1;
+}
+
+_public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
+        int r;
+
+        assert_return(s, -EINVAL);
+
+        /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
+         * so is a programming error. */
+        assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
+
+        /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
+         * non-ratelimited. */
+        r = event_source_leave_ratelimit(s, /* run_callback */ false);
+        if (r < 0)
+                return r;
+
+        s->rate_limit = (RateLimit) { interval, burst };
+        return 0;
+}
+
+_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
+        assert_return(s, -EINVAL);
+
+        s->ratelimit_expire_callback = callback;
+        return 0;
+}
+
+_public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
+        assert_return(s, -EINVAL);
+
+        /* Querying whether an event source has ratelimiting configured is not a loggable offense, hence
+         * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error. */
+        if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
+                return -EDOM;
+
+        if (!ratelimit_configured(&s->rate_limit))
+                return -ENOEXEC;
+
+        if (ret_interval)
+                *ret_interval = s->rate_limit.interval;
+        if (ret_burst)
+                *ret_burst = s->rate_limit.burst;
+
+        return 0;
+}
+
+_public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
+        assert_return(s, -EINVAL);
+
+        if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
+                return false;
+
+        if (!ratelimit_configured(&s->rate_limit))
+                return false;
+
+        return s->ratelimited;
+}
+
+_public_ int sd_event_set_signal_exit(sd_event *e, int b) {
+        bool change = false;
+        int r;
+
+        assert_return(e, -EINVAL);
+
+        if (b) {
+                /* We want to maintain pointers to these event sources, so that we can destroy them when told
+                 * so. But we also don't want them to pin the event loop itself. Hence we mark them as
+                 * floating after creation (and undo this before deleting them again). */
+
+                if (!e->sigint_event_source) {
+                        r = sd_event_add_signal(e, &e->sigint_event_source, SIGINT | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
+                        if (r < 0)
+                                return r;
+
+                        assert(sd_event_source_set_floating(e->sigint_event_source, true) >= 0);
+                        change = true;
+                }
+
+                if (!e->sigterm_event_source) {
+                        r = sd_event_add_signal(e, &e->sigterm_event_source, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, NULL, NULL);
+                        if (r < 0) {
+                                if (change) {
+                                        assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
+                                        e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
+                                }
+
+                                return r;
+                        }
+
+                        assert(sd_event_source_set_floating(e->sigterm_event_source, true) >= 0);
+                        change = true;
+                }
+
+        } else {
+                if (e->sigint_event_source) {
+                        assert(sd_event_source_set_floating(e->sigint_event_source, false) >= 0);
+                        e->sigint_event_source = sd_event_source_unref(e->sigint_event_source);
+                        change = true;
+                }
+
+                if (e->sigterm_event_source) {
+                        assert(sd_event_source_set_floating(e->sigterm_event_source, false) >= 0);
+                        e->sigterm_event_source = sd_event_source_unref(e->sigterm_event_source);
+                        change = true;
+                }
+        }
+
+        return change;
+}
diff --git a/src/libsystemd/sd-event/test-event.c b/src/libsystemd/sd-event/test-event.c
new file mode 100644
index 0000000..33cd1c0
--- /dev/null
+++ b/src/libsystemd/sd-event/test-event.c
@@ -0,0 +1,812 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/wait.h>
+
+#include "sd-event.h"
+
+#include "alloc-util.h"
+#include "exec-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "log.h"
+#include "macro.h"
+#include "missing_syscall.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "rm-rf.h"
+#include "signal-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "tests.h"
+#include "tmpfile-util.h"
+#include "util.h"
+
+static int prepare_handler(sd_event_source *s, void *userdata) {
+        log_info("preparing %c", PTR_TO_INT(userdata));
+        return 1;
+}
+
+static bool got_a, got_b, got_c, got_unref;
+static unsigned got_d;
+
+static int unref_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        sd_event_source_unref(s);
+        got_unref = true;
+        return 0;
+}
+
+static int io_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+
+        log_info("got IO on %c", PTR_TO_INT(userdata));
+
+        if (userdata == INT_TO_PTR('a')) {
+                assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
+                assert_se(!got_a);
+                got_a = true;
+        } else if (userdata == INT_TO_PTR('b')) {
+                assert_se(!got_b);
+                got_b = true;
+        } else if (userdata == INT_TO_PTR('d')) {
+                got_d++;
+                if (got_d < 2)
+                        assert_se(sd_event_source_set_enabled(s, SD_EVENT_ONESHOT) >= 0);
+                else
+                        assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
+        } else
+                assert_not_reached();
+
+        return 1;
+}
+
+static int child_handler(sd_event_source *s, const siginfo_t *si, void *userdata) {
+
+        assert_se(s);
+        assert_se(si);
+
+        assert_se(si->si_uid == getuid());
+        assert_se(si->si_signo == SIGCHLD);
+        assert_se(si->si_code == CLD_EXITED);
+        assert_se(si->si_status == 78);
+
+        log_info("got child on %c", PTR_TO_INT(userdata));
+
+        assert_se(userdata == INT_TO_PTR('f'));
+
+        assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0);
+        sd_event_source_unref(s);
+
+        return 1;
+}
+
+static int signal_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+        sd_event_source *p = NULL;
+        pid_t pid;
+        siginfo_t plain_si;
+
+        assert_se(s);
+        assert_se(si);
+
+        log_info("got signal on %c", PTR_TO_INT(userdata));
+
+        assert_se(userdata == INT_TO_PTR('e'));
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGUSR2, -1) >= 0);
+
+        pid = fork();
+        assert_se(pid >= 0);
+
+        if (pid == 0) {
+                sigset_t ss;
+
+                assert_se(sigemptyset(&ss) >= 0);
+                assert_se(sigaddset(&ss, SIGUSR2) >= 0);
+
+                zero(plain_si);
+                assert_se(sigwaitinfo(&ss, &plain_si) >= 0);
+
+                assert_se(plain_si.si_signo == SIGUSR2);
+                assert_se(plain_si.si_value.sival_int == 4711);
+
+                _exit(78);
+        }
+
+        assert_se(sd_event_add_child(sd_event_source_get_event(s), &p, pid, WEXITED, child_handler, INT_TO_PTR('f')) >= 0);
+        assert_se(sd_event_source_set_enabled(p, SD_EVENT_ONESHOT) >= 0);
+        assert_se(sd_event_source_set_child_process_own(p, true) >= 0);
+
+        /* We can't use structured initialization here, since the structure contains various unions and these
+         * fields lie in overlapping (carefully aligned) unions that LLVM is allergic to allow assignments
+         * to */
+        zero(plain_si);
+        plain_si.si_signo = SIGUSR2;
+        plain_si.si_code = SI_QUEUE;
+        plain_si.si_pid = getpid();
+        plain_si.si_uid = getuid();
+        plain_si.si_value.sival_int = 4711;
+
+        assert_se(sd_event_source_send_child_signal(p, SIGUSR2, &plain_si, 0) >= 0);
+
+        sd_event_source_unref(s);
+
+        return 1;
+}
+
+static int defer_handler(sd_event_source *s, void *userdata) {
+        sd_event_source *p = NULL;
+
+        assert_se(s);
+
+        log_info("got defer on %c", PTR_TO_INT(userdata));
+
+        assert_se(userdata == INT_TO_PTR('d'));
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGUSR1, -1) >= 0);
+
+        assert_se(sd_event_add_signal(sd_event_source_get_event(s), &p, SIGUSR1, signal_handler, INT_TO_PTR('e')) >= 0);
+        assert_se(sd_event_source_set_enabled(p, SD_EVENT_ONESHOT) >= 0);
+        raise(SIGUSR1);
+
+        sd_event_source_unref(s);
+
+        return 1;
+}
+
+static bool do_quit;
+
+static int time_handler(sd_event_source *s, uint64_t usec, void *userdata) {
+        log_info("got timer on %c", PTR_TO_INT(userdata));
+
+        if (userdata == INT_TO_PTR('c')) {
+
+                if (do_quit) {
+                        sd_event_source *p;
+
+                        assert_se(sd_event_add_defer(sd_event_source_get_event(s), &p, defer_handler, INT_TO_PTR('d')) >= 0);
+                        assert_se(sd_event_source_set_enabled(p, SD_EVENT_ONESHOT) >= 0);
+                } else {
+                        assert_se(!got_c);
+                        got_c = true;
+                }
+        } else
+                assert_not_reached();
+
+        return 2;
+}
+
+static bool got_exit = false;
+
+static int exit_handler(sd_event_source *s, void *userdata) {
+        log_info("got quit handler on %c", PTR_TO_INT(userdata));
+
+        got_exit = true;
+
+        return 3;
+}
+
+static bool got_post = false;
+
+static int post_handler(sd_event_source *s, void *userdata) {
+        log_info("got post handler");
+
+        got_post = true;
+
+        return 2;
+}
+
+static void test_basic_one(bool with_pidfd) {
+        sd_event *e = NULL;
+        sd_event_source *w = NULL, *x = NULL, *y = NULL, *z = NULL, *q = NULL, *t = NULL;
+        static const char ch = 'x';
+        int a[2] = { -1, -1 }, b[2] = { -1, -1}, d[2] = { -1, -1}, k[2] = { -1, -1 };
+        uint64_t event_now;
+        int64_t priority;
+
+        log_info("/* %s(pidfd=%s) */", __func__, yes_no(with_pidfd));
+
+        assert_se(setenv("SYSTEMD_PIDFD", yes_no(with_pidfd), 1) >= 0);
+
+        assert_se(pipe(a) >= 0);
+        assert_se(pipe(b) >= 0);
+        assert_se(pipe(d) >= 0);
+        assert_se(pipe(k) >= 0);
+
+        assert_se(sd_event_default(&e) >= 0);
+        assert_se(sd_event_now(e, CLOCK_MONOTONIC, &event_now) > 0);
+
+        assert_se(sd_event_set_watchdog(e, true) >= 0);
+
+        /* Test whether we cleanly can destroy an io event source from its own handler */
+        got_unref = false;
+        assert_se(sd_event_add_io(e, &t, k[0], EPOLLIN, unref_handler, NULL) >= 0);
+        assert_se(write(k[1], &ch, 1) == 1);
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+        assert_se(got_unref);
+
+        got_a = false, got_b = false, got_c = false, got_d = 0;
+
+        /* Add a oneshot handler, trigger it, reenable it, and trigger
+         * it again. */
+        assert_se(sd_event_add_io(e, &w, d[0], EPOLLIN, io_handler, INT_TO_PTR('d')) >= 0);
+        assert_se(sd_event_source_set_enabled(w, SD_EVENT_ONESHOT) >= 0);
+        assert_se(write(d[1], &ch, 1) >= 0);
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+        assert_se(got_d == 1);
+        assert_se(write(d[1], &ch, 1) >= 0);
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+        assert_se(got_d == 2);
+
+        assert_se(sd_event_add_io(e, &x, a[0], EPOLLIN, io_handler, INT_TO_PTR('a')) >= 0);
+        assert_se(sd_event_add_io(e, &y, b[0], EPOLLIN, io_handler, INT_TO_PTR('b')) >= 0);
+
+        do_quit = false;
+        assert_se(sd_event_add_time(e, &z, CLOCK_MONOTONIC, 0, 0, time_handler, INT_TO_PTR('c')) >= 0);
+        assert_se(sd_event_add_exit(e, &q, exit_handler, INT_TO_PTR('g')) >= 0);
+
+        assert_se(sd_event_source_set_priority(x, 99) >= 0);
+        assert_se(sd_event_source_get_priority(x, &priority) >= 0);
+        assert_se(priority == 99);
+        assert_se(sd_event_source_set_enabled(y, SD_EVENT_ONESHOT) >= 0);
+        assert_se(sd_event_source_set_prepare(x, prepare_handler) >= 0);
+        assert_se(sd_event_source_set_priority(z, 50) >= 0);
+        assert_se(sd_event_source_set_enabled(z, SD_EVENT_ONESHOT) >= 0);
+        assert_se(sd_event_source_set_prepare(z, prepare_handler) >= 0);
+
+        /* Test for floating event sources */
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGRTMIN+1, -1) >= 0);
+        assert_se(sd_event_add_signal(e, NULL, SIGRTMIN+1, NULL, NULL) >= 0);
+
+        assert_se(write(a[1], &ch, 1) >= 0);
+        assert_se(write(b[1], &ch, 1) >= 0);
+
+        assert_se(!got_a && !got_b && !got_c);
+
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+
+        assert_se(!got_a && got_b && !got_c);
+
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+
+        assert_se(!got_a && got_b && got_c);
+
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+
+        assert_se(got_a && got_b && got_c);
+
+        sd_event_source_unref(x);
+        sd_event_source_unref(y);
+
+        do_quit = true;
+        assert_se(sd_event_add_post(e, NULL, post_handler, NULL) >= 0);
+        assert_se(sd_event_now(e, CLOCK_MONOTONIC, &event_now) == 0);
+        assert_se(sd_event_source_set_time(z, event_now + 200 * USEC_PER_MSEC) >= 0);
+        assert_se(sd_event_source_set_enabled(z, SD_EVENT_ONESHOT) >= 0);
+
+        assert_se(sd_event_loop(e) >= 0);
+        assert_se(got_post);
+        assert_se(got_exit);
+
+        sd_event_source_unref(z);
+        sd_event_source_unref(q);
+
+        sd_event_source_unref(w);
+
+        sd_event_unref(e);
+
+        safe_close_pair(a);
+        safe_close_pair(b);
+        safe_close_pair(d);
+        safe_close_pair(k);
+
+        assert_se(unsetenv("SYSTEMD_PIDFD") >= 0);
+}
+
+TEST(basic) {
+        test_basic_one(true);   /* test with pidfd */
+        test_basic_one(false);  /* test without pidfd */
+}
+
+TEST(sd_event_now) {
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        uint64_t event_now;
+
+        assert_se(sd_event_new(&e) >= 0);
+        assert_se(sd_event_now(e, CLOCK_MONOTONIC, &event_now) > 0);
+        assert_se(sd_event_now(e, CLOCK_REALTIME, &event_now) > 0);
+        assert_se(sd_event_now(e, CLOCK_REALTIME_ALARM, &event_now) > 0);
+        assert_se(sd_event_now(e, CLOCK_BOOTTIME, &event_now) > 0);
+        assert_se(sd_event_now(e, CLOCK_BOOTTIME_ALARM, &event_now) > 0);
+        assert_se(sd_event_now(e, -1, &event_now) == -EOPNOTSUPP);
+        assert_se(sd_event_now(e, 900 /* arbitrary big number */, &event_now) == -EOPNOTSUPP);
+
+        assert_se(sd_event_run(e, 0) == 0);
+
+        assert_se(sd_event_now(e, CLOCK_MONOTONIC, &event_now) == 0);
+        assert_se(sd_event_now(e, CLOCK_REALTIME, &event_now) == 0);
+        assert_se(sd_event_now(e, CLOCK_REALTIME_ALARM, &event_now) == 0);
+        assert_se(sd_event_now(e, CLOCK_BOOTTIME, &event_now) == 0);
+        assert_se(sd_event_now(e, CLOCK_BOOTTIME_ALARM, &event_now) == 0);
+        assert_se(sd_event_now(e, -1, &event_now) == -EOPNOTSUPP);
+        assert_se(sd_event_now(e, 900 /* arbitrary big number */, &event_now) == -EOPNOTSUPP);
+}
+
+static int last_rtqueue_sigval = 0;
+static int n_rtqueue = 0;
+
+static int rtqueue_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+        last_rtqueue_sigval = si->ssi_int;
+        n_rtqueue++;
+        return 0;
+}
+
+TEST(rtqueue) {
+        sd_event_source *u = NULL, *v = NULL, *s = NULL;
+        sd_event *e = NULL;
+
+        assert_se(sd_event_default(&e) >= 0);
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGRTMIN+2, SIGRTMIN+3, SIGUSR2, -1) >= 0);
+        assert_se(sd_event_add_signal(e, &u, SIGRTMIN+2, rtqueue_handler, NULL) >= 0);
+        assert_se(sd_event_add_signal(e, &v, SIGRTMIN+3, rtqueue_handler, NULL) >= 0);
+        assert_se(sd_event_add_signal(e, &s, SIGUSR2, rtqueue_handler, NULL) >= 0);
+
+        assert_se(sd_event_source_set_priority(v, -10) >= 0);
+
+        assert_se(sigqueue(getpid_cached(), SIGRTMIN+2, (union sigval) { .sival_int = 1 }) >= 0);
+        assert_se(sigqueue(getpid_cached(), SIGRTMIN+3, (union sigval) { .sival_int = 2 }) >= 0);
+        assert_se(sigqueue(getpid_cached(), SIGUSR2, (union sigval) { .sival_int = 3 }) >= 0);
+        assert_se(sigqueue(getpid_cached(), SIGRTMIN+3, (union sigval) { .sival_int = 4 }) >= 0);
+        assert_se(sigqueue(getpid_cached(), SIGUSR2, (union sigval) { .sival_int = 5 }) >= 0);
+
+        assert_se(n_rtqueue == 0);
+        assert_se(last_rtqueue_sigval == 0);
+
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+        assert_se(n_rtqueue == 1);
+        assert_se(last_rtqueue_sigval == 2); /* first SIGRTMIN+3 */
+
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+        assert_se(n_rtqueue == 2);
+        assert_se(last_rtqueue_sigval == 4); /* second SIGRTMIN+3 */
+
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+        assert_se(n_rtqueue == 3);
+        assert_se(last_rtqueue_sigval == 3); /* first SIGUSR2 */
+
+        assert_se(sd_event_run(e, UINT64_MAX) >= 1);
+        assert_se(n_rtqueue == 4);
+        assert_se(last_rtqueue_sigval == 1); /* SIGRTMIN+2 */
+
+        assert_se(sd_event_run(e, 0) == 0); /* the other SIGUSR2 is dropped, because the first one was still queued */
+        assert_se(n_rtqueue == 4);
+        assert_se(last_rtqueue_sigval == 1);
+
+        sd_event_source_unref(u);
+        sd_event_source_unref(v);
+        sd_event_source_unref(s);
+
+        sd_event_unref(e);
+}
+
+#define CREATE_EVENTS_MAX (70000U)
+
+struct inotify_context {
+        bool delete_self_handler_called;
+        unsigned create_called[CREATE_EVENTS_MAX];
+        unsigned create_overflow;
+        unsigned n_create_events;
+};
+
+static void maybe_exit(sd_event_source *s, struct inotify_context *c) {
+        unsigned n;
+
+        assert_se(s);
+        assert_se(c);
+
+        if (!c->delete_self_handler_called)
+                return;
+
+        for (n = 0; n < 3; n++) {
+                unsigned i;
+
+                if (c->create_overflow & (1U << n))
+                        continue;
+
+                for (i = 0; i < c->n_create_events; i++)
+                        if (!(c->create_called[i] & (1U << n)))
+                                return;
+        }
+
+        sd_event_exit(sd_event_source_get_event(s), 0);
+}
+
+static int inotify_handler(sd_event_source *s, const struct inotify_event *ev, void *userdata) {
+        struct inotify_context *c = userdata;
+        const char *description;
+        unsigned bit, n;
+
+        assert_se(sd_event_source_get_description(s, &description) >= 0);
+        assert_se(safe_atou(description, &n) >= 0);
+
+        assert_se(n <= 3);
+        bit = 1U << n;
+
+        if (ev->mask & IN_Q_OVERFLOW) {
+                log_info("inotify-handler <%s>: overflow", description);
+                c->create_overflow |= bit;
+        } else if (ev->mask & IN_CREATE) {
+                if (streq(ev->name, "sub"))
+                        log_debug("inotify-handler <%s>: create on %s", description, ev->name);
+                else {
+                        unsigned i;
+
+                        assert_se(safe_atou(ev->name, &i) >= 0);
+                        assert_se(i < c->n_create_events);
+                        c->create_called[i] |= bit;
+                }
+        } else if (ev->mask & IN_DELETE) {
+                log_info("inotify-handler <%s>: delete of %s", description, ev->name);
+                assert_se(streq(ev->name, "sub"));
+        } else
+                assert_not_reached();
+
+        maybe_exit(s, c);
+        return 1;
+}
+
+static int delete_self_handler(sd_event_source *s, const struct inotify_event *ev, void *userdata) {
+        struct inotify_context *c = userdata;
+
+        if (ev->mask & IN_Q_OVERFLOW) {
+                log_info("delete-self-handler: overflow");
+                c->delete_self_handler_called = true;
+        } else if (ev->mask & IN_DELETE_SELF) {
+                log_info("delete-self-handler: delete-self");
+                c->delete_self_handler_called = true;
+        } else if (ev->mask & IN_IGNORED) {
+                log_info("delete-self-handler: ignore");
+        } else
+                assert_not_reached();
+
+        maybe_exit(s, c);
+        return 1;
+}
+
+static void test_inotify_one(unsigned n_create_events) {
+        _cleanup_(rm_rf_physical_and_freep) char *p = NULL;
+        sd_event_source *a = NULL, *b = NULL, *c = NULL, *d = NULL;
+        struct inotify_context context = {
+                .n_create_events = n_create_events,
+        };
+        sd_event *e = NULL;
+        const char *q;
+        unsigned i;
+
+        log_info("/* %s(%u) */", __func__, n_create_events);
+
+        assert_se(sd_event_default(&e) >= 0);
+
+        assert_se(mkdtemp_malloc("/tmp/test-inotify-XXXXXX", &p) >= 0);
+
+        assert_se(sd_event_add_inotify(e, &a, p, IN_CREATE|IN_ONLYDIR, inotify_handler, &context) >= 0);
+        assert_se(sd_event_add_inotify(e, &b, p, IN_CREATE|IN_DELETE|IN_DONT_FOLLOW, inotify_handler, &context) >= 0);
+        assert_se(sd_event_source_set_priority(b, SD_EVENT_PRIORITY_IDLE) >= 0);
+        assert_se(sd_event_source_set_priority(b, SD_EVENT_PRIORITY_NORMAL) >= 0);
+        assert_se(sd_event_add_inotify(e, &c, p, IN_CREATE|IN_DELETE|IN_EXCL_UNLINK, inotify_handler, &context) >= 0);
+        assert_se(sd_event_source_set_priority(c, SD_EVENT_PRIORITY_IDLE) >= 0);
+
+        assert_se(sd_event_source_set_description(a, "0") >= 0);
+        assert_se(sd_event_source_set_description(b, "1") >= 0);
+        assert_se(sd_event_source_set_description(c, "2") >= 0);
+
+        q = strjoina(p, "/sub");
+        assert_se(touch(q) >= 0);
+        assert_se(sd_event_add_inotify(e, &d, q, IN_DELETE_SELF, delete_self_handler, &context) >= 0);
+
+        for (i = 0; i < n_create_events; i++) {
+                char buf[DECIMAL_STR_MAX(unsigned)+1];
+                _cleanup_free_ char *z = NULL;
+
+                xsprintf(buf, "%u", i);
+                assert_se(z = path_join(p, buf));
+
+                assert_se(touch(z) >= 0);
+        }
+
+        assert_se(unlink(q) >= 0);
+
+        assert_se(sd_event_loop(e) >= 0);
+
+        sd_event_source_unref(a);
+        sd_event_source_unref(b);
+        sd_event_source_unref(c);
+        sd_event_source_unref(d);
+
+        sd_event_unref(e);
+}
+
+TEST(inotify) {
+        test_inotify_one(100); /* should work without overflow */
+        test_inotify_one(33000); /* should trigger a q overflow */
+}
+
+static int pidfd_handler(sd_event_source *s, const siginfo_t *si, void *userdata) {
+        assert_se(s);
+        assert_se(si);
+
+        assert_se(si->si_uid == getuid());
+        assert_se(si->si_signo == SIGCHLD);
+        assert_se(si->si_code == CLD_EXITED);
+        assert_se(si->si_status == 66);
+
+        log_info("got pidfd on %c", PTR_TO_INT(userdata));
+
+        assert_se(userdata == INT_TO_PTR('p'));
+
+        assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0);
+        sd_event_source_unref(s);
+
+        return 0;
+}
+
+TEST(pidfd) {
+        sd_event_source *s = NULL, *t = NULL;
+        sd_event *e = NULL;
+        int pidfd;
+        pid_t pid, pid2;
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0);
+
+        pid = fork();
+        if (pid == 0)
+                /* child */
+                _exit(66);
+
+        assert_se(pid > 1);
+
+        pidfd = pidfd_open(pid, 0);
+        if (pidfd < 0) {
+                /* No pidfd_open() supported or blocked? */
+                assert_se(ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno));
+                (void) wait_for_terminate(pid, NULL);
+                return;
+        }
+
+        pid2 = fork();
+        if (pid2 == 0)
+                freeze();
+
+        assert_se(pid > 2);
+
+        assert_se(sd_event_default(&e) >= 0);
+        assert_se(sd_event_add_child_pidfd(e, &s, pidfd, WEXITED, pidfd_handler, INT_TO_PTR('p')) >= 0);
+        assert_se(sd_event_source_set_child_pidfd_own(s, true) >= 0);
+
+        /* This one should never trigger, since our second child lives forever */
+        assert_se(sd_event_add_child(e, &t, pid2, WEXITED, pidfd_handler, INT_TO_PTR('q')) >= 0);
+        assert_se(sd_event_source_set_child_process_own(t, true) >= 0);
+
+        assert_se(sd_event_loop(e) >= 0);
+
+        /* Child should still be alive */
+        assert_se(kill(pid2, 0) >= 0);
+
+        t = sd_event_source_unref(t);
+
+        /* Child should now be dead, since we dropped the ref */
+        assert_se(kill(pid2, 0) < 0 && errno == ESRCH);
+
+        sd_event_unref(e);
+}
+
+static int ratelimit_io_handler(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        unsigned *c = (unsigned*) userdata;
+        *c += 1;
+        return 0;
+}
+
+static int ratelimit_time_handler(sd_event_source *s, uint64_t usec, void *userdata) {
+        int r;
+
+        r = sd_event_source_set_enabled(s, SD_EVENT_ON);
+        if (r < 0)
+                log_warning_errno(r, "Failed to turn on notify event source: %m");
+
+        r = sd_event_source_set_time(s, usec + 1000);
+        if (r < 0)
+                log_error_errno(r, "Failed to restart watchdog event source: %m");
+
+        unsigned *c = (unsigned*) userdata;
+        *c += 1;
+
+        return 0;
+}
+
+static int expired = -1;
+static int ratelimit_expired(sd_event_source *s, void *userdata) {
+        return ++expired;
+}
+
+TEST(ratelimit) {
+        _cleanup_close_pair_ int p[2] = {-1, -1};
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+        uint64_t interval;
+        unsigned count, burst;
+
+        assert_se(sd_event_default(&e) >= 0);
+        assert_se(pipe2(p, O_CLOEXEC|O_NONBLOCK) >= 0);
+
+        assert_se(sd_event_add_io(e, &s, p[0], EPOLLIN, ratelimit_io_handler, &count) >= 0);
+        assert_se(sd_event_source_set_description(s, "test-ratelimit-io") >= 0);
+        assert_se(sd_event_source_set_ratelimit(s, 1 * USEC_PER_SEC, 5) >= 0);
+        assert_se(sd_event_source_get_ratelimit(s, &interval, &burst) >= 0);
+        assert_se(interval == 1 * USEC_PER_SEC && burst == 5);
+
+        assert_se(write(p[1], "1", 1) == 1);
+
+        count = 0;
+        for (unsigned i = 0; i < 10; i++) {
+                log_debug("slow loop iteration %u", i);
+                assert_se(sd_event_run(e, UINT64_MAX) >= 0);
+                assert_se(usleep(250 * USEC_PER_MSEC) >= 0);
+        }
+
+        assert_se(sd_event_source_is_ratelimited(s) == 0);
+        assert_se(count == 10);
+        log_info("ratelimit_io_handler: called %u times, event source not ratelimited", count);
+
+        assert_se(sd_event_source_set_ratelimit(s, 0, 0) >= 0);
+        assert_se(sd_event_source_set_ratelimit(s, 1 * USEC_PER_SEC, 5) >= 0);
+
+        count = 0;
+        for (unsigned i = 0; i < 10; i++) {
+                log_debug("fast event loop iteration %u", i);
+                assert_se(sd_event_run(e, UINT64_MAX) >= 0);
+                assert_se(usleep(10) >= 0);
+        }
+        log_info("ratelimit_io_handler: called %u times, event source got ratelimited", count);
+        assert_se(count < 10);
+
+        s = sd_event_source_unref(s);
+        safe_close_pair(p);
+
+        count = 0;
+        assert_se(sd_event_add_time_relative(e, &s, CLOCK_MONOTONIC, 1000, 1, ratelimit_time_handler, &count) >= 0);
+        assert_se(sd_event_source_set_ratelimit(s, 1 * USEC_PER_SEC, 10) == 0);
+
+        do {
+                assert_se(sd_event_run(e, UINT64_MAX) >= 0);
+        } while (!sd_event_source_is_ratelimited(s));
+
+        log_info("ratelimit_time_handler: called %u times, event source got ratelimited", count);
+        assert_se(count == 10);
+
+        /* In order to get rid of active rate limit client needs to disable it explicitly */
+        assert_se(sd_event_source_set_ratelimit(s, 0, 0) >= 0);
+        assert_se(!sd_event_source_is_ratelimited(s));
+
+        assert_se(sd_event_source_set_ratelimit(s, 1 * USEC_PER_SEC, 10) >= 0);
+
+        /* Set callback that will be invoked when we leave rate limited state. */
+        assert_se(sd_event_source_set_ratelimit_expire_callback(s, ratelimit_expired) >= 0);
+
+        do {
+                assert_se(sd_event_run(e, UINT64_MAX) >= 0);
+        } while (!sd_event_source_is_ratelimited(s));
+
+        log_info("ratelimit_time_handler: called 10 more times, event source got ratelimited");
+        assert_se(count == 20);
+
+        /* Dispatch the event loop once more and check that ratelimit expiration callback got called */
+        assert_se(sd_event_run(e, UINT64_MAX) >= 0);
+        assert_se(expired == 0);
+}
+
+TEST(simple_timeout) {
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        usec_t f, t, some_time;
+
+        some_time = random_u64_range(2 * USEC_PER_SEC);
+
+        assert_se(sd_event_default(&e) >= 0);
+
+        assert_se(sd_event_prepare(e) == 0);
+
+        f = now(CLOCK_MONOTONIC);
+        assert_se(sd_event_wait(e, some_time) >= 0);
+        t = now(CLOCK_MONOTONIC);
+
+        /* The event loop may sleep longer than the specified time (timer accuracy, scheduling latencies, …),
+         * but never shorter. Let's check that. */
+        assert_se(t >= usec_add(f, some_time));
+}
+
+static int inotify_self_destroy_handler(sd_event_source *s, const struct inotify_event *ev, void *userdata) {
+        sd_event_source **p = userdata;
+
+        assert_se(ev);
+        assert_se(p);
+        assert_se(*p == s);
+
+        assert_se(FLAGS_SET(ev->mask, IN_ATTRIB));
+
+        assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0);
+
+        *p = sd_event_source_unref(*p); /* here's what we actually intend to test: we destroy the event
+                                         * source from inside the event source handler */
+        return 1;
+}
+
+TEST(inotify_self_destroy) {
+        _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        char path[] = "/tmp/inotifyXXXXXX";
+        _cleanup_close_ int fd = -1;
+
+        /* Tests that destroying an inotify event source from its own handler is safe */
+
+        assert_se(sd_event_default(&e) >= 0);
+
+        fd = mkostemp_safe(path);
+        assert_se(fd >= 0);
+        assert_se(sd_event_add_inotify_fd(e, &s, fd, IN_ATTRIB, inotify_self_destroy_handler, &s) >= 0);
+        fd = safe_close(fd);
+        assert_se(unlink(path) >= 0); /* This will trigger IN_ATTRIB because link count goes to zero */
+        assert_se(sd_event_loop(e) >= 0);
+}
+
+struct inotify_process_buffered_data_context {
+        const char *path[2];
+        unsigned i;
+};
+
+static int inotify_process_buffered_data_handler(sd_event_source *s, const struct inotify_event *ev, void *userdata) {
+        struct inotify_process_buffered_data_context *c = ASSERT_PTR(userdata);
+        const char *description;
+
+        assert_se(sd_event_source_get_description(s, &description) >= 0);
+
+        assert_se(c->i < 2);
+        assert_se(streq(c->path[c->i], description));
+        c->i++;
+
+        return 1;
+}
+
+TEST(inotify_process_buffered_data) {
+        _cleanup_(rm_rf_physical_and_freep) char *p = NULL, *q = NULL;
+        _cleanup_(sd_event_source_unrefp) sd_event_source *a = NULL, *b = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        _cleanup_free_ char *z = NULL;
+
+        /* For issue #23826 */
+
+        assert_se(sd_event_default(&e) >= 0);
+
+        assert_se(mkdtemp_malloc("/tmp/test-inotify-XXXXXX", &p) >= 0);
+        assert_se(mkdtemp_malloc("/tmp/test-inotify-XXXXXX", &q) >= 0);
+
+        struct inotify_process_buffered_data_context context = {
+                .path = { p, q },
+        };
+
+        assert_se(sd_event_add_inotify(e, &a, p, IN_CREATE, inotify_process_buffered_data_handler, &context) >= 0);
+        assert_se(sd_event_add_inotify(e, &b, q, IN_CREATE, inotify_process_buffered_data_handler, &context) >= 0);
+
+        assert_se(z = path_join(p, "aaa"));
+        assert_se(touch(z) >= 0);
+        z = mfree(z);
+        assert_se(z = path_join(q, "bbb"));
+        assert_se(touch(z) >= 0);
+        z = mfree(z);
+
+        assert_se(sd_event_run(e, 10 * USEC_PER_SEC) > 0);
+        assert_se(sd_event_prepare(e) > 0); /* issue #23826: this was 0. */
+        assert_se(sd_event_dispatch(e) > 0);
+        assert_se(sd_event_prepare(e) == 0);
+        assert_se(sd_event_wait(e, 0) == 0);
+}
+
+DEFINE_TEST_MAIN(LOG_DEBUG);