summaryrefslogtreecommitdiffstats
path: root/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp')
-rw-r--r--src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp940
1 files changed, 940 insertions, 0 deletions
diff --git a/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp b/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp
new file mode 100644
index 00000000..f6719664
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp
@@ -0,0 +1,940 @@
+/* $Id: ioqueue-iouringfile-provider.cpp $ */
+/** @file
+ * IPRT - I/O queue, Linux io_uring interface I/O file provider.
+ */
+
+/*
+ * Copyright (C) 2019-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
+ * @internal
+ *
+ * The io_uring interface is the most recent interface added to the Linux kernel
+ * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
+ * thus not available on most systems as of writing this backend (July 2019).
+ * It supersedes the old async I/O interface and cleans up with some restrictions like
+ * having to disable caching for the file.
+ * The interface is centered around a submission and completion queue to queue multiple new
+ * requests for the kernel to process and get notified about completions to reduce the amount
+ * of context switches to an absolute minimum. It also offers advanced features like
+ * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
+ * even more.
+ *
+ * The first implementation will only make use of the basic features and more advanced features
+ * will be added later.
+ * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
+ * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
+ * while still keeping a consistent platform independent API which allows efficient implementations on
+ * other hosts when they come up.
+ *
+ * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
+ * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
+ * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
+ * * http://kernel.dk/io_uring.pdf
+ * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_IOQUEUE
+#include <iprt/ioqueue.h>
+
+#include <iprt/assertcompile.h>
+#include <iprt/asm.h>
+#include <iprt/errcore.h>
+#include <iprt/file.h>
+#include <iprt/log.h>
+#include <iprt/mem.h>
+#include <iprt/string.h>
+
+#include <errno.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+
+#include "internal/ioqueue.h"
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+
+/** The syscall number of io_uring_setup(). */
+#define LNX_IOURING_SYSCALL_SETUP 425
+/** The syscall number of io_uring_enter(). */
+#define LNX_IOURING_SYSCALL_ENTER 426
+/** The syscall number of io_uring_register(). */
+#define LNX_IOURING_SYSCALL_REGISTER 427
+/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
+#define LNX_SYSCALL_EVENTFD2 290
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+
+/**
+ * Linux io_uring completion event.
+ */
+typedef struct LNXIOURINGCQE
+{
+ /** Opaque user data associated with the completed request. */
+ uint64_t u64User;
+ /** The status code of the request. */
+ int32_t rcLnx;
+ /** Some flags which are not used as of now. */
+ uint32_t fFlags;
+} LNXIOURINGCQE;
+AssertCompileSize(LNXIOURINGCQE, 16);
+/** Pointer to a Linux io_uring completion event. */
+typedef LNXIOURINGCQE *PLNXIOURINGCQE;
+/** Pointer to a constant linux io_uring completion event. */
+typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
+
+
+/**
+ * Linux io_uring submission queue entry.
+ */
+typedef struct LNXIOURINGSQE
+{
+ /** The opcode for the request. */
+ uint8_t u8Opc;
+ /** Common flags for the request. */
+ uint8_t u8Flags;
+ /** Assigned I/O priority. */
+ uint16_t u16IoPrio;
+ /** The file descriptor the request is for. */
+ int32_t i32Fd;
+ /** The start offset into the file for the request. */
+ uint64_t u64OffStart;
+ /** Buffer pointer or Pointer to io vector array depending on opcode. */
+ uint64_t u64AddrBufIoVec;
+ /** Size of the buffer in bytes or number of io vectors. */
+ uint32_t u32BufIoVecSz;
+ /** Opcode dependent data. */
+ union
+ {
+ /** Flags for read/write requests. */
+ uint32_t u32KrnlRwFlags;
+ /** Flags for fsync() like requests. */
+ uint32_t u32FsyncFlags;
+ /** Flags for poll() like requests. */
+ uint16_t u16PollFlags;
+ /** Flags for sync_file_range() like requests. */
+ uint32_t u32SyncFileRangeFlags;
+ /** Flags for requests requiring a msg structure. */
+ uint32_t u32MsgFlags;
+ } uOpc;
+ /** Opaque user data associated with the request and returned durign completion. */
+ uint64_t u64User;
+ /** Request type dependent data. */
+ union
+ {
+ /** Fixed buffer index if indicated by the request flags. */
+ uint16_t u16FixedBufIdx;
+ /** Padding to align the structure to 64 bytes. */
+ uint64_t au64Padding[3];
+ } uReq;
+} LNXIOURINGSQE;
+AssertCompileSize(LNXIOURINGSQE, 64);
+/** Pointer to a Linux io_uring submission queue entry. */
+typedef LNXIOURINGSQE *PLNXIOURINGSQE;
+/** Pointer to a constant Linux io_uring submission queue entry. */
+typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
+
+
+/**
+ * Linux u_ioring SQ ring header structure to maintain the queue.
+ */
+typedef struct LNXIOURINGSQ
+{
+ /** The current head position to fill in new requests. */
+ uint32_t u32OffHead;
+ /** The current tail position the kernel starts processing from. */
+ uint32_t u32OffTail;
+ /** The mask for the head and tail counters to apply to retrieve the index. */
+ uint32_t u32OffRingMask;
+ /** Number of entries in the SQ ring. */
+ uint32_t u32OffRingEntries;
+ /** Flags set asychronously by the kernel. */
+ uint32_t u32OffFlags;
+ /** Counter of dropped requests. */
+ uint32_t u32OffDroppedReqs;
+ /** Offset where to find the array of SQ entries. */
+ uint32_t u32OffArray;
+ /** Reserved. */
+ uint32_t u32Rsvd0;
+ /** Reserved. */
+ uint64_t u64Rsvd1;
+} LNXIOURINGSQ;
+AssertCompileSize(LNXIOURINGSQ, 40);
+/** Pointer to a Linux u_ioring SQ ring header. */
+typedef LNXIOURINGSQ *PLNXIOURINGSQ;
+/** Pointer to a constant Linux u_ioring SQ ring header. */
+typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
+
+
+/**
+ * Linux io_uring CQ ring header structure to maintain the queue.
+ */
+typedef struct LNXIOURINGCQ
+{
+ /** The current head position the kernel modifies when completion events happen. */
+ uint32_t u32OffHead;
+ /** The current tail position to read completion events from. */
+ uint32_t u32OffTail;
+ /** The mask for the head and tail counters to apply to retrieve the index. */
+ uint32_t u32OffRingMask;
+ /** Number of entries in the CQ ring. */
+ uint32_t u32OffRingEntries;
+ /** Number of CQ overflows happened. */
+ uint32_t u32OffOverflowCnt;
+ /** */
+ uint32_t u32OffCqes;
+ /** Reserved. */
+ uint64_t au64Rsvd0[2];
+} LNXIOURINGCQ;
+AssertCompileSize(LNXIOURINGCQ, 40);
+/** Pointer to a Linux u_ioring CQ ring header. */
+typedef LNXIOURINGCQ *PLNXIOURINGCQ;
+/** Pointer to a constant Linux u_ioring CQ ring header. */
+typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
+
+
+/**
+ * Linux io_uring parameters passed to io_uring_setup().
+ */
+typedef struct LNXIOURINGPARAMS
+{
+ /** Number of SQ entries requested, must be power of 2. */
+ uint32_t u32SqEntriesCnt;
+ /** Number of CQ entries requested, must be power of 2. */
+ uint32_t u32CqEntriesCnt;
+ /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
+ uint32_t u32Flags;
+ /** Affinity of the kernel side SQ polling thread if enabled. */
+ uint32_t u32SqPollCpu;
+ /** Milliseconds after the kernel side SQ polling thread goes to sleep
+ * if there is are no requests to process. */
+ uint32_t u32SqPollIdleMs;
+ /** Reserved. */
+ uint32_t au32Rsvd0[5];
+ /** Offsets returned for the submission queue. */
+ LNXIOURINGSQ SqOffsets;
+ /** Offsets returned for the completion queue. */
+ LNXIOURINGCQ CqOffsets;
+} LNXIOURINGPARAMS;
+/** Pointer to Linux io_uring parameters. */
+typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
+/** Pointer to constant Linux io_uring parameters. */
+typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
+
+
+/** @name LNXIOURINGSQE::u8Opc defined opcodes.
+ * @{ */
+/** Opcode to profile the interface, does nothing. */
+#define LNX_IOURING_OPC_NOP 0
+/** preadv() like request. */
+#define LNX_IOURING_OPC_READV 1
+/** pwritev() like request. */
+#define LNX_IOURING_OPC_WRITEV 2
+/** fsync() like request. */
+#define LNX_IOURING_OPC_FSYNC 3
+/** Read request using a fixed preset buffer. */
+#define LNX_IOURING_OPC_READ_FIXED 4
+/** Write request using a fixed preset buffer. */
+#define LNX_IOURING_OPC_WRITE_FIXED 5
+/** Add file descriptor to pollset. */
+#define LNX_IOURING_OPC_POLL_ADD 6
+/** Remove file descriptor from pollset. */
+#define LNX_IOURING_OPC_POLL_REMOVE 7
+/** sync_file_range() like request. */
+#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
+/** sendmsg() like request. */
+#define LNX_IOURING_OPC_SENDMSG 9
+/** recvmsg() like request. */
+#define LNX_IOURING_OPC_RECVMSG 10
+/** @} */
+
+
+/** @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
+ * @{ */
+/** Sync userdata as well instead of metadata only. */
+#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
+/** @} */
+
+
+/** @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
+ * @{ */
+/** The I/O context is polled. */
+#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
+/** The kernel should poll the submission queue. */
+#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
+/** Sets the CPU affinity of the kernel thread polling the submission queue. */
+#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
+/** @} */
+
+
+/** @name Flags for LNXIOURINGSQE::u8Flags.
+ * @{ */
+/** The file descriptor was registered before use. */
+#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
+/** Complete all active requests before issuing the request with the flag set. */
+#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
+/** Links the request with the flag set to the next one. */
+#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
+/** @} */
+
+
+/** @name Magic mmap offsets to map submission and completion queues.
+ * @{ */
+/** Used to map the submission queue. */
+#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
+/** Used to map the completion queue. */
+#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
+/** Used to map the submission queue entries array. */
+#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
+/** @} */
+
+
+/** @name Flags used for the SQ ring structure.
+ * @{ */
+/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
+#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
+/** @} */
+
+
+/** @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
+ * @{ */
+/** Retrieve completion events for the completion queue. */
+#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
+/** Wakes the suspended kernel thread processing the requests. */
+#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
+/** @} */
+
+
+/** @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
+ * @{ */
+/** Register a fixed set of buffers. */
+#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
+/** Unregisters a fixed set of buffers registered previously. */
+#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
+/** Register a fixed set of files. */
+#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
+/** Unregisters a fixed set of files registered previously. */
+#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
+/** Register an eventfd associated with the I/O ring. */
+#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
+/** Unregisters an eventfd registered previously. */
+#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
+/** @} */
+
+
+/**
+ * SQ ring structure.
+ *
+ * @note Some members of this structure point to memory shared with the kernel,
+ * hence the volatile keyword.
+ */
+typedef struct RTIOQUEUESQ
+{
+ /** Pointer to the head counter. */
+ volatile uint32_t *pidxHead;
+ /** Pointer to the tail counter. */
+ volatile uint32_t *pidxTail;
+ /** Mask to apply for the counters to get to the index. */
+ uint32_t fRingMask;
+ /** Number of entries in the ring. */
+ uint32_t cEntries;
+ /** Pointer to the global flags. */
+ volatile uint32_t *pfFlags;
+ /** Pointer to the indirection array used for indexing the real SQ entries. */
+ volatile uint32_t *paidxSqes;
+} RTIOQUEUESQ;
+
+
+/**
+ * CQ ring structure.
+ *
+ * @note Some members of this structure point to memory shared with the kernel,
+ * hence the volatile keyword.
+ */
+typedef struct RTIOQUEUECQ
+{
+ /** Pointer to the head counter. */
+ volatile uint32_t *pidxHead;
+ /** Pointer to the tail counter. */
+ volatile uint32_t *pidxTail;
+ /** Mask to apply for the counters to get to the index. */
+ uint32_t fRingMask;
+ /** Number of entries in the ring. */
+ uint32_t cEntries;
+ /** Pointer to the completion entry ring. */
+ volatile LNXIOURINGCQE *paCqes;
+} RTIOQUEUECQ;
+
+
+/**
+ * Internal I/O queue provider instance data.
+ */
+typedef struct RTIOQUEUEPROVINT
+{
+ /** The io_uring file descriptor. */
+ int iFdIoCtx;
+ /** The eventfd file descriptor registered with the ring. */
+ int iFdEvt;
+ /** The submission queue. */
+ RTIOQUEUESQ Sq;
+ /** The currently uncommitted tail for the SQ. */
+ uint32_t idxSqTail;
+ /** Numbere of uncommitted SQEs. */
+ uint32_t cSqesToCommit;
+ /** The completion queue. */
+ RTIOQUEUECQ Cq;
+ /** Pointer to the mapped SQES entries. */
+ PLNXIOURINGSQE paSqes;
+ /** Pointer to the iovec structure used for non S/G requests. */
+ struct iovec *paIoVecs;
+ /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
+ void *pvMMapSqRing;
+ /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
+ void *pvMMapCqRing;
+ /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
+ void *pvMMapSqes;
+ /** Size of the mapped SQ ring, used for unmapping. */
+ size_t cbMMapSqRing;
+ /** Size of the mapped CQ ring, used for unmapping. */
+ size_t cbMMapCqRing;
+ /** Size of the mapped SQ entries array, used for unmapping. */
+ size_t cbMMapSqes;
+ /** Flag whether the waiter was woken up externally. */
+ volatile bool fExtIntr;
+} RTIOQUEUEPROVINT;
+/** Pointer to the internal I/O queue provider instance data. */
+typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
+
+
+/*********************************************************************************************************************************
+* Internal Functions *
+*********************************************************************************************************************************/
+
+/**
+ * Syscall wrapper for io_uring_setup().
+ *
+ * @returns IPRT status code.
+ * @param cEntries Number of entries for submission and completion queues.
+ * @param pParams Additional parameters for the I/O ring and updated return values
+ * on success.
+ * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
+ */
+DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
+{
+ int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
+ if (RT_UNLIKELY(rcLnx == -1))
+ return RTErrConvertFromErrno(errno);
+
+ *piFdIoCtx = rcLnx;
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Syscall wrapper for io_uring_enter().
+ *
+ * @returns IPRT status code.
+ * @param iFdIoCtx The I/O ring file descriptor.
+ * @param cToSubmit Maximum number of requests waiting for processing.
+ * @param cMinComplete Minimum number of completion events to accumulate before returning.
+ * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
+ */
+DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
+ uint32_t fFlags)
+{
+ int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
+ NULL, 0);
+ if (RT_UNLIKELY(rcLnx == -1))
+ return RTErrConvertFromErrno(errno);
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Syscall wrapper for io_uring_register().
+ *
+ * @returns IPRT status code.
+ * @param iFdIoCtx The I/O ring file descriptor.
+ * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
+ * @param pvArg Opaque arguments.
+ * @param cArgs Number of arguments.
+ */
+DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
+ uint32_t cArgs)
+{
+ int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
+ if (RT_UNLIKELY(rcLnx == -1))
+ return RTErrConvertFromErrno(errno);
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * mmap() wrapper for the common bits and returning an IPRT status code.
+ *
+ * @returns IPRT status code.
+ * @param iFdIoCtx The I/O ring file descriptor.
+ * @param offMmap The mmap() offset.
+ * @param cbMmap How much to map.
+ * @param ppv Where to store the pointer to the mapping on success.
+ */
+DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
+{
+ void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
+ if (pv != MAP_FAILED)
+ {
+ *ppv = pv;
+ return VINF_SUCCESS;
+ }
+
+ return RTErrConvertFromErrno(errno);
+}
+
+
+/**
+ * eventfd2() syscall wrapper.
+ *
+ * @returns IPRT status code.
+ * @param uValInit The initial value of the maintained counter.
+ * @param fFlags Flags controlling the eventfd behavior.
+ * @param piFdEvt Where to store the file descriptor of the eventfd object on success.
+ */
+DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
+{
+ int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
+ if (RT_UNLIKELY(rcLnx == -1))
+ return RTErrConvertFromErrno(errno);
+
+ *piFdEvt = rcLnx;
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Checks the completion event queue for pending events.
+ *
+ * @param pThis The provider instance.
+ * @param paCEvt Pointer to the array of completion events.
+ * @param cCEvt Maximum number of completion events the array can hold.
+ * @param pcCEvtSeen Where to store the number of completion events processed.
+ */
+static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
+ uint32_t cCEvt, uint32_t *pcCEvtSeen)
+{
+ /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
+ ASMReadFence();
+ uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
+ uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
+ ASMReadFence();
+
+ uint32_t cCEvtSeen = 0;
+
+ while ( idxCqTail != idxCqHead
+ && cCEvtSeen < cCEvt)
+ {
+ /* Get the index. */
+ uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
+ volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];
+
+ paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
+ if (pCqe->rcLnx >= 0)
+ {
+ paCEvt->rcReq = VINF_SUCCESS;
+ paCEvt->cbXfered = (size_t)pCqe->rcLnx;
+ }
+ else
+ paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);
+
+#ifdef RT_STRICT /* poison */
+ memset((void *)pCqe, 0xff, sizeof(*pCqe));
+#endif
+
+ paCEvt++;
+ cCEvtSeen++;
+ idxCqHead++;
+ }
+
+ *pcCEvtSeen = cCEvtSeen;
+
+ /* Paranoia strikes again. */
+ ASMWriteFence();
+ ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
+ ASMWriteFence();
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
+static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
+{
+ /*
+ * Try to create a simple I/O ring and close it again.
+ * The common code/public API already checked for the proper handle type.
+ */
+ int iFdIoCtx = 0;
+ bool fSupp = false;
+ LNXIOURINGPARAMS Params;
+ RT_ZERO(Params);
+
+ int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * Check that we can register an eventfd descriptor to get notified about
+ * completion events while being able to kick the waiter externally out of the wait.
+ */
+ int iFdEvt = 0;
+ rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
+ &iFdEvt, 1 /*cArgs*/);
+ if (RT_SUCCESS(rc))
+ fSupp = true;
+
+ int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
+ }
+ int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
+ }
+
+ return fSupp;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
+ uint32_t cSqEntries, uint32_t cCqEntries)
+{
+ RT_NOREF(fFlags, cCqEntries);
+
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+ LNXIOURINGPARAMS Params;
+ RT_ZERO(Params);
+
+ pThis->cSqesToCommit = 0;
+ pThis->fExtIntr = false;
+
+ int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
+ if (RT_SUCCESS(rc))
+ {
+ /* Map the rings into userspace. */
+ pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
+ pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
+ pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
+
+ pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
+ if (RT_LIKELY(pThis->paIoVecs))
+ {
+ rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
+ if (RT_SUCCESS(rc))
+ {
+ uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
+
+ pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
+ pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
+ pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
+ pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
+ pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
+ pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
+ pThis->idxSqTail = *pThis->Sq.pidxTail;
+
+ pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
+
+ pbTmp = (uint8_t *)pThis->pvMMapCqRing;
+
+ pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
+ pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
+ pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
+ pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
+ pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
+ return VINF_SUCCESS;
+ }
+
+ munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
+ }
+
+ munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
+ }
+
+ rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
+ AssertRC(rc);
+ }
+
+ close(pThis->iFdEvt);
+ }
+
+ RTMemFree(pThis->paIoVecs);
+ }
+
+ int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
+ }
+
+ return rc;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
+static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+
+ int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
+ rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
+ rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
+
+ int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
+ AssertRC(rc);
+
+ close(pThis->iFdEvt);
+ close(pThis->iFdIoCtx);
+ RTMemFree(pThis->paIoVecs);
+
+ RT_ZERO(pThis);
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
+{
+ RT_NOREF(hIoQueueProv, pHandle);
+ /** @todo Add support for fixed file sets later. */
+ return VINF_SUCCESS;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
+{
+ RT_NOREF(hIoQueueProv, pHandle);
+ /** @todo Add support for fixed file sets later. */
+ return VINF_SUCCESS;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
+ uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
+ void *pvUser)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+ RT_NOREF(fReqFlags);
+
+ uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
+ PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
+ struct iovec *pIoVec = &pThis->paIoVecs[idx];
+
+ pIoVec->iov_base = pvBuf;
+ pIoVec->iov_len = cbBuf;
+
+ pSqe->u8Flags = 0;
+ pSqe->u16IoPrio = 0;
+ pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile);
+ pSqe->u64OffStart = off;
+ pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
+ pSqe->u32BufIoVecSz = 1;
+ pSqe->u64User = (uint64_t)(uintptr_t)pvUser;
+
+ switch (enmOp)
+ {
+ case RTIOQUEUEOP_READ:
+ pSqe->u8Opc = LNX_IOURING_OPC_READV;
+ pSqe->uOpc.u32KrnlRwFlags = 0;
+ break;
+ case RTIOQUEUEOP_WRITE:
+ pSqe->u8Opc = LNX_IOURING_OPC_WRITEV;
+ pSqe->uOpc.u32KrnlRwFlags = 0;
+ break;
+ case RTIOQUEUEOP_SYNC:
+ pSqe->u8Opc = LNX_IOURING_OPC_FSYNC;
+ pSqe->uOpc.u32FsyncFlags = 0;
+ break;
+ default:
+ AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
+ VERR_INVALID_PARAMETER);
+ }
+
+ pThis->Sq.paidxSqes[idx] = idx;
+ pThis->idxSqTail++;
+ pThis->cSqesToCommit++;
+ return VINF_SUCCESS;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+
+ ASMWriteFence();
+ ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
+ ASMWriteFence();
+
+ int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
+ if (RT_SUCCESS(rc))
+ {
+ *pcReqsCommitted = pThis->cSqesToCommit;
+ pThis->cSqesToCommit = 0;
+ }
+
+ return rc;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
+ uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+ int rc = VINF_SUCCESS;
+ uint32_t cCEvtSeen = 0;
+
+ RT_NOREF(fFlags);
+
+ /*
+ * Check the completion queue first for any completed events which might save us a
+ * context switch later on.
+ */
+ rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);
+
+ while ( cCEvtSeen < cMinWait
+ && RT_SUCCESS(rc))
+ {
+ /*
+ * We can employ a blocking read on the event file descriptor, it will return
+ * either when woken up externally or when there are completion events pending.
+ */
+ uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
+ ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
+ if (rcLnx == sizeof(uCnt))
+ {
+ uint32_t cCEvtThisSeen = 0;
+ rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
+ cCEvtSeen += cCEvtThisSeen;
+
+ /* Whether we got woken up externally. */
+ if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
+ rc = VERR_INTERRUPTED;
+ }
+ else if (rcLnx == -1)
+ rc = RTErrConvertFromErrno(errno);
+ else
+ AssertMsgFailed(("Unexpected read() -> 0\n"));
+ }
+
+ *pcCEvt = cCEvtSeen;
+ return rc;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+ int rc = VINF_SUCCESS;
+
+ if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
+ {
+ const uint64_t uValAdd = 1;
+ ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));
+
+ Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
+ if (rcLnx == -1)
+ rc = RTErrConvertFromErrno(errno);
+ }
+
+ return rc;
+}
+
+
+/**
+ * Async file I/O queue provider virtual method table.
+ */
+RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
+{
+ /** uVersion */
+ RTIOQUEUEPROVVTABLE_VERSION,
+ /** pszId */
+ "LnxIoURingFile",
+ /** cbIoQueueProv */
+ sizeof(RTIOQUEUEPROVINT),
+ /** enmHnd */
+ RTHANDLETYPE_FILE,
+ /** fFlags */
+ 0,
+ /** pfnIsSupported */
+ rtIoQueueLnxIoURingFileProv_IsSupported,
+ /** pfnQueueInit */
+ rtIoQueueLnxIoURingFileProv_QueueInit,
+ /** pfnQueueDestroy */
+ rtIoQueueLnxIoURingFileProv_QueueDestroy,
+ /** pfnHandleRegister */
+ rtIoQueueLnxIoURingFileProv_HandleRegister,
+ /** pfnHandleDeregister */
+ rtIoQueueLnxIoURingFileProv_HandleDeregister,
+ /** pfnReqPrepare */
+ rtIoQueueLnxIoURingFileProv_ReqPrepare,
+ /** pfnReqPrepareSg */
+ NULL,
+ /** pfnCommit */
+ rtIoQueueLnxIoURingFileProv_Commit,
+ /** pfnEvtWait */
+ rtIoQueueLnxIoURingFileProv_EvtWait,
+ /** pfnEvtWaitWakeup */
+ rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
+ /** uEndMarker */
+ RTIOQUEUEPROVVTABLE_VERSION
+};
+