summaryrefslogtreecommitdiffstats
path: root/src/VBox/Runtime/r3/linux
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:17:27 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:17:27 +0000
commitf215e02bf85f68d3a6106c2a1f4f7f063f819064 (patch)
tree6bb5b92c046312c4e95ac2620b10ddf482d3fa8b /src/VBox/Runtime/r3/linux
parentInitial commit. (diff)
downloadvirtualbox-f215e02bf85f68d3a6106c2a1f4f7f063f819064.tar.xz
virtualbox-f215e02bf85f68d3a6106c2a1f4f7f063f819064.zip
Adding upstream version 7.0.14-dfsg.upstream/7.0.14-dfsg
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/VBox/Runtime/r3/linux')
-rw-r--r--src/VBox/Runtime/r3/linux/Makefile.kup0
-rw-r--r--src/VBox/Runtime/r3/linux/RTFileCopyPartEx-linux.cpp196
-rw-r--r--src/VBox/Runtime/r3/linux/RTFileQuerySectorSize-linux.cpp88
-rw-r--r--src/VBox/Runtime/r3/linux/RTFileSetAllocationSize-linux.cpp86
-rw-r--r--src/VBox/Runtime/r3/linux/RTProcIsRunningByName-linux.cpp128
-rw-r--r--src/VBox/Runtime/r3/linux/RTSystemFirmware-linux.cpp115
-rw-r--r--src/VBox/Runtime/r3/linux/RTSystemQueryDmiString-linux.cpp96
-rw-r--r--src/VBox/Runtime/r3/linux/RTSystemShutdown-linux.cpp111
-rw-r--r--src/VBox/Runtime/r3/linux/RTThreadGetNativeState-linux.cpp121
-rw-r--r--src/VBox/Runtime/r3/linux/fileaio-linux.cpp847
-rw-r--r--src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp940
-rw-r--r--src/VBox/Runtime/r3/linux/krnlmod-linux.cpp358
-rw-r--r--src/VBox/Runtime/r3/linux/mp-linux.cpp328
-rw-r--r--src/VBox/Runtime/r3/linux/rtProcInitExePath-linux.cpp79
-rw-r--r--src/VBox/Runtime/r3/linux/sched-linux.cpp707
-rw-r--r--src/VBox/Runtime/r3/linux/semevent-linux.cpp607
-rw-r--r--src/VBox/Runtime/r3/linux/semeventmulti-linux.cpp600
-rw-r--r--src/VBox/Runtime/r3/linux/semmutex-linux.cpp475
-rw-r--r--src/VBox/Runtime/r3/linux/semwait-linux.h233
-rw-r--r--src/VBox/Runtime/r3/linux/sysfs.cpp736
-rw-r--r--src/VBox/Runtime/r3/linux/systemmem-linux.cpp119
-rw-r--r--src/VBox/Runtime/r3/linux/thread-affinity-linux.cpp105
-rw-r--r--src/VBox/Runtime/r3/linux/time-linux.cpp169
-rw-r--r--src/VBox/Runtime/r3/linux/tpm-linux.cpp229
24 files changed, 7473 insertions, 0 deletions
diff --git a/src/VBox/Runtime/r3/linux/Makefile.kup b/src/VBox/Runtime/r3/linux/Makefile.kup
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/Makefile.kup
diff --git a/src/VBox/Runtime/r3/linux/RTFileCopyPartEx-linux.cpp b/src/VBox/Runtime/r3/linux/RTFileCopyPartEx-linux.cpp
new file mode 100644
index 00000000..0b1d93d7
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/RTFileCopyPartEx-linux.cpp
@@ -0,0 +1,196 @@
+/* $Id: RTFileCopyPartEx-linux.cpp $ */
+/** @file
+ * IPRT - RTFileCopyPartEx, linux specific implementation.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include <iprt/file.h>
+#include "internal/iprt.h"
+
+#include <iprt/asm.h>
+#include <iprt/assert.h>
+#include <iprt/err.h>
+
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#ifndef __NR_copy_file_range
+# if defined(RT_ARCH_X86)
+# define __NR_copy_file_range 377
+# elif defined(RT_ARCH_AMD64)
+# define __NR_copy_file_range 326
+# endif
+#endif
+
+
+#ifndef __NR_copy_file_range
+# include "../../generic/RTFileCopyPartEx-generic.cpp"
+#else /* __NR_copy_file_range - whole file */
+/* Include the generic code as a fallback since copy_file_range is rather new . */
+# define IPRT_FALLBACK_VERSION
+# include "../../generic/RTFileCopyPartEx-generic.cpp"
+# undef IPRT_FALLBACK_VERSION
+
+
+/*********************************************************************************************************************************
+* Global Variables *
+*********************************************************************************************************************************/
+static int32_t volatile g_fCopyFileRangeSupported = -1;
+
+
+DECLINLINE(loff_t)
+MyCopyFileRangeSysCall(int fdIn, loff_t *poffIn, int fdOut, loff_t *poffOut, size_t cbChunk, unsigned int fFlags)
+{
+ return syscall(__NR_copy_file_range, fdIn, poffIn, fdOut, poffOut, cbChunk, fFlags);
+}
+
+
+DECL_NO_INLINE(static, bool) HasCopyFileRangeSyscallSlow(void)
+{
+ errno = 0;
+ MyCopyFileRangeSysCall(-1, NULL, -1, NULL, 4096, 0);
+ if (errno != ENOSYS)
+ {
+ ASMAtomicWriteS32(&g_fCopyFileRangeSupported, 1);
+ return true;
+ }
+ ASMAtomicWriteS32(&g_fCopyFileRangeSupported, 0);
+ return false;
+}
+
+DECLINLINE(bool) HasCopyFileRangeSyscall(void)
+{
+ int32_t i = ASMAtomicUoReadS32(&g_fCopyFileRangeSupported);
+ if (i != -1)
+ return i == 1;
+ return HasCopyFileRangeSyscallSlow();
+}
+
+
+
+RTDECL(int) RTFileCopyPartPrep(PRTFILECOPYPARTBUFSTATE pBufState, uint64_t cbToCopy)
+{
+ if (HasCopyFileRangeSyscall())
+ {
+ pBufState->iAllocType = -42;
+ pBufState->pbBuf = NULL;
+ pBufState->cbBuf = 0;
+ pBufState->uMagic = RTFILECOPYPARTBUFSTATE_MAGIC;
+ return VINF_SUCCESS;
+ }
+ return rtFileCopyPartPrepFallback(pBufState, cbToCopy);
+}
+
+
+RTDECL(void) RTFileCopyPartCleanup(PRTFILECOPYPARTBUFSTATE pBufState)
+{
+ return rtFileCopyPartCleanupFallback(pBufState);
+}
+
+
+RTDECL(int) RTFileCopyPartEx(RTFILE hFileSrc, RTFOFF offSrc, RTFILE hFileDst, RTFOFF offDst, uint64_t cbToCopy,
+ uint32_t fFlags, PRTFILECOPYPARTBUFSTATE pBufState, uint64_t *pcbCopied)
+{
+ /*
+ * Validate input.
+ */
+ if (pcbCopied)
+ *pcbCopied = 0;
+ AssertReturn(pBufState->uMagic == RTFILECOPYPARTBUFSTATE_MAGIC, VERR_INVALID_FLAGS);
+ if (pBufState->iAllocType == -42)
+ { /* more and more likely as time goes */ }
+ else
+ return rtFileCopyPartExFallback(hFileSrc, offSrc, hFileDst, offDst, cbToCopy, fFlags, pBufState, pcbCopied);
+ AssertReturn(offSrc >= 0, VERR_NEGATIVE_SEEK);
+ AssertReturn(offDst >= 0, VERR_NEGATIVE_SEEK);
+ AssertReturn(!fFlags, VERR_INVALID_FLAGS);
+
+ /*
+ * If nothing to copy, return right away.
+ */
+ if (!cbToCopy)
+ return VINF_SUCCESS;
+
+ /*
+ * Do the copying.
+ */
+ uint64_t cbCopied = 0;
+ int rc = VINF_SUCCESS;
+ do
+ {
+ size_t cbThisCopy = (size_t)RT_MIN(cbToCopy - cbCopied, _1G);
+ loff_t offThisDst = offSrc + cbCopied;
+ loff_t offThisSrc = offDst + cbCopied;
+ ssize_t cbActual = MyCopyFileRangeSysCall((int)RTFileToNative(hFileSrc), &offThisSrc,
+ (int)RTFileToNative(hFileDst), &offThisDst,
+ cbThisCopy, 0);
+ if (cbActual < 0)
+ {
+ rc = errno;
+ Assert(rc != 0);
+ rc = rc != 0 ? RTErrConvertFromErrno(rc) : VERR_READ_ERROR;
+ if (rc != VERR_NOT_SAME_DEVICE || cbCopied != 0)
+ break;
+
+ /* Fall back to generic implementation if the syscall refuses to handle the case. */
+ rc = rtFileCopyPartPrepFallback(pBufState, cbToCopy);
+ if (RT_SUCCESS(rc))
+ return rtFileCopyPartExFallback(hFileSrc, offSrc, hFileDst, offDst, cbToCopy, fFlags, pBufState, pcbCopied);
+ return rc;
+ }
+ Assert(offThisSrc == offSrc + (int64_t)cbCopied + cbActual);
+ Assert(offThisDst == offDst + (int64_t)cbCopied + cbActual);
+
+ if (cbActual == 0)
+ {
+ if (!pcbCopied)
+ rc = VERR_EOF;
+ break;
+ }
+
+ cbCopied += cbActual;
+ } while (cbCopied < cbToCopy);
+
+ if (pcbCopied)
+ *pcbCopied = cbCopied;
+
+ return rc;
+}
+
+#endif /* __NR_copy_file_range */
+
diff --git a/src/VBox/Runtime/r3/linux/RTFileQuerySectorSize-linux.cpp b/src/VBox/Runtime/r3/linux/RTFileQuerySectorSize-linux.cpp
new file mode 100644
index 00000000..ffd615fb
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/RTFileQuerySectorSize-linux.cpp
@@ -0,0 +1,88 @@
+/* $Id: RTFileQuerySectorSize-linux.cpp $ */
+/** @file
+ * IPRT - RTFileQuerySectorSize, Linux implementation.
+ */
+
+/*
+ * Copyright (C) 2017-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "internal/iprt.h"
+#include <iprt/file.h>
+
+#include <iprt/assert.h>
+#include <iprt/errcore.h>
+
+#include <errno.h>
+#include <linux/fs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+
+RTDECL(int) RTFileQuerySectorSize(RTFILE hFile, uint32_t *pcbSector)
+{
+ AssertPtrReturn(pcbSector, VERR_INVALID_PARAMETER);
+
+ int rc;
+ int const fd = (int)RTFileToNative(hFile);
+ struct stat DevStat = { 0 };
+ if (!fstat(fd, &DevStat))
+ {
+ if (S_ISBLK(DevStat.st_mode))
+ {
+ int cbLogicalBlock = 0;
+ if (!ioctl(fd, BLKSSZGET, &cbLogicalBlock))
+ {
+ AssertReturn(cbLogicalBlock > 0, VERR_INVALID_FUNCTION);
+ *pcbSector = cbLogicalBlock;
+ return VINF_SUCCESS;
+ }
+
+ rc = RTErrConvertFromErrno(errno);
+ AssertMsgFailed(("ioctl failed: errno=%d / %Rrc\n", errno, rc));
+ }
+ else
+ {
+ AssertMsgFailed(("not a block device.\n"));
+ rc = VERR_INVALID_FUNCTION;
+ }
+ }
+ else
+ {
+ rc = RTErrConvertFromErrno(errno);
+ AssertMsgFailed(("fstat failed: errno=%d / %Rrc\n", errno, rc));
+ }
+ return rc;
+}
+
diff --git a/src/VBox/Runtime/r3/linux/RTFileSetAllocationSize-linux.cpp b/src/VBox/Runtime/r3/linux/RTFileSetAllocationSize-linux.cpp
new file mode 100644
index 00000000..f3acd7fe
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/RTFileSetAllocationSize-linux.cpp
@@ -0,0 +1,86 @@
+/* $Id: RTFileSetAllocationSize-linux.cpp $ */
+/** @file
+ * IPRT - RTFileSetAllocationSize, linux implementation.
+ */
+
+/*
+ * Copyright (C) 2016-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_FILE
+#include <iprt/file.h>
+#include "internal/iprt.h"
+
+#include <iprt/assert.h>
+#include <iprt/errcore.h>
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/fcntl.h>
+
+/**
+ * The Linux specific fallocate() method.
+ */
+typedef int (*PFNLNXFALLOCATE) (int iFd, int fMode, off_t offStart, off_t cb);
+/** Flag to specify that the file size should not be extended. */
+#define LNX_FALLOC_FL_KEEP_SIZE 1
+
+RTDECL(int) RTFileSetAllocationSize(RTFILE hFile, uint64_t cbSize, uint32_t fFlags)
+{
+ AssertReturn(hFile != NIL_RTFILE, VERR_INVALID_PARAMETER);
+ AssertReturn(!(fFlags & ~RTFILE_ALLOC_SIZE_F_VALID), VERR_INVALID_PARAMETER);
+ AssertMsgReturn(sizeof(off_t) >= sizeof(cbSize) || RT_HIDWORD(cbSize) == 0,
+ ("64-bit filesize not supported! cbSize=%lld\n", cbSize),
+ VERR_NOT_SUPPORTED);
+
+ int rc;
+ PFNLNXFALLOCATE pfnLnxFAllocate = (PFNLNXFALLOCATE)(uintptr_t)dlsym(RTLD_DEFAULT, "fallocate64");
+ if (RT_VALID_PTR(pfnLnxFAllocate))
+ {
+ int fLnxFlags = (fFlags & RTFILE_ALLOC_SIZE_F_KEEP_SIZE) ? LNX_FALLOC_FL_KEEP_SIZE : 0;
+ int rcLnx = pfnLnxFAllocate(RTFileToNative(hFile), fLnxFlags, 0, cbSize);
+ if (rcLnx == 0)
+ rc = VINF_SUCCESS;
+ else if (errno == EOPNOTSUPP)
+ rc = VERR_NOT_SUPPORTED;
+ else
+ rc = RTErrConvertFromErrno(errno);
+ }
+ else
+ rc = VERR_NOT_SUPPORTED;
+
+ return rc;
+}
+RT_EXPORT_SYMBOL(RTFileSetAllocationSize);
diff --git a/src/VBox/Runtime/r3/linux/RTProcIsRunningByName-linux.cpp b/src/VBox/Runtime/r3/linux/RTProcIsRunningByName-linux.cpp
new file mode 100644
index 00000000..6049fa1a
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/RTProcIsRunningByName-linux.cpp
@@ -0,0 +1,128 @@
+/* $Id: RTProcIsRunningByName-linux.cpp $ */
+/** @file
+ * IPRT - RTProcIsRunningByName, Linux implementation.
+ */
+
+/*
+ * Copyright (C) 2009-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_PROCESS
+#include <iprt/process.h>
+#include <iprt/string.h>
+#include <iprt/dir.h>
+#include <iprt/path.h>
+#include <iprt/stream.h>
+#include <iprt/param.h>
+#include <iprt/assert.h>
+
+#include <unistd.h>
+
+
+RTR3DECL(bool) RTProcIsRunningByName(const char *pszName)
+{
+ /*
+ * Quick validation.
+ */
+ if (!pszName)
+ return false;
+
+ bool const fWithPath = RTPathHavePath(pszName);
+
+ /*
+ * Enumerate /proc.
+ */
+ RTDIR hDir;
+ int rc = RTDirOpen(&hDir, "/proc");
+ AssertMsgRCReturn(rc, ("RTDirOpen on /proc failed: rc=%Rrc\n", rc), false);
+ if (RT_SUCCESS(rc))
+ {
+ RTDIRENTRY DirEntry;
+ while (RT_SUCCESS(RTDirRead(hDir, &DirEntry, NULL)))
+ {
+ /*
+ * Filter numeric directory entries only.
+ */
+ if ( ( DirEntry.enmType == RTDIRENTRYTYPE_DIRECTORY
+ || DirEntry.enmType == RTDIRENTRYTYPE_UNKNOWN)
+ && RTStrToUInt32(DirEntry.szName) > 0)
+ {
+ /*
+ * Try readlink on exe first since it's more faster and reliable.
+ * Fall back on reading the first line in cmdline if that fails
+ * (access errors typically). cmdline is unreliable as it might
+ * contain whatever the execv caller passes as argv[0].
+ */
+ char szName[RTPATH_MAX];
+ RTStrPrintf(szName, sizeof(szName), "/proc/%s/exe", &DirEntry.szName[0]);
+ char szExe[RTPATH_MAX];
+ int cchLink = readlink(szName, szExe, sizeof(szExe) - 1);
+ if ( cchLink > 0
+ && (size_t)cchLink < sizeof(szExe))
+ {
+ szExe[cchLink] = '\0';
+ rc = VINF_SUCCESS;
+ }
+ else
+ {
+ RTStrPrintf(szName, sizeof(szName), "/proc/%s/cmdline", &DirEntry.szName[0]);
+ PRTSTREAM pStream;
+ rc = RTStrmOpen(szName, "r", &pStream);
+ if (RT_SUCCESS(rc))
+ {
+ rc = RTStrmGetLine(pStream, szExe, sizeof(szExe));
+ RTStrmClose(pStream);
+ }
+ }
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * We are interested on the file name part only.
+ */
+ char const *pszProcName = fWithPath ? szExe : RTPathFilename(szExe);
+ if (RTStrCmp(pszProcName, pszName) == 0)
+ {
+ /* Found it! */
+ RTDirClose(hDir);
+ return true;
+ }
+ }
+ }
+ }
+ RTDirClose(hDir);
+ }
+
+ return false;
+}
+
diff --git a/src/VBox/Runtime/r3/linux/RTSystemFirmware-linux.cpp b/src/VBox/Runtime/r3/linux/RTSystemFirmware-linux.cpp
new file mode 100644
index 00000000..2d7b8986
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/RTSystemFirmware-linux.cpp
@@ -0,0 +1,115 @@
+/* $Id: RTSystemFirmware-linux.cpp $ */
+/** @file
+ * IPRT - System firmware information, linux.
+ */
+
+/*
+ * Copyright (C) 2019-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "internal/iprt.h"
+#include <iprt/system.h>
+
+#include <iprt/err.h>
+#include <iprt/file.h>
+#include <iprt/string.h>
+#include <iprt/linux/sysfs.h>
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+/** Defines the UEFI Globals UUID that is used here as filename suffix (case sensitive). */
+#define VBOX_UEFI_UUID_GLOBALS "8be4df61-93ca-11d2-aa0d-00e098032b8c"
+
+
+RTDECL(int) RTSystemQueryFirmwareType(PRTSYSFWTYPE penmFirmwareType)
+{
+ if (RTLinuxSysFsExists("firmware/efi/"))
+ *penmFirmwareType = RTSYSFWTYPE_UEFI;
+ else if (RTLinuxSysFsExists(""))
+ *penmFirmwareType = RTSYSFWTYPE_BIOS;
+ else
+ {
+ *penmFirmwareType = RTSYSFWTYPE_INVALID;
+ return VERR_NOT_SUPPORTED;
+ }
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSystemQueryFirmwareType);
+
+
+RTDECL(int) RTSystemQueryFirmwareBoolean(RTSYSFWBOOL enmBoolean, bool *pfValue)
+{
+ *pfValue = false;
+
+ /*
+ * Translate the property to variable base filename.
+ */
+ const char *pszName;
+ switch (enmBoolean)
+ {
+ case RTSYSFWBOOL_SECURE_BOOT:
+ pszName = "firmware/efi/efivars/SecureBoot";
+ break;
+
+ default:
+ AssertReturn(enmBoolean > RTSYSFWBOOL_INVALID && enmBoolean < RTSYSFWBOOL_END, VERR_INVALID_PARAMETER);
+ return VERR_SYS_UNSUPPORTED_FIRMWARE_PROPERTY;
+
+ }
+
+ /*
+ * Try open and read the variable value.
+ */
+ RTFILE hFile;
+ int rc = RTLinuxSysFsOpen(&hFile, "%s-" VBOX_UEFI_UUID_GLOBALS, pszName);
+ /** @todo try other suffixes if file-not-found. */
+ if (RT_SUCCESS(rc))
+ {
+ uint8_t abBuf[16];
+ size_t cbRead = 0;
+ rc = RTLinuxSysFsReadFile(hFile, abBuf, sizeof(abBuf), &cbRead);
+ *pfValue = cbRead > 1 && abBuf[cbRead - 1] != 0;
+ RTFileClose(hFile);
+ }
+ else if (rc == VERR_FILE_NOT_FOUND || rc == VERR_PATH_NOT_FOUND)
+ rc = VINF_SUCCESS;
+ else if (rc == VERR_PERMISSION_DENIED)
+ rc = VERR_NOT_SUPPORTED;
+
+ return rc;
+}
+RT_EXPORT_SYMBOL(RTSystemQueryFirmwareBoolean);
+
diff --git a/src/VBox/Runtime/r3/linux/RTSystemQueryDmiString-linux.cpp b/src/VBox/Runtime/r3/linux/RTSystemQueryDmiString-linux.cpp
new file mode 100644
index 00000000..91cf6eb2
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/RTSystemQueryDmiString-linux.cpp
@@ -0,0 +1,96 @@
+/* $Id: RTSystemQueryDmiString-linux.cpp $ */
+/** @file
+ * IPRT - RTSystemQueryDmiString, linux ring-3.
+ */
+
+/*
+ * Copyright (C) 2010-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include <iprt/system.h>
+#include "internal/iprt.h"
+
+#include <iprt/err.h>
+#include <iprt/assert.h>
+#include <iprt/linux/sysfs.h>
+
+#include <errno.h>
+
+
+RTDECL(int) RTSystemQueryDmiString(RTSYSDMISTR enmString, char *pszBuf, size_t cbBuf)
+{
+ AssertPtrReturn(pszBuf, VERR_INVALID_POINTER);
+ AssertReturn(cbBuf > 0, VERR_INVALID_PARAMETER);
+ *pszBuf = '\0';
+ AssertReturn(enmString > RTSYSDMISTR_INVALID && enmString < RTSYSDMISTR_END, VERR_INVALID_PARAMETER);
+
+ const char *pszSysFsName;
+ switch (enmString)
+ {
+ case RTSYSDMISTR_PRODUCT_NAME: pszSysFsName = "id/product_name"; break;
+ case RTSYSDMISTR_PRODUCT_VERSION: pszSysFsName = "id/product_version"; break;
+ case RTSYSDMISTR_PRODUCT_UUID: pszSysFsName = "id/product_uuid"; break;
+ case RTSYSDMISTR_PRODUCT_SERIAL: pszSysFsName = "id/product_serial"; break;
+ case RTSYSDMISTR_MANUFACTURER: pszSysFsName = "id/sys_vendor"; break;
+ default:
+ return VERR_NOT_SUPPORTED;
+ }
+
+ size_t cbRead = 0;
+ int rc = RTLinuxSysFsReadStrFile(pszBuf, cbBuf, &cbRead, "devices/virtual/dmi/%s", pszSysFsName);
+ if (RT_FAILURE(rc) && rc != VERR_BUFFER_OVERFLOW)
+ rc = RTLinuxSysFsReadStrFile(pszBuf, cbBuf, &cbRead, "class/dmi/%s", pszSysFsName);
+ if (RT_FAILURE(rc) && rc != VERR_BUFFER_OVERFLOW)
+ {
+ switch (rc)
+ {
+ case VINF_SUCCESS:
+ AssertFailed();
+ break;
+ case VERR_FILE_NOT_FOUND:
+ case VERR_PATH_NOT_FOUND:
+ case VERR_IS_A_DIRECTORY:
+ rc = VERR_NOT_SUPPORTED;
+ break;
+ case VERR_PERMISSION_DENIED:
+ case VERR_ACCESS_DENIED:
+ rc = VERR_ACCESS_DENIED;
+ break;
+ }
+ }
+
+ return rc;
+}
+RT_EXPORT_SYMBOL(RTSystemQueryDmiString);
+
diff --git a/src/VBox/Runtime/r3/linux/RTSystemShutdown-linux.cpp b/src/VBox/Runtime/r3/linux/RTSystemShutdown-linux.cpp
new file mode 100644
index 00000000..fd198e30
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/RTSystemShutdown-linux.cpp
@@ -0,0 +1,111 @@
+/* $Id: RTSystemShutdown-linux.cpp $ */
+/** @file
+ * IPRT - RTSystemShutdown, linux implementation.
+ */
+
+/*
+ * Copyright (C) 2012-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include <iprt/system.h>
+#include "internal/iprt.h"
+
+#include <iprt/assert.h>
+#include <iprt/env.h>
+#include <iprt/err.h>
+#include <iprt/process.h>
+#include <iprt/string.h>
+
+
+RTDECL(int) RTSystemShutdown(RTMSINTERVAL cMsDelay, uint32_t fFlags, const char *pszLogMsg)
+{
+ AssertPtrReturn(pszLogMsg, VERR_INVALID_POINTER);
+ AssertReturn(!(fFlags & ~RTSYSTEM_SHUTDOWN_VALID_MASK), VERR_INVALID_PARAMETER);
+
+ /*
+ * Assemble the argument vector.
+ */
+ int iArg = 0;
+ const char *apszArgs[6];
+
+ RT_BZERO(apszArgs, sizeof(apszArgs));
+
+ apszArgs[iArg++] = "/sbin/shutdown";
+ switch (fFlags & RTSYSTEM_SHUTDOWN_ACTION_MASK)
+ {
+ case RTSYSTEM_SHUTDOWN_HALT:
+ apszArgs[iArg++] = "-h";
+ apszArgs[iArg++] = "-H";
+ break;
+ case RTSYSTEM_SHUTDOWN_REBOOT:
+ apszArgs[iArg++] = "-r";
+ break;
+ case RTSYSTEM_SHUTDOWN_POWER_OFF:
+ case RTSYSTEM_SHUTDOWN_POWER_OFF_HALT:
+ apszArgs[iArg++] = "-h";
+ apszArgs[iArg++] = "-P";
+ break;
+ }
+
+ char szWhen[80];
+ if (cMsDelay < 500)
+ strcpy(szWhen, "now");
+ else
+ RTStrPrintf(szWhen, sizeof(szWhen), "%u", (unsigned)((cMsDelay + 499) / 1000));
+ apszArgs[iArg++] = szWhen;
+
+ apszArgs[iArg++] = pszLogMsg;
+
+
+ /*
+ * Start the shutdown process and wait for it to complete.
+ */
+ RTPROCESS hProc;
+ int rc = RTProcCreate(apszArgs[0], apszArgs, RTENV_DEFAULT, 0 /*fFlags*/, &hProc);
+ if (RT_FAILURE(rc))
+ return rc;
+
+ RTPROCSTATUS ProcStatus;
+ rc = RTProcWait(hProc, RTPROCWAIT_FLAGS_BLOCK, &ProcStatus);
+ if (RT_SUCCESS(rc))
+ {
+ if ( ProcStatus.enmReason != RTPROCEXITREASON_NORMAL
+ || ProcStatus.iStatus != 0)
+ rc = VERR_SYS_SHUTDOWN_FAILED;
+ }
+
+ return rc;
+}
+RT_EXPORT_SYMBOL(RTSystemShutdown);
+
diff --git a/src/VBox/Runtime/r3/linux/RTThreadGetNativeState-linux.cpp b/src/VBox/Runtime/r3/linux/RTThreadGetNativeState-linux.cpp
new file mode 100644
index 00000000..26c0afdb
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/RTThreadGetNativeState-linux.cpp
@@ -0,0 +1,121 @@
+/* $Id: RTThreadGetNativeState-linux.cpp $ */
+/** @file
+ * IPRT - RTThreadGetNativeState, linux implementation.
+ */
+
+/*
+ * Copyright (C) 2010-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_PROCESS
+#include <iprt/thread.h>
+#include "internal/iprt.h"
+
+#include <iprt/assert.h>
+#include <iprt/ctype.h>
+#include <iprt/errcore.h>
+#include <iprt/string.h>
+
+#include "internal/thread.h"
+
+#include <unistd.h>
+#include <sys/fcntl.h>
+
+
+RTDECL(RTTHREADNATIVESTATE) RTThreadGetNativeState(RTTHREAD hThread)
+{
+ RTTHREADNATIVESTATE enmRet = RTTHREADNATIVESTATE_INVALID;
+ PRTTHREADINT pThread = rtThreadGet(hThread);
+ if (pThread)
+ {
+ enmRet = RTTHREADNATIVESTATE_UNKNOWN;
+
+ char szName[512];
+ RTStrPrintf(szName, sizeof(szName), "/proc/self/task/%u/stat", pThread->tid);
+ int fd = open(szName, O_RDONLY, 0);
+ if (fd >= 0)
+ {
+ ssize_t cch = read(fd, szName, sizeof(szName) - 1);
+ close(fd);
+ if (cch > 0)
+ {
+ szName[cch] = '\0';
+
+ /* skip the pid, the (comm name) and stop at the status char. */
+ const char *psz = szName;
+ while ( *psz
+ && ( *psz != ')'
+ || !RT_C_IS_SPACE(psz[1])
+ || !RT_C_IS_ALPHA(psz[2])
+ || !RT_C_IS_SPACE(psz[3])
+ )
+ )
+ psz++;
+ if (*psz == ')')
+ {
+ switch (psz[2])
+ {
+ case 'R': /* running */
+ enmRet = RTTHREADNATIVESTATE_RUNNING;
+ break;
+
+ case 'S': /* sleeping */
+ case 'D': /* disk sleeping */
+ enmRet = RTTHREADNATIVESTATE_BLOCKED;
+ break;
+
+ case 'T': /* stopped or tracking stop */
+ enmRet = RTTHREADNATIVESTATE_SUSPENDED;
+ break;
+
+ case 'Z': /* zombie */
+ case 'X': /* dead */
+ enmRet = RTTHREADNATIVESTATE_TERMINATED;
+ break;
+
+ default:
+ AssertMsgFailed(("state=%c\n", psz[2]));
+ enmRet = RTTHREADNATIVESTATE_UNKNOWN;
+ break;
+ }
+ }
+ else
+ AssertMsgFailed(("stat='%s'\n", szName));
+ }
+ }
+ rtThreadRelease(pThread);
+ }
+ return enmRet;
+}
+
diff --git a/src/VBox/Runtime/r3/linux/fileaio-linux.cpp b/src/VBox/Runtime/r3/linux/fileaio-linux.cpp
new file mode 100644
index 00000000..2f365a45
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/fileaio-linux.cpp
@@ -0,0 +1,847 @@
+/* $Id: fileaio-linux.cpp $ */
+/** @file
+ * IPRT - File async I/O, native implementation for the Linux host platform.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+/** @page pg_rtfileaio_linux RTFile Async I/O - Linux Implementation Notes
+ * @internal
+ *
+ * Linux implements the kernel async I/O API through the io_* syscalls. They are
+ * not exposed in the glibc (the aio_* API uses userspace threads and blocking
+ * I/O operations to simulate async behavior). There is an external library
+ * called libaio which implements these syscalls but because we don't want to
+ * have another dependency and this library is not installed by default and the
+ * interface is really simple we use the kernel interface directly using wrapper
+ * functions.
+ *
+ * The interface has some limitations. The first one is that the file must be
+ * opened with O_DIRECT. This disables caching done by the kernel which can be
+ * compensated if the user of this API implements caching itself. The next
+ * limitation is that data buffers must be aligned at a 512 byte boundary or the
+ * request will fail.
+ */
+/** @todo r=bird: What's this about "must be opened with O_DIRECT"? An
+ * explanation would be nice, esp. seeing what Linus is quoted saying
+ * about it in the open man page... */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_FILE
+#include <iprt/asm.h>
+#include <iprt/mem.h>
+#include <iprt/assert.h>
+#include <iprt/string.h>
+#include <iprt/err.h>
+#include <iprt/log.h>
+#include <iprt/thread.h>
+#include "internal/fileaio.h"
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <errno.h>
+
+#include <iprt/file.h>
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/** The async I/O context handle */
+typedef unsigned long LNXKAIOCONTEXT;
+
+/**
+ * Supported commands for the iocbs
+ */
+enum
+{
+ LNXKAIO_IOCB_CMD_READ = 0,
+ LNXKAIO_IOCB_CMD_WRITE = 1,
+ LNXKAIO_IOCB_CMD_FSYNC = 2,
+ LNXKAIO_IOCB_CMD_FDSYNC = 3
+};
+
+/**
+ * The iocb structure of a request which is passed to the kernel.
+ *
+ * We redefined this here because the version in the header lacks padding
+ * for 32bit.
+ */
+typedef struct LNXKAIOIOCB
+{
+ /** Opaque pointer to data which is returned on an I/O event. */
+ void *pvUser;
+#ifdef RT_ARCH_X86
+ uint32_t u32Padding0;
+#endif
+ /** Contains the request number and is set by the kernel. */
+ uint32_t u32Key;
+ /** Reserved. */
+ uint32_t u32Reserved0;
+ /** The I/O opcode. */
+ uint16_t u16IoOpCode;
+ /** Request priority. */
+ int16_t i16Priority;
+ /** The file descriptor. */
+ uint32_t uFileDesc;
+ /** The userspace pointer to the buffer containing/receiving the data. */
+ void *pvBuf;
+#ifdef RT_ARCH_X86
+ uint32_t u32Padding1;
+#endif
+ /** How many bytes to transfer. */
+#if ARCH_BITS == 32
+ uint32_t cbTransfer;
+ uint32_t u32Padding2;
+#elif ARCH_BITS == 64
+ uint64_t cbTransfer;
+#else
+# error "Unknown architecture"
+#endif
+ /** At which offset to start the transfer. */
+ int64_t off;
+ /** Reserved. */
+ uint64_t u64Reserved1;
+ /** Flags */
+ uint32_t fFlags;
+ /** Readyness signal file descriptor. */
+ uint32_t u32ResFd;
+} LNXKAIOIOCB, *PLNXKAIOIOCB;
+
+/**
+ * I/O event structure to notify about completed requests.
+ * Redefined here too because of the padding.
+ */
+typedef struct LNXKAIOIOEVENT
+{
+ /** The pvUser field from the iocb. */
+ void *pvUser;
+#if ARCH_BITS == 32
+ uint32_t u32Padding0;
+#endif
+ /** The LNXKAIOIOCB object this event is for. */
+ PLNXKAIOIOCB *pIoCB;
+#if ARCH_BITS == 32
+ uint32_t u32Padding1;
+#endif
+ /** The result code of the operation .*/
+#if ARCH_BITS == 32
+ int32_t rc;
+ uint32_t u32Padding2;
+#elif ARCH_BITS == 64
+ int64_t rc;
+#else
+# error "Unknown architecture"
+#endif
+ /** Secondary result code. */
+#if ARCH_BITS == 32
+ int32_t rc2;
+ uint32_t u32Padding3;
+#elif ARCH_BITS == 64
+ int64_t rc2;
+#else
+# error "Unknown architecture"
+#endif
+} LNXKAIOIOEVENT, *PLNXKAIOIOEVENT;
+
+
+/**
+ * Async I/O completion context state.
+ */
+typedef struct RTFILEAIOCTXINTERNAL
+{
+ /** Handle to the async I/O context. */
+ LNXKAIOCONTEXT AioContext;
+ /** Maximum number of requests this context can handle. */
+ int cRequestsMax;
+ /** Current number of requests active on this context. */
+ volatile int32_t cRequests;
+ /** The ID of the thread which is currently waiting for requests. */
+ volatile RTTHREAD hThreadWait;
+ /** Flag whether the thread was woken up. */
+ volatile bool fWokenUp;
+ /** Flag whether the thread is currently waiting in the syscall. */
+ volatile bool fWaiting;
+ /** Flags given during creation. */
+ uint32_t fFlags;
+ /** Magic value (RTFILEAIOCTX_MAGIC). */
+ uint32_t u32Magic;
+} RTFILEAIOCTXINTERNAL;
+/** Pointer to an internal context structure. */
+typedef RTFILEAIOCTXINTERNAL *PRTFILEAIOCTXINTERNAL;
+
+/**
+ * Async I/O request state.
+ */
+typedef struct RTFILEAIOREQINTERNAL
+{
+ /** The aio control block. This must be the FIRST elment in
+ * the structure! (see notes below) */
+ LNXKAIOIOCB AioCB;
+ /** Current state the request is in. */
+ RTFILEAIOREQSTATE enmState;
+ /** The I/O context this request is associated with. */
+ LNXKAIOCONTEXT AioContext;
+ /** Return code the request completed with. */
+ int Rc;
+ /** Number of bytes actually transferred. */
+ size_t cbTransfered;
+ /** Completion context we are assigned to. */
+ PRTFILEAIOCTXINTERNAL pCtxInt;
+ /** Magic value (RTFILEAIOREQ_MAGIC). */
+ uint32_t u32Magic;
+} RTFILEAIOREQINTERNAL;
+/** Pointer to an internal request structure. */
+typedef RTFILEAIOREQINTERNAL *PRTFILEAIOREQINTERNAL;
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+/** The max number of events to get in one call. */
+#define AIO_MAXIMUM_REQUESTS_PER_CONTEXT 64
+
+
+/**
+ * Creates a new async I/O context.
+ */
+DECLINLINE(int) rtFileAsyncIoLinuxCreate(unsigned cEvents, LNXKAIOCONTEXT *pAioContext)
+{
+ int rc = syscall(__NR_io_setup, cEvents, pAioContext);
+ if (RT_UNLIKELY(rc == -1))
+ {
+ if (errno == EAGAIN)
+ return VERR_FILE_AIO_INSUFFICIENT_EVENTS;
+ else
+ return RTErrConvertFromErrno(errno);
+ }
+
+ return VINF_SUCCESS;
+}
+
+/**
+ * Destroys a async I/O context.
+ */
+DECLINLINE(int) rtFileAsyncIoLinuxDestroy(LNXKAIOCONTEXT AioContext)
+{
+ int rc = syscall(__NR_io_destroy, AioContext);
+ if (RT_UNLIKELY(rc == -1))
+ return RTErrConvertFromErrno(errno);
+
+ return VINF_SUCCESS;
+}
+
+/**
+ * Submits an array of I/O requests to the kernel.
+ */
+DECLINLINE(int) rtFileAsyncIoLinuxSubmit(LNXKAIOCONTEXT AioContext, long cReqs, LNXKAIOIOCB **ppIoCB, int *pcSubmitted)
+{
+ int rc = syscall(__NR_io_submit, AioContext, cReqs, ppIoCB);
+ if (RT_UNLIKELY(rc == -1))
+ return RTErrConvertFromErrno(errno);
+
+ *pcSubmitted = rc;
+
+ return VINF_SUCCESS;
+}
+
+/**
+ * Cancels a I/O request.
+ */
+DECLINLINE(int) rtFileAsyncIoLinuxCancel(LNXKAIOCONTEXT AioContext, PLNXKAIOIOCB pIoCB, PLNXKAIOIOEVENT pIoResult)
+{
+ int rc = syscall(__NR_io_cancel, AioContext, pIoCB, pIoResult);
+ if (RT_UNLIKELY(rc == -1))
+ return RTErrConvertFromErrno(errno);
+
+ return VINF_SUCCESS;
+}
+
+/**
+ * Waits for I/O events.
+ * @returns Number of events (natural number w/ 0), IPRT error code (negative).
+ */
+DECLINLINE(int) rtFileAsyncIoLinuxGetEvents(LNXKAIOCONTEXT AioContext, long cReqsMin, long cReqs,
+ PLNXKAIOIOEVENT paIoResults, struct timespec *pTimeout)
+{
+ int rc = syscall(__NR_io_getevents, AioContext, cReqsMin, cReqs, paIoResults, pTimeout);
+ if (RT_UNLIKELY(rc == -1))
+ return RTErrConvertFromErrno(errno);
+
+ return rc;
+}
+
+RTR3DECL(int) RTFileAioGetLimits(PRTFILEAIOLIMITS pAioLimits)
+{
+ int rc = VINF_SUCCESS;
+ AssertPtrReturn(pAioLimits, VERR_INVALID_POINTER);
+
+ /*
+ * Check if the API is implemented by creating a
+ * completion port.
+ */
+ LNXKAIOCONTEXT AioContext = 0;
+ rc = rtFileAsyncIoLinuxCreate(1, &AioContext);
+ if (RT_FAILURE(rc))
+ return rc;
+
+ rc = rtFileAsyncIoLinuxDestroy(AioContext);
+ if (RT_FAILURE(rc))
+ return rc;
+
+ /* Supported - fill in the limits. The alignment is the only restriction. */
+ pAioLimits->cReqsOutstandingMax = RTFILEAIO_UNLIMITED_REQS;
+ pAioLimits->cbBufferAlignment = 512;
+
+ return VINF_SUCCESS;
+}
+
+
+RTR3DECL(int) RTFileAioReqCreate(PRTFILEAIOREQ phReq)
+{
+ AssertPtrReturn(phReq, VERR_INVALID_POINTER);
+
+ /*
+ * Allocate a new request and initialize it.
+ */
+ PRTFILEAIOREQINTERNAL pReqInt = (PRTFILEAIOREQINTERNAL)RTMemAllocZ(sizeof(*pReqInt));
+ if (RT_UNLIKELY(!pReqInt))
+ return VERR_NO_MEMORY;
+
+ pReqInt->pCtxInt = NULL;
+ pReqInt->u32Magic = RTFILEAIOREQ_MAGIC;
+ RTFILEAIOREQ_SET_STATE(pReqInt, COMPLETED);
+
+ *phReq = (RTFILEAIOREQ)pReqInt;
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(int) RTFileAioReqDestroy(RTFILEAIOREQ hReq)
+{
+ /*
+ * Validate the handle and ignore nil.
+ */
+ if (hReq == NIL_RTFILEAIOREQ)
+ return VINF_SUCCESS;
+ PRTFILEAIOREQINTERNAL pReqInt = hReq;
+ RTFILEAIOREQ_VALID_RETURN(pReqInt);
+ RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_IN_PROGRESS);
+
+ /*
+ * Trash the magic and free it.
+ */
+ ASMAtomicUoWriteU32(&pReqInt->u32Magic, ~RTFILEAIOREQ_MAGIC);
+ RTMemFree(pReqInt);
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Worker setting up the request.
+ */
+DECLINLINE(int) rtFileAioReqPrepareTransfer(RTFILEAIOREQ hReq, RTFILE hFile,
+ uint16_t uTransferDirection,
+ RTFOFF off, void *pvBuf, size_t cbTransfer,
+ void *pvUser)
+{
+ /*
+ * Validate the input.
+ */
+ PRTFILEAIOREQINTERNAL pReqInt = hReq;
+ RTFILEAIOREQ_VALID_RETURN(pReqInt);
+ RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_IN_PROGRESS);
+ Assert(hFile != NIL_RTFILE);
+
+ if (uTransferDirection != LNXKAIO_IOCB_CMD_FSYNC)
+ {
+ AssertPtr(pvBuf);
+ Assert(off >= 0);
+ Assert(cbTransfer > 0);
+ }
+
+ /*
+ * Setup the control block and clear the finished flag.
+ */
+ pReqInt->AioCB.u16IoOpCode = uTransferDirection;
+ pReqInt->AioCB.uFileDesc = RTFileToNative(hFile);
+ pReqInt->AioCB.off = off;
+ pReqInt->AioCB.cbTransfer = cbTransfer;
+ pReqInt->AioCB.pvBuf = pvBuf;
+ pReqInt->AioCB.pvUser = pvUser;
+
+ pReqInt->pCtxInt = NULL;
+ RTFILEAIOREQ_SET_STATE(pReqInt, PREPARED);
+
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(int) RTFileAioReqPrepareRead(RTFILEAIOREQ hReq, RTFILE hFile, RTFOFF off,
+ void *pvBuf, size_t cbRead, void *pvUser)
+{
+ return rtFileAioReqPrepareTransfer(hReq, hFile, LNXKAIO_IOCB_CMD_READ,
+ off, pvBuf, cbRead, pvUser);
+}
+
+
+RTDECL(int) RTFileAioReqPrepareWrite(RTFILEAIOREQ hReq, RTFILE hFile, RTFOFF off,
+ void const *pvBuf, size_t cbWrite, void *pvUser)
+{
+ return rtFileAioReqPrepareTransfer(hReq, hFile, LNXKAIO_IOCB_CMD_WRITE,
+ off, (void *)pvBuf, cbWrite, pvUser);
+}
+
+
+RTDECL(int) RTFileAioReqPrepareFlush(RTFILEAIOREQ hReq, RTFILE hFile, void *pvUser)
+{
+ PRTFILEAIOREQINTERNAL pReqInt = hReq;
+ RTFILEAIOREQ_VALID_RETURN(pReqInt);
+ AssertReturn(hFile != NIL_RTFILE, VERR_INVALID_HANDLE);
+ RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_IN_PROGRESS);
+
+ return rtFileAioReqPrepareTransfer(pReqInt, hFile, LNXKAIO_IOCB_CMD_FSYNC,
+ 0, NULL, 0, pvUser);
+}
+
+
+RTDECL(void *) RTFileAioReqGetUser(RTFILEAIOREQ hReq)
+{
+ PRTFILEAIOREQINTERNAL pReqInt = hReq;
+ RTFILEAIOREQ_VALID_RETURN_RC(pReqInt, NULL);
+
+ return pReqInt->AioCB.pvUser;
+}
+
+
+RTDECL(int) RTFileAioReqCancel(RTFILEAIOREQ hReq)
+{
+ PRTFILEAIOREQINTERNAL pReqInt = hReq;
+ RTFILEAIOREQ_VALID_RETURN(pReqInt);
+ RTFILEAIOREQ_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_NOT_SUBMITTED);
+
+ LNXKAIOIOEVENT AioEvent;
+ int rc = rtFileAsyncIoLinuxCancel(pReqInt->AioContext, &pReqInt->AioCB, &AioEvent);
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * Decrement request count because the request will never arrive at the
+ * completion port.
+ */
+ AssertMsg(RT_VALID_PTR(pReqInt->pCtxInt), ("Invalid state. Request was canceled but wasn't submitted\n"));
+
+ ASMAtomicDecS32(&pReqInt->pCtxInt->cRequests);
+ pReqInt->Rc = VERR_FILE_AIO_CANCELED;
+ RTFILEAIOREQ_SET_STATE(pReqInt, COMPLETED);
+ return VINF_SUCCESS;
+ }
+ if (rc == VERR_TRY_AGAIN)
+ return VERR_FILE_AIO_IN_PROGRESS;
+ return rc;
+}
+
+
+RTDECL(int) RTFileAioReqGetRC(RTFILEAIOREQ hReq, size_t *pcbTransfered)
+{
+ PRTFILEAIOREQINTERNAL pReqInt = hReq;
+ RTFILEAIOREQ_VALID_RETURN(pReqInt);
+ AssertPtrNull(pcbTransfered);
+ RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_IN_PROGRESS);
+ RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, PREPARED, VERR_FILE_AIO_NOT_SUBMITTED);
+
+ if ( pcbTransfered
+ && RT_SUCCESS(pReqInt->Rc))
+ *pcbTransfered = pReqInt->cbTransfered;
+
+ return pReqInt->Rc;
+}
+
+
+RTDECL(int) RTFileAioCtxCreate(PRTFILEAIOCTX phAioCtx, uint32_t cAioReqsMax,
+ uint32_t fFlags)
+{
+ PRTFILEAIOCTXINTERNAL pCtxInt;
+ AssertPtrReturn(phAioCtx, VERR_INVALID_POINTER);
+ AssertReturn(!(fFlags & ~RTFILEAIOCTX_FLAGS_VALID_MASK), VERR_INVALID_PARAMETER);
+
+ /* The kernel interface needs a maximum. */
+ if (cAioReqsMax == RTFILEAIO_UNLIMITED_REQS)
+ return VERR_OUT_OF_RANGE;
+
+ pCtxInt = (PRTFILEAIOCTXINTERNAL)RTMemAllocZ(sizeof(RTFILEAIOCTXINTERNAL));
+ if (RT_UNLIKELY(!pCtxInt))
+ return VERR_NO_MEMORY;
+
+ /* Init the event handle. */
+ int rc = rtFileAsyncIoLinuxCreate(cAioReqsMax, &pCtxInt->AioContext);
+ if (RT_SUCCESS(rc))
+ {
+ pCtxInt->fWokenUp = false;
+ pCtxInt->fWaiting = false;
+ pCtxInt->hThreadWait = NIL_RTTHREAD;
+ pCtxInt->cRequestsMax = cAioReqsMax;
+ pCtxInt->fFlags = fFlags;
+ pCtxInt->u32Magic = RTFILEAIOCTX_MAGIC;
+ *phAioCtx = (RTFILEAIOCTX)pCtxInt;
+ }
+ else
+ RTMemFree(pCtxInt);
+
+ return rc;
+}
+
+
+RTDECL(int) RTFileAioCtxDestroy(RTFILEAIOCTX hAioCtx)
+{
+ /* Validate the handle and ignore nil. */
+ if (hAioCtx == NIL_RTFILEAIOCTX)
+ return VINF_SUCCESS;
+ PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx;
+ RTFILEAIOCTX_VALID_RETURN(pCtxInt);
+
+ /* Cannot destroy a busy context. */
+ if (RT_UNLIKELY(pCtxInt->cRequests))
+ return VERR_FILE_AIO_BUSY;
+
+ /* The native bit first, then mark it as dead and free it. */
+ int rc = rtFileAsyncIoLinuxDestroy(pCtxInt->AioContext);
+ if (RT_FAILURE(rc))
+ return rc;
+ ASMAtomicUoWriteU32(&pCtxInt->u32Magic, RTFILEAIOCTX_MAGIC_DEAD);
+ RTMemFree(pCtxInt);
+
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(uint32_t) RTFileAioCtxGetMaxReqCount(RTFILEAIOCTX hAioCtx)
+{
+ /* Nil means global here. */
+ if (hAioCtx == NIL_RTFILEAIOCTX)
+ return RTFILEAIO_UNLIMITED_REQS; /** @todo r=bird: I'm a bit puzzled by this return value since it
+ * is completely useless in RTFileAioCtxCreate. */
+
+ /* Return 0 if the handle is invalid, it's better than garbage I think... */
+ PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx;
+ RTFILEAIOCTX_VALID_RETURN_RC(pCtxInt, 0);
+
+ return pCtxInt->cRequestsMax;
+}
+
+RTDECL(int) RTFileAioCtxAssociateWithFile(RTFILEAIOCTX hAioCtx, RTFILE hFile)
+{
+ /* Nothing to do. */
+ NOREF(hAioCtx); NOREF(hFile);
+ return VINF_SUCCESS;
+}
+
+RTDECL(int) RTFileAioCtxSubmit(RTFILEAIOCTX hAioCtx, PRTFILEAIOREQ pahReqs, size_t cReqs)
+{
+ int rc = VINF_SUCCESS;
+
+ /*
+ * Parameter validation.
+ */
+ PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx;
+ RTFILEAIOCTX_VALID_RETURN(pCtxInt);
+ AssertReturn(cReqs > 0, VERR_INVALID_PARAMETER);
+ AssertPtrReturn(pahReqs, VERR_INVALID_POINTER);
+ uint32_t i = cReqs;
+ PRTFILEAIOREQINTERNAL pReqInt = NULL;
+
+ /*
+ * Validate requests and associate with the context.
+ */
+ while (i-- > 0)
+ {
+ pReqInt = pahReqs[i];
+ if (RTFILEAIOREQ_IS_NOT_VALID(pReqInt))
+ {
+ /* Undo everything and stop submitting. */
+ size_t iUndo = cReqs;
+ while (iUndo-- > i)
+ {
+ pReqInt = pahReqs[iUndo];
+ RTFILEAIOREQ_SET_STATE(pReqInt, PREPARED);
+ pReqInt->pCtxInt = NULL;
+ }
+ return VERR_INVALID_HANDLE;
+ }
+
+ pReqInt->AioContext = pCtxInt->AioContext;
+ pReqInt->pCtxInt = pCtxInt;
+ RTFILEAIOREQ_SET_STATE(pReqInt, SUBMITTED);
+ }
+
+ do
+ {
+ /*
+ * We cast pahReqs to the Linux iocb structure to avoid copying the requests
+ * into a temporary array. This is possible because the iocb structure is
+ * the first element in the request structure (see PRTFILEAIOCTXINTERNAL).
+ */
+ int cReqsSubmitted = 0;
+ rc = rtFileAsyncIoLinuxSubmit(pCtxInt->AioContext, cReqs,
+ (PLNXKAIOIOCB *)pahReqs,
+ &cReqsSubmitted);
+ if (RT_FAILURE(rc))
+ {
+ /*
+ * We encountered an error.
+ * This means that the first IoCB
+ * is not correctly initialized
+ * (invalid buffer alignment or bad file descriptor).
+ * Revert every request into the prepared state except
+ * the first one which will switch to completed.
+ * Another reason could be insufficient resources.
+ */
+ i = cReqs;
+ while (i-- > 0)
+ {
+ /* Already validated. */
+ pReqInt = pahReqs[i];
+ pReqInt->pCtxInt = NULL;
+ pReqInt->AioContext = 0;
+ RTFILEAIOREQ_SET_STATE(pReqInt, PREPARED);
+ }
+
+ if (rc == VERR_TRY_AGAIN)
+ return VERR_FILE_AIO_INSUFFICIENT_RESSOURCES;
+ else
+ {
+ /* The first request failed. */
+ pReqInt = pahReqs[0];
+ RTFILEAIOREQ_SET_STATE(pReqInt, COMPLETED);
+ pReqInt->Rc = rc;
+ pReqInt->cbTransfered = 0;
+ return rc;
+ }
+ }
+
+ /* Advance. */
+ cReqs -= cReqsSubmitted;
+ pahReqs += cReqsSubmitted;
+ ASMAtomicAddS32(&pCtxInt->cRequests, cReqsSubmitted);
+
+ } while (cReqs);
+
+ return rc;
+}
+
+
+RTDECL(int) RTFileAioCtxWait(RTFILEAIOCTX hAioCtx, size_t cMinReqs, RTMSINTERVAL cMillies,
+ PRTFILEAIOREQ pahReqs, size_t cReqs, uint32_t *pcReqs)
+{
+ /*
+ * Validate the parameters, making sure to always set pcReqs.
+ */
+ AssertPtrReturn(pcReqs, VERR_INVALID_POINTER);
+ *pcReqs = 0; /* always set */
+ PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx;
+ RTFILEAIOCTX_VALID_RETURN(pCtxInt);
+ AssertPtrReturn(pahReqs, VERR_INVALID_POINTER);
+ AssertReturn(cReqs != 0, VERR_INVALID_PARAMETER);
+ AssertReturn(cReqs >= cMinReqs, VERR_OUT_OF_RANGE);
+
+ /*
+ * Can't wait if there are not requests around.
+ */
+ if ( RT_UNLIKELY(ASMAtomicUoReadS32(&pCtxInt->cRequests) == 0)
+ && !(pCtxInt->fFlags & RTFILEAIOCTX_FLAGS_WAIT_WITHOUT_PENDING_REQUESTS))
+ return VERR_FILE_AIO_NO_REQUEST;
+
+ /*
+ * Convert the timeout if specified.
+ */
+ struct timespec *pTimeout = NULL;
+ struct timespec Timeout = {0,0};
+ uint64_t StartNanoTS = 0;
+ if (cMillies != RT_INDEFINITE_WAIT)
+ {
+ Timeout.tv_sec = cMillies / 1000;
+ Timeout.tv_nsec = cMillies % 1000 * 1000000;
+ pTimeout = &Timeout;
+ StartNanoTS = RTTimeNanoTS();
+ }
+
+ /* Wait for at least one. */
+ if (!cMinReqs)
+ cMinReqs = 1;
+
+ /* For the wakeup call. */
+ Assert(pCtxInt->hThreadWait == NIL_RTTHREAD);
+ ASMAtomicWriteHandle(&pCtxInt->hThreadWait, RTThreadSelf());
+
+ /*
+ * Loop until we're woken up, hit an error (incl timeout), or
+ * have collected the desired number of requests.
+ */
+ int rc = VINF_SUCCESS;
+ int cRequestsCompleted = 0;
+ while (!pCtxInt->fWokenUp)
+ {
+ LNXKAIOIOEVENT aPortEvents[AIO_MAXIMUM_REQUESTS_PER_CONTEXT];
+ int cRequestsToWait = RT_MIN(cReqs, AIO_MAXIMUM_REQUESTS_PER_CONTEXT);
+ ASMAtomicXchgBool(&pCtxInt->fWaiting, true);
+ rc = rtFileAsyncIoLinuxGetEvents(pCtxInt->AioContext, cMinReqs, cRequestsToWait, &aPortEvents[0], pTimeout);
+ ASMAtomicXchgBool(&pCtxInt->fWaiting, false);
+ if (RT_FAILURE(rc))
+ break;
+ uint32_t const cDone = rc;
+ rc = VINF_SUCCESS;
+
+ /*
+ * Process received events / requests.
+ */
+ for (uint32_t i = 0; i < cDone; i++)
+ {
+ /*
+ * The iocb is the first element in our request structure.
+ * So we can safely cast it directly to the handle (see above)
+ */
+ PRTFILEAIOREQINTERNAL pReqInt = (PRTFILEAIOREQINTERNAL)aPortEvents[i].pIoCB;
+ AssertPtr(pReqInt);
+ Assert(pReqInt->u32Magic == RTFILEAIOREQ_MAGIC);
+
+ /** @todo aeichner: The rc field contains the result code
+ * like you can find in errno for the normal read/write ops.
+ * But there is a second field called rc2. I don't know the
+ * purpose for it yet.
+ */
+ if (RT_UNLIKELY(aPortEvents[i].rc < 0))
+ pReqInt->Rc = RTErrConvertFromErrno(-aPortEvents[i].rc); /* Convert to positive value. */
+ else
+ {
+ pReqInt->Rc = VINF_SUCCESS;
+ pReqInt->cbTransfered = aPortEvents[i].rc;
+ }
+
+ /* Mark the request as finished. */
+ RTFILEAIOREQ_SET_STATE(pReqInt, COMPLETED);
+
+ pahReqs[cRequestsCompleted++] = (RTFILEAIOREQ)pReqInt;
+ }
+
+ /*
+ * Done Yet? If not advance and try again.
+ */
+ if (cDone >= cMinReqs)
+ break;
+ cMinReqs -= cDone;
+ cReqs -= cDone;
+
+ if (cMillies != RT_INDEFINITE_WAIT)
+ {
+ /* The API doesn't return ETIMEDOUT, so we have to fix that ourselves. */
+ uint64_t NanoTS = RTTimeNanoTS();
+ uint64_t cMilliesElapsed = (NanoTS - StartNanoTS) / 1000000;
+ if (cMilliesElapsed >= cMillies)
+ {
+ rc = VERR_TIMEOUT;
+ break;
+ }
+
+ /* The syscall supposedly updates it, but we're paranoid. :-) */
+ Timeout.tv_sec = (cMillies - (RTMSINTERVAL)cMilliesElapsed) / 1000;
+ Timeout.tv_nsec = (cMillies - (RTMSINTERVAL)cMilliesElapsed) % 1000 * 1000000;
+ }
+ }
+
+ /*
+ * Update the context state and set the return value.
+ */
+ *pcReqs = cRequestsCompleted;
+ ASMAtomicSubS32(&pCtxInt->cRequests, cRequestsCompleted);
+ Assert(pCtxInt->hThreadWait == RTThreadSelf());
+ ASMAtomicWriteHandle(&pCtxInt->hThreadWait, NIL_RTTHREAD);
+
+ /*
+ * Clear the wakeup flag and set rc.
+ */
+ if ( pCtxInt->fWokenUp
+ && RT_SUCCESS(rc))
+ {
+ ASMAtomicXchgBool(&pCtxInt->fWokenUp, false);
+ rc = VERR_INTERRUPTED;
+ }
+
+ return rc;
+}
+
+
+RTDECL(int) RTFileAioCtxWakeup(RTFILEAIOCTX hAioCtx)
+{
+ PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx;
+ RTFILEAIOCTX_VALID_RETURN(pCtxInt);
+
+ /** @todo r=bird: Define the protocol for how to resume work after calling
+ * this function. */
+
+ bool fWokenUp = ASMAtomicXchgBool(&pCtxInt->fWokenUp, true);
+
+ /*
+ * Read the thread handle before the status flag.
+ * If we read the handle after the flag we might
+ * end up with an invalid handle because the thread
+ * waiting in RTFileAioCtxWakeup() might get scheduled
+ * before we read the flag and returns.
+ * We can ensure that the handle is valid if fWaiting is true
+ * when reading the handle before the status flag.
+ */
+ RTTHREAD hThread;
+ ASMAtomicReadHandle(&pCtxInt->hThreadWait, &hThread);
+ bool fWaiting = ASMAtomicReadBool(&pCtxInt->fWaiting);
+ if ( !fWokenUp
+ && fWaiting)
+ {
+ /*
+ * If a thread waits the handle must be valid.
+ * It is possible that the thread returns from
+ * rtFileAsyncIoLinuxGetEvents() before the signal
+ * is send.
+ * This is no problem because we already set fWokenUp
+ * to true which will let the thread return VERR_INTERRUPTED
+ * and the next call to RTFileAioCtxWait() will not
+ * return VERR_INTERRUPTED because signals are not saved
+ * and will simply vanish if the destination thread can't
+ * receive it.
+ */
+ Assert(hThread != NIL_RTTHREAD);
+ RTThreadPoke(hThread);
+ }
+
+ return VINF_SUCCESS;
+}
+
diff --git a/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp b/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp
new file mode 100644
index 00000000..f6719664
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp
@@ -0,0 +1,940 @@
+/* $Id: ioqueue-iouringfile-provider.cpp $ */
+/** @file
+ * IPRT - I/O queue, Linux io_uring interface I/O file provider.
+ */
+
+/*
+ * Copyright (C) 2019-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes
+ * @internal
+ *
+ * The io_uring interface is the most recent interface added to the Linux kernel
+ * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is
+ * thus not available on most systems as of writing this backend (July 2019).
+ * It supersedes the old async I/O interface and cleans up with some restrictions like
+ * having to disable caching for the file.
+ * The interface is centered around a submission and completion queue to queue multiple new
+ * requests for the kernel to process and get notified about completions to reduce the amount
+ * of context switches to an absolute minimum. It also offers advanced features like
+ * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead
+ * even more.
+ *
+ * The first implementation will only make use of the basic features and more advanced features
+ * will be added later.
+ * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring
+ * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible
+ * while still keeping a consistent platform independent API which allows efficient implementations on
+ * other hosts when they come up.
+ *
+ * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional
+ * dependencies and to avoid compile problems on older hosts missing the interface just like it is done
+ * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from:
+ * * http://kernel.dk/io_uring.pdf
+ * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_IOQUEUE
+#include <iprt/ioqueue.h>
+
+#include <iprt/assertcompile.h>
+#include <iprt/asm.h>
+#include <iprt/errcore.h>
+#include <iprt/file.h>
+#include <iprt/log.h>
+#include <iprt/mem.h>
+#include <iprt/string.h>
+
+#include <errno.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+
+#include "internal/ioqueue.h"
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+
+/** The syscall number of io_uring_setup(). */
+#define LNX_IOURING_SYSCALL_SETUP 425
+/** The syscall number of io_uring_enter(). */
+#define LNX_IOURING_SYSCALL_ENTER 426
+/** The syscall number of io_uring_register(). */
+#define LNX_IOURING_SYSCALL_REGISTER 427
+/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */
+#define LNX_SYSCALL_EVENTFD2 290
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+
+/**
+ * Linux io_uring completion event.
+ */
+typedef struct LNXIOURINGCQE
+{
+ /** Opaque user data associated with the completed request. */
+ uint64_t u64User;
+ /** The status code of the request. */
+ int32_t rcLnx;
+ /** Some flags which are not used as of now. */
+ uint32_t fFlags;
+} LNXIOURINGCQE;
+AssertCompileSize(LNXIOURINGCQE, 16);
+/** Pointer to a Linux io_uring completion event. */
+typedef LNXIOURINGCQE *PLNXIOURINGCQE;
+/** Pointer to a constant linux io_uring completion event. */
+typedef const LNXIOURINGCQE *PCLNXIOURINGCQE;
+
+
+/**
+ * Linux io_uring submission queue entry.
+ */
+typedef struct LNXIOURINGSQE
+{
+ /** The opcode for the request. */
+ uint8_t u8Opc;
+ /** Common flags for the request. */
+ uint8_t u8Flags;
+ /** Assigned I/O priority. */
+ uint16_t u16IoPrio;
+ /** The file descriptor the request is for. */
+ int32_t i32Fd;
+ /** The start offset into the file for the request. */
+ uint64_t u64OffStart;
+ /** Buffer pointer or Pointer to io vector array depending on opcode. */
+ uint64_t u64AddrBufIoVec;
+ /** Size of the buffer in bytes or number of io vectors. */
+ uint32_t u32BufIoVecSz;
+ /** Opcode dependent data. */
+ union
+ {
+ /** Flags for read/write requests. */
+ uint32_t u32KrnlRwFlags;
+ /** Flags for fsync() like requests. */
+ uint32_t u32FsyncFlags;
+ /** Flags for poll() like requests. */
+ uint16_t u16PollFlags;
+ /** Flags for sync_file_range() like requests. */
+ uint32_t u32SyncFileRangeFlags;
+ /** Flags for requests requiring a msg structure. */
+ uint32_t u32MsgFlags;
+ } uOpc;
+ /** Opaque user data associated with the request and returned durign completion. */
+ uint64_t u64User;
+ /** Request type dependent data. */
+ union
+ {
+ /** Fixed buffer index if indicated by the request flags. */
+ uint16_t u16FixedBufIdx;
+ /** Padding to align the structure to 64 bytes. */
+ uint64_t au64Padding[3];
+ } uReq;
+} LNXIOURINGSQE;
+AssertCompileSize(LNXIOURINGSQE, 64);
+/** Pointer to a Linux io_uring submission queue entry. */
+typedef LNXIOURINGSQE *PLNXIOURINGSQE;
+/** Pointer to a constant Linux io_uring submission queue entry. */
+typedef const LNXIOURINGSQE *PCLNXIOURINGSQE;
+
+
+/**
+ * Linux u_ioring SQ ring header structure to maintain the queue.
+ */
+typedef struct LNXIOURINGSQ
+{
+ /** The current head position to fill in new requests. */
+ uint32_t u32OffHead;
+ /** The current tail position the kernel starts processing from. */
+ uint32_t u32OffTail;
+ /** The mask for the head and tail counters to apply to retrieve the index. */
+ uint32_t u32OffRingMask;
+ /** Number of entries in the SQ ring. */
+ uint32_t u32OffRingEntries;
+ /** Flags set asychronously by the kernel. */
+ uint32_t u32OffFlags;
+ /** Counter of dropped requests. */
+ uint32_t u32OffDroppedReqs;
+ /** Offset where to find the array of SQ entries. */
+ uint32_t u32OffArray;
+ /** Reserved. */
+ uint32_t u32Rsvd0;
+ /** Reserved. */
+ uint64_t u64Rsvd1;
+} LNXIOURINGSQ;
+AssertCompileSize(LNXIOURINGSQ, 40);
+/** Pointer to a Linux u_ioring SQ ring header. */
+typedef LNXIOURINGSQ *PLNXIOURINGSQ;
+/** Pointer to a constant Linux u_ioring SQ ring header. */
+typedef const LNXIOURINGSQ *PCLNXIOURINGSQ;
+
+
+/**
+ * Linux io_uring CQ ring header structure to maintain the queue.
+ */
+typedef struct LNXIOURINGCQ
+{
+ /** The current head position the kernel modifies when completion events happen. */
+ uint32_t u32OffHead;
+ /** The current tail position to read completion events from. */
+ uint32_t u32OffTail;
+ /** The mask for the head and tail counters to apply to retrieve the index. */
+ uint32_t u32OffRingMask;
+ /** Number of entries in the CQ ring. */
+ uint32_t u32OffRingEntries;
+ /** Number of CQ overflows happened. */
+ uint32_t u32OffOverflowCnt;
+ /** */
+ uint32_t u32OffCqes;
+ /** Reserved. */
+ uint64_t au64Rsvd0[2];
+} LNXIOURINGCQ;
+AssertCompileSize(LNXIOURINGCQ, 40);
+/** Pointer to a Linux u_ioring CQ ring header. */
+typedef LNXIOURINGCQ *PLNXIOURINGCQ;
+/** Pointer to a constant Linux u_ioring CQ ring header. */
+typedef const LNXIOURINGCQ *PCLNXIOURINGCQ;
+
+
+/**
+ * Linux io_uring parameters passed to io_uring_setup().
+ */
+typedef struct LNXIOURINGPARAMS
+{
+ /** Number of SQ entries requested, must be power of 2. */
+ uint32_t u32SqEntriesCnt;
+ /** Number of CQ entries requested, must be power of 2. */
+ uint32_t u32CqEntriesCnt;
+ /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */
+ uint32_t u32Flags;
+ /** Affinity of the kernel side SQ polling thread if enabled. */
+ uint32_t u32SqPollCpu;
+ /** Milliseconds after the kernel side SQ polling thread goes to sleep
+ * if there is are no requests to process. */
+ uint32_t u32SqPollIdleMs;
+ /** Reserved. */
+ uint32_t au32Rsvd0[5];
+ /** Offsets returned for the submission queue. */
+ LNXIOURINGSQ SqOffsets;
+ /** Offsets returned for the completion queue. */
+ LNXIOURINGCQ CqOffsets;
+} LNXIOURINGPARAMS;
+/** Pointer to Linux io_uring parameters. */
+typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS;
+/** Pointer to constant Linux io_uring parameters. */
+typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS;
+
+
+/** @name LNXIOURINGSQE::u8Opc defined opcodes.
+ * @{ */
+/** Opcode to profile the interface, does nothing. */
+#define LNX_IOURING_OPC_NOP 0
+/** preadv() like request. */
+#define LNX_IOURING_OPC_READV 1
+/** pwritev() like request. */
+#define LNX_IOURING_OPC_WRITEV 2
+/** fsync() like request. */
+#define LNX_IOURING_OPC_FSYNC 3
+/** Read request using a fixed preset buffer. */
+#define LNX_IOURING_OPC_READ_FIXED 4
+/** Write request using a fixed preset buffer. */
+#define LNX_IOURING_OPC_WRITE_FIXED 5
+/** Add file descriptor to pollset. */
+#define LNX_IOURING_OPC_POLL_ADD 6
+/** Remove file descriptor from pollset. */
+#define LNX_IOURING_OPC_POLL_REMOVE 7
+/** sync_file_range() like request. */
+#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8
+/** sendmsg() like request. */
+#define LNX_IOURING_OPC_SENDMSG 9
+/** recvmsg() like request. */
+#define LNX_IOURING_OPC_RECVMSG 10
+/** @} */
+
+
+/** @name Additional flags for LNX_IOURING_OPC_FSYNC requests.
+ * @{ */
+/** Sync userdata as well instead of metadata only. */
+#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0)
+/** @} */
+
+
+/** @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall.
+ * @{ */
+/** The I/O context is polled. */
+#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0)
+/** The kernel should poll the submission queue. */
+#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1)
+/** Sets the CPU affinity of the kernel thread polling the submission queue. */
+#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2)
+/** @} */
+
+
+/** @name Flags for LNXIOURINGSQE::u8Flags.
+ * @{ */
+/** The file descriptor was registered before use. */
+#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0)
+/** Complete all active requests before issuing the request with the flag set. */
+#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1)
+/** Links the request with the flag set to the next one. */
+#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2)
+/** @} */
+
+
+/** @name Magic mmap offsets to map submission and completion queues.
+ * @{ */
+/** Used to map the submission queue. */
+#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0)
+/** Used to map the completion queue. */
+#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000)
+/** Used to map the submission queue entries array. */
+#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000)
+/** @} */
+
+
+/** @name Flags used for the SQ ring structure.
+ * @{ */
+/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */
+#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0)
+/** @} */
+
+
+/** @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall.
+ * @{ */
+/** Retrieve completion events for the completion queue. */
+#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0)
+/** Wakes the suspended kernel thread processing the requests. */
+#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1)
+/** @} */
+
+
+/** @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall.
+ * @{ */
+/** Register a fixed set of buffers. */
+#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0
+/** Unregisters a fixed set of buffers registered previously. */
+#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1
+/** Register a fixed set of files. */
+#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2
+/** Unregisters a fixed set of files registered previously. */
+#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3
+/** Register an eventfd associated with the I/O ring. */
+#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4
+/** Unregisters an eventfd registered previously. */
+#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5
+/** @} */
+
+
+/**
+ * SQ ring structure.
+ *
+ * @note Some members of this structure point to memory shared with the kernel,
+ * hence the volatile keyword.
+ */
+typedef struct RTIOQUEUESQ
+{
+ /** Pointer to the head counter. */
+ volatile uint32_t *pidxHead;
+ /** Pointer to the tail counter. */
+ volatile uint32_t *pidxTail;
+ /** Mask to apply for the counters to get to the index. */
+ uint32_t fRingMask;
+ /** Number of entries in the ring. */
+ uint32_t cEntries;
+ /** Pointer to the global flags. */
+ volatile uint32_t *pfFlags;
+ /** Pointer to the indirection array used for indexing the real SQ entries. */
+ volatile uint32_t *paidxSqes;
+} RTIOQUEUESQ;
+
+
+/**
+ * CQ ring structure.
+ *
+ * @note Some members of this structure point to memory shared with the kernel,
+ * hence the volatile keyword.
+ */
+typedef struct RTIOQUEUECQ
+{
+ /** Pointer to the head counter. */
+ volatile uint32_t *pidxHead;
+ /** Pointer to the tail counter. */
+ volatile uint32_t *pidxTail;
+ /** Mask to apply for the counters to get to the index. */
+ uint32_t fRingMask;
+ /** Number of entries in the ring. */
+ uint32_t cEntries;
+ /** Pointer to the completion entry ring. */
+ volatile LNXIOURINGCQE *paCqes;
+} RTIOQUEUECQ;
+
+
+/**
+ * Internal I/O queue provider instance data.
+ */
+typedef struct RTIOQUEUEPROVINT
+{
+ /** The io_uring file descriptor. */
+ int iFdIoCtx;
+ /** The eventfd file descriptor registered with the ring. */
+ int iFdEvt;
+ /** The submission queue. */
+ RTIOQUEUESQ Sq;
+ /** The currently uncommitted tail for the SQ. */
+ uint32_t idxSqTail;
+ /** Numbere of uncommitted SQEs. */
+ uint32_t cSqesToCommit;
+ /** The completion queue. */
+ RTIOQUEUECQ Cq;
+ /** Pointer to the mapped SQES entries. */
+ PLNXIOURINGSQE paSqes;
+ /** Pointer to the iovec structure used for non S/G requests. */
+ struct iovec *paIoVecs;
+ /** Pointer returned by mmap() for the SQ ring, used for unmapping. */
+ void *pvMMapSqRing;
+ /** Pointer returned by mmap() for the CQ ring, used for unmapping. */
+ void *pvMMapCqRing;
+ /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */
+ void *pvMMapSqes;
+ /** Size of the mapped SQ ring, used for unmapping. */
+ size_t cbMMapSqRing;
+ /** Size of the mapped CQ ring, used for unmapping. */
+ size_t cbMMapCqRing;
+ /** Size of the mapped SQ entries array, used for unmapping. */
+ size_t cbMMapSqes;
+ /** Flag whether the waiter was woken up externally. */
+ volatile bool fExtIntr;
+} RTIOQUEUEPROVINT;
+/** Pointer to the internal I/O queue provider instance data. */
+typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT;
+
+
+/*********************************************************************************************************************************
+* Internal Functions *
+*********************************************************************************************************************************/
+
+/**
+ * Syscall wrapper for io_uring_setup().
+ *
+ * @returns IPRT status code.
+ * @param cEntries Number of entries for submission and completion queues.
+ * @param pParams Additional parameters for the I/O ring and updated return values
+ * on success.
+ * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success.
+ */
+DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx)
+{
+ int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams);
+ if (RT_UNLIKELY(rcLnx == -1))
+ return RTErrConvertFromErrno(errno);
+
+ *piFdIoCtx = rcLnx;
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Syscall wrapper for io_uring_enter().
+ *
+ * @returns IPRT status code.
+ * @param iFdIoCtx The I/O ring file descriptor.
+ * @param cToSubmit Maximum number of requests waiting for processing.
+ * @param cMinComplete Minimum number of completion events to accumulate before returning.
+ * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*.
+ */
+DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete,
+ uint32_t fFlags)
+{
+ int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags,
+ NULL, 0);
+ if (RT_UNLIKELY(rcLnx == -1))
+ return RTErrConvertFromErrno(errno);
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Syscall wrapper for io_uring_register().
+ *
+ * @returns IPRT status code.
+ * @param iFdIoCtx The I/O ring file descriptor.
+ * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*.
+ * @param pvArg Opaque arguments.
+ * @param cArgs Number of arguments.
+ */
+DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg,
+ uint32_t cArgs)
+{
+ int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs);
+ if (RT_UNLIKELY(rcLnx == -1))
+ return RTErrConvertFromErrno(errno);
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * mmap() wrapper for the common bits and returning an IPRT status code.
+ *
+ * @returns IPRT status code.
+ * @param iFdIoCtx The I/O ring file descriptor.
+ * @param offMmap The mmap() offset.
+ * @param cbMmap How much to map.
+ * @param ppv Where to store the pointer to the mapping on success.
+ */
+DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv)
+{
+ void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap);
+ if (pv != MAP_FAILED)
+ {
+ *ppv = pv;
+ return VINF_SUCCESS;
+ }
+
+ return RTErrConvertFromErrno(errno);
+}
+
+
+/**
+ * eventfd2() syscall wrapper.
+ *
+ * @returns IPRT status code.
+ * @param uValInit The initial value of the maintained counter.
+ * @param fFlags Flags controlling the eventfd behavior.
+ * @param piFdEvt Where to store the file descriptor of the eventfd object on success.
+ */
+DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt)
+{
+ int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags);
+ if (RT_UNLIKELY(rcLnx == -1))
+ return RTErrConvertFromErrno(errno);
+
+ *piFdEvt = rcLnx;
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Checks the completion event queue for pending events.
+ *
+ * @param pThis The provider instance.
+ * @param paCEvt Pointer to the array of completion events.
+ * @param cCEvt Maximum number of completion events the array can hold.
+ * @param pcCEvtSeen Where to store the number of completion events processed.
+ */
+static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt,
+ uint32_t cCEvt, uint32_t *pcCEvtSeen)
+{
+ /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */
+ ASMReadFence();
+ uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead);
+ uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail);
+ ASMReadFence();
+
+ uint32_t cCEvtSeen = 0;
+
+ while ( idxCqTail != idxCqHead
+ && cCEvtSeen < cCEvt)
+ {
+ /* Get the index. */
+ uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask;
+ volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe];
+
+ paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User;
+ if (pCqe->rcLnx >= 0)
+ {
+ paCEvt->rcReq = VINF_SUCCESS;
+ paCEvt->cbXfered = (size_t)pCqe->rcLnx;
+ }
+ else
+ paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx);
+
+#ifdef RT_STRICT /* poison */
+ memset((void *)pCqe, 0xff, sizeof(*pCqe));
+#endif
+
+ paCEvt++;
+ cCEvtSeen++;
+ idxCqHead++;
+ }
+
+ *pcCEvtSeen = cCEvtSeen;
+
+ /* Paranoia strikes again. */
+ ASMWriteFence();
+ ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead);
+ ASMWriteFence();
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */
+static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void)
+{
+ /*
+ * Try to create a simple I/O ring and close it again.
+ * The common code/public API already checked for the proper handle type.
+ */
+ int iFdIoCtx = 0;
+ bool fSupp = false;
+ LNXIOURINGPARAMS Params;
+ RT_ZERO(Params);
+
+ int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx);
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * Check that we can register an eventfd descriptor to get notified about
+ * completion events while being able to kick the waiter externally out of the wait.
+ */
+ int iFdEvt = 0;
+ rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER,
+ &iFdEvt, 1 /*cArgs*/);
+ if (RT_SUCCESS(rc))
+ fSupp = true;
+
+ int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx);
+ }
+ int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
+ }
+
+ return fSupp;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags,
+ uint32_t cSqEntries, uint32_t cCqEntries)
+{
+ RT_NOREF(fFlags, cCqEntries);
+
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+ LNXIOURINGPARAMS Params;
+ RT_ZERO(Params);
+
+ pThis->cSqesToCommit = 0;
+ pThis->fExtIntr = false;
+
+ int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx);
+ if (RT_SUCCESS(rc))
+ {
+ /* Map the rings into userspace. */
+ pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t);
+ pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE);
+ pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE);
+
+ pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec));
+ if (RT_LIKELY(pThis->paIoVecs))
+ {
+ rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing);
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes);
+ if (RT_SUCCESS(rc))
+ {
+ uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing;
+
+ pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead);
+ pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail);
+ pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask);
+ pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries);
+ pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags);
+ pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray);
+ pThis->idxSqTail = *pThis->Sq.pidxTail;
+
+ pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes;
+
+ pbTmp = (uint8_t *)pThis->pvMMapCqRing;
+
+ pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead);
+ pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail);
+ pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask);
+ pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries);
+ pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes);
+ return VINF_SUCCESS;
+ }
+
+ munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing);
+ }
+
+ munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing);
+ }
+
+ rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
+ AssertRC(rc);
+ }
+
+ close(pThis->iFdEvt);
+ }
+
+ RTMemFree(pThis->paIoVecs);
+ }
+
+ int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx);
+ }
+
+ return rc;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */
+static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+
+ int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
+ rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx);
+ rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx);
+
+ int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0);
+ AssertRC(rc);
+
+ close(pThis->iFdEvt);
+ close(pThis->iFdIoCtx);
+ RTMemFree(pThis->paIoVecs);
+
+ RT_ZERO(pThis);
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
+{
+ RT_NOREF(hIoQueueProv, pHandle);
+ /** @todo Add support for fixed file sets later. */
+ return VINF_SUCCESS;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle)
+{
+ RT_NOREF(hIoQueueProv, pHandle);
+ /** @todo Add support for fixed file sets later. */
+ return VINF_SUCCESS;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp,
+ uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags,
+ void *pvUser)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+ RT_NOREF(fReqFlags);
+
+ uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask;
+ PLNXIOURINGSQE pSqe = &pThis->paSqes[idx];
+ struct iovec *pIoVec = &pThis->paIoVecs[idx];
+
+ pIoVec->iov_base = pvBuf;
+ pIoVec->iov_len = cbBuf;
+
+ pSqe->u8Flags = 0;
+ pSqe->u16IoPrio = 0;
+ pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile);
+ pSqe->u64OffStart = off;
+ pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec;
+ pSqe->u32BufIoVecSz = 1;
+ pSqe->u64User = (uint64_t)(uintptr_t)pvUser;
+
+ switch (enmOp)
+ {
+ case RTIOQUEUEOP_READ:
+ pSqe->u8Opc = LNX_IOURING_OPC_READV;
+ pSqe->uOpc.u32KrnlRwFlags = 0;
+ break;
+ case RTIOQUEUEOP_WRITE:
+ pSqe->u8Opc = LNX_IOURING_OPC_WRITEV;
+ pSqe->uOpc.u32KrnlRwFlags = 0;
+ break;
+ case RTIOQUEUEOP_SYNC:
+ pSqe->u8Opc = LNX_IOURING_OPC_FSYNC;
+ pSqe->uOpc.u32FsyncFlags = 0;
+ break;
+ default:
+ AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp),
+ VERR_INVALID_PARAMETER);
+ }
+
+ pThis->Sq.paidxSqes[idx] = idx;
+ pThis->idxSqTail++;
+ pThis->cSqesToCommit++;
+ return VINF_SUCCESS;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+
+ ASMWriteFence();
+ ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail);
+ ASMWriteFence();
+
+ int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/);
+ if (RT_SUCCESS(rc))
+ {
+ *pcReqsCommitted = pThis->cSqesToCommit;
+ pThis->cSqesToCommit = 0;
+ }
+
+ return rc;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt,
+ uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+ int rc = VINF_SUCCESS;
+ uint32_t cCEvtSeen = 0;
+
+ RT_NOREF(fFlags);
+
+ /*
+ * Check the completion queue first for any completed events which might save us a
+ * context switch later on.
+ */
+ rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen);
+
+ while ( cCEvtSeen < cMinWait
+ && RT_SUCCESS(rc))
+ {
+ /*
+ * We can employ a blocking read on the event file descriptor, it will return
+ * either when woken up externally or when there are completion events pending.
+ */
+ uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */
+ ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt));
+ if (rcLnx == sizeof(uCnt))
+ {
+ uint32_t cCEvtThisSeen = 0;
+ rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen);
+ cCEvtSeen += cCEvtThisSeen;
+
+ /* Whether we got woken up externally. */
+ if (ASMAtomicXchgBool(&pThis->fExtIntr, false))
+ rc = VERR_INTERRUPTED;
+ }
+ else if (rcLnx == -1)
+ rc = RTErrConvertFromErrno(errno);
+ else
+ AssertMsgFailed(("Unexpected read() -> 0\n"));
+ }
+
+ *pcCEvt = cCEvtSeen;
+ return rc;
+}
+
+
+/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */
+static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv)
+{
+ PRTIOQUEUEPROVINT pThis = hIoQueueProv;
+ int rc = VINF_SUCCESS;
+
+ if (!ASMAtomicXchgBool(&pThis->fExtIntr, true))
+ {
+ const uint64_t uValAdd = 1;
+ ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd));
+
+ Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd));
+ if (rcLnx == -1)
+ rc = RTErrConvertFromErrno(errno);
+ }
+
+ return rc;
+}
+
+
+/**
+ * Async file I/O queue provider virtual method table.
+ */
+RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv =
+{
+ /** uVersion */
+ RTIOQUEUEPROVVTABLE_VERSION,
+ /** pszId */
+ "LnxIoURingFile",
+ /** cbIoQueueProv */
+ sizeof(RTIOQUEUEPROVINT),
+ /** enmHnd */
+ RTHANDLETYPE_FILE,
+ /** fFlags */
+ 0,
+ /** pfnIsSupported */
+ rtIoQueueLnxIoURingFileProv_IsSupported,
+ /** pfnQueueInit */
+ rtIoQueueLnxIoURingFileProv_QueueInit,
+ /** pfnQueueDestroy */
+ rtIoQueueLnxIoURingFileProv_QueueDestroy,
+ /** pfnHandleRegister */
+ rtIoQueueLnxIoURingFileProv_HandleRegister,
+ /** pfnHandleDeregister */
+ rtIoQueueLnxIoURingFileProv_HandleDeregister,
+ /** pfnReqPrepare */
+ rtIoQueueLnxIoURingFileProv_ReqPrepare,
+ /** pfnReqPrepareSg */
+ NULL,
+ /** pfnCommit */
+ rtIoQueueLnxIoURingFileProv_Commit,
+ /** pfnEvtWait */
+ rtIoQueueLnxIoURingFileProv_EvtWait,
+ /** pfnEvtWaitWakeup */
+ rtIoQueueLnxIoURingFileProv_EvtWaitWakeup,
+ /** uEndMarker */
+ RTIOQUEUEPROVVTABLE_VERSION
+};
+
diff --git a/src/VBox/Runtime/r3/linux/krnlmod-linux.cpp b/src/VBox/Runtime/r3/linux/krnlmod-linux.cpp
new file mode 100644
index 00000000..6d81d530
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/krnlmod-linux.cpp
@@ -0,0 +1,358 @@
+/* $Id: krnlmod-linux.cpp $ */
+/** @file
+ * IPRT - Kernel module, Linux.
+ */
+
+/*
+ * Copyright (C) 2017-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_SYSTEM
+#include <iprt/krnlmod.h>
+#include <iprt/linux/sysfs.h>
+#include <iprt/asm.h>
+#include <iprt/assert.h>
+#include <iprt/dir.h>
+#include <iprt/err.h>
+#include <iprt/mem.h>
+#include <iprt/string.h>
+#include <iprt/types.h>
+
+
+/**
+ * Internal kernel information record state.
+ */
+typedef struct RTKRNLMODINFOINT
+{
+ /** Reference counter. */
+ volatile uint32_t cRefs;
+ /** Reference count for the kernel module. */
+ uint32_t cRefKrnlMod;
+ /** Load address of the kernel module. */
+ RTR0UINTPTR uLoadAddr;
+ /** Size of the kernel module. */
+ size_t cbKrnlMod;
+ /** Size of the name in characters including the zero terminator. */
+ size_t cchName;
+ /** Module name - variable in size. */
+ char achName[1];
+} RTKRNLMODINFOINT;
+/** Pointer to the internal kernel module information record. */
+typedef RTKRNLMODINFOINT *PRTKRNLMODINFOINT;
+/** Pointer to a const internal kernel module information record. */
+typedef const RTKRNLMODINFOINT *PCRTKRNLMODINFOINT;
+
+
+
+/**
+ * Destroy the given kernel module information record.
+ *
+ * @param pThis The record to destroy.
+ */
+static void rtKrnlModInfoDestroy(PRTKRNLMODINFOINT pThis)
+{
+ RTMemFree(pThis);
+}
+
+
+static int rtKrnlModLinuxReadIntFileDef(unsigned uBase, int64_t *pi64, int64_t i64Def,
+ const char *pszName, const char *pszPath)
+{
+ int rc = RTLinuxSysFsReadIntFile(uBase, pi64, "module/%s/%s", pszName, pszPath);
+ if (rc == VERR_FILE_NOT_FOUND)
+ {
+ *pi64 = i64Def;
+ rc = VINF_SUCCESS;
+ }
+
+ return rc;
+}
+
+/**
+ * Creates a new kernel module information record for the given module.
+ *
+ * @returns IPRT status code.
+ * @param pszName The kernel module name.
+ * @param phKrnlModInfo Where to store the handle to the kernel module information record
+ * on success.
+ */
+static int rtKrnlModLinuxInfoCreate(const char *pszName, PRTKRNLMODINFO phKrnlModInfo)
+{
+ int rc = VINF_SUCCESS;
+ size_t cchName = strlen(pszName) + 1;
+ PRTKRNLMODINFOINT pThis = (PRTKRNLMODINFOINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTKRNLMODINFOINT, achName[cchName]));
+ if (RT_LIKELY(pThis))
+ {
+ memcpy(&pThis->achName[0], pszName, cchName);
+ pThis->cchName = cchName;
+ pThis->cRefs = 1;
+
+ int64_t iTmp = 0;
+ rc = rtKrnlModLinuxReadIntFileDef(10, &iTmp, 0, pszName, "refcnt");
+ if (RT_SUCCESS(rc))
+ pThis->cRefKrnlMod = (uint32_t)iTmp;
+
+ rc = rtKrnlModLinuxReadIntFileDef(10, &iTmp, 0, pszName, "coresize");
+ if (RT_SUCCESS(rc))
+ pThis->cbKrnlMod = iTmp;
+
+ rc = rtKrnlModLinuxReadIntFileDef(16, &iTmp, 0, pszName, "sections/.text");
+ if (RT_SUCCESS(rc))
+ pThis->uLoadAddr = iTmp;
+
+ if (RT_SUCCESS(rc))
+ *phKrnlModInfo = pThis;
+ else
+ RTMemFree(pThis);
+ }
+ else
+ rc = VERR_NO_MEMORY;
+
+ return rc;
+}
+
+
+RTDECL(int) RTKrnlModQueryLoaded(const char *pszName, bool *pfLoaded)
+{
+ AssertPtrReturn(pszName, VERR_INVALID_POINTER);
+ AssertPtrReturn(pfLoaded, VERR_INVALID_POINTER);
+
+ int rc = RTLinuxSysFsExists("module/%s", pszName);
+ if (rc == VINF_SUCCESS)
+ *pfLoaded = true;
+ else if (rc == VERR_FILE_NOT_FOUND)
+ {
+ *pfLoaded = false;
+ rc = VINF_SUCCESS;
+ }
+
+ return rc;
+}
+
+
+RTDECL(int) RTKrnlModLoadedQueryInfo(const char *pszName, PRTKRNLMODINFO phKrnlModInfo)
+{
+ AssertPtrReturn(pszName, VERR_INVALID_POINTER);
+ AssertPtrReturn(phKrnlModInfo, VERR_INVALID_POINTER);
+
+ int rc = RTLinuxSysFsExists("module/%s", pszName);
+ if (rc == VINF_SUCCESS)
+ rc = rtKrnlModLinuxInfoCreate(pszName, phKrnlModInfo);
+ else if (rc == VERR_FILE_NOT_FOUND)
+ rc = VERR_NOT_FOUND;
+
+ return rc;
+}
+
+
+RTDECL(uint32_t) RTKrnlModLoadedGetCount(void)
+{
+ uint32_t cKmodsLoaded = 0;
+
+ RTDIR hDir = NULL;
+ int rc = RTDirOpen(&hDir, "/sys/module");
+ if (RT_SUCCESS(rc))
+ {
+ RTDIRENTRY DirEnt;
+ rc = RTDirRead(hDir, &DirEnt, NULL);
+ while (RT_SUCCESS(rc))
+ {
+ if (!RTDirEntryIsStdDotLink(&DirEnt))
+ cKmodsLoaded++;
+ rc = RTDirRead(hDir, &DirEnt, NULL);
+ }
+
+ RTDirClose(hDir);
+ }
+
+
+ return cKmodsLoaded;
+}
+
+
+RTDECL(int) RTKrnlModLoadedQueryInfoAll(PRTKRNLMODINFO pahKrnlModInfo, uint32_t cEntriesMax,
+ uint32_t *pcEntries)
+{
+ if (cEntriesMax > 0)
+ AssertPtrReturn(pahKrnlModInfo, VERR_INVALID_POINTER);
+
+ uint32_t cKmodsLoaded = RTKrnlModLoadedGetCount();
+ if (cEntriesMax < cKmodsLoaded)
+ {
+ if (*pcEntries)
+ *pcEntries = cKmodsLoaded;
+ return VERR_BUFFER_OVERFLOW;
+ }
+
+ RTDIR hDir = NULL;
+ int rc = RTDirOpen(&hDir, "/sys/module");
+ if (RT_SUCCESS(rc))
+ {
+ unsigned idxKrnlModInfo = 0;
+ RTDIRENTRY DirEnt;
+
+ rc = RTDirRead(hDir, &DirEnt, NULL);
+ while (RT_SUCCESS(rc))
+ {
+ if (!RTDirEntryIsStdDotLink(&DirEnt))
+ {
+ rc = rtKrnlModLinuxInfoCreate(DirEnt.szName, &pahKrnlModInfo[idxKrnlModInfo]);
+ if (RT_SUCCESS(rc))
+ idxKrnlModInfo++;
+ }
+
+ if (RT_SUCCESS(rc))
+ rc = RTDirRead(hDir, &DirEnt, NULL);
+ }
+
+ if (rc == VERR_NO_MORE_FILES)
+ rc = VINF_SUCCESS;
+ else if (RT_FAILURE(rc))
+ {
+ /* Rollback */
+ while (idxKrnlModInfo-- > 0)
+ RTKrnlModInfoRelease(pahKrnlModInfo[idxKrnlModInfo]);
+ }
+
+ if (*pcEntries)
+ *pcEntries = cKmodsLoaded;
+
+ RTDirClose(hDir);
+ }
+
+ return rc;
+}
+
+
+RTDECL(uint32_t) RTKrnlModInfoRetain(RTKRNLMODINFO hKrnlModInfo)
+{
+ PRTKRNLMODINFOINT pThis = hKrnlModInfo;
+ AssertPtrReturn(pThis, UINT32_MAX);
+
+ uint32_t cRefs = ASMAtomicIncU32(&pThis->cRefs);
+ AssertMsg(cRefs > 1 && cRefs < _1M, ("%#x %p\n", cRefs, pThis));
+ return cRefs;
+}
+
+
+RTDECL(uint32_t) RTKrnlModInfoRelease(RTKRNLMODINFO hKrnlModInfo)
+{
+ PRTKRNLMODINFOINT pThis = hKrnlModInfo;
+ if (!pThis)
+ return 0;
+ AssertPtrReturn(pThis, UINT32_MAX);
+
+ uint32_t cRefs = ASMAtomicDecU32(&pThis->cRefs);
+ AssertMsg(cRefs < _1M, ("%#x %p\n", cRefs, pThis));
+ if (cRefs == 0)
+ rtKrnlModInfoDestroy(pThis);
+ return cRefs;
+}
+
+
+RTDECL(uint32_t) RTKrnlModInfoGetRefCnt(RTKRNLMODINFO hKrnlModInfo)
+{
+ PRTKRNLMODINFOINT pThis = hKrnlModInfo;
+ AssertPtrReturn(pThis, 0);
+
+ return pThis->cRefKrnlMod;
+}
+
+
+RTDECL(const char *) RTKrnlModInfoGetName(RTKRNLMODINFO hKrnlModInfo)
+{
+ PRTKRNLMODINFOINT pThis = hKrnlModInfo;
+ AssertPtrReturn(pThis, NULL);
+
+ return &pThis->achName[0];
+}
+
+
+RTDECL(const char *) RTKrnlModInfoGetFilePath(RTKRNLMODINFO hKrnlModInfo)
+{
+ PRTKRNLMODINFOINT pThis = hKrnlModInfo;
+ AssertPtrReturn(pThis, NULL);
+
+ return NULL;
+}
+
+
+RTDECL(size_t) RTKrnlModInfoGetSize(RTKRNLMODINFO hKrnlModInfo)
+{
+ PRTKRNLMODINFOINT pThis = hKrnlModInfo;
+ AssertPtrReturn(pThis, 0);
+
+ return pThis->cbKrnlMod;
+}
+
+
+RTDECL(RTR0UINTPTR) RTKrnlModInfoGetLoadAddr(RTKRNLMODINFO hKrnlModInfo)
+{
+ PRTKRNLMODINFOINT pThis = hKrnlModInfo;
+ AssertPtrReturn(pThis, 0);
+
+ return pThis->uLoadAddr;
+}
+
+
+RTDECL(int) RTKrnlModInfoQueryRefModInfo(RTKRNLMODINFO hKrnlModInfo, uint32_t idx,
+ PRTKRNLMODINFO phKrnlModInfoRef)
+{
+ RT_NOREF3(hKrnlModInfo, idx, phKrnlModInfoRef);
+ return VERR_NOT_IMPLEMENTED;
+}
+
+
+RTDECL(int) RTKrnlModLoadByName(const char *pszName)
+{
+ AssertPtrReturn(pszName, VERR_INVALID_PARAMETER);
+
+ return VERR_NOT_SUPPORTED;
+}
+
+
+RTDECL(int) RTKrnlModLoadByPath(const char *pszPath)
+{
+ AssertPtrReturn(pszPath, VERR_INVALID_PARAMETER);
+
+ return VERR_NOT_SUPPORTED;
+}
+
+
+RTDECL(int) RTKrnlModUnloadByName(const char *pszName)
+{
+ AssertPtrReturn(pszName, VERR_INVALID_PARAMETER);
+
+ return VERR_NOT_SUPPORTED;
+}
diff --git a/src/VBox/Runtime/r3/linux/mp-linux.cpp b/src/VBox/Runtime/r3/linux/mp-linux.cpp
new file mode 100644
index 00000000..935dfd22
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/mp-linux.cpp
@@ -0,0 +1,328 @@
+/* $Id: mp-linux.cpp $ */
+/** @file
+ * IPRT - Multiprocessor, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_SYSTEM
+#include <stdio.h>
+#include <errno.h>
+
+#include <iprt/mp.h>
+#include "internal/iprt.h"
+
+#include <iprt/alloca.h>
+#include <iprt/cpuset.h>
+#include <iprt/assert.h>
+#include <iprt/string.h>
+#include <iprt/linux/sysfs.h>
+
+
+/**
+ * Internal worker that determines the max possible CPU count.
+ *
+ * @returns Max cpus.
+ */
+static RTCPUID rtMpLinuxMaxCpus(void)
+{
+#if 0 /* this doesn't do the right thing :-/ */
+ int cMax = sysconf(_SC_NPROCESSORS_CONF);
+ Assert(cMax >= 1);
+ return cMax;
+#else
+ static uint32_t s_cMax = 0;
+ if (!s_cMax)
+ {
+ int cMax = 1;
+ for (unsigned iCpu = 0; iCpu < RTCPUSET_MAX_CPUS; iCpu++)
+ if (RTLinuxSysFsExists("devices/system/cpu/cpu%d", iCpu))
+ cMax = iCpu + 1;
+ ASMAtomicUoWriteU32((uint32_t volatile *)&s_cMax, cMax);
+ return cMax;
+ }
+ return s_cMax;
+#endif
+}
+
+/**
+ * Internal worker that picks the processor speed in MHz from /proc/cpuinfo.
+ *
+ * @returns CPU frequency.
+ */
+static uint32_t rtMpLinuxGetFrequency(RTCPUID idCpu)
+{
+ FILE *pFile = fopen("/proc/cpuinfo", "r");
+ if (!pFile)
+ return 0;
+
+ char sz[256];
+ RTCPUID idCpuFound = NIL_RTCPUID;
+ uint32_t Frequency = 0;
+ while (fgets(sz, sizeof(sz), pFile))
+ {
+ char *psz;
+ if ( !strncmp(sz, RT_STR_TUPLE("processor"))
+ && (sz[10] == ' ' || sz[10] == '\t' || sz[10] == ':')
+ && (psz = strchr(sz, ':')))
+ {
+ psz += 2;
+ int64_t iCpu;
+ int rc = RTStrToInt64Ex(psz, NULL, 0, &iCpu);
+ if (RT_SUCCESS(rc))
+ idCpuFound = iCpu;
+ }
+ else if ( idCpu == idCpuFound
+ && !strncmp(sz, RT_STR_TUPLE("cpu MHz"))
+ && (sz[10] == ' ' || sz[10] == '\t' || sz[10] == ':')
+ && (psz = strchr(sz, ':')))
+ {
+ psz += 2;
+ int64_t v;
+ int rc = RTStrToInt64Ex(psz, &psz, 0, &v);
+ if (RT_SUCCESS(rc))
+ {
+ Frequency = v;
+ break;
+ }
+ }
+ }
+ fclose(pFile);
+ return Frequency;
+}
+
+
+/** @todo RTmpCpuId(). */
+
+RTDECL(int) RTMpCpuIdToSetIndex(RTCPUID idCpu)
+{
+ return idCpu < rtMpLinuxMaxCpus() ? (int)idCpu : -1;
+}
+
+
+RTDECL(RTCPUID) RTMpCpuIdFromSetIndex(int iCpu)
+{
+ return (unsigned)iCpu < rtMpLinuxMaxCpus() ? iCpu : NIL_RTCPUID;
+}
+
+
+RTDECL(RTCPUID) RTMpGetMaxCpuId(void)
+{
+ return rtMpLinuxMaxCpus() - 1;
+}
+
+
+RTDECL(bool) RTMpIsCpuOnline(RTCPUID idCpu)
+{
+ /** @todo check if there is a simpler interface than this... */
+ int64_t i = 0;
+ int rc = RTLinuxSysFsReadIntFile(0, &i, "devices/system/cpu/cpu%d/online", (int)idCpu);
+ if ( RT_FAILURE(rc)
+ && RTLinuxSysFsExists("devices/system/cpu/cpu%d", (int)idCpu))
+ {
+ /** @todo Assert(!RTLinuxSysFsExists("devices/system/cpu/cpu%d/online",
+ * (int)idCpu));
+ * Unfortunately, the online file wasn't always world readable (centos
+ * 2.6.18-164). */
+ i = 1;
+ rc = VINF_SUCCESS;
+ }
+
+ AssertMsg(i == 0 || i == -1 || i == 1, ("i=%d\n", i));
+ return RT_SUCCESS(rc) && i != 0;
+}
+
+
+RTDECL(bool) RTMpIsCpuPossible(RTCPUID idCpu)
+{
+ /** @todo check this up with hotplugging! */
+ return RTLinuxSysFsExists("devices/system/cpu/cpu%d", (int)idCpu);
+}
+
+
+RTDECL(PRTCPUSET) RTMpGetSet(PRTCPUSET pSet)
+{
+ RTCpuSetEmpty(pSet);
+ RTCPUID cMax = rtMpLinuxMaxCpus();
+ for (RTCPUID idCpu = 0; idCpu < cMax; idCpu++)
+ if (RTMpIsCpuPossible(idCpu))
+ RTCpuSetAdd(pSet, idCpu);
+ return pSet;
+}
+
+
+RTDECL(RTCPUID) RTMpGetCount(void)
+{
+ RTCPUSET Set;
+ RTMpGetSet(&Set);
+ return RTCpuSetCount(&Set);
+}
+
+
+RTDECL(RTCPUID) RTMpGetCoreCount(void)
+{
+ RTCPUID cMax = rtMpLinuxMaxCpus();
+ uint32_t *paidCores = (uint32_t *)alloca(sizeof(paidCores[0]) * (cMax + 1));
+ uint32_t *paidPckgs = (uint32_t *)alloca(sizeof(paidPckgs[0]) * (cMax + 1));
+ uint32_t cCores = 0;
+ for (RTCPUID idCpu = 0; idCpu < cMax; idCpu++)
+ {
+ if (RTMpIsCpuPossible(idCpu))
+ {
+ int64_t idCore = 0;
+ int64_t idPckg = 0;
+
+ int rc = RTLinuxSysFsReadIntFile(0, &idCore, "devices/system/cpu/cpu%d/topology/core_id", (int)idCpu);
+ if (RT_SUCCESS(rc))
+ rc = RTLinuxSysFsReadIntFile(0, &idPckg, "devices/system/cpu/cpu%d/topology/physical_package_id", (int)idCpu);
+
+ if (RT_SUCCESS(rc))
+ {
+ uint32_t i;
+
+ for (i = 0; i < cCores; i++)
+ if ( paidCores[i] == (uint32_t)idCore
+ && paidPckgs[i] == (uint32_t)idPckg)
+ break;
+ if (i >= cCores)
+ {
+ paidCores[cCores] = (uint32_t)idCore;
+ paidPckgs[cCores] = (uint32_t)idPckg;
+ cCores++;
+ }
+ }
+ }
+ }
+ Assert(cCores > 0);
+ return cCores;
+}
+
+
+RTDECL(PRTCPUSET) RTMpGetOnlineSet(PRTCPUSET pSet)
+{
+ RTCpuSetEmpty(pSet);
+ RTCPUID cMax = rtMpLinuxMaxCpus();
+ for (RTCPUID idCpu = 0; idCpu < cMax; idCpu++)
+ if (RTMpIsCpuOnline(idCpu))
+ RTCpuSetAdd(pSet, idCpu);
+ return pSet;
+}
+
+
+RTDECL(RTCPUID) RTMpGetOnlineCount(void)
+{
+ RTCPUSET Set;
+ RTMpGetOnlineSet(&Set);
+ return RTCpuSetCount(&Set);
+}
+
+
+RTDECL(RTCPUID) RTMpGetOnlineCoreCount(void)
+{
+ RTCPUID cMax = rtMpLinuxMaxCpus();
+ uint32_t *paidCores = (uint32_t *)alloca(sizeof(paidCores[0]) * (cMax + 1));
+ uint32_t *paidPckgs = (uint32_t *)alloca(sizeof(paidPckgs[0]) * (cMax + 1));
+ uint32_t cCores = 0;
+ for (RTCPUID idCpu = 0; idCpu < cMax; idCpu++)
+ {
+ if (RTMpIsCpuOnline(idCpu))
+ {
+ int64_t idCore = 0;
+ int64_t idPckg = 0;
+
+ int rc = RTLinuxSysFsReadIntFile(0, &idCore, "devices/system/cpu/cpu%d/topology/core_id", (int)idCpu);
+ if (RT_SUCCESS(rc))
+ rc = RTLinuxSysFsReadIntFile(0, &idPckg, "devices/system/cpu/cpu%d/topology/physical_package_id", (int)idCpu);
+
+ if (RT_SUCCESS(rc))
+ {
+ uint32_t i;
+
+ for (i = 0; i < cCores; i++)
+ if ( paidCores[i] == idCore
+ && paidPckgs[i] == idPckg)
+ break;
+ if (i >= cCores)
+ {
+ paidCores[cCores] = idCore;
+ paidPckgs[cCores] = idPckg;
+ cCores++;
+ }
+ }
+ }
+ }
+ Assert(cCores > 0);
+ return cCores;
+}
+
+
+
+RTDECL(uint32_t) RTMpGetCurFrequency(RTCPUID idCpu)
+{
+ int64_t kHz = 0;
+ int rc = RTLinuxSysFsReadIntFile(0, &kHz, "devices/system/cpu/cpu%d/cpufreq/cpuinfo_cur_freq", (int)idCpu);
+ if (RT_FAILURE(rc))
+ {
+ /*
+ * The file may be just unreadable - in that case use plan B, i.e.
+ * /proc/cpuinfo to get the data we want. The assumption is that if
+ * cpuinfo_cur_freq doesn't exist then the speed won't change, and
+ * thus cur == max. If it does exist then cpuinfo contains the
+ * current frequency.
+ */
+ kHz = rtMpLinuxGetFrequency(idCpu) * 1000;
+ }
+ return (kHz + 999) / 1000;
+}
+
+
+RTDECL(uint32_t) RTMpGetMaxFrequency(RTCPUID idCpu)
+{
+ int64_t kHz = 0;
+ int rc = RTLinuxSysFsReadIntFile(0, &kHz, "devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", (int)idCpu);
+ if (RT_FAILURE(rc))
+ {
+ /*
+ * Check if the file isn't there - if it is there, then /proc/cpuinfo
+ * would provide current frequency information, which is wrong.
+ */
+ if (!RTLinuxSysFsExists("devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", (int)idCpu))
+ kHz = rtMpLinuxGetFrequency(idCpu) * 1000;
+ else
+ kHz = 0;
+ }
+ return (kHz + 999) / 1000;
+}
diff --git a/src/VBox/Runtime/r3/linux/rtProcInitExePath-linux.cpp b/src/VBox/Runtime/r3/linux/rtProcInitExePath-linux.cpp
new file mode 100644
index 00000000..bd3edc12
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/rtProcInitExePath-linux.cpp
@@ -0,0 +1,79 @@
+/* $Id: rtProcInitExePath-linux.cpp $ */
+/** @file
+ * IPRT - rtProcInitName, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_PROCESS
+#include <unistd.h>
+#include <errno.h>
+
+#include <iprt/string.h>
+#include <iprt/assert.h>
+#include <iprt/errcore.h>
+#include <iprt/path.h>
+#include "internal/process.h"
+#include "internal/path.h"
+
+
+DECLHIDDEN(int) rtProcInitExePath(char *pszPath, size_t cchPath)
+{
+ /*
+ * Read the /proc/self/exe link, convert to native and return it.
+ */
+ int cchLink = readlink("/proc/self/exe", pszPath, cchPath - 1);
+ if (cchLink > 0 && (size_t)cchLink <= cchPath - 1)
+ {
+ pszPath[cchLink] = '\0';
+
+ char const *pszTmp;
+ int rc = rtPathFromNative(&pszTmp, pszPath, NULL);
+ AssertMsgRCReturn(rc, ("rc=%Rrc pszLink=\"%s\"\nhex: %.*Rhxs\n", rc, pszPath, cchLink, pszPath), rc);
+ if (pszTmp != pszPath)
+ {
+ rc = RTStrCopy(pszPath, cchPath, pszTmp);
+ rtPathFreeIprt(pszTmp, pszPath);
+ }
+ return rc;
+ }
+
+ int err = errno;
+ int rc = RTErrConvertFromErrno(err);
+ AssertMsgFailed(("rc=%Rrc err=%d cchLink=%d\n", rc, err, cchLink));
+ return rc;
+}
+
diff --git a/src/VBox/Runtime/r3/linux/sched-linux.cpp b/src/VBox/Runtime/r3/linux/sched-linux.cpp
new file mode 100644
index 00000000..0b7370e0
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/sched-linux.cpp
@@ -0,0 +1,707 @@
+/* $Id: sched-linux.cpp $ */
+/** @file
+ * IPRT - Scheduling, POSIX.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+/*
+ * !WARNING!
+ *
+ * When talking about lowering and raising priority, we do *NOT* refer to
+ * the common direction priority values takes on unix systems (lower means
+ * higher). So, when we raise the priority of a linux thread the nice
+ * value will decrease, and when we lower the priority the nice value
+ * will increase. Confusing, right?
+ *
+ * !WARNING!
+ */
+
+
+
+/** @def THREAD_LOGGING
+ * Be very careful with enabling this, it may cause deadlocks when combined
+ * with the 'thread' logging prefix.
+ */
+#ifdef DOXYGEN_RUNNING
+# define THREAD_LOGGING
+#endif
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_THREAD
+#include <errno.h>
+#include <pthread.h>
+#include <limits.h>
+#include <sched.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#include <iprt/thread.h>
+#include <iprt/process.h>
+#include <iprt/semaphore.h>
+#include <iprt/string.h>
+#include <iprt/assert.h>
+#include <iprt/log.h>
+#include <iprt/errcore.h>
+#include "internal/sched.h"
+#include "internal/thread.h"
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+
+/** Array scheduler attributes corresponding to each of the thread types.
+ * @internal */
+typedef struct PROCPRIORITYTYPE
+{
+ /** For sanity include the array index. */
+ RTTHREADTYPE enmType;
+ /** The thread priority or nice delta - depends on which priority type. */
+ int iPriority;
+} PROCPRIORITYTYPE;
+
+
+/**
+ * Configuration of one priority.
+ * @internal
+ */
+typedef struct
+{
+ /** The priority. */
+ RTPROCPRIORITY enmPriority;
+ /** The name of this priority. */
+ const char *pszName;
+ /** The process nice value. */
+ int iNice;
+ /** The delta applied to the iPriority value. */
+ int iDelta;
+ /** Array scheduler attributes corresponding to each of the thread types. */
+ const PROCPRIORITYTYPE *paTypes;
+} PROCPRIORITY;
+
+
+/**
+ * Saved priority settings
+ * @internal
+ */
+typedef struct
+{
+ /** Process priority. */
+ int iPriority;
+ /** Process level. */
+ struct sched_param SchedParam;
+ /** Process level. */
+ int iPolicy;
+ /** pthread level. */
+ struct sched_param PthreadSchedParam;
+ /** pthread level. */
+ int iPthreadPolicy;
+} SAVEDPRIORITY, *PSAVEDPRIORITY;
+
+
+/**
+ * Priorities for checking by separate thread
+ * @internal
+ */
+typedef struct
+{
+ /** The current thread priority to assume first. */
+ int iCurrent;
+ /** The thread priority to try set afterwards. */
+ int iNew;
+} VALIDATORPRIORITYPAIR, *PVALIDATORPRIORITYPAIR;
+
+
+/*********************************************************************************************************************************
+* Global Variables *
+*********************************************************************************************************************************/
+/**
+ * Deltas for a process in which we are not restricted
+ * to only be lowering the priority.
+ */
+static const PROCPRIORITYTYPE g_aTypesLinuxFree[RTTHREADTYPE_END] =
+{
+ { RTTHREADTYPE_INVALID, -999999999 },
+ { RTTHREADTYPE_INFREQUENT_POLLER, +3 },
+ { RTTHREADTYPE_MAIN_HEAVY_WORKER, +2 },
+ { RTTHREADTYPE_EMULATION, +1 },
+ { RTTHREADTYPE_DEFAULT, 0 },
+ { RTTHREADTYPE_GUI, 0 },
+ { RTTHREADTYPE_MAIN_WORKER, 0 },
+ { RTTHREADTYPE_VRDP_IO, -1 },
+ { RTTHREADTYPE_DEBUGGER, -1 },
+ { RTTHREADTYPE_MSG_PUMP, -2 },
+ { RTTHREADTYPE_IO, -3 },
+ { RTTHREADTYPE_TIMER, -4 }
+};
+
+/**
+ * Deltas for a process in which we are restricted and can only lower the priority.
+ */
+static const PROCPRIORITYTYPE g_aTypesLinuxRestricted[RTTHREADTYPE_END] =
+{
+ { RTTHREADTYPE_INVALID, -999999999 },
+ { RTTHREADTYPE_INFREQUENT_POLLER, +3 },
+ { RTTHREADTYPE_MAIN_HEAVY_WORKER, +2 },
+ { RTTHREADTYPE_EMULATION, +1 },
+ { RTTHREADTYPE_DEFAULT, 0 },
+ { RTTHREADTYPE_GUI, 0 },
+ { RTTHREADTYPE_MAIN_WORKER, 0 },
+ { RTTHREADTYPE_VRDP_IO, 0 },
+ { RTTHREADTYPE_DEBUGGER, 0 },
+ { RTTHREADTYPE_MSG_PUMP, 0 },
+ { RTTHREADTYPE_IO, 0 },
+ { RTTHREADTYPE_TIMER, 0 }
+};
+
+/**
+ * All threads have the same priority.
+ *
+ * This is typically chosen when we find that we can't raise the priority
+ * to the process default of a thread created by a low priority thread.
+ */
+static const PROCPRIORITYTYPE g_aTypesLinuxFlat[RTTHREADTYPE_END] =
+{
+ { RTTHREADTYPE_INVALID, -999999999 },
+ { RTTHREADTYPE_INFREQUENT_POLLER, 0 },
+ { RTTHREADTYPE_MAIN_HEAVY_WORKER, 0 },
+ { RTTHREADTYPE_EMULATION, 0 },
+ { RTTHREADTYPE_DEFAULT, 0 },
+ { RTTHREADTYPE_GUI, 0 },
+ { RTTHREADTYPE_MAIN_WORKER, 0 },
+ { RTTHREADTYPE_VRDP_IO, 0 },
+ { RTTHREADTYPE_DEBUGGER, 0 },
+ { RTTHREADTYPE_MSG_PUMP, 0 },
+ { RTTHREADTYPE_IO, 0 },
+ { RTTHREADTYPE_TIMER, 0 }
+};
+
+/**
+ * Process and thread level priority, full access at thread level.
+ */
+static const PROCPRIORITY g_aUnixConfigs[] =
+{
+ { RTPROCPRIORITY_FLAT, "Flat", 0, 0, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_LOW, "Low", 9, 9, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_LOW, "Low", 9, 9, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_LOW, "Low", 15, 15, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_LOW, "Low", 15, 15, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_LOW, "Low", 17, 17, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_LOW, "Low", 17, 17, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_LOW, "Low", 19, 19, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_LOW, "Low", 9, 9, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_LOW, "Low", 15, 15, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_LOW, "Low", 17, 17, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_NORMAL, "Normal", 0, 0, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_NORMAL, "Normal", 0, 0, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_NORMAL, "Normal", 0, 0, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_HIGH, "High", -9, -9, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_HIGH, "High", -7, -7, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_HIGH, "High", -5, -5, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_HIGH, "High", -3, -3, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_HIGH, "High", -1, -1, g_aTypesLinuxFree },
+ { RTPROCPRIORITY_HIGH, "High", -9, -9, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_HIGH, "High", -7, -7, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_HIGH, "High", -5, -5, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_HIGH, "High", -3, -3, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_HIGH, "High", -1, -1, g_aTypesLinuxRestricted },
+ { RTPROCPRIORITY_HIGH, "High", -9, -9, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_HIGH, "High", -7, -7, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_HIGH, "High", -5, -5, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_HIGH, "High", -3, -3, g_aTypesLinuxFlat },
+ { RTPROCPRIORITY_HIGH, "High", -1, -1, g_aTypesLinuxFlat }
+};
+
+/**
+ * The dynamic default priority configuration.
+ *
+ * This will be recalulated at runtime depending on what the
+ * system allow us to do and what the current priority is.
+ */
+static PROCPRIORITY g_aDefaultPriority =
+{
+ RTPROCPRIORITY_LOW, "Default", 0, 0, g_aTypesLinuxRestricted
+};
+
+/** Pointer to the current priority configuration. */
+static const PROCPRIORITY *g_pProcessPriority = &g_aDefaultPriority;
+
+/** Set if we can raise the priority of a thread beyond the default.
+ *
+ * It might mean we have the CAP_SYS_NICE capability or that the
+ * process's RLIMIT_NICE is higher than the priority of the thread
+ * calculating the defaults.
+ */
+static bool g_fCanRaisePriority = false;
+
+/** Set if we can restore the priority after having temporarily lowered or raised it. */
+static bool g_fCanRestorePriority = false;
+
+/** Set if we can NOT raise the priority to the process default in a thread
+ * created by a thread running below the process default.
+ */
+static bool g_fScrewedUpMaxPriorityLimitInheritance = true;
+
+/** The highest priority we can set. */
+static int g_iMaxPriority = 0;
+
+/** The lower priority we can set. */
+static int g_iMinPriority = 19;
+
+/** Set when we've successfully determined the capabilities of the process and kernel. */
+static bool g_fInitialized = false;
+
+
+
+/*********************************************************************************************************************************
+* Internal Functions *
+*********************************************************************************************************************************/
+
+
+/**
+ * Saves all the scheduling attributes we can think of.
+ */
+static void rtSchedNativeSave(PSAVEDPRIORITY pSave)
+{
+ memset(pSave, 0xff, sizeof(*pSave));
+
+ errno = 0;
+ pSave->iPriority = getpriority(PRIO_PROCESS, 0 /* current process */);
+ Assert(errno == 0);
+
+ errno = 0;
+ sched_getparam(0 /* current process */, &pSave->SchedParam);
+ Assert(errno == 0);
+
+ errno = 0;
+ pSave->iPolicy = sched_getscheduler(0 /* current process */);
+ Assert(errno == 0);
+
+ int rc = pthread_getschedparam(pthread_self(), &pSave->iPthreadPolicy, &pSave->PthreadSchedParam);
+ Assert(rc == 0); NOREF(rc);
+}
+
+
+/**
+ * Restores scheduling attributes.
+ * Most of this won't work right, but anyway...
+ */
+static void rtSchedNativeRestore(PSAVEDPRIORITY pSave)
+{
+ setpriority(PRIO_PROCESS, 0, pSave->iPriority);
+ sched_setscheduler(0, pSave->iPolicy, &pSave->SchedParam);
+ sched_setparam(0, &pSave->SchedParam);
+ pthread_setschedparam(pthread_self(), pSave->iPthreadPolicy, &pSave->PthreadSchedParam);
+}
+
+
+/**
+ * Called on the priority proxy thread if requested running, otherwise
+ * rtSchedRunThread() calls it directly.
+ */
+static DECLCALLBACK(int) rtSchedRunThreadCallback(pthread_t *pThread, void *(*pfnThread)(void *pvArg), void *pvArg)
+{
+ int rc = pthread_create(pThread, NULL, pfnThread, pvArg);
+ if (!rc)
+ return VINF_SUCCESS;
+ return RTErrConvertFromErrno(rc);
+}
+
+
+/**
+ * Starts a worker thread and wait for it to complete.
+ *
+ * We cannot use RTThreadCreate since we're already owner of the RW lock.
+ */
+static int rtSchedRunThread(void *(*pfnThread)(void *pvArg), void *pvArg, bool fUsePriorityProxy)
+{
+ /*
+ * Create the thread.
+ */
+ pthread_t Thread;
+ int rc;
+#ifndef RTTHREAD_POSIX_WITH_CREATE_PRIORITY_PROXY
+ RT_NOREF(fUsePriorityProxy);
+#else
+ if ( fUsePriorityProxy
+ && rtThreadPosixPriorityProxyStart())
+ rc = rtThreadPosixPriorityProxyCall(NULL, (PFNRT)rtSchedRunThreadCallback, 3, &Thread, pfnThread, pvArg);
+ else
+#endif
+ rc = rtSchedRunThreadCallback(&Thread, pfnThread, pvArg);
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * Wait for the thread to finish.
+ */
+ void *pvRet = (void *)-1;
+ do
+ {
+ rc = pthread_join(Thread, &pvRet);
+ } while (rc == EINTR);
+ if (rc)
+ return RTErrConvertFromErrno(rc);
+ return (int)(uintptr_t)pvRet;
+ }
+ return rc;
+}
+
+
+static void rtSchedDumpPriority(void)
+{
+#ifdef THREAD_LOGGING
+ Log(("Priority: g_fCanRaisePriority=%RTbool g_fCanRestorePriority=%RTbool g_fScrewedUpMaxPriorityLimitInheritance=%RTbool\n",
+ g_fCanRaisePriority, g_fCanRestorePriority, g_fScrewedUpMaxPriorityLimitInheritance));
+ Log(("Priority: g_iMaxPriority=%d g_iMinPriority=%d\n", g_iMaxPriority, g_iMinPriority));
+ Log(("Priority: enmPriority=%d \"%s\" iNice=%d iDelta=%d\n",
+ g_pProcessPriority->enmPriority,
+ g_pProcessPriority->pszName,
+ g_pProcessPriority->iNice,
+ g_pProcessPriority->iDelta));
+ Log(("Priority: %2d INFREQUENT_POLLER = %d\n", RTTHREADTYPE_INFREQUENT_POLLER, g_pProcessPriority->paTypes[RTTHREADTYPE_INFREQUENT_POLLER].iPriority));
+ Log(("Priority: %2d MAIN_HEAVY_WORKER = %d\n", RTTHREADTYPE_MAIN_HEAVY_WORKER, g_pProcessPriority->paTypes[RTTHREADTYPE_MAIN_HEAVY_WORKER].iPriority));
+ Log(("Priority: %2d EMULATION = %d\n", RTTHREADTYPE_EMULATION , g_pProcessPriority->paTypes[RTTHREADTYPE_EMULATION ].iPriority));
+ Log(("Priority: %2d DEFAULT = %d\n", RTTHREADTYPE_DEFAULT , g_pProcessPriority->paTypes[RTTHREADTYPE_DEFAULT ].iPriority));
+ Log(("Priority: %2d GUI = %d\n", RTTHREADTYPE_GUI , g_pProcessPriority->paTypes[RTTHREADTYPE_GUI ].iPriority));
+ Log(("Priority: %2d MAIN_WORKER = %d\n", RTTHREADTYPE_MAIN_WORKER , g_pProcessPriority->paTypes[RTTHREADTYPE_MAIN_WORKER ].iPriority));
+ Log(("Priority: %2d VRDP_IO = %d\n", RTTHREADTYPE_VRDP_IO , g_pProcessPriority->paTypes[RTTHREADTYPE_VRDP_IO ].iPriority));
+ Log(("Priority: %2d DEBUGGER = %d\n", RTTHREADTYPE_DEBUGGER , g_pProcessPriority->paTypes[RTTHREADTYPE_DEBUGGER ].iPriority));
+ Log(("Priority: %2d MSG_PUMP = %d\n", RTTHREADTYPE_MSG_PUMP , g_pProcessPriority->paTypes[RTTHREADTYPE_MSG_PUMP ].iPriority));
+ Log(("Priority: %2d IO = %d\n", RTTHREADTYPE_IO , g_pProcessPriority->paTypes[RTTHREADTYPE_IO ].iPriority));
+ Log(("Priority: %2d TIMER = %d\n", RTTHREADTYPE_TIMER , g_pProcessPriority->paTypes[RTTHREADTYPE_TIMER ].iPriority));
+#endif
+}
+
+
+/**
+ * This just checks if it can raise the priority after having been
+ * created by a thread with a low priority.
+ *
+ * @returns zero on success, non-zero on failure.
+ * @param pvUser The priority of the parent before it was lowered (cast to int).
+ */
+static void *rtSchedNativeSubProberThread(void *pvUser)
+{
+ int iPriority = getpriority(PRIO_PROCESS, 0);
+ Assert(iPriority == g_iMinPriority);
+
+ if (setpriority(PRIO_PROCESS, 0, iPriority + 1))
+ return (void *)-1;
+ if (setpriority(PRIO_PROCESS, 0, (int)(intptr_t)pvUser))
+ return (void *)-1;
+ return (void *)0;
+}
+
+
+/**
+ * The prober thread.
+ * We don't want to mess with the priority of the calling thread.
+ *
+ * @remark This is pretty presumptive stuff, but if it works on Linux and
+ * FreeBSD it does what I want.
+ */
+static void *rtSchedNativeProberThread(void *pvUser)
+{
+ NOREF(pvUser);
+ SAVEDPRIORITY SavedPriority;
+ rtSchedNativeSave(&SavedPriority);
+
+ /*
+ * Check if we can get higher priority (typically only root can do this).
+ * (Won't work right if our priority is -19 to start with, but what the heck.)
+ *
+ * We assume that the priority range is -19 to 19. Should probably find the right
+ * define for this.
+ */
+ int iStart = getpriority(PRIO_PROCESS, 0);
+ int i = iStart;
+ while (i-- > -20)
+ if (setpriority(PRIO_PROCESS, 0, i))
+ break;
+ g_iMaxPriority = getpriority(PRIO_PROCESS, 0);
+ g_fCanRaisePriority = g_iMaxPriority < iStart;
+ g_fCanRestorePriority = setpriority(PRIO_PROCESS, 0, iStart) == 0;
+
+ /*
+ * Check if we temporarily lower the thread priority.
+ * Again, we assume we're not at the extreme end of the priority scale.
+ */
+ iStart = getpriority(PRIO_PROCESS, 0);
+ i = iStart;
+ while (i++ < 19)
+ if (setpriority(PRIO_PROCESS, 0, i))
+ break;
+ g_iMinPriority = getpriority(PRIO_PROCESS, 0);
+ if ( setpriority(PRIO_PROCESS, 0, iStart)
+ || getpriority(PRIO_PROCESS, 0) != iStart)
+ g_fCanRestorePriority = false;
+ if (g_iMinPriority == g_iMaxPriority)
+ g_fCanRestorePriority = g_fCanRaisePriority = false;
+
+ /*
+ * Check what happens to child threads when the parent lowers the
+ * priority when it's being created.
+ */
+ iStart = getpriority(PRIO_PROCESS, 0);
+ g_fScrewedUpMaxPriorityLimitInheritance = true;
+ if ( g_fCanRestorePriority
+ && !setpriority(PRIO_PROCESS, 0, g_iMinPriority)
+ && iStart != g_iMinPriority)
+ {
+ if (rtSchedRunThread(rtSchedNativeSubProberThread, (void *)(intptr_t)iStart, false /*fUsePriorityProxy*/) == 0)
+ g_fScrewedUpMaxPriorityLimitInheritance = false;
+ }
+
+ /* done */
+ rtSchedNativeRestore(&SavedPriority);
+ return (void *)VINF_SUCCESS;
+}
+
+
+/**
+ * Calculate the scheduling properties for all the threads in the default
+ * process priority, assuming the current thread have the type enmType.
+ *
+ * @returns iprt status code.
+ * @param enmType The thread type to be assumed for the current thread.
+ */
+DECLHIDDEN(int) rtSchedNativeCalcDefaultPriority(RTTHREADTYPE enmType)
+{
+ Assert(enmType > RTTHREADTYPE_INVALID && enmType < RTTHREADTYPE_END);
+
+ /*
+ * First figure out what's we're allowed to do in this process.
+ */
+ if (!g_fInitialized)
+ {
+ int iPriority = getpriority(PRIO_PROCESS, 0);
+#ifdef RLIMIT_RTPRIO
+ /** @todo */
+#endif
+ int rc = rtSchedRunThread(rtSchedNativeProberThread, NULL, false /*fUsePriorityProxy*/);
+ if (RT_FAILURE(rc))
+ return rc;
+ Assert(getpriority(PRIO_PROCESS, 0) == iPriority); NOREF(iPriority);
+ g_fInitialized = true;
+ }
+
+ /*
+ * Select the right priority type table and update the default
+ * process priority structure.
+ */
+ if (g_fCanRaisePriority && g_fCanRestorePriority && !g_fScrewedUpMaxPriorityLimitInheritance)
+ g_aDefaultPriority.paTypes = &g_aTypesLinuxFree[0];
+ else if (!g_fCanRaisePriority && g_fCanRestorePriority && !g_fScrewedUpMaxPriorityLimitInheritance)
+ g_aDefaultPriority.paTypes = &g_aTypesLinuxRestricted[0];
+ else
+ g_aDefaultPriority.paTypes = &g_aTypesLinuxFlat[0];
+ Assert(enmType == g_aDefaultPriority.paTypes[enmType].enmType);
+
+ int iPriority = getpriority(PRIO_PROCESS, 0 /* current process */);
+ g_aDefaultPriority.iNice = iPriority - g_aDefaultPriority.paTypes[enmType].iPriority;
+ g_aDefaultPriority.iDelta = g_aDefaultPriority.iNice;
+
+ rtSchedDumpPriority();
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * The process priority validator thread.
+ * (We don't want to mess with the priority of the calling thread.)
+ */
+static void *rtSchedNativeValidatorThread(void *pvUser)
+{
+ PVALIDATORPRIORITYPAIR pPrioPair = (PVALIDATORPRIORITYPAIR)pvUser;
+ SAVEDPRIORITY SavedPriority;
+ rtSchedNativeSave(&SavedPriority);
+
+ int rc = VINF_SUCCESS;
+
+ /*
+ * Set the priority to the current value for specified thread type, but
+ * only if we have any threads of this type (caller checked - INT_MAX).
+ */
+ if (pPrioPair->iCurrent != INT_MAX)
+ if (setpriority(PRIO_PROCESS, 0, pPrioPair->iCurrent))
+ rc = RTErrConvertFromErrno(errno);
+
+ /*
+ * Try set the new priority.
+ */
+ if (RT_SUCCESS(rc) && setpriority(PRIO_PROCESS, 0, pPrioPair->iNew))
+ rc = RTErrConvertFromErrno(errno);
+
+ /* done */
+ rtSchedNativeRestore(&SavedPriority);
+ return (void *)(intptr_t)rc;
+}
+
+
+/**
+ * Validates the ability to apply suggested priority scheme.
+ *
+ * The function checks that we're able to apply all the thread types in the
+ * suggested priority scheme.
+ *
+ * @returns iprt status code.
+ * @param pCfg The priority scheme to validate.
+ * @param fHavePriorityProxy Set if we've got a priority proxy thread,
+ * otherwise clear.
+ */
+static int rtSchedNativeCheckThreadTypes(const PROCPRIORITY *pCfg, bool fHavePriorityProxy)
+{
+ int i = RTTHREADTYPE_END;
+ while (--i > RTTHREADTYPE_INVALID)
+ {
+ VALIDATORPRIORITYPAIR PrioPair;
+ PrioPair.iCurrent = g_pProcessPriority->paTypes[i].iPriority + g_pProcessPriority->iDelta;
+ PrioPair.iNew = pCfg->paTypes[i].iPriority + pCfg->iDelta;
+ if (g_acRTThreadTypeStats[i] == 0)
+ PrioPair.iCurrent = INT_MAX;
+
+#ifdef RT_STRICT
+ int const iPriority = getpriority(PRIO_PROCESS, 0);
+#endif
+ int rc = rtSchedRunThread(rtSchedNativeValidatorThread, &PrioPair, fHavePriorityProxy /*fUsePriorityProxy*/);
+ Assert(getpriority(PRIO_PROCESS, 0) == iPriority);
+
+ if (RT_FAILURE(rc))
+ return rc;
+ }
+ return VINF_SUCCESS;
+}
+
+
+DECLHIDDEN(int) rtProcNativeSetPriority(RTPROCPRIORITY enmPriority)
+{
+ Assert(enmPriority > RTPROCPRIORITY_INVALID && enmPriority < RTPROCPRIORITY_LAST);
+
+#ifdef RTTHREAD_POSIX_WITH_CREATE_PRIORITY_PROXY
+ /*
+ * Make sure the proxy creation thread is started so we don't 'lose' our
+ * initial priority if it's lowered.
+ */
+ bool const fHavePriorityProxy = rtThreadPosixPriorityProxyStart();
+#else
+ bool const fHavePriorityProxy = false;
+#endif
+
+ int rc;
+ if (enmPriority == RTPROCPRIORITY_DEFAULT)
+ {
+ /*
+ * If we've lowered priority since the process started, it may be impossible
+ * to raise it again for existing thread (new threads will work fine).
+ */
+ rc = rtSchedNativeCheckThreadTypes(&g_aDefaultPriority, fHavePriorityProxy);
+ if (RT_SUCCESS(rc))
+ g_pProcessPriority = &g_aDefaultPriority;
+ }
+ else
+ {
+ /*
+ * Find a configuration which matches and can be applied.
+ */
+ rc = VERR_NOT_FOUND;
+ for (unsigned i = 0; i < RT_ELEMENTS(g_aUnixConfigs); i++)
+ if (g_aUnixConfigs[i].enmPriority == enmPriority)
+ {
+ int rc2 = rtSchedNativeCheckThreadTypes(&g_aUnixConfigs[i], fHavePriorityProxy);
+ if (RT_SUCCESS(rc2))
+ {
+ g_pProcessPriority = &g_aUnixConfigs[i];
+ rc = VINF_SUCCESS;
+ break;
+ }
+ if (rc == VERR_NOT_FOUND || rc == VERR_ACCESS_DENIED)
+ rc = rc2;
+ }
+ }
+
+#ifdef THREAD_LOGGING
+ LogFlow(("rtProcNativeSetPriority: returns %Rrc enmPriority=%d\n", rc, enmPriority));
+ rtSchedDumpPriority();
+#endif
+ return rc;
+}
+
+
+/**
+ * Called on the priority proxy thread if it's running, otherwise
+ * rtThreadNativeSetPriority calls it directly.
+ */
+static DECLCALLBACK(int) rtThreadLinuxSetPriorityCallback(PRTTHREADINT pThread, int iPriority)
+{
+ if (!setpriority(PRIO_PROCESS, pThread->tid, iPriority))
+ {
+ AssertMsg(iPriority == getpriority(PRIO_PROCESS, pThread->tid),
+ ("iPriority=%d getpriority()=%d\n", iPriority, getpriority(PRIO_PROCESS, pThread->tid)));
+#ifdef THREAD_LOGGING
+ Log(("rtThreadNativeSetPriority: Thread=%p enmType=%d iPriority=%d pid=%d tid=%d\n",
+ pThread->Core.Key, enmType, iPriority, getpid(), pThread->tid));
+#endif
+ return VINF_SUCCESS;
+ }
+ AssertMsgFailed(("setpriority(,, %d) -> errno=%d rc=%Rrc\n", iPriority, errno, RTErrConvertFromErrno(errno)));
+ return VINF_SUCCESS; //non-fatal for now.
+}
+
+
+DECLHIDDEN(int) rtThreadNativeSetPriority(PRTTHREADINT pThread, RTTHREADTYPE enmType)
+{
+ /* sanity */
+ Assert(enmType > RTTHREADTYPE_INVALID && enmType < RTTHREADTYPE_END);
+ Assert(enmType == g_pProcessPriority->paTypes[enmType].enmType);
+
+ /*
+ * The thread ID is zero for alien threads, so skip these or we'd risk
+ * modifying our own priority.
+ */
+ if (!pThread->tid)
+ return VINF_SUCCESS;
+
+ /*
+ * Calculate the thread priority and apply it, preferrably via the priority proxy thread.
+ */
+ int const iPriority = g_pProcessPriority->paTypes[enmType].iPriority + g_pProcessPriority->iDelta;
+#ifdef RTTHREAD_POSIX_WITH_CREATE_PRIORITY_PROXY
+ if (rtThreadPosixPriorityProxyStart())
+ return rtThreadPosixPriorityProxyCall(pThread, (PFNRT)rtThreadLinuxSetPriorityCallback, 2, pThread, iPriority);
+#endif
+ return rtThreadLinuxSetPriorityCallback(pThread, iPriority);
+}
+
diff --git a/src/VBox/Runtime/r3/linux/semevent-linux.cpp b/src/VBox/Runtime/r3/linux/semevent-linux.cpp
new file mode 100644
index 00000000..c3a973c1
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/semevent-linux.cpp
@@ -0,0 +1,607 @@
+/* $Id: semevent-linux.cpp $ */
+/** @file
+ * IPRT - Event Semaphore, Linux (2.6.0 and later).
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+#include <features.h>
+#if __GLIBC_PREREQ(2,6) && !defined(IPRT_WITH_FUTEX_BASED_SEMS)
+
+/*
+ * glibc 2.6 fixed a serious bug in the mutex implementation. We wrote this
+ * linux specific event semaphores code in order to work around the bug. We
+ * will fall back on the pthread-based implementation if glibc is known to
+ * contain the bug fix.
+ *
+ * The external reference to epoll_pwait is a hack which prevents that we link
+ * against glibc < 2.6.
+ */
+# include "../posix/semevent-posix.cpp"
+__asm__ (".global epoll_pwait");
+
+#else /* glibc < 2.6 */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include <iprt/semaphore.h>
+#include "internal/iprt.h"
+
+#include <iprt/asm.h>
+#include <iprt/assert.h>
+#include <iprt/err.h>
+#include <iprt/lockvalidator.h>
+#include <iprt/mem.h>
+#include <iprt/time.h>
+#include "internal/magics.h"
+#include "internal/mem.h"
+#include "internal/strict.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+
+#include "semwait-linux.h"
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * Linux (single wakup) event semaphore.
+ */
+struct RTSEMEVENTINTERNAL
+{
+ /** Magic value. */
+ intptr_t volatile iMagic;
+ /** The futex state variable.
+ * 0 means not signalled.
+ 1 means signalled. */
+ uint32_t volatile fSignalled;
+ /** The number of waiting threads */
+ int32_t volatile cWaiters;
+#ifdef RTSEMEVENT_STRICT
+ /** Signallers. */
+ RTLOCKVALRECSHRD Signallers;
+ /** Indicates that lock validation should be performed. */
+ bool volatile fEverHadSignallers;
+#endif
+ /** The creation flags. */
+ uint32_t fFlags;
+};
+
+
+/*********************************************************************************************************************************
+* Global Variables *
+*********************************************************************************************************************************/
+/** Whether we can use FUTEX_WAIT_BITSET. */
+static int volatile g_fCanUseWaitBitSet = -1;
+
+
+
+
+RTDECL(int) RTSemEventCreate(PRTSEMEVENT phEventSem)
+{
+ return RTSemEventCreateEx(phEventSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, NULL);
+}
+
+
+RTDECL(int) RTSemEventCreateEx(PRTSEMEVENT phEventSem, uint32_t fFlags, RTLOCKVALCLASS hClass, const char *pszNameFmt, ...)
+{
+ AssertReturn(!(fFlags & ~(RTSEMEVENT_FLAGS_NO_LOCK_VAL | RTSEMEVENT_FLAGS_BOOTSTRAP_HACK)), VERR_INVALID_PARAMETER);
+ Assert(!(fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK) || (fFlags & RTSEMEVENT_FLAGS_NO_LOCK_VAL));
+
+ /*
+ * Make sure we know whether FUTEX_WAIT_BITSET works.
+ */
+ rtSemLinuxCheckForFutexWaitBitSet(&g_fCanUseWaitBitSet);
+#if defined(DEBUG_bird) && !defined(IN_GUEST)
+ Assert(g_fCanUseWaitBitSet == true);
+#endif
+
+ /*
+ * Allocate semaphore handle.
+ */
+ struct RTSEMEVENTINTERNAL *pThis;
+ if (!(fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK))
+ pThis = (struct RTSEMEVENTINTERNAL *)RTMemAlloc(sizeof(struct RTSEMEVENTINTERNAL));
+ else
+ pThis = (struct RTSEMEVENTINTERNAL *)rtMemBaseAlloc(sizeof(struct RTSEMEVENTINTERNAL));
+ if (pThis)
+ {
+ pThis->iMagic = RTSEMEVENT_MAGIC;
+ pThis->cWaiters = 0;
+ pThis->fSignalled = 0;
+ pThis->fFlags = fFlags;
+#ifdef RTSEMEVENT_STRICT
+ if (!pszNameFmt)
+ {
+ static uint32_t volatile s_iSemEventAnon = 0;
+ RTLockValidatorRecSharedInit(&pThis->Signallers, hClass, RTLOCKVAL_SUB_CLASS_ANY, pThis,
+ true /*fSignaller*/, !(fFlags & RTSEMEVENT_FLAGS_NO_LOCK_VAL),
+ "RTSemEvent-%u", ASMAtomicIncU32(&s_iSemEventAnon) - 1);
+ }
+ else
+ {
+ va_list va;
+ va_start(va, pszNameFmt);
+ RTLockValidatorRecSharedInitV(&pThis->Signallers, hClass, RTLOCKVAL_SUB_CLASS_ANY, pThis,
+ true /*fSignaller*/, !(fFlags & RTSEMEVENT_FLAGS_NO_LOCK_VAL),
+ pszNameFmt, va);
+ va_end(va);
+ }
+ pThis->fEverHadSignallers = false;
+#else
+ RT_NOREF(hClass, pszNameFmt);
+#endif
+
+ *phEventSem = pThis;
+ return VINF_SUCCESS;
+ }
+ return VERR_NO_MEMORY;
+}
+
+
+RTDECL(int) RTSemEventDestroy(RTSEMEVENT hEventSem)
+{
+ /*
+ * Validate input.
+ */
+ struct RTSEMEVENTINTERNAL *pThis = hEventSem;
+ if (pThis == NIL_RTSEMEVENT)
+ return VINF_SUCCESS;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->iMagic == RTSEMEVENT_MAGIC, VERR_INVALID_HANDLE);
+
+ /*
+ * Invalidate the semaphore and wake up anyone waiting on it.
+ */
+ ASMAtomicXchgSize(&pThis->iMagic, RTSEMEVENT_MAGIC | UINT32_C(0x80000000));
+ if (ASMAtomicXchgS32(&pThis->cWaiters, INT32_MIN / 2) > 0)
+ {
+ sys_futex(&pThis->fSignalled, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
+ usleep(1000);
+ }
+
+ /*
+ * Free the semaphore memory and be gone.
+ */
+#ifdef RTSEMEVENT_STRICT
+ RTLockValidatorRecSharedDelete(&pThis->Signallers);
+#endif
+ if (!(pThis->fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK))
+ RTMemFree(pThis);
+ else
+ rtMemBaseFree(pThis);
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(int) RTSemEventSignal(RTSEMEVENT hEventSem)
+{
+ /*
+ * Validate input.
+ */
+ struct RTSEMEVENTINTERNAL *pThis = hEventSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->iMagic == RTSEMEVENT_MAGIC, VERR_INVALID_HANDLE);
+
+#ifdef RTSEMEVENT_STRICT
+ if (pThis->fEverHadSignallers)
+ {
+ int rc9 = RTLockValidatorRecSharedCheckSignaller(&pThis->Signallers, NIL_RTTHREAD);
+ if (RT_FAILURE(rc9))
+ return rc9;
+ }
+#endif
+
+ ASMAtomicWriteU32(&pThis->fSignalled, 1);
+ if (ASMAtomicReadS32(&pThis->cWaiters) < 1)
+ return VINF_SUCCESS;
+
+ /* somebody is waiting, try wake up one of them. */
+ long cWoken = sys_futex(&pThis->fSignalled, FUTEX_WAKE, 1, NULL, NULL, 0);
+ if (RT_LIKELY(cWoken >= 0))
+ return VINF_SUCCESS;
+
+ if (RT_UNLIKELY(pThis->iMagic != RTSEMEVENT_MAGIC))
+ return VERR_SEM_DESTROYED;
+
+ return VERR_INVALID_PARAMETER;
+}
+
+
+/**
+ * Performs an indefinite wait on the event.
+ */
+static int rtSemEventLinuxWaitIndefinite(struct RTSEMEVENTINTERNAL *pThis, uint32_t fFlags, PCRTLOCKVALSRCPOS pSrcPos)
+{
+ RT_NOREF_PV(pSrcPos);
+
+ /*
+ * Quickly check whether it's signaled and there are no other waiters.
+ */
+ uint32_t cWaiters = ASMAtomicIncS32(&pThis->cWaiters);
+ if ( cWaiters == 1
+ && ASMAtomicCmpXchgU32(&pThis->fSignalled, 0, 1))
+ {
+ ASMAtomicDecS32(&pThis->cWaiters);
+ return VINF_SUCCESS;
+ }
+
+ /*
+ * The wait loop.
+ */
+#ifdef RTSEMEVENT_STRICT
+ RTTHREAD hThreadSelf = !(pThis->fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK)
+ ? RTThreadSelfAutoAdopt()
+ : RTThreadSelf();
+#else
+ RTTHREAD hThreadSelf = RTThreadSelf();
+#endif
+ int rc = VINF_SUCCESS;
+ for (;;)
+ {
+#ifdef RTSEMEVENT_STRICT
+ if (pThis->fEverHadSignallers)
+ {
+ rc = RTLockValidatorRecSharedCheckBlocking(&pThis->Signallers, hThreadSelf, pSrcPos, false,
+ RT_INDEFINITE_WAIT, RTTHREADSTATE_EVENT, true);
+ if (RT_FAILURE(rc))
+ break;
+ }
+#endif
+ RTThreadBlocking(hThreadSelf, RTTHREADSTATE_EVENT, true);
+ long lrc = sys_futex(&pThis->fSignalled, FUTEX_WAIT, 0, NULL /*pTimeout*/, NULL, 0);
+ RTThreadUnblocked(hThreadSelf, RTTHREADSTATE_EVENT);
+ if (RT_UNLIKELY(pThis->iMagic != RTSEMEVENT_MAGIC))
+ {
+ rc = VERR_SEM_DESTROYED;
+ break;
+ }
+
+ if (RT_LIKELY(lrc == 0 || lrc == -EWOULDBLOCK))
+ {
+ /* successful wakeup or fSignalled > 0 in the meantime */
+ if (ASMAtomicCmpXchgU32(&pThis->fSignalled, 0, 1))
+ break;
+ }
+ else if (lrc == -ETIMEDOUT)
+ {
+ rc = VERR_TIMEOUT;
+ break;
+ }
+ else if (lrc == -EINTR)
+ {
+ if (fFlags & RTSEMWAIT_FLAGS_NORESUME)
+ {
+ rc = VERR_INTERRUPTED;
+ break;
+ }
+ }
+ else
+ {
+ /* this shouldn't happen! */
+ AssertMsgFailed(("rc=%ld errno=%d\n", lrc, errno));
+ rc = RTErrConvertFromErrno(lrc);
+ break;
+ }
+ }
+
+ ASMAtomicDecS32(&pThis->cWaiters);
+ return rc;
+}
+
+
+/**
+ * Handle polling (timeout already expired at the time of the call).
+ *
+ * @returns VINF_SUCCESS, VERR_TIMEOUT, VERR_SEM_DESTROYED.
+ * @param pThis The semaphore.
+ */
+static int rtSemEventLinuxWaitPoll(struct RTSEMEVENTINTERNAL *pThis)
+{
+ /*
+ * What we do here is isn't quite fair to anyone else waiting on it, however
+ * it might not be as bad as all that for callers making repeated poll calls
+ * because they cannot block, as that would be a virtual wait but without the
+ * chance of a permanept queue position. So, I hope we can live with this.
+ */
+ if (ASMAtomicCmpXchgU32(&pThis->fSignalled, 0, 1))
+ return VINF_SUCCESS;
+ return VERR_TIMEOUT;
+}
+
+
+/**
+ * Performs an timed wait on the event.
+ */
+static int rtSemEventLinuxWaitTimed(struct RTSEMEVENTINTERNAL *pThis, uint32_t fFlags,
+ uint64_t uTimeout, PCRTLOCKVALSRCPOS pSrcPos)
+{
+ RT_NOREF_PV(pSrcPos);
+
+ /*
+ * Convert the timeout value.
+ */
+ struct timespec TsTimeout;
+ int iWaitOp;
+ uint32_t uWaitVal3;
+ uint64_t nsAbsTimeout = uTimeout; /* (older gcc maybe used uninitialized) */
+ uTimeout = rtSemLinuxCalcDeadline(fFlags, uTimeout, g_fCanUseWaitBitSet, &TsTimeout, &iWaitOp, &uWaitVal3, &nsAbsTimeout);
+ if (uTimeout == 0)
+ return rtSemEventLinuxWaitPoll(pThis);
+ if (uTimeout == UINT64_MAX)
+ return rtSemEventLinuxWaitIndefinite(pThis, fFlags, pSrcPos);
+
+ /*
+ * Quickly check whether it's signaled and there are no other waiters.
+ */
+ uint32_t cWaiters = ASMAtomicIncS32(&pThis->cWaiters);
+ if ( cWaiters == 1
+ && ASMAtomicCmpXchgU32(&pThis->fSignalled, 0, 1))
+ {
+ ASMAtomicDecS32(&pThis->cWaiters);
+ return VINF_SUCCESS;
+ }
+
+ /*
+ * The wait loop.
+ */
+#ifdef RTSEMEVENT_STRICT
+ RTTHREAD hThreadSelf = !(pThis->fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK)
+ ? RTThreadSelfAutoAdopt()
+ : RTThreadSelf();
+#else
+ RTTHREAD hThreadSelf = RTThreadSelf();
+#endif
+ int rc = VINF_SUCCESS;
+ for (;;)
+ {
+#ifdef RTSEMEVENT_STRICT
+ if (pThis->fEverHadSignallers)
+ {
+ rc = RTLockValidatorRecSharedCheckBlocking(&pThis->Signallers, hThreadSelf, pSrcPos, false,
+ iWaitOp == FUTEX_WAIT ? uTimeout / RT_NS_1MS : RT_MS_1HOUR /*whatever*/,
+ RTTHREADSTATE_EVENT, true);
+ if (RT_FAILURE(rc))
+ break;
+ }
+#endif
+ RTThreadBlocking(hThreadSelf, RTTHREADSTATE_EVENT, true);
+ long lrc = sys_futex(&pThis->fSignalled, iWaitOp, 0, &TsTimeout, NULL, uWaitVal3);
+ RTThreadUnblocked(hThreadSelf, RTTHREADSTATE_EVENT);
+ if (RT_UNLIKELY(pThis->iMagic != RTSEMEVENT_MAGIC))
+ {
+ rc = VERR_SEM_DESTROYED;
+ break;
+ }
+
+ if (RT_LIKELY(lrc == 0 || lrc == -EWOULDBLOCK))
+ {
+ /* successful wakeup or fSignalled > 0 in the meantime */
+ if (ASMAtomicCmpXchgU32(&pThis->fSignalled, 0, 1))
+ break;
+ }
+ else if (lrc == -ETIMEDOUT)
+ {
+#ifdef RT_STRICT
+ uint64_t const uNow = RTTimeNanoTS();
+ AssertMsg(uNow >= nsAbsTimeout || nsAbsTimeout - uNow < RT_NS_1MS,
+ ("%#RX64 - %#RX64 => %#RX64 (%RI64)\n", nsAbsTimeout, uNow, nsAbsTimeout - uNow, nsAbsTimeout - uNow));
+#endif
+ rc = VERR_TIMEOUT;
+ break;
+ }
+ else if (lrc == -EINTR)
+ {
+ if (fFlags & RTSEMWAIT_FLAGS_NORESUME)
+ {
+ rc = VERR_INTERRUPTED;
+ break;
+ }
+ }
+ else
+ {
+ /* this shouldn't happen! */
+ AssertMsgFailed(("rc=%ld errno=%d\n", lrc, errno));
+ rc = RTErrConvertFromErrno(lrc);
+ break;
+ }
+
+ /* adjust the relative timeout */
+ if (iWaitOp == FUTEX_WAIT)
+ {
+ int64_t i64Diff = nsAbsTimeout - RTTimeSystemNanoTS();
+ if (i64Diff < 1000)
+ {
+ rc = VERR_TIMEOUT;
+ break;
+ }
+ TsTimeout.tv_sec = (uint64_t)i64Diff / RT_NS_1SEC;
+ TsTimeout.tv_nsec = (uint64_t)i64Diff % RT_NS_1SEC;
+ }
+ }
+
+ ASMAtomicDecS32(&pThis->cWaiters);
+ return rc;
+}
+
+
+/**
+ * Internal wait worker function.
+ */
+DECLINLINE(int) rtSemEventLinuxWait(RTSEMEVENT hEventSem, uint32_t fFlags, uint64_t uTimeout, PCRTLOCKVALSRCPOS pSrcPos)
+{
+ /*
+ * Validate input.
+ */
+ struct RTSEMEVENTINTERNAL *pThis = hEventSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->iMagic == RTSEMEVENT_MAGIC, VERR_INVALID_HANDLE);
+ AssertReturn(RTSEMWAIT_FLAGS_ARE_VALID(fFlags), VERR_INVALID_PARAMETER);
+#ifdef RT_STRICT
+ uint32_t const fSignalled = pThis->fSignalled;
+ Assert(fSignalled == false || fSignalled == true);
+#endif
+
+ /*
+ * Timed or indefinite wait?
+ */
+ if (fFlags & RTSEMWAIT_FLAGS_INDEFINITE)
+ return rtSemEventLinuxWaitIndefinite(pThis, fFlags, pSrcPos);
+ return rtSemEventLinuxWaitTimed(hEventSem, fFlags, uTimeout, pSrcPos);
+}
+
+
+RTDECL(int) RTSemEventWait(RTSEMEVENT hEventSem, RTMSINTERVAL cMillies)
+{
+ int rc;
+#ifndef RTSEMEVENT_STRICT
+ if (cMillies == RT_INDEFINITE_WAIT)
+ rc = rtSemEventLinuxWait(hEventSem, RTSEMWAIT_FLAGS_RESUME | RTSEMWAIT_FLAGS_INDEFINITE, 0, NULL);
+ else
+ rc = rtSemEventLinuxWait(hEventSem, RTSEMWAIT_FLAGS_RESUME | RTSEMWAIT_FLAGS_RELATIVE | RTSEMWAIT_FLAGS_MILLISECS,
+ cMillies, NULL);
+#else
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API();
+ if (cMillies == RT_INDEFINITE_WAIT)
+ rc = rtSemEventLinuxWait(hEventSem, RTSEMWAIT_FLAGS_RESUME | RTSEMWAIT_FLAGS_INDEFINITE, 0, &SrcPos);
+ else
+ rc = rtSemEventLinuxWait(hEventSem, RTSEMWAIT_FLAGS_RESUME | RTSEMWAIT_FLAGS_RELATIVE | RTSEMWAIT_FLAGS_MILLISECS,
+ cMillies, &SrcPos);
+#endif
+ Assert(rc != VERR_INTERRUPTED);
+ return rc;
+}
+
+
+RTDECL(int) RTSemEventWaitNoResume(RTSEMEVENT hEventSem, RTMSINTERVAL cMillies)
+{
+ int rc;
+#ifndef RTSEMEVENT_STRICT
+ if (cMillies == RT_INDEFINITE_WAIT)
+ rc = rtSemEventLinuxWait(hEventSem, RTSEMWAIT_FLAGS_NORESUME | RTSEMWAIT_FLAGS_INDEFINITE, 0, NULL);
+ else
+ rc = rtSemEventLinuxWait(hEventSem, RTSEMWAIT_FLAGS_NORESUME | RTSEMWAIT_FLAGS_RELATIVE | RTSEMWAIT_FLAGS_MILLISECS,
+ cMillies, NULL);
+#else
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API();
+ if (cMillies == RT_INDEFINITE_WAIT)
+ rc = rtSemEventLinuxWait(hEventSem, RTSEMWAIT_FLAGS_NORESUME | RTSEMWAIT_FLAGS_INDEFINITE, 0, &SrcPos);
+ else
+ rc = rtSemEventLinuxWait(hEventSem, RTSEMWAIT_FLAGS_NORESUME | RTSEMWAIT_FLAGS_RELATIVE | RTSEMWAIT_FLAGS_MILLISECS,
+ cMillies, &SrcPos);
+#endif
+ Assert(rc != VERR_INTERRUPTED);
+ return rc;
+}
+
+
+RTDECL(int) RTSemEventWaitEx(RTSEMEVENT hEventSem, uint32_t fFlags, uint64_t uTimeout)
+{
+#ifndef RTSEMEVENT_STRICT
+ return rtSemEventLinuxWait(hEventSem, fFlags, uTimeout, NULL);
+#else
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API();
+ return rtSemEventLinuxWait(hEventSem, fFlags, uTimeout, &SrcPos);
+#endif
+}
+
+
+RTDECL(int) RTSemEventWaitExDebug(RTSEMEVENT hEventSem, uint32_t fFlags, uint64_t uTimeout,
+ RTHCUINTPTR uId, RT_SRC_POS_DECL)
+{
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API();
+ return rtSemEventLinuxWait(hEventSem, fFlags, uTimeout, &SrcPos);
+}
+
+
+RTDECL(uint32_t) RTSemEventGetResolution(void)
+{
+ /** @todo we have 1ns parameter resolution, but need to verify that this is what
+ * the kernel actually will use when setting the timer. Most likely
+ * it's rounded a little, but hopefully not to a multiple of HZ. */
+ return 1;
+}
+
+
+RTDECL(void) RTSemEventSetSignaller(RTSEMEVENT hEventSem, RTTHREAD hThread)
+{
+#ifdef RTSEMEVENT_STRICT
+ struct RTSEMEVENTINTERNAL *pThis = hEventSem;
+ AssertPtrReturnVoid(pThis);
+ AssertReturnVoid(pThis->iMagic == RTSEMEVENT_MAGIC);
+
+ ASMAtomicWriteBool(&pThis->fEverHadSignallers, true);
+ RTLockValidatorRecSharedResetOwner(&pThis->Signallers, hThread, NULL);
+#else
+ RT_NOREF(hEventSem, hThread);
+#endif
+}
+
+
+RTDECL(void) RTSemEventAddSignaller(RTSEMEVENT hEventSem, RTTHREAD hThread)
+{
+#ifdef RTSEMEVENT_STRICT
+ struct RTSEMEVENTINTERNAL *pThis = hEventSem;
+ AssertPtrReturnVoid(pThis);
+ AssertReturnVoid(pThis->iMagic == RTSEMEVENT_MAGIC);
+
+ ASMAtomicWriteBool(&pThis->fEverHadSignallers, true);
+ RTLockValidatorRecSharedAddOwner(&pThis->Signallers, hThread, NULL);
+#else
+ RT_NOREF(hEventSem, hThread);
+#endif
+}
+
+
+RTDECL(void) RTSemEventRemoveSignaller(RTSEMEVENT hEventSem, RTTHREAD hThread)
+{
+#ifdef RTSEMEVENT_STRICT
+ struct RTSEMEVENTINTERNAL *pThis = hEventSem;
+ AssertPtrReturnVoid(pThis);
+ AssertReturnVoid(pThis->iMagic == RTSEMEVENT_MAGIC);
+
+ RTLockValidatorRecSharedRemoveOwner(&pThis->Signallers, hThread);
+#else
+ RT_NOREF(hEventSem, hThread);
+#endif
+}
+
+#endif /* glibc < 2.6 || IPRT_WITH_FUTEX_BASED_SEMS */
+
diff --git a/src/VBox/Runtime/r3/linux/semeventmulti-linux.cpp b/src/VBox/Runtime/r3/linux/semeventmulti-linux.cpp
new file mode 100644
index 00000000..87554838
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/semeventmulti-linux.cpp
@@ -0,0 +1,600 @@
+/* $Id: semeventmulti-linux.cpp $ */
+/** @file
+ * IPRT - Multiple Release Event Semaphore, Linux (2.6.x+).
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+#include <features.h>
+#if __GLIBC_PREREQ(2,6) && !defined(IPRT_WITH_FUTEX_BASED_SEMS)
+
+/*
+ * glibc 2.6 fixed a serious bug in the mutex implementation. We wrote this
+ * linux specific event semaphores code in order to work around the bug. As it
+ * turns out, this code seems to have an unresolved issue (@bugref{2599}), so we'll
+ * fall back on the pthread based implementation if glibc is known to contain
+ * the bug fix.
+ *
+ * The external reference to epoll_pwait is a hack which prevents that we link
+ * against glibc < 2.6.
+ */
+#include "../posix/semeventmulti-posix.cpp"
+__asm__ (".global epoll_pwait");
+
+#else /* glibc < 2.6 */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include <iprt/semaphore.h>
+#include "internal/iprt.h"
+
+#include <iprt/assert.h>
+#include <iprt/asm.h>
+#include <iprt/err.h>
+#include <iprt/lockvalidator.h>
+#include <iprt/mem.h>
+#include <iprt/time.h>
+#include "internal/magics.h"
+#include "internal/strict.h"
+
+
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+
+#include "semwait-linux.h"
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * Linux multiple wakup event semaphore.
+ */
+struct RTSEMEVENTMULTIINTERNAL
+{
+ /** Magic value. */
+ uint32_t volatile u32Magic;
+ /** The futex state variable, see RTSEMEVENTMULTI_LNX_XXX. */
+ uint32_t volatile uState;
+#ifdef RT_STRICT
+ /** Increased on every signalling call. */
+ uint32_t volatile uSignalSerialNo;
+#endif
+#ifdef RTSEMEVENTMULTI_STRICT
+ /** Signallers. */
+ RTLOCKVALRECSHRD Signallers;
+ /** Indicates that lock validation should be performed. */
+ bool volatile fEverHadSignallers;
+#endif
+};
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+/** @name RTSEMEVENTMULTI_LNX_XXX - state
+ * @{ */
+#define RTSEMEVENTMULTI_LNX_NOT_SIGNALED UINT32_C(0x00000000)
+#define RTSEMEVENTMULTI_LNX_NOT_SIGNALED_WAITERS UINT32_C(0x00000001)
+#define RTSEMEVENTMULTI_LNX_SIGNALED UINT32_C(0x00000003)
+/** @} */
+
+#define ASSERT_VALID_STATE(a_uState) \
+ AssertMsg( (a_uState) == RTSEMEVENTMULTI_LNX_NOT_SIGNALED \
+ || (a_uState) == RTSEMEVENTMULTI_LNX_NOT_SIGNALED_WAITERS \
+ || (a_uState) == RTSEMEVENTMULTI_LNX_SIGNALED, \
+ (#a_uState "=%s\n", a_uState))
+
+
+/*********************************************************************************************************************************
+* Global Variables *
+*********************************************************************************************************************************/
+/** Whether we can use FUTEX_WAIT_BITSET. */
+static int volatile g_fCanUseWaitBitSet = -1;
+
+
+RTDECL(int) RTSemEventMultiCreate(PRTSEMEVENTMULTI phEventMultiSem)
+{
+ return RTSemEventMultiCreateEx(phEventMultiSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, NULL);
+}
+
+
+RTDECL(int) RTSemEventMultiCreateEx(PRTSEMEVENTMULTI phEventMultiSem, uint32_t fFlags, RTLOCKVALCLASS hClass,
+ const char *pszNameFmt, ...)
+{
+ AssertReturn(!(fFlags & ~RTSEMEVENTMULTI_FLAGS_NO_LOCK_VAL), VERR_INVALID_PARAMETER);
+
+ /*
+ * Make sure we know whether FUTEX_WAIT_BITSET works.
+ */
+ rtSemLinuxCheckForFutexWaitBitSet(&g_fCanUseWaitBitSet);
+#if defined(DEBUG_bird) && !defined(IN_GUEST)
+ Assert(g_fCanUseWaitBitSet == true);
+#endif
+
+ /*
+ * Allocate semaphore handle.
+ */
+ struct RTSEMEVENTMULTIINTERNAL *pThis = (struct RTSEMEVENTMULTIINTERNAL *)RTMemAlloc(sizeof(struct RTSEMEVENTMULTIINTERNAL));
+ if (pThis)
+ {
+ pThis->u32Magic = RTSEMEVENTMULTI_MAGIC;
+ pThis->uState = RTSEMEVENTMULTI_LNX_NOT_SIGNALED;
+#ifdef RT_STRICT
+ pThis->uSignalSerialNo = 0;
+#endif
+#ifdef RTSEMEVENTMULTI_STRICT
+ if (!pszNameFmt)
+ {
+ static uint32_t volatile s_iSemEventMultiAnon = 0;
+ RTLockValidatorRecSharedInit(&pThis->Signallers, hClass, RTLOCKVAL_SUB_CLASS_ANY, pThis,
+ true /*fSignaller*/, !(fFlags & RTSEMEVENTMULTI_FLAGS_NO_LOCK_VAL),
+ "RTSemEventMulti-%u", ASMAtomicIncU32(&s_iSemEventMultiAnon) - 1);
+ }
+ else
+ {
+ va_list va;
+ va_start(va, pszNameFmt);
+ RTLockValidatorRecSharedInitV(&pThis->Signallers, hClass, RTLOCKVAL_SUB_CLASS_ANY, pThis,
+ true /*fSignaller*/, !(fFlags & RTSEMEVENTMULTI_FLAGS_NO_LOCK_VAL),
+ pszNameFmt, va);
+ va_end(va);
+ }
+ pThis->fEverHadSignallers = false;
+#else
+ RT_NOREF(hClass, pszNameFmt);
+#endif
+
+ *phEventMultiSem = pThis;
+ return VINF_SUCCESS;
+ }
+ return VERR_NO_MEMORY;
+}
+
+
+RTDECL(int) RTSemEventMultiDestroy(RTSEMEVENTMULTI hEventMultiSem)
+{
+ /*
+ * Validate input.
+ */
+ struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem;
+ if (pThis == NIL_RTSEMEVENTMULTI)
+ return VINF_SUCCESS;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, VERR_INVALID_HANDLE);
+
+ /*
+ * Invalidate the semaphore and wake up anyone waiting on it.
+ */
+ ASMAtomicWriteU32(&pThis->u32Magic, RTSEMEVENTMULTI_MAGIC + 1);
+ if (ASMAtomicXchgU32(&pThis->uState, RTSEMEVENTMULTI_LNX_SIGNALED) == RTSEMEVENTMULTI_LNX_NOT_SIGNALED_WAITERS)
+ {
+ sys_futex(&pThis->uState, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
+ usleep(1000);
+ }
+
+ /*
+ * Free the semaphore memory and be gone.
+ */
+#ifdef RTSEMEVENTMULTI_STRICT
+ RTLockValidatorRecSharedDelete(&pThis->Signallers);
+#endif
+ RTMemFree(pThis);
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(int) RTSemEventMultiSignal(RTSEMEVENTMULTI hEventMultiSem)
+{
+ /*
+ * Validate input.
+ */
+ struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, VERR_INVALID_HANDLE);
+
+#ifdef RTSEMEVENTMULTI_STRICT
+ if (pThis->fEverHadSignallers)
+ {
+ int rc9 = RTLockValidatorRecSharedCheckSignaller(&pThis->Signallers, NIL_RTTHREAD);
+ if (RT_FAILURE(rc9))
+ return rc9;
+ }
+#endif
+
+ /*
+ * Signal it.
+ */
+#ifdef RT_STRICT
+ ASMAtomicIncU32(&pThis->uSignalSerialNo);
+#endif
+ uint32_t uOld = ASMAtomicXchgU32(&pThis->uState, RTSEMEVENTMULTI_LNX_SIGNALED);
+ if (uOld == RTSEMEVENTMULTI_LNX_NOT_SIGNALED_WAITERS)
+ {
+ /* wake up sleeping threads. */
+ long cWoken = sys_futex(&pThis->uState, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
+ AssertMsg(cWoken >= 0, ("%ld\n", cWoken)); NOREF(cWoken);
+ }
+ ASSERT_VALID_STATE(uOld);
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(int) RTSemEventMultiReset(RTSEMEVENTMULTI hEventMultiSem)
+{
+ /*
+ * Validate input.
+ */
+ struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, VERR_INVALID_HANDLE);
+#ifdef RT_STRICT
+ uint32_t const uState = pThis->uState;
+ ASSERT_VALID_STATE(uState);
+#endif
+
+ /*
+ * Reset it.
+ */
+ ASMAtomicCmpXchgU32(&pThis->uState, RTSEMEVENTMULTI_LNX_NOT_SIGNALED, RTSEMEVENTMULTI_LNX_SIGNALED);
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Performs an indefinite wait on the event.
+ */
+static int rtSemEventMultiLinuxWaitIndefinite(struct RTSEMEVENTMULTIINTERNAL *pThis, uint32_t fFlags, PCRTLOCKVALSRCPOS pSrcPos)
+{
+ RT_NOREF(pSrcPos);
+
+ /*
+ * Quickly check whether it's signaled.
+ */
+ uint32_t uState = ASMAtomicUoReadU32(&pThis->uState);
+ if (uState == RTSEMEVENTMULTI_LNX_SIGNALED)
+ return VINF_SUCCESS;
+ ASSERT_VALID_STATE(uState);
+
+ /*
+ * The wait loop.
+ */
+#ifdef RTSEMEVENTMULTI_STRICT
+ RTTHREAD hThreadSelf = RTThreadSelfAutoAdopt();
+#else
+ RTTHREAD hThreadSelf = RTThreadSelf();
+#endif
+ for (unsigned i = 0;; i++)
+ {
+ /*
+ * Start waiting. We only account for there being or having been
+ * threads waiting on the semaphore to keep things simple.
+ */
+ uState = ASMAtomicUoReadU32(&pThis->uState);
+ if ( uState == RTSEMEVENTMULTI_LNX_NOT_SIGNALED_WAITERS
+ || ( uState == RTSEMEVENTMULTI_LNX_NOT_SIGNALED
+ && ASMAtomicCmpXchgU32(&pThis->uState, RTSEMEVENTMULTI_LNX_NOT_SIGNALED_WAITERS,
+ RTSEMEVENTMULTI_LNX_NOT_SIGNALED)))
+ {
+#ifdef RTSEMEVENTMULTI_STRICT
+ if (pThis->fEverHadSignallers)
+ {
+ int rc9 = RTLockValidatorRecSharedCheckBlocking(&pThis->Signallers, hThreadSelf, pSrcPos, false,
+ RT_INDEFINITE_WAIT, RTTHREADSTATE_EVENT_MULTI, true);
+ if (RT_FAILURE(rc9))
+ return rc9;
+ }
+#endif
+#ifdef RT_STRICT
+ uint32_t const uPrevSignalSerialNo = ASMAtomicReadU32(&pThis->uSignalSerialNo);
+#endif
+ RTThreadBlocking(hThreadSelf, RTTHREADSTATE_EVENT_MULTI, true);
+ long rc = sys_futex(&pThis->uState, FUTEX_WAIT, 1, NULL /*pTimeout*/, NULL, 0);
+ RTThreadUnblocked(hThreadSelf, RTTHREADSTATE_EVENT_MULTI);
+
+ /* Check that the structure is still alive before continuing. */
+ if (RT_LIKELY(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC))
+ { /*likely*/ }
+ else
+ return VERR_SEM_DESTROYED;
+
+ /*
+ * Return if success.
+ */
+ if (rc == 0)
+ {
+ Assert(uPrevSignalSerialNo != ASMAtomicReadU32(&pThis->uSignalSerialNo));
+ return VINF_SUCCESS;
+ }
+
+ /*
+ * Act on the wakup code.
+ */
+ if (rc == -EWOULDBLOCK)
+ /* retry, the value changed. */;
+ else if (rc == -EINTR)
+ {
+ if (fFlags & RTSEMWAIT_FLAGS_NORESUME)
+ return VERR_INTERRUPTED;
+ }
+ else
+ {
+ /* this shouldn't happen! */
+ AssertMsgFailed(("rc=%ld errno=%d\n", rc, errno));
+ return RTErrConvertFromErrno(rc);
+ }
+ }
+ else if (uState == RTSEMEVENTMULTI_LNX_SIGNALED)
+ return VINF_SUCCESS;
+ else
+ ASSERT_VALID_STATE(uState);
+ }
+}
+
+
+/**
+ * Handle polling (timeout already expired at the time of the call).
+ *
+ * @returns VINF_SUCCESS, VERR_TIMEOUT, VERR_SEM_DESTROYED.
+ * @param pThis The semaphore.
+ */
+static int rtSemEventMultiLinuxWaitPoll(struct RTSEMEVENTMULTIINTERNAL *pThis)
+{
+ uint32_t uState = ASMAtomicUoReadU32(&pThis->uState);
+ if (uState == RTSEMEVENTMULTI_LNX_SIGNALED)
+ return VINF_SUCCESS;
+ return VERR_TIMEOUT;
+}
+
+
+/**
+ * Performs an indefinite wait on the event.
+ */
+static int rtSemEventMultiLinuxWaitTimed(struct RTSEMEVENTMULTIINTERNAL *pThis, uint32_t fFlags, uint64_t uTimeout,
+ PCRTLOCKVALSRCPOS pSrcPos)
+{
+ RT_NOREF(pSrcPos);
+
+ /*
+ * Quickly check whether it's signaled.
+ */
+ uint32_t uState = ASMAtomicUoReadU32(&pThis->uState);
+ if (uState == RTSEMEVENTMULTI_LNX_SIGNALED)
+ return VINF_SUCCESS;
+ ASSERT_VALID_STATE(uState);
+
+ /*
+ * Convert the timeout value.
+ */
+ struct timespec TsTimeout;
+ int iWaitOp;
+ uint32_t uWaitVal3;
+ uint64_t nsAbsTimeout = uTimeout; /* (older gcc maybe used uninitialized) */
+ uTimeout = rtSemLinuxCalcDeadline(fFlags, uTimeout, g_fCanUseWaitBitSet, &TsTimeout, &iWaitOp, &uWaitVal3, &nsAbsTimeout);
+ if (uTimeout == 0)
+ return rtSemEventMultiLinuxWaitPoll(pThis);
+ if (uTimeout == UINT64_MAX)
+ return rtSemEventMultiLinuxWaitIndefinite(pThis, fFlags, pSrcPos);
+
+ /*
+ * The wait loop.
+ */
+#ifdef RTSEMEVENTMULTI_STRICT
+ RTTHREAD hThreadSelf = RTThreadSelfAutoAdopt();
+#else
+ RTTHREAD hThreadSelf = RTThreadSelf();
+#endif
+ for (unsigned i = 0;; i++)
+ {
+ /*
+ * Start waiting. We only account for there being or having been
+ * threads waiting on the semaphore to keep things simple.
+ */
+ uState = ASMAtomicUoReadU32(&pThis->uState);
+ if ( uState == RTSEMEVENTMULTI_LNX_NOT_SIGNALED_WAITERS
+ || ( uState == RTSEMEVENTMULTI_LNX_NOT_SIGNALED
+ && ASMAtomicCmpXchgU32(&pThis->uState, RTSEMEVENTMULTI_LNX_NOT_SIGNALED_WAITERS,
+ RTSEMEVENTMULTI_LNX_NOT_SIGNALED)))
+ {
+#ifdef RTSEMEVENTMULTI_STRICT
+ if (pThis->fEverHadSignallers)
+ {
+ int rc9 = RTLockValidatorRecSharedCheckBlocking(&pThis->Signallers, hThreadSelf, pSrcPos, false,
+ uTimeout / UINT32_C(1000000), RTTHREADSTATE_EVENT_MULTI, true);
+ if (RT_FAILURE(rc9))
+ return rc9;
+ }
+#endif
+#ifdef RT_STRICT
+ uint32_t const uPrevSignalSerialNo = ASMAtomicReadU32(&pThis->uSignalSerialNo);
+#endif
+ RTThreadBlocking(hThreadSelf, RTTHREADSTATE_EVENT_MULTI, true);
+ long rc = sys_futex(&pThis->uState, iWaitOp, 1, &TsTimeout, NULL, uWaitVal3);
+ RTThreadUnblocked(hThreadSelf, RTTHREADSTATE_EVENT_MULTI);
+
+ /* Check that the structure is still alive before continuing. */
+ if (RT_LIKELY(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC))
+ { /*likely*/ }
+ else
+ return VERR_SEM_DESTROYED;
+
+ /*
+ * Return if success.
+ */
+ if (rc == 0)
+ {
+ Assert(uPrevSignalSerialNo != ASMAtomicReadU32(&pThis->uSignalSerialNo));
+ return VINF_SUCCESS;
+ }
+
+ /*
+ * Act on the wakup code.
+ */
+ if (rc == -ETIMEDOUT)
+ {
+ /** @todo something is broken here. shows up every now and again in the ata
+ * code. Should try to run the timeout against RTTimeMilliTS to
+ * check that it's doing the right thing... */
+#ifdef RT_STRICT
+ uint64_t const uNow = RTTimeNanoTS();
+ AssertMsg(uNow >= nsAbsTimeout || nsAbsTimeout - uNow < RT_NS_1MS,
+ ("%#RX64 - %#RX64 => %#RX64 (%RI64)\n", nsAbsTimeout, uNow, nsAbsTimeout - uNow, nsAbsTimeout - uNow));
+#endif
+ return VERR_TIMEOUT;
+ }
+ if (rc == -EWOULDBLOCK)
+ {
+ /* retry, the value changed. */;
+ }
+ else if (rc == -EINTR)
+ {
+ if (fFlags & RTSEMWAIT_FLAGS_NORESUME)
+ return VERR_INTERRUPTED;
+ }
+ else
+ {
+ /* this shouldn't happen! */
+ AssertMsgFailed(("rc=%ld errno=%d\n", rc, errno));
+ return RTErrConvertFromErrno(rc);
+ }
+ }
+ else if (uState == RTSEMEVENTMULTI_LNX_SIGNALED)
+ return VINF_SUCCESS;
+ else
+ ASSERT_VALID_STATE(uState);
+
+ /* adjust the relative timeout if relative */
+ if (iWaitOp == FUTEX_WAIT)
+ {
+ int64_t i64Diff = nsAbsTimeout - RTTimeSystemNanoTS();
+ if (i64Diff < 1000)
+ return VERR_TIMEOUT;
+ TsTimeout.tv_sec = (uint64_t)i64Diff / RT_NS_1SEC;
+ TsTimeout.tv_nsec = (uint64_t)i64Diff % RT_NS_1SEC;
+ }
+ }
+}
+
+/**
+ * Internal wait worker function.
+ */
+DECLINLINE(int) rtSemEventLnxMultiWait(RTSEMEVENTMULTI hEventSem, uint32_t fFlags, uint64_t uTimeout, PCRTLOCKVALSRCPOS pSrcPos)
+{
+ /*
+ * Validate input.
+ */
+ struct RTSEMEVENTMULTIINTERNAL *pThis = hEventSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, VERR_INVALID_HANDLE);
+ AssertReturn(RTSEMWAIT_FLAGS_ARE_VALID(fFlags), VERR_INVALID_PARAMETER);
+
+ /*
+ * Timed or indefinite wait?
+ */
+ if (fFlags & RTSEMWAIT_FLAGS_INDEFINITE)
+ return rtSemEventMultiLinuxWaitIndefinite(pThis, fFlags, pSrcPos);
+ return rtSemEventMultiLinuxWaitTimed(hEventSem, fFlags, uTimeout, pSrcPos);
+}
+
+
+#undef RTSemEventMultiWaitEx
+RTDECL(int) RTSemEventMultiWaitEx(RTSEMEVENTMULTI hEventMultiSem, uint32_t fFlags, uint64_t uTimeout)
+{
+#ifndef RTSEMEVENT_STRICT
+ return rtSemEventLnxMultiWait(hEventMultiSem, fFlags, uTimeout, NULL);
+#else
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API();
+ return rtSemEventLnxMultiWait(hEventMultiSem, fFlags, uTimeout, &SrcPos);
+#endif
+}
+
+
+RTDECL(int) RTSemEventMultiWaitExDebug(RTSEMEVENTMULTI hEventMultiSem, uint32_t fFlags, uint64_t uTimeout,
+ RTHCUINTPTR uId, RT_SRC_POS_DECL)
+{
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API();
+ return rtSemEventLnxMultiWait(hEventMultiSem, fFlags, uTimeout, &SrcPos);
+}
+
+
+RTDECL(void) RTSemEventMultiSetSignaller(RTSEMEVENTMULTI hEventMultiSem, RTTHREAD hThread)
+{
+#ifdef RTSEMEVENTMULTI_STRICT
+ struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem;
+ AssertPtrReturnVoid(pThis);
+ AssertReturnVoid(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC);
+
+ ASMAtomicWriteBool(&pThis->fEverHadSignallers, true);
+ RTLockValidatorRecSharedResetOwner(&pThis->Signallers, hThread, NULL);
+#else
+ RT_NOREF(hEventMultiSem, hThread);
+#endif
+}
+
+
+RTDECL(void) RTSemEventMultiAddSignaller(RTSEMEVENTMULTI hEventMultiSem, RTTHREAD hThread)
+{
+#ifdef RTSEMEVENTMULTI_STRICT
+ struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem;
+ AssertPtrReturnVoid(pThis);
+ AssertReturnVoid(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC);
+
+ ASMAtomicWriteBool(&pThis->fEverHadSignallers, true);
+ RTLockValidatorRecSharedAddOwner(&pThis->Signallers, hThread, NULL);
+#else
+ RT_NOREF(hEventMultiSem, hThread);
+#endif
+}
+
+
+RTDECL(void) RTSemEventMultiRemoveSignaller(RTSEMEVENTMULTI hEventMultiSem, RTTHREAD hThread)
+{
+#ifdef RTSEMEVENTMULTI_STRICT
+ struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem;
+ AssertPtrReturnVoid(pThis);
+ AssertReturnVoid(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC);
+
+ RTLockValidatorRecSharedRemoveOwner(&pThis->Signallers, hThread);
+#else
+ RT_NOREF(hEventMultiSem, hThread);
+#endif
+}
+
+#endif /* glibc < 2.6 || IPRT_WITH_FUTEX_BASED_SEMS */
+
diff --git a/src/VBox/Runtime/r3/linux/semmutex-linux.cpp b/src/VBox/Runtime/r3/linux/semmutex-linux.cpp
new file mode 100644
index 00000000..09cd866f
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/semmutex-linux.cpp
@@ -0,0 +1,475 @@
+/* $Id: semmutex-linux.cpp $ */
+/** @file
+ * IPRT - Mutex Semaphore, Linux (2.6.x+).
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include <iprt/semaphore.h>
+#include "internal/iprt.h"
+
+#include <iprt/alloc.h>
+#include <iprt/asm.h>
+#include <iprt/assert.h>
+#include <iprt/err.h>
+#include <iprt/lockvalidator.h>
+#include <iprt/thread.h>
+#include <iprt/time.h>
+#include "internal/magics.h"
+#include "internal/strict.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#if 0 /* With 2.6.17 futex.h has become C++ unfriendly. */
+# include <linux/futex.h>
+#else
+# define FUTEX_WAIT 0
+# define FUTEX_WAKE 1
+#endif
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * Linux internal representation of a Mutex semaphore.
+ */
+struct RTSEMMUTEXINTERNAL
+{
+ /** The futex state variable.
+ * 0 means unlocked.
+ * 1 means locked, no waiters.
+ * 2 means locked, one or more waiters.
+ */
+ int32_t volatile iState;
+ /** Nesting count. */
+ uint32_t volatile cNestings;
+ /** The owner of the mutex. */
+ pthread_t volatile Owner;
+ /** Magic value (RTSEMMUTEX_MAGIC). */
+ uint32_t volatile u32Magic;
+#ifdef RTSEMMUTEX_STRICT
+ /** Lock validator record associated with this mutex. */
+ RTLOCKVALRECEXCL ValidatorRec;
+#endif
+};
+
+
+
+/**
+ * Wrapper for the futex syscall.
+ */
+static long sys_futex(int32_t volatile *uaddr, int op, int val, struct timespec *utime, int32_t *uaddr2, int val3)
+{
+ errno = 0;
+ long rc = syscall(__NR_futex, uaddr, op, val, utime, uaddr2, val3);
+ if (rc < 0)
+ {
+ Assert(rc == -1);
+ rc = -errno;
+ }
+ return rc;
+}
+
+
+#undef RTSemMutexCreate
+RTDECL(int) RTSemMutexCreate(PRTSEMMUTEX phMutexSem)
+{
+ return RTSemMutexCreateEx(phMutexSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, RTLOCKVAL_SUB_CLASS_NONE, NULL);
+}
+
+
+RTDECL(int) RTSemMutexCreateEx(PRTSEMMUTEX phMutexSem, uint32_t fFlags,
+ RTLOCKVALCLASS hClass, uint32_t uSubClass, const char *pszNameFmt, ...)
+{
+ AssertReturn(!(fFlags & ~RTSEMMUTEX_FLAGS_NO_LOCK_VAL), VERR_INVALID_PARAMETER);
+
+ /*
+ * Allocate semaphore handle.
+ */
+ struct RTSEMMUTEXINTERNAL *pThis = (struct RTSEMMUTEXINTERNAL *)RTMemAlloc(sizeof(struct RTSEMMUTEXINTERNAL));
+ if (pThis)
+ {
+ pThis->u32Magic = RTSEMMUTEX_MAGIC;
+ pThis->iState = 0;
+ pThis->Owner = (pthread_t)~0;
+ pThis->cNestings = 0;
+#ifdef RTSEMMUTEX_STRICT
+ if (!pszNameFmt)
+ {
+ static uint32_t volatile s_iMutexAnon = 0;
+ RTLockValidatorRecExclInit(&pThis->ValidatorRec, hClass, uSubClass, pThis,
+ !(fFlags & RTSEMMUTEX_FLAGS_NO_LOCK_VAL),
+ "RTSemMutex-%u", ASMAtomicIncU32(&s_iMutexAnon) - 1);
+ }
+ else
+ {
+ va_list va;
+ va_start(va, pszNameFmt);
+ RTLockValidatorRecExclInitV(&pThis->ValidatorRec, hClass, uSubClass, pThis,
+ !(fFlags & RTSEMMUTEX_FLAGS_NO_LOCK_VAL), pszNameFmt, va);
+ va_end(va);
+ }
+#else
+ RT_NOREF(hClass, uSubClass, pszNameFmt);
+#endif
+
+ *phMutexSem = pThis;
+ return VINF_SUCCESS;
+ }
+
+ return VERR_NO_MEMORY;
+}
+
+
+RTDECL(int) RTSemMutexDestroy(RTSEMMUTEX hMutexSem)
+{
+ /*
+ * Validate input.
+ */
+ if (hMutexSem == NIL_RTSEMMUTEX)
+ return VINF_SUCCESS;
+ struct RTSEMMUTEXINTERNAL *pThis = hMutexSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC,
+ ("hMutexSem=%p u32Magic=%#x\n", pThis, pThis->u32Magic),
+ VERR_INVALID_HANDLE);
+
+ /*
+ * Invalidate the semaphore and wake up anyone waiting on it.
+ */
+ ASMAtomicWriteU32(&pThis->u32Magic, RTSEMMUTEX_MAGIC_DEAD);
+ if (ASMAtomicXchgS32(&pThis->iState, 0) > 0)
+ {
+ sys_futex(&pThis->iState, FUTEX_WAKE, INT_MAX, NULL, NULL, 0);
+ usleep(1000);
+ }
+ pThis->Owner = (pthread_t)~0;
+ pThis->cNestings = 0;
+#ifdef RTSEMMUTEX_STRICT
+ RTLockValidatorRecExclDelete(&pThis->ValidatorRec);
+#endif
+
+ /*
+ * Free the semaphore memory and be gone.
+ */
+ RTMemFree(pThis);
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(uint32_t) RTSemMutexSetSubClass(RTSEMMUTEX hMutexSem, uint32_t uSubClass)
+{
+#ifdef RTSEMMUTEX_STRICT
+ /*
+ * Validate.
+ */
+ RTSEMMUTEXINTERNAL *pThis = hMutexSem;
+ AssertPtrReturn(pThis, RTLOCKVAL_SUB_CLASS_INVALID);
+ AssertReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, RTLOCKVAL_SUB_CLASS_INVALID);
+
+ return RTLockValidatorRecExclSetSubClass(&pThis->ValidatorRec, uSubClass);
+#else
+ RT_NOREF(hMutexSem, uSubClass);
+ return RTLOCKVAL_SUB_CLASS_INVALID;
+#endif
+}
+
+
+DECL_FORCE_INLINE(int) rtSemMutexRequest(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, bool fAutoResume, PCRTLOCKVALSRCPOS pSrcPos)
+{
+ RT_NOREF(pSrcPos);
+
+ /*
+ * Validate input.
+ */
+ struct RTSEMMUTEXINTERNAL *pThis = hMutexSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, VERR_INVALID_HANDLE);
+
+ /*
+ * Check if nested request.
+ */
+ pthread_t Self = pthread_self();
+ if ( pThis->Owner == Self
+ && pThis->cNestings > 0)
+ {
+#ifdef RTSEMMUTEX_STRICT
+ int rc9 = RTLockValidatorRecExclRecursion(&pThis->ValidatorRec, pSrcPos);
+ if (RT_FAILURE(rc9))
+ return rc9;
+#endif
+ ASMAtomicIncU32(&pThis->cNestings);
+ return VINF_SUCCESS;
+ }
+
+#ifdef RTSEMMUTEX_STRICT
+ RTTHREAD hThreadSelf = RTThreadSelfAutoAdopt();
+ if (cMillies)
+ {
+ int rc9 = RTLockValidatorRecExclCheckOrder(&pThis->ValidatorRec, hThreadSelf, pSrcPos, cMillies);
+ if (RT_FAILURE(rc9))
+ return rc9;
+ }
+#else
+ RTTHREAD hThreadSelf = RTThreadSelf();
+#endif
+
+ /*
+ * Convert timeout value.
+ */
+ struct timespec ts;
+ struct timespec *pTimeout = NULL;
+ uint64_t u64End = 0; /* shut up gcc */
+ if (cMillies != RT_INDEFINITE_WAIT)
+ {
+ ts.tv_sec = cMillies / 1000;
+ ts.tv_nsec = (cMillies % 1000) * UINT32_C(1000000);
+ u64End = RTTimeSystemNanoTS() + cMillies * UINT64_C(1000000);
+ pTimeout = &ts;
+ }
+
+ /*
+ * Lock the mutex.
+ * Optimize for the uncontended case (makes 1-2 ns difference).
+ */
+ if (RT_UNLIKELY(!ASMAtomicCmpXchgS32(&pThis->iState, 1, 0)))
+ {
+ for (;;)
+ {
+ int32_t iOld = ASMAtomicXchgS32(&pThis->iState, 2);
+
+ /*
+ * Was the lock released in the meantime? This is unlikely (but possible)
+ */
+ if (RT_UNLIKELY(iOld == 0))
+ break;
+
+ /*
+ * Go to sleep.
+ */
+ if (pTimeout && ( pTimeout->tv_sec || pTimeout->tv_nsec ))
+ {
+#ifdef RTSEMMUTEX_STRICT
+ int rc9 = RTLockValidatorRecExclCheckBlocking(&pThis->ValidatorRec, hThreadSelf, pSrcPos, true,
+ cMillies, RTTHREADSTATE_MUTEX, true);
+ if (RT_FAILURE(rc9))
+ return rc9;
+#else
+ RTThreadBlocking(hThreadSelf, RTTHREADSTATE_MUTEX, true);
+#endif
+ }
+
+ long rc = sys_futex(&pThis->iState, FUTEX_WAIT, 2, pTimeout, NULL, 0);
+
+ RTThreadUnblocked(hThreadSelf, RTTHREADSTATE_MUTEX);
+ if (RT_UNLIKELY(pThis->u32Magic != RTSEMMUTEX_MAGIC))
+ return VERR_SEM_DESTROYED;
+
+ /*
+ * Act on the wakup code.
+ */
+ if (rc == -ETIMEDOUT)
+ {
+ Assert(pTimeout);
+ return VERR_TIMEOUT;
+ }
+ if (rc == 0)
+ /* we'll leave the loop now unless another thread is faster */;
+ else if (rc == -EWOULDBLOCK)
+ /* retry with new value. */;
+ else if (rc == -EINTR)
+ {
+ if (!fAutoResume)
+ return VERR_INTERRUPTED;
+ }
+ else
+ {
+ /* this shouldn't happen! */
+ AssertMsgFailed(("rc=%ld errno=%d\n", rc, errno));
+ return RTErrConvertFromErrno(rc);
+ }
+
+ /* adjust the relative timeout */
+ if (pTimeout)
+ {
+ int64_t i64Diff = u64End - RTTimeSystemNanoTS();
+ if (i64Diff < 1000)
+ {
+ rc = VERR_TIMEOUT;
+ break;
+ }
+ ts.tv_sec = (uint64_t)i64Diff / UINT32_C(1000000000);
+ ts.tv_nsec = (uint64_t)i64Diff % UINT32_C(1000000000);
+ }
+ }
+
+ /*
+ * When leaving this loop, iState is set to 2. This means that we gained the
+ * lock and there are _possibly_ some waiters. We don't know exactly as another
+ * thread might entered this loop at nearly the same time. Therefore we will
+ * call futex_wakeup once too often (if _no_ other thread entered this loop).
+ * The key problem is the simple futex_wait test for x != y (iState != 2) in
+ * our case).
+ */
+ }
+
+ /*
+ * Set the owner and nesting.
+ */
+ pThis->Owner = Self;
+ ASMAtomicWriteU32(&pThis->cNestings, 1);
+#ifdef RTSEMMUTEX_STRICT
+ RTLockValidatorRecExclSetOwner(&pThis->ValidatorRec, hThreadSelf, pSrcPos, true);
+#endif
+ return VINF_SUCCESS;
+}
+
+
+#undef RTSemMutexRequest
+RTDECL(int) RTSemMutexRequest(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies)
+{
+#ifndef RTSEMMUTEX_STRICT
+ int rc = rtSemMutexRequest(hMutexSem, cMillies, true, NULL);
+#else
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API();
+ int rc = rtSemMutexRequest(hMutexSem, cMillies, true, &SrcPos);
+#endif
+ Assert(rc != VERR_INTERRUPTED);
+ return rc;
+}
+
+
+RTDECL(int) RTSemMutexRequestDebug(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, RTHCUINTPTR uId, RT_SRC_POS_DECL)
+{
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API();
+ int rc = rtSemMutexRequest(hMutexSem, cMillies, true, &SrcPos);
+ Assert(rc != VERR_INTERRUPTED);
+ return rc;
+}
+
+
+#undef RTSemMutexRequestNoResume
+RTDECL(int) RTSemMutexRequestNoResume(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies)
+{
+#ifndef RTSEMMUTEX_STRICT
+ return rtSemMutexRequest(hMutexSem, cMillies, false, NULL);
+#else
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API();
+ return rtSemMutexRequest(hMutexSem, cMillies, false, &SrcPos);
+#endif
+}
+
+
+RTDECL(int) RTSemMutexRequestNoResumeDebug(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, RTHCUINTPTR uId, RT_SRC_POS_DECL)
+{
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API();
+ return rtSemMutexRequest(hMutexSem, cMillies, false, &SrcPos);
+}
+
+
+RTDECL(int) RTSemMutexRelease(RTSEMMUTEX hMutexSem)
+{
+ /*
+ * Validate input.
+ */
+ struct RTSEMMUTEXINTERNAL *pThis = hMutexSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, VERR_INVALID_HANDLE);
+
+#ifdef RTSEMMUTEX_STRICT
+ int rc9 = RTLockValidatorRecExclReleaseOwner(&pThis->ValidatorRec, pThis->cNestings == 1);
+ if (RT_FAILURE(rc9))
+ return rc9;
+#endif
+
+ /*
+ * Check if nested.
+ */
+ pthread_t Self = pthread_self();
+ if (RT_UNLIKELY( pThis->Owner != Self
+ || pThis->cNestings == 0))
+ {
+ AssertMsgFailed(("Not owner of mutex %p!! Self=%08x Owner=%08x cNestings=%d\n",
+ pThis, Self, pThis->Owner, pThis->cNestings));
+ return VERR_NOT_OWNER;
+ }
+
+ /*
+ * If nested we'll just pop a nesting.
+ */
+ if (pThis->cNestings > 1)
+ {
+ ASMAtomicDecU32(&pThis->cNestings);
+ return VINF_SUCCESS;
+ }
+
+ /*
+ * Clear the state. (cNestings == 1)
+ */
+ pThis->Owner = (pthread_t)~0;
+ ASMAtomicWriteU32(&pThis->cNestings, 0);
+
+ /*
+ * Release the mutex.
+ */
+ int32_t iNew = ASMAtomicDecS32(&pThis->iState);
+ if (RT_UNLIKELY(iNew != 0))
+ {
+ /* somebody is waiting, try wake up one of them. */
+ ASMAtomicXchgS32(&pThis->iState, 0);
+ (void)sys_futex(&pThis->iState, FUTEX_WAKE, 1, NULL, NULL, 0);
+ }
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(bool) RTSemMutexIsOwned(RTSEMMUTEX hMutexSem)
+{
+ /*
+ * Validate.
+ */
+ RTSEMMUTEXINTERNAL *pThis = hMutexSem;
+ AssertPtrReturn(pThis, false);
+ AssertReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, false);
+
+ return pThis->Owner != (pthread_t)~0;
+}
+
diff --git a/src/VBox/Runtime/r3/linux/semwait-linux.h b/src/VBox/Runtime/r3/linux/semwait-linux.h
new file mode 100644
index 00000000..0f533845
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/semwait-linux.h
@@ -0,0 +1,233 @@
+/* $Id: semwait-linux.h $ */
+/** @file
+ * IPRT - Common semaphore wait code, Linux.
+ */
+
+/*
+ * Copyright (C) 2021-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+#ifndef IPRT_INCLUDED_SRC_r3_linux_semwait_linux_h
+#define IPRT_INCLUDED_SRC_r3_linux_semwait_linux_h
+#ifndef RT_WITHOUT_PRAGMA_ONCE
+# pragma once
+#endif
+
+
+/* With 2.6.17 futex.h has become C++ unfriendly, so define the bits we need. */
+#define FUTEX_WAIT 0
+#define FUTEX_WAKE 1
+#define FUTEX_WAIT_BITSET 9 /**< @since 2.6.25 - uses absolute timeout. */
+
+
+/**
+ * Wrapper for the futex syscall.
+ */
+DECLINLINE(long) sys_futex(uint32_t volatile *uaddr, int op, int val, struct timespec *utime, int32_t *uaddr2, int val3)
+{
+ errno = 0;
+ long rc = syscall(__NR_futex, uaddr, op, val, utime, uaddr2, val3);
+ if (rc < 0)
+ {
+ Assert(rc == -1);
+ rc = -errno;
+ }
+ return rc;
+}
+
+
+DECL_NO_INLINE(static, void) rtSemLinuxCheckForFutexWaitBitSetSlow(int volatile *pfCanUseWaitBitSet)
+{
+ uint32_t uTestVar = UINT32_MAX;
+ long rc = sys_futex(&uTestVar, FUTEX_WAIT_BITSET, UINT32_C(0xf0f0f0f0), NULL, NULL, UINT32_MAX);
+ *pfCanUseWaitBitSet = rc == -EAGAIN;
+ AssertMsg(rc == -ENOSYS || rc == -EAGAIN, ("%d\n", rc));
+}
+
+
+DECLINLINE(void) rtSemLinuxCheckForFutexWaitBitSet(int volatile *pfCanUseWaitBitSet)
+{
+ if (*pfCanUseWaitBitSet != -1)
+ { /* likely */ }
+ else
+ rtSemLinuxCheckForFutexWaitBitSetSlow(pfCanUseWaitBitSet);
+}
+
+
+/**
+ * Converts a extended wait timeout specification to an timespec and
+ * corresponding futex operation, as well as an approximate relative nanosecond
+ * interval.
+ *
+ * @note This does not check for RTSEMWAIT_FLAGS_INDEFINITE, caller should've
+ * done that already.
+ *
+ * @returns The relative wait in nanoseconds. 0 for a poll call, UINT64_MAX for
+ * an effectively indefinite wait.
+ * @param fFlags RTSEMWAIT_FLAGS_XXX.
+ * @param fCanUseWaitBitSet Whether we can use FUTEX_WAIT_BITMSET or not.
+ * @param uTimeout The timeout.
+ * @param pDeadline Where to return the deadline.
+ * @param piWaitOp Where to return the FUTEX wait operation number.
+ * @param puWaitVal3 Where to return the FUTEX wait value 3.
+ * @param pnsAbsTimeout Where to return the absolute timeout in case of
+ * a resuming relative call (i.e. FUTEX_WAIT).
+ */
+DECL_FORCE_INLINE(uint64_t)
+rtSemLinuxCalcDeadline(uint32_t fFlags, uint64_t uTimeout, int fCanUseWaitBitSet,
+ struct timespec *pDeadline, int *piWaitOp, uint32_t *puWaitVal3, uint64_t *pnsAbsTimeout)
+{
+ Assert(!(fFlags & RTSEMWAIT_FLAGS_INDEFINITE));
+
+ if (fFlags & RTSEMWAIT_FLAGS_RELATIVE)
+ {
+ Assert(!(fFlags & RTSEMWAIT_FLAGS_ABSOLUTE));
+
+ /*
+ * Polling call?
+ */
+ if (uTimeout == 0)
+ return 0;
+
+ /*
+ * We use FUTEX_WAIT here as it takes a relative timespec.
+ *
+ * Note! For non-resuming waits, we can skip calculating the absolute
+ * time ASSUMING it is only needed for timeout adjustments
+ * after an -EINTR return.
+ */
+ if (fFlags & RTSEMWAIT_FLAGS_MILLISECS)
+ {
+ if ( sizeof(pDeadline->tv_sec) >= sizeof(uint64_t)
+ || uTimeout < (uint64_t)UINT32_MAX * RT_MS_1SEC)
+ {
+ pDeadline->tv_sec = uTimeout / RT_MS_1SEC;
+ pDeadline->tv_nsec = (uTimeout % RT_MS_1SEC) & RT_NS_1MS;
+ uTimeout *= RT_NS_1MS;
+ }
+ else
+ return UINT64_MAX;
+ }
+ else
+ {
+ Assert(fFlags & RTSEMWAIT_FLAGS_NANOSECS);
+ if ( sizeof(pDeadline->tv_sec) >= sizeof(uint64_t)
+ || uTimeout < (uint64_t)UINT32_MAX * RT_NS_1SEC)
+ {
+ pDeadline->tv_sec = uTimeout / RT_NS_1SEC;
+ pDeadline->tv_nsec = uTimeout % RT_NS_1SEC;
+ }
+ else
+ return UINT64_MAX;
+ }
+
+#ifdef RT_STRICT
+ if (!(fFlags & RTSEMWAIT_FLAGS_RESUME))
+ *pnsAbsTimeout = uTimeout;
+ else
+#endif
+ *pnsAbsTimeout = RTTimeNanoTS() + uTimeout; /* Note! only relevant for relative waits (FUTEX_WAIT). */
+ }
+ else
+ {
+ /* Absolute deadline: */
+ Assert(fFlags & RTSEMWAIT_FLAGS_ABSOLUTE);
+ if (fCanUseWaitBitSet == true)
+ {
+ /*
+ * Use FUTEX_WAIT_BITSET as it takes an absolute deadline.
+ */
+ if (fFlags & RTSEMWAIT_FLAGS_MILLISECS)
+ {
+ if ( sizeof(pDeadline->tv_sec) >= sizeof(uint64_t)
+ || uTimeout < (uint64_t)UINT32_MAX * RT_MS_1SEC)
+ {
+ pDeadline->tv_sec = uTimeout / RT_MS_1SEC;
+ pDeadline->tv_nsec = (uTimeout % RT_MS_1SEC) & RT_NS_1MS;
+ }
+ else
+ return UINT64_MAX;
+ }
+ else
+ {
+ Assert(fFlags & RTSEMWAIT_FLAGS_NANOSECS);
+ if ( sizeof(pDeadline->tv_sec) >= sizeof(uint64_t)
+ || uTimeout < (uint64_t)UINT32_MAX * RT_NS_1SEC)
+ {
+ pDeadline->tv_sec = uTimeout / RT_NS_1SEC;
+ pDeadline->tv_nsec = uTimeout % RT_NS_1SEC;
+ }
+ else
+ return UINT64_MAX;
+ }
+ *pnsAbsTimeout = uTimeout;
+ *piWaitOp = FUTEX_WAIT_BITSET;
+ *puWaitVal3 = UINT32_MAX;
+ return RT_MS_1SEC; /* Whatever non-zero; Whole point is not calling RTTimeNanoTS() in this path. */
+ }
+
+ /*
+ * FUTEX_WAIT_BITSET is not available, so use FUTEX_WAIT with a
+ * relative timeout.
+ */
+ if (fFlags & RTSEMWAIT_FLAGS_MILLISECS)
+ {
+ if (uTimeout < UINT64_MAX / RT_NS_1MS)
+ uTimeout *= RT_NS_1MS;
+ else
+ return UINT64_MAX;
+ }
+
+ uint64_t const u64Now = RTTimeNanoTS();
+ if (u64Now < uTimeout)
+ {
+ *pnsAbsTimeout = uTimeout;
+ uTimeout -= u64Now;
+ }
+ else
+ return 0;
+
+ if ( sizeof(pDeadline->tv_sec) >= sizeof(uint64_t)
+ || uTimeout < (uint64_t)UINT32_MAX * RT_NS_1SEC)
+ {
+ pDeadline->tv_sec = uTimeout / RT_NS_1SEC;
+ pDeadline->tv_nsec = uTimeout % RT_NS_1SEC;
+ }
+ else
+ return UINT64_MAX;
+ }
+
+ *piWaitOp = FUTEX_WAIT;
+ *puWaitVal3 = 0;
+ return uTimeout;
+}
+
+#endif /* !IPRT_INCLUDED_SRC_r3_linux_semwait_linux_h */
+
diff --git a/src/VBox/Runtime/r3/linux/sysfs.cpp b/src/VBox/Runtime/r3/linux/sysfs.cpp
new file mode 100644
index 00000000..6324fe00
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/sysfs.cpp
@@ -0,0 +1,736 @@
+/* $Id: sysfs.cpp $ */
+/** @file
+ * IPRT - Linux sysfs access.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_SYSTEM
+#include <iprt/assert.h>
+#include <iprt/dir.h>
+#include <iprt/err.h>
+#include <iprt/file.h>
+#include <iprt/fs.h>
+#include <iprt/param.h>
+#include <iprt/path.h>
+#include <iprt/string.h>
+#include <iprt/symlink.h>
+
+#include <iprt/linux/sysfs.h>
+
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/fcntl.h>
+#include <sys/sysmacros.h>
+#include <errno.h>
+
+
+
+/**
+ * Constructs the path of a sysfs file from the format parameters passed,
+ * prepending a prefix if the path is relative.
+ *
+ * @returns IPRT status code.
+ * @param pszPrefix The prefix to prepend if the path is relative. Must end
+ * in '/'.
+ * @param pszBuf Where to write the path. Must be at least
+ * sizeof(@a pszPrefix) characters long
+ * @param cchBuf The size of the buffer pointed to by @a pszBuf.
+ * @param pszFormat The name format, either absolute or relative to the
+ * prefix specified by @a pszPrefix.
+ * @param va The format args.
+ */
+static int rtLinuxConstructPathV(char *pszBuf, size_t cchBuf,
+ const char *pszPrefix,
+ const char *pszFormat, va_list va)
+{
+ size_t const cchPrefix = strlen(pszPrefix);
+ AssertReturn(pszPrefix[cchPrefix - 1] == '/', VERR_INVALID_PARAMETER);
+ AssertReturn(cchBuf > cchPrefix + 1, VERR_INVALID_PARAMETER);
+
+ ssize_t cch = RTStrPrintf2V(pszBuf, cchBuf, pszFormat, va);
+ AssertReturn(cch >= 0, VERR_BUFFER_OVERFLOW);
+
+ if (*pszBuf != '/')
+ {
+ AssertReturn(cchBuf >= (size_t)cch + cchPrefix + 1, VERR_BUFFER_OVERFLOW);
+ memmove(pszBuf + cchPrefix, pszBuf, (size_t)cch + 1);
+ memcpy(pszBuf, pszPrefix, cchPrefix);
+ }
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Constructs the path of a sysfs file from the format parameters passed,
+ * prepending a prefix if the path is relative.
+ *
+ * @returns IPRT status code.
+ * @param pszPrefix The prefix to prepend if the path is relative. Must end
+ * in '/'.
+ * @param pszBuf Where to write the path. Must be at least
+ * sizeof(@a pszPrefix) characters long
+ * @param cchBuf The size of the buffer pointed to by @a pszBuf.
+ * @param pszFormat The name format, either absolute or relative to "/sys/".
+ * @param ... The format args.
+ */
+DECLINLINE(int) rtLinuxConstructPath(char *pszBuf, size_t cchBuf,
+ const char *pszPrefix,
+ const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = rtLinuxConstructPathV(pszBuf, cchBuf, pszPrefix, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+/**
+ * Constructs the path of a sysfs file from the format parameters passed,
+ * prepending "/sys/" if the path is relative.
+ *
+ * @returns IPRT status code.
+ * @param pszBuf Where to write the path. Must be at least
+ * sizeof("/sys/") characters long
+ * @param cchBuf The size of the buffer pointed to by @a pszBuf.
+ * @param pszFormat The name format, either absolute or relative to "/sys/".
+ * @param va The format args.
+ */
+DECLINLINE(int) rtLinuxSysFsConstructPath(char *pszBuf, size_t cchBuf, const char *pszFormat, va_list va)
+{
+ return rtLinuxConstructPathV(pszBuf, cchBuf, "/sys/", pszFormat, va);
+}
+
+
+RTDECL(int) RTLinuxConstructPathV(char *pszPath, size_t cbPath, const char *pszFormat, va_list va)
+{
+ return rtLinuxSysFsConstructPath(pszPath, cbPath, pszFormat, va);
+}
+
+
+RTDECL(int) RTLinuxConstructPath(char *pszPath, size_t cbPath, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = rtLinuxSysFsConstructPath(pszPath, cbPath, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsExistsExV(const char *pszFormat, va_list va)
+{
+ int iSavedErrno = errno;
+
+ /*
+ * Construct the filename and call stat.
+ */
+ char szFilename[RTPATH_MAX];
+ int rc = rtLinuxSysFsConstructPath(szFilename, sizeof(szFilename), pszFormat, va);
+ if (RT_SUCCESS(rc))
+ {
+ struct stat st;
+ int rcStat = stat(szFilename, &st);
+ if (rcStat != 0)
+ rc = RTErrConvertFromErrno(errno);
+ }
+
+ errno = iSavedErrno;
+ return rc;
+}
+
+
+RTDECL(bool) RTLinuxSysFsExistsV(const char *pszFormat, va_list va)
+{
+ return RT_SUCCESS(RTLinuxSysFsExistsExV(pszFormat, va));
+}
+
+
+RTDECL(int) RTLinuxSysFsExistsEx(const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsExistsExV(pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(bool) RTLinuxSysFsExists(const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ bool fRet = RTLinuxSysFsExistsV(pszFormat, va);
+ va_end(va);
+ return fRet;
+}
+
+
+RTDECL(int) RTLinuxSysFsOpenV(PRTFILE phFile, const char *pszFormat, va_list va)
+{
+ /*
+ * Construct the filename and call open.
+ */
+ char szFilename[RTPATH_MAX];
+ int rc = rtLinuxSysFsConstructPath(szFilename, sizeof(szFilename), pszFormat, va);
+ if (RT_SUCCESS(rc))
+ rc = RTFileOpen(phFile, szFilename, RTFILE_O_OPEN | RTFILE_O_READ | RTFILE_O_DENY_NONE);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsOpenExV(PRTFILE phFile, uint64_t fOpen, const char *pszFormat, va_list va)
+{
+ /*
+ * Construct the filename and call open.
+ */
+ char szFilename[RTPATH_MAX];
+ int rc = rtLinuxSysFsConstructPath(szFilename, sizeof(szFilename), pszFormat, va);
+ if (RT_SUCCESS(rc))
+ rc = RTFileOpen(phFile, szFilename, fOpen);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsOpen(PRTFILE phFile, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsOpenV(phFile, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsOpenEx(PRTFILE phFile, uint64_t fOpen, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsOpenExV(phFile, fOpen, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsReadStr(RTFILE hFile, char *pszBuf, size_t cchBuf, size_t *pcchRead)
+{
+ Assert(cchBuf > 1); /* not mandatory */
+
+ int rc;
+ size_t cchRead;
+ rc = RTFileRead(hFile, pszBuf, cchBuf, &cchRead);
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * ASSUME that if we've read less than we asked for, we've reached the
+ * end of the file. Otherwise, we've been given a buffer too small for
+ * the entire remainder of the file.
+ */
+ if (cchRead < cchBuf)
+ pszBuf[cchRead] = '\0';
+ else if (cchBuf)
+ {
+ rc = RTFileSeek(hFile, -1, RTFILE_SEEK_CURRENT, NULL);
+ if (RT_SUCCESS(rc))
+ rc = VERR_BUFFER_OVERFLOW;
+ cchRead = cchBuf - 1;
+ pszBuf[cchRead] = '\0';
+ }
+ else
+ rc = VERR_BUFFER_OVERFLOW;
+ }
+ else
+ {
+ if (cchBuf > 0)
+ *pszBuf = '\0';
+ cchRead = 0;
+ }
+
+ if (pcchRead)
+ *pcchRead = cchRead;
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteStr(RTFILE hFile, const char *pszBuf, size_t cchBuf, size_t *pcchWritten)
+{
+ if (!cchBuf)
+ cchBuf = strlen(pszBuf) + 1; /* Include the terminator */
+ return RTFileWrite(hFile, pszBuf, cchBuf, pcchWritten);
+}
+
+
+RTDECL(int) RTLinuxSysFsReadFile(RTFILE hFile, void *pvBuf, size_t cbBuf, size_t *pcbRead)
+{
+ int rc;
+ size_t cbRead = 0;
+
+ rc = RTFileRead(hFile, pvBuf, cbBuf, &cbRead);
+ if (RT_SUCCESS(rc))
+ {
+ if (pcbRead)
+ *pcbRead = cbRead;
+ if (cbRead < cbBuf)
+ rc = VINF_SUCCESS;
+ else
+ {
+ /* Check for EOF */
+ uint64_t offCur = 0;
+ uint8_t bRead;
+ rc = RTFileSeek(hFile, 0, RTFILE_SEEK_CURRENT, &offCur);
+ if (RT_SUCCESS(rc))
+ {
+ int rc2 = RTFileRead(hFile, &bRead, 1, NULL);
+ if (RT_SUCCESS(rc2))
+ {
+ rc = VERR_BUFFER_OVERFLOW;
+
+ rc2 = RTFileSeek(hFile, offCur, RTFILE_SEEK_BEGIN, NULL);
+ if (RT_FAILURE(rc2))
+ rc = rc2;
+ }
+ else if (rc2 != VERR_EOF)
+ rc = rc2;
+ }
+ }
+ }
+
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteFile(RTFILE hFile, void *pvBuf, size_t cbBuf, size_t *pcbWritten)
+{
+ return RTFileWrite(hFile, pvBuf, cbBuf, pcbWritten);
+}
+
+
+RTDECL(int) RTLinuxSysFsReadIntFileV(unsigned uBase, int64_t *pi64, const char *pszFormat, va_list va)
+{
+ RTFILE hFile;
+
+ AssertPtrReturn(pi64, VERR_INVALID_POINTER);
+
+ int rc = RTLinuxSysFsOpenV(&hFile, pszFormat, va);
+ if (RT_SUCCESS(rc))
+ {
+ char szNum[128];
+ size_t cchNum;
+ rc = RTLinuxSysFsReadStr(hFile, szNum, sizeof(szNum), &cchNum);
+ if (RT_SUCCESS(rc))
+ {
+ if (cchNum > 0)
+ {
+ int64_t i64Ret = -1;
+ rc = RTStrToInt64Ex(szNum, NULL, uBase, &i64Ret);
+ if (RT_SUCCESS(rc))
+ *pi64 = i64Ret;
+ }
+ else
+ rc = VERR_INVALID_PARAMETER;
+ }
+
+ RTFileClose(hFile);
+ }
+
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsReadIntFile(unsigned uBase, int64_t *pi64, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsReadIntFileV(uBase, pi64, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteU8FileV(unsigned uBase, uint8_t u8, const char *pszFormat, va_list va)
+{
+ return RTLinuxSysFsWriteU64FileV(uBase, u8, pszFormat, va);
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteU8File(unsigned uBase, uint8_t u8, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsWriteU64FileV(uBase, u8, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteU16FileV(unsigned uBase, uint16_t u16, const char *pszFormat, va_list va)
+{
+ return RTLinuxSysFsWriteU64FileV(uBase, u16, pszFormat, va);
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteU16File(unsigned uBase, uint16_t u16, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsWriteU64FileV(uBase, u16, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteU32FileV(unsigned uBase, uint32_t u32, const char *pszFormat, va_list va)
+{
+ return RTLinuxSysFsWriteU64FileV(uBase, u32, pszFormat, va);
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteU32File(unsigned uBase, uint32_t u32, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsWriteU64FileV(uBase, u32, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteU64FileV(unsigned uBase, uint64_t u64, const char *pszFormat, va_list va)
+{
+ RTFILE hFile;
+
+ const char *pszFmt = NULL;
+ switch (uBase)
+ {
+ case 8:
+ pszFmt = "%#llo";
+ break;
+ case 10:
+ pszFmt = "%llu";
+ break;
+ case 16:
+ pszFmt = "%#llx";
+ break;
+ default:
+ return VERR_INVALID_PARAMETER;
+ }
+
+ int rc = RTLinuxSysFsOpenExV(&hFile, RTFILE_O_OPEN | RTFILE_O_WRITE | RTFILE_O_DENY_NONE, pszFormat, va);
+ if (RT_SUCCESS(rc))
+ {
+ char szNum[128];
+ size_t cchNum = RTStrPrintf(szNum, sizeof(szNum), pszFmt, u64);
+ if (cchNum > 0)
+ {
+ size_t cbWritten = 0;
+ rc = RTLinuxSysFsWriteStr(hFile, &szNum[0], cchNum, &cbWritten);
+ if ( RT_SUCCESS(rc)
+ && cbWritten != cchNum)
+ rc = VERR_BUFFER_OVERFLOW;
+ }
+ else
+ rc = VERR_INVALID_PARAMETER;
+
+ RTFileClose(hFile);
+ }
+
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteU64File(unsigned uBase, uint32_t u64, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsWriteU64FileV(uBase, u64, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsReadDevNumFileV(dev_t *pDevNum, const char *pszFormat, va_list va)
+{
+ RTFILE hFile;
+
+ AssertPtrReturn(pDevNum, VERR_INVALID_POINTER);
+
+ int rc = RTLinuxSysFsOpenV(&hFile, pszFormat, va);
+ if (RT_SUCCESS(rc))
+ {
+ size_t cchNum = 0;
+ char szNum[128];
+ rc = RTLinuxSysFsReadStr(hFile, szNum, sizeof(szNum), &cchNum);
+ if (RT_SUCCESS(rc))
+ {
+ if (cchNum > 0)
+ {
+ uint32_t u32Maj = 0;
+ uint32_t u32Min = 0;
+ char *pszNext = NULL;
+ rc = RTStrToUInt32Ex(szNum, &pszNext, 10, &u32Maj);
+ if (RT_FAILURE(rc) || (rc != VWRN_TRAILING_CHARS) || (*pszNext != ':'))
+ rc = VERR_INVALID_PARAMETER;
+ else
+ {
+ rc = RTStrToUInt32Ex(pszNext + 1, NULL, 10, &u32Min);
+ if ( rc != VINF_SUCCESS
+ && rc != VWRN_TRAILING_CHARS
+ && rc != VWRN_TRAILING_SPACES)
+ rc = VERR_INVALID_PARAMETER;
+ else
+ *pDevNum = makedev(u32Maj, u32Min);
+ }
+ }
+ else
+ rc = VERR_INVALID_PARAMETER;
+ }
+
+ RTFileClose(hFile);
+ }
+
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsReadDevNumFile(dev_t *pDevNum, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsReadDevNumFileV(pDevNum, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsReadStrFileV(char *pszBuf, size_t cchBuf, size_t *pcchRead, const char *pszFormat, va_list va)
+{
+ RTFILE hFile;
+
+ AssertPtrReturn(pszBuf, VERR_INVALID_POINTER);
+
+ int rc = RTLinuxSysFsOpenV(&hFile, pszFormat, va);
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * Note! We cannot use RTLinuxSysFsReadStr here as it has different
+ * semantics wrt to newline characters. It is not known why
+ * the semantics has to differ... Michael, any clues?
+ */
+ size_t cchRead;
+ rc = RTFileRead(hFile, pszBuf, cchBuf, &cchRead);
+ if (RT_SUCCESS(rc))
+ {
+ char *pchNewLine = (char *)memchr(pszBuf, '\n', cchRead);
+ if (pchNewLine)
+ {
+ *pchNewLine = '\0';
+ cchRead = pchNewLine - pszBuf;
+ }
+ else if (cchRead < cchBuf)
+ pszBuf[cchRead] = '\0';
+ else
+ {
+ if (cchBuf)
+ {
+ cchRead = cchBuf - 1;
+ pszBuf[cchRead] = '\0';
+ }
+ else
+ cchRead = 0;
+ rc = VERR_BUFFER_OVERFLOW;
+ }
+ }
+ else
+ cchRead = 0;
+
+ RTFileClose(hFile);
+
+ if (pcchRead)
+ *pcchRead = cchRead;
+ }
+ else
+ {
+ if (cchBuf)
+ *pszBuf = '\0';
+ if (pcchRead)
+ *pcchRead = 0;
+ }
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsReadStrFile(char *pszBuf, size_t cchBuf, size_t *pcchRead, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsReadStrFileV(pszBuf, cchBuf, pcchRead, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteStrFileV(const char *pszBuf, size_t cchBuf, size_t *pcchWritten, const char *pszFormat, va_list va)
+{
+ RTFILE hFile;
+
+ AssertPtrReturn(pszBuf, VERR_INVALID_POINTER);
+
+ int rc = RTLinuxSysFsOpenExV(&hFile, RTFILE_O_OPEN | RTFILE_O_WRITE | RTFILE_O_DENY_NONE, pszFormat, va);
+ if (RT_SUCCESS(rc))
+ {
+ rc = RTLinuxSysFsWriteStr(hFile, pszBuf, cchBuf, pcchWritten);
+ RTFileClose(hFile);
+ }
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsWriteStrFile(const char *pszBuf, size_t cchBuf, size_t *pcchWritten, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsWriteStrFileV(pszBuf, cchBuf, pcchWritten, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+RTDECL(int) RTLinuxSysFsGetLinkDestV(char *pszBuf, size_t cchBuf, size_t *pchBuf, const char *pszFormat, va_list va)
+{
+ AssertReturn(cchBuf >= 2, VERR_INVALID_PARAMETER);
+
+ /*
+ * Construct the filename and read the link.
+ */
+ char szFilename[RTPATH_MAX];
+ int rc = rtLinuxSysFsConstructPath(szFilename, sizeof(szFilename), pszFormat, va);
+ if (RT_SUCCESS(rc))
+ {
+ char szLink[RTPATH_MAX];
+ rc = RTSymlinkRead(szFilename, szLink, sizeof(szLink), 0);
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * Extract the file name component and copy it into the return buffer.
+ */
+ size_t cchName;
+ const char *pszName = RTPathFilename(szLink);
+ if (pszName)
+ {
+ cchName = strlen(pszName);
+ if (cchName < cchBuf)
+ memcpy(pszBuf, pszName, cchName + 1);
+ else
+ rc = VERR_BUFFER_OVERFLOW;
+ }
+ else
+ {
+ *pszBuf = '\0';
+ cchName = 0;
+ }
+
+ if (pchBuf)
+ *pchBuf = cchName;
+ }
+ }
+
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxSysFsGetLinkDest(char *pszBuf, size_t cchBuf, size_t *pchBuf, const char *pszFormat, ...)
+{
+ va_list va;
+ va_start(va, pszFormat);
+ int rc = RTLinuxSysFsGetLinkDestV(pszBuf, cchBuf, pchBuf, pszFormat, va);
+ va_end(va);
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxCheckDevicePathV(dev_t DevNum, RTFMODE fMode, char *pszBuf,
+ size_t cchBuf, const char *pszPattern,
+ va_list va)
+{
+ AssertReturn(cchBuf >= 2, VERR_INVALID_PARAMETER);
+ AssertReturn( fMode == RTFS_TYPE_DEV_CHAR
+ || fMode == RTFS_TYPE_DEV_BLOCK,
+ VERR_INVALID_PARAMETER);
+ AssertPtrReturn(pszPattern, VERR_INVALID_PARAMETER);
+
+ /*
+ * Construct the filename and read the link.
+ */
+ char szFilename[RTPATH_MAX];
+ int rc = rtLinuxConstructPathV(szFilename, sizeof(szFilename), "/dev/",
+ pszPattern, va);
+ if (RT_SUCCESS(rc))
+ {
+ RTFSOBJINFO Info;
+ rc = RTPathQueryInfo(szFilename, &Info, RTFSOBJATTRADD_UNIX);
+ if ( rc == VERR_PATH_NOT_FOUND
+ || ( RT_SUCCESS(rc)
+ && ( Info.Attr.u.Unix.Device != DevNum
+ || (Info.Attr.fMode & RTFS_TYPE_MASK) != fMode)))
+ rc = VERR_FILE_NOT_FOUND;
+
+ if (RT_SUCCESS(rc))
+ {
+ size_t cchPath = strlen(szFilename);
+ if (cchPath < cchBuf)
+ memcpy(pszBuf, szFilename, cchPath + 1);
+ else
+ rc = VERR_BUFFER_OVERFLOW;
+ }
+ }
+
+ return rc;
+}
+
+
+RTDECL(int) RTLinuxCheckDevicePath(dev_t DevNum, RTFMODE fMode, char *pszBuf,
+ size_t cchBuf, const char *pszPattern,
+ ...)
+{
+ va_list va;
+ va_start(va, pszPattern);
+ int rc = RTLinuxCheckDevicePathV(DevNum, fMode, pszBuf, cchBuf,
+ pszPattern, va);
+ va_end(va);
+ return rc;
+}
+
diff --git a/src/VBox/Runtime/r3/linux/systemmem-linux.cpp b/src/VBox/Runtime/r3/linux/systemmem-linux.cpp
new file mode 100644
index 00000000..11764ab7
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/systemmem-linux.cpp
@@ -0,0 +1,119 @@
+/* $Id: systemmem-linux.cpp $ */
+/** @file
+ * IPRT - RTSystemQueryTotalRam, Linux ring-3.
+ */
+
+/*
+ * Copyright (C) 2012-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include <iprt/system.h>
+#include "internal/iprt.h"
+
+#include <iprt/errcore.h>
+#include <iprt/assert.h>
+#include <iprt/string.h>
+
+#include <stdio.h>
+#include <errno.h>
+
+/* Satisfy compiller warning */
+#define __EXPORTED_HEADERS__
+#include <sys/sysinfo.h>
+#undef __EXPORTED_HEADERS__
+
+
+RTDECL(int) RTSystemQueryTotalRam(uint64_t *pcb)
+{
+ AssertPtrReturn(pcb, VERR_INVALID_POINTER);
+
+ struct sysinfo info;
+ int rc = sysinfo(&info);
+ if (rc == 0)
+ {
+ *pcb = (uint64_t)info.totalram * info.mem_unit;
+ return VINF_SUCCESS;
+ }
+ return RTErrConvertFromErrno(errno);
+}
+
+
+RTDECL(int) RTSystemQueryAvailableRam(uint64_t *pcb)
+{
+ AssertPtrReturn(pcb, VERR_INVALID_POINTER);
+
+ FILE *pFile = fopen("/proc/meminfo", "r");
+ if (pFile)
+ {
+ int rc = VERR_NOT_FOUND;
+ uint64_t cbTotal = 0;
+ uint64_t cbFree = 0;
+ uint64_t cbBuffers = 0;
+ uint64_t cbCached = 0;
+ char sz[256];
+ while (fgets(sz, sizeof(sz), pFile))
+ {
+ if (!strncmp(sz, RT_STR_TUPLE("MemTotal:")))
+ rc = RTStrToUInt64Ex(RTStrStripL(&sz[sizeof("MemTotal:")]), NULL, 0, &cbTotal);
+ else if (!strncmp(sz, RT_STR_TUPLE("MemFree:")))
+ rc = RTStrToUInt64Ex(RTStrStripL(&sz[sizeof("MemFree:")]), NULL, 0, &cbFree);
+ else if (!strncmp(sz, RT_STR_TUPLE("Buffers:")))
+ rc = RTStrToUInt64Ex(RTStrStripL(&sz[sizeof("Buffers:")]), NULL, 0, &cbBuffers);
+ else if (!strncmp(sz, RT_STR_TUPLE("Cached:")))
+ rc = RTStrToUInt64Ex(RTStrStripL(&sz[sizeof("Cached:")]), NULL, 0, &cbCached);
+ if (RT_FAILURE(rc))
+ break;
+ }
+ fclose(pFile);
+ if (RT_SUCCESS(rc))
+ {
+ *pcb = (cbFree + cbBuffers + cbCached) * _1K;
+ return VINF_SUCCESS;
+ }
+ }
+ /*
+ * Fallback (e.g. /proc not mapped) to sysinfo. Less accurat because there
+ * is no information about the cached memory. 'Cached:' from above is only
+ * accessible through proc :-(
+ */
+ struct sysinfo info;
+ int rc = sysinfo(&info);
+ if (rc == 0)
+ {
+ *pcb = ((uint64_t)info.freeram + info.bufferram) * info.mem_unit;
+ return VINF_SUCCESS;
+ }
+ return RTErrConvertFromErrno(errno);
+}
+
diff --git a/src/VBox/Runtime/r3/linux/thread-affinity-linux.cpp b/src/VBox/Runtime/r3/linux/thread-affinity-linux.cpp
new file mode 100644
index 00000000..2726e716
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/thread-affinity-linux.cpp
@@ -0,0 +1,105 @@
+/* $Id: thread-affinity-linux.cpp $ */
+/** @file
+ * IPRT - Thread Affinity, Linux ring-3 implementation.
+ */
+
+/*
+ * Copyright (C) 2011-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE
+#endif
+#include <features.h>
+#if __GLIBC_PREREQ(2,4)
+
+#include <sched.h>
+#include <unistd.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include <iprt/thread.h>
+#include "internal/iprt.h"
+
+#include <iprt/assert.h>
+#include <iprt/cpuset.h>
+#include <iprt/err.h>
+#include <iprt/mp.h>
+
+
+
+RTR3DECL(int) RTThreadSetAffinity(PCRTCPUSET pCpuSet)
+{
+ /* convert */
+ cpu_set_t LnxCpuSet;
+ CPU_ZERO(&LnxCpuSet);
+ if (!pCpuSet)
+ for (unsigned iCpu = 0; iCpu < CPU_SETSIZE; iCpu++)
+ CPU_SET(iCpu, &LnxCpuSet);
+ else
+ for (unsigned iCpu = 0; iCpu < RT_MIN(CPU_SETSIZE, RTCPUSET_MAX_CPUS); iCpu++)
+ if (RTCpuSetIsMemberByIndex(pCpuSet, iCpu))
+ CPU_SET(iCpu, &LnxCpuSet);
+
+ int rc = pthread_setaffinity_np(pthread_self(), sizeof(LnxCpuSet), &LnxCpuSet);
+ if (!rc)
+ return VINF_SUCCESS;
+ rc = errno;
+ if (rc == ENOENT)
+ return VERR_CPU_NOT_FOUND;
+ return RTErrConvertFromErrno(errno);
+}
+
+
+RTR3DECL(int) RTThreadGetAffinity(PRTCPUSET pCpuSet)
+{
+ cpu_set_t LnxCpuSet;
+ int rc = pthread_getaffinity_np(pthread_self(), sizeof(LnxCpuSet), &LnxCpuSet);
+ if (rc != 0)
+ return RTErrConvertFromErrno(errno);
+
+ /* convert */
+ RTCpuSetEmpty(pCpuSet);
+ for (unsigned iCpu = 0; iCpu < RT_MIN(CPU_SETSIZE, RTCPUSET_MAX_CPUS); iCpu++)
+ if (CPU_ISSET(iCpu, &LnxCpuSet))
+ RTCpuSetAddByIndex(pCpuSet, iCpu);
+
+ return VINF_SUCCESS;
+}
+
+#else
+# include "../../generic/RTThreadGetAffinity-stub-generic.cpp"
+# include "../../generic/RTThreadSetAffinity-stub-generic.cpp"
+#endif
+
diff --git a/src/VBox/Runtime/r3/linux/time-linux.cpp b/src/VBox/Runtime/r3/linux/time-linux.cpp
new file mode 100644
index 00000000..6ceac2de
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/time-linux.cpp
@@ -0,0 +1,169 @@
+/* $Id: time-linux.cpp $ */
+/** @file
+ * IPRT - Time, POSIX.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_TIME
+#define RTTIME_INCL_TIMEVAL
+#include <sys/time.h>
+#include <time.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#ifndef __NR_clock_gettime
+# define __NR_timer_create 259
+# define __NR_clock_gettime (__NR_timer_create+6)
+#endif
+
+#include <iprt/time.h>
+#include "internal/time.h"
+
+
+DECLINLINE(int) sys_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ int rc = syscall(__NR_clock_gettime, id, ts);
+ if (rc >= 0)
+ return rc;
+ return -1;
+}
+
+
+/**
+ * Wrapper around various monotone time sources.
+ */
+DECLINLINE(int) mono_clock(struct timespec *ts)
+{
+ static int iWorking = -1;
+ switch (iWorking)
+ {
+#ifdef CLOCK_MONOTONIC
+ /*
+ * Standard clock_gettime()
+ */
+ case 0:
+ return clock_gettime(CLOCK_MONOTONIC, ts);
+
+ /*
+ * Syscall clock_gettime().
+ */
+ case 1:
+ return sys_clock_gettime(CLOCK_MONOTONIC, ts);
+
+#endif /* CLOCK_MONOTONIC */
+
+
+ /*
+ * Figure out what's working.
+ */
+ case -1:
+ {
+#ifdef CLOCK_MONOTONIC
+ /*
+ * Real-Time API.
+ */
+ int rc = clock_gettime(CLOCK_MONOTONIC, ts);
+ if (!rc)
+ {
+ iWorking = 0;
+ return 0;
+ }
+
+ rc = sys_clock_gettime(CLOCK_MONOTONIC, ts);
+ if (!rc)
+ {
+ iWorking = 1;
+ return 0;
+ }
+#endif /* CLOCK_MONOTONIC */
+
+ /* give up */
+ iWorking = -2;
+ break;
+ }
+ }
+ return -1;
+}
+
+
+DECLINLINE(uint64_t) rtTimeGetSystemNanoTS(void)
+{
+ /* check monotonic clock first. */
+ static bool fMonoClock = true;
+ if (fMonoClock)
+ {
+ struct timespec ts;
+ if (!mono_clock(&ts))
+ return (uint64_t)ts.tv_sec * RT_NS_1SEC_64
+ + ts.tv_nsec;
+ fMonoClock = false;
+ }
+
+ /* fallback to gettimeofday(). */
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return (uint64_t)tv.tv_sec * RT_NS_1SEC_64
+ + (uint64_t)(tv.tv_usec * RT_NS_1US);
+}
+
+
+/**
+ * Gets the current nanosecond timestamp.
+ *
+ * This differs from RTTimeNanoTS in that it will use system APIs and not do any
+ * resolution or performance optimizations.
+ *
+ * @returns nanosecond timestamp.
+ */
+RTDECL(uint64_t) RTTimeSystemNanoTS(void)
+{
+ return rtTimeGetSystemNanoTS();
+}
+
+
+/**
+ * Gets the current millisecond timestamp.
+ *
+ * This differs from RTTimeNanoTS in that it will use system APIs and not do any
+ * resolution or performance optimizations.
+ *
+ * @returns millisecond timestamp.
+ */
+RTDECL(uint64_t) RTTimeSystemMilliTS(void)
+{
+ return rtTimeGetSystemNanoTS() / RT_NS_1MS;
+}
+
diff --git a/src/VBox/Runtime/r3/linux/tpm-linux.cpp b/src/VBox/Runtime/r3/linux/tpm-linux.cpp
new file mode 100644
index 00000000..4851eabc
--- /dev/null
+++ b/src/VBox/Runtime/r3/linux/tpm-linux.cpp
@@ -0,0 +1,229 @@
+/* $Id: tpm-linux.cpp $ */
+/** @file
+ * IPRT - Trusted Platform Module (TPM) access, Linux variant.
+ */
+
+/*
+ * Copyright (C) 2021-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_DEFAULT
+#include <iprt/tpm.h>
+
+#include <iprt/assertcompile.h>
+#include <iprt/asm.h>
+#include <iprt/err.h>
+#include <iprt/file.h>
+#include <iprt/log.h>
+#include <iprt/mem.h>
+#include <iprt/string.h>
+#include <iprt/linux/sysfs.h>
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+
+/**
+ * Internal TPM instance data.
+ */
+typedef struct RTTPMINT
+{
+ /** Handle to the /dev/tpmX device. */
+ RTFILE hTpmDev;
+ /** Handle to the sysfs cancel interface. */
+ RTFILE hTpmCancel;
+ /** The deduced TPM version. */
+ RTTPMVERSION enmTpmVers;
+ /** Flag whether a request is currently being executed. */
+ volatile bool fReqExec;
+} RTTPMINT;
+/** Pointer to the internal TPM instance data. */
+typedef RTTPMINT *PRTTPMINT;
+
+
+/*********************************************************************************************************************************
+* Internal Functions *
+*********************************************************************************************************************************/
+
+RTDECL(int) RTTpmOpen(PRTTPM phTpm, uint32_t idTpm)
+{
+ AssertPtrReturn(phTpm, VERR_INVALID_POINTER);
+ if (idTpm == RTTPM_ID_DEFAULT)
+ idTpm = 0;
+
+ int rc = VINF_SUCCESS;
+ PRTTPMINT pThis = (PRTTPMINT)RTMemAllocZ(sizeof(*pThis));
+ if (pThis)
+ {
+ pThis->hTpmDev = NIL_RTFILE;
+ pThis->hTpmCancel = NIL_RTFILE;
+ pThis->enmTpmVers = RTTPMVERSION_UNKNOWN;
+ pThis->fReqExec = false;
+
+ rc = RTFileOpenF(&pThis->hTpmDev, RTFILE_O_OPEN | RTFILE_O_READWRITE | RTFILE_O_DENY_NONE,
+ "/dev/tpm%u", idTpm);
+ if (RT_SUCCESS(rc))
+ {
+ /* Open the sysfs path to cancel a request, either /sys/class/tpm/tpmX/device/cancel or /sys/class/misc/tpmX/device/cancel. */
+ rc = RTFileOpenF(&pThis->hTpmCancel, RTFILE_O_OPEN | RTFILE_O_WRITE | RTFILE_O_DENY_NONE,
+ "/sys/class/tpm/tpm%u/device/cancel", idTpm);
+ if (rc == VERR_FILE_NOT_FOUND)
+ rc = RTFileOpenF(&pThis->hTpmCancel, RTFILE_O_OPEN | RTFILE_O_WRITE | RTFILE_O_DENY_NONE,
+ "/sys/class/misc/tpm%u/device/cancel", idTpm);
+ if ( RT_SUCCESS(rc)
+ || rc == VERR_FILE_NOT_FOUND)
+ {
+ /* Try to figure out the TPM version. */
+ int64_t iVersion = 0;
+ rc = RTLinuxSysFsReadIntFile(10 /*uBase*/, &iVersion, "/sys/class/tpm/tpm%u/tpm_version_major", idTpm);
+ if (rc == VERR_FILE_NOT_FOUND)
+ rc = RTLinuxSysFsReadIntFile(10 /*uBase*/, &iVersion, "/sys/class/misc/tpm%u/tpm_version_major", idTpm);
+ if (RT_SUCCESS(rc))
+ {
+ if (iVersion == 1)
+ pThis->enmTpmVers = RTTPMVERSION_1_2;
+ else if (iVersion == 2)
+ pThis->enmTpmVers = RTTPMVERSION_2_0;
+ }
+
+ *phTpm = pThis;
+ return VINF_SUCCESS;
+ }
+
+ RTFileClose(pThis->hTpmDev);
+ pThis->hTpmDev = NIL_RTFILE;
+ }
+
+ RTMemFree(pThis);
+ }
+ else
+ rc = VERR_NO_MEMORY;
+ return rc;
+}
+
+
+RTDECL(int) RTTpmClose(RTTPM hTpm)
+{
+ PRTTPMINT pThis = hTpm;
+
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+
+ RTFileClose(pThis->hTpmDev);
+ if (pThis->hTpmCancel != NIL_RTFILE)
+ RTFileClose(pThis->hTpmCancel);
+
+ pThis->hTpmDev = NIL_RTFILE;
+ pThis->hTpmCancel = NIL_RTFILE;
+ RTMemFree(pThis);
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(RTTPMVERSION) RTTpmGetVersion(RTTPM hTpm)
+{
+ PRTTPMINT pThis = hTpm;
+
+ AssertPtrReturn(pThis, RTTPMVERSION_INVALID);
+ return pThis->enmTpmVers;
+}
+
+
+RTDECL(uint32_t) RTTpmGetLocalityMax(RTTPM hTpm)
+{
+ RT_NOREF(hTpm);
+ return 0; /* On Linux only TPM locality 0 is supported. */
+}
+
+
+RTDECL(int) RTTpmReqCancel(RTTPM hTpm)
+{
+ PRTTPMINT pThis = hTpm;
+
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ if (pThis->hTpmCancel == NIL_RTFILE)
+ return VERR_NOT_SUPPORTED;
+
+ if (ASMAtomicReadBool(&pThis->fReqExec))
+ {
+ uint8_t bCancel = '-';
+ return RTFileWrite(pThis->hTpmCancel, &bCancel, sizeof(bCancel), NULL /*pcbWritten*/);
+ }
+
+ return VINF_SUCCESS;
+}
+
+
+RTDECL(int) RTTpmReqExec(RTTPM hTpm, uint8_t bLoc, const void *pvReq, size_t cbReq,
+ void *pvResp, size_t cbRespMax, size_t *pcbResp)
+{
+ PRTTPMINT pThis = hTpm;
+
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertPtrReturn(pvReq, VERR_INVALID_POINTER);
+ AssertPtrReturn(pvResp, VERR_INVALID_POINTER);
+ AssertReturn(cbReq && cbRespMax, VERR_INVALID_PARAMETER);
+ AssertReturn(bLoc == 0, VERR_NOT_SUPPORTED); /** @todo There doesn't seem to be a way to use a different locality. */
+
+ /* The request has to be supplied by a single blocking write. */
+ ASMAtomicXchgBool(&pThis->fReqExec, true);
+ int rc = RTFileWrite(pThis->hTpmDev, pvReq, cbReq, NULL /*pcbWritten*/);
+ if (RT_SUCCESS(rc))
+ {
+ size_t cbResp = 0;
+ /* The response has to be retrieved in a single read as well. */
+ rc = RTFileRead(pThis->hTpmDev, pvResp, cbRespMax, &cbResp);
+ ASMAtomicXchgBool(&pThis->fReqExec, false);
+ if (RT_SUCCESS(rc))
+ {
+ /* Check whether the response is complete. */
+ if ( cbResp >= sizeof(TPMRESPHDR)
+ && RTTpmRespGetSz((PCTPMRESPHDR)pvResp) == cbResp)
+ {
+ if (pcbResp)
+ *pcbResp = cbResp;
+ }
+ else
+ rc = VERR_BUFFER_OVERFLOW;
+ }
+ }
+
+ return rc;
+}
+