summaryrefslogtreecommitdiffstats
path: root/src/VBox/Runtime/r0drv/linux
diff options
context:
space:
mode:
Diffstat (limited to 'src/VBox/Runtime/r0drv/linux')
-rw-r--r--src/VBox/Runtime/r0drv/linux/Makefile.kup0
-rw-r--r--src/VBox/Runtime/r0drv/linux/RTLogWriteDebugger-r0drv-linux.c53
-rw-r--r--src/VBox/Runtime/r0drv/linux/alloc-r0drv-linux.c257
-rw-r--r--src/VBox/Runtime/r0drv/linux/assert-r0drv-linux.c84
-rw-r--r--src/VBox/Runtime/r0drv/linux/initterm-r0drv-linux.c138
-rw-r--r--src/VBox/Runtime/r0drv/linux/memobj-r0drv-linux.c2104
-rw-r--r--src/VBox/Runtime/r0drv/linux/memuserkernel-r0drv-linux.c191
-rw-r--r--src/VBox/Runtime/r0drv/linux/mp-r0drv-linux.c640
-rw-r--r--src/VBox/Runtime/r0drv/linux/mpnotification-r0drv-linux.c258
-rw-r--r--src/VBox/Runtime/r0drv/linux/process-r0drv-linux.c59
-rw-r--r--src/VBox/Runtime/r0drv/linux/rtStrFormatKernelAddress-r0drv-linux.c66
-rw-r--r--src/VBox/Runtime/r0drv/linux/semevent-r0drv-linux.c296
-rw-r--r--src/VBox/Runtime/r0drv/linux/semeventmulti-r0drv-linux.c361
-rw-r--r--src/VBox/Runtime/r0drv/linux/semfastmutex-r0drv-linux.c167
-rw-r--r--src/VBox/Runtime/r0drv/linux/semmutex-r0drv-linux.c431
-rw-r--r--src/VBox/Runtime/r0drv/linux/spinlock-r0drv-linux.c196
-rw-r--r--src/VBox/Runtime/r0drv/linux/string.h70
-rw-r--r--src/VBox/Runtime/r0drv/linux/the-linux-kernel.h494
-rw-r--r--src/VBox/Runtime/r0drv/linux/thread-r0drv-linux.c285
-rw-r--r--src/VBox/Runtime/r0drv/linux/thread2-r0drv-linux.c243
-rw-r--r--src/VBox/Runtime/r0drv/linux/threadctxhooks-r0drv-linux.c341
-rw-r--r--src/VBox/Runtime/r0drv/linux/time-r0drv-linux.c221
-rw-r--r--src/VBox/Runtime/r0drv/linux/timer-r0drv-linux.c1739
-rw-r--r--src/VBox/Runtime/r0drv/linux/waitqueue-r0drv-linux.h302
24 files changed, 8996 insertions, 0 deletions
diff --git a/src/VBox/Runtime/r0drv/linux/Makefile.kup b/src/VBox/Runtime/r0drv/linux/Makefile.kup
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/Makefile.kup
diff --git a/src/VBox/Runtime/r0drv/linux/RTLogWriteDebugger-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/RTLogWriteDebugger-r0drv-linux.c
new file mode 100644
index 00000000..7de10556
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/RTLogWriteDebugger-r0drv-linux.c
@@ -0,0 +1,53 @@
+/* $Id: RTLogWriteDebugger-r0drv-linux.c $ */
+/** @file
+ * IPRT - Log To Debugger, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/log.h>
+
+
+RTDECL(void) RTLogWriteDebugger(const char *pch, size_t cb)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ printk("%.*s", (int)cb, pch);
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+RT_EXPORT_SYMBOL(RTLogWriteDebugger);
+
diff --git a/src/VBox/Runtime/r0drv/linux/alloc-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/alloc-r0drv-linux.c
new file mode 100644
index 00000000..47410cca
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/alloc-r0drv-linux.c
@@ -0,0 +1,257 @@
+/* $Id: alloc-r0drv-linux.c $ */
+/** @file
+ * IPRT - Memory Allocation, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/mem.h>
+
+#include <iprt/assert.h>
+#include <iprt/errcore.h>
+#include "r0drv/alloc-r0drv.h"
+
+#include "internal/initterm.h"
+
+
+
+/**
+ * OS specific allocation function.
+ */
+DECLHIDDEN(int) rtR0MemAllocEx(size_t cb, uint32_t fFlags, PRTMEMHDR *ppHdr)
+{
+ PRTMEMHDR pHdr;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Allocate.
+ */
+ if (
+#if 1 /* vmalloc has serious performance issues, avoid it. */
+ cb <= PAGE_SIZE*16 - sizeof(*pHdr)
+#else
+ cb <= PAGE_SIZE
+#endif
+ || (fFlags & RTMEMHDR_FLAG_ANY_CTX)
+ )
+ {
+ fFlags |= RTMEMHDR_FLAG_KMALLOC;
+ pHdr = kmalloc(cb + sizeof(*pHdr),
+ fFlags & RTMEMHDR_FLAG_ANY_CTX_ALLOC ? GFP_ATOMIC | __GFP_NOWARN : GFP_KERNEL | __GFP_NOWARN);
+ if (RT_UNLIKELY( !pHdr
+ && cb > PAGE_SIZE
+ && !(fFlags & RTMEMHDR_FLAG_ANY_CTX) ))
+ {
+ fFlags &= ~RTMEMHDR_FLAG_KMALLOC;
+ pHdr = vmalloc(cb + sizeof(*pHdr));
+ }
+ }
+ else
+ pHdr = vmalloc(cb + sizeof(*pHdr));
+ if (RT_LIKELY(pHdr))
+ {
+ /*
+ * Initialize.
+ */
+ pHdr->u32Magic = RTMEMHDR_MAGIC;
+ pHdr->fFlags = fFlags;
+ pHdr->cb = cb;
+ pHdr->cbReq = cb;
+
+ *ppHdr = pHdr;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+}
+
+
+/**
+ * OS specific free function.
+ */
+DECLHIDDEN(void) rtR0MemFree(PRTMEMHDR pHdr)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ pHdr->u32Magic += 1;
+ if (pHdr->fFlags & RTMEMHDR_FLAG_KMALLOC)
+ kfree(pHdr);
+ else
+ vfree(pHdr);
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+
+
+
+/**
+ * Compute order. Some functions allocate 2^order pages.
+ *
+ * @returns order.
+ * @param cPages Number of pages.
+ */
+static int CalcPowerOf2Order(unsigned long cPages)
+{
+ int iOrder;
+ unsigned long cTmp;
+
+ for (iOrder = 0, cTmp = cPages; cTmp >>= 1; ++iOrder)
+ ;
+ if (cPages & ~(1 << iOrder))
+ ++iOrder;
+
+ return iOrder;
+}
+
+
+/**
+ * Allocates physical contiguous memory (below 4GB).
+ * The allocation is page aligned and the content is undefined.
+ *
+ * @returns Pointer to the memory block. This is page aligned.
+ * @param pPhys Where to store the physical address.
+ * @param cb The allocation size in bytes. This is always
+ * rounded up to PAGE_SIZE.
+ */
+RTR0DECL(void *) RTMemContAlloc(PRTCCPHYS pPhys, size_t cb)
+{
+ int cOrder;
+ unsigned cPages;
+ struct page *paPages;
+ void *pvRet;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * validate input.
+ */
+ AssertPtr(pPhys);
+ Assert(cb > 0);
+
+ /*
+ * Allocate page pointer array.
+ */
+ cb = RT_ALIGN_Z(cb, PAGE_SIZE);
+ cPages = cb >> PAGE_SHIFT;
+ cOrder = CalcPowerOf2Order(cPages);
+#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32)
+ /* ZONE_DMA32: 0-4GB */
+ paPages = alloc_pages(GFP_DMA32 | __GFP_NOWARN, cOrder);
+ if (!paPages)
+#endif
+#ifdef RT_ARCH_AMD64
+ /* ZONE_DMA; 0-16MB */
+ paPages = alloc_pages(GFP_DMA | __GFP_NOWARN, cOrder);
+#else
+ /* ZONE_NORMAL: 0-896MB */
+ paPages = alloc_pages(GFP_USER | __GFP_NOWARN, cOrder);
+#endif
+ if (paPages)
+ {
+ /*
+ * Reserve the pages and mark them executable.
+ */
+ unsigned iPage;
+ for (iPage = 0; iPage < cPages; iPage++)
+ {
+ Assert(!PageHighMem(&paPages[iPage]));
+ if (iPage + 1 < cPages)
+ {
+ AssertMsg( (uintptr_t)phys_to_virt(page_to_phys(&paPages[iPage])) + PAGE_SIZE
+ == (uintptr_t)phys_to_virt(page_to_phys(&paPages[iPage + 1]))
+ && page_to_phys(&paPages[iPage]) + PAGE_SIZE
+ == page_to_phys(&paPages[iPage + 1]),
+ ("iPage=%i cPages=%u [0]=%#llx,%p [1]=%#llx,%p\n", iPage, cPages,
+ (long long)page_to_phys(&paPages[iPage]), phys_to_virt(page_to_phys(&paPages[iPage])),
+ (long long)page_to_phys(&paPages[iPage + 1]), phys_to_virt(page_to_phys(&paPages[iPage + 1])) ));
+ }
+
+ SetPageReserved(&paPages[iPage]);
+ }
+ *pPhys = page_to_phys(paPages);
+ pvRet = phys_to_virt(page_to_phys(paPages));
+ }
+ else
+ pvRet = NULL;
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return pvRet;
+}
+RT_EXPORT_SYMBOL(RTMemContAlloc);
+
+
+/**
+ * Frees memory allocated using RTMemContAlloc().
+ *
+ * @param pv Pointer to return from RTMemContAlloc().
+ * @param cb The cb parameter passed to RTMemContAlloc().
+ */
+RTR0DECL(void) RTMemContFree(void *pv, size_t cb)
+{
+ if (pv)
+ {
+ int cOrder;
+ unsigned cPages;
+ unsigned iPage;
+ struct page *paPages;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /* validate */
+ AssertMsg(!((uintptr_t)pv & PAGE_OFFSET_MASK), ("pv=%p\n", pv));
+ Assert(cb > 0);
+
+ /* calc order and get pages */
+ cb = RT_ALIGN_Z(cb, PAGE_SIZE);
+ cPages = cb >> PAGE_SHIFT;
+ cOrder = CalcPowerOf2Order(cPages);
+ paPages = virt_to_page(pv);
+
+ /*
+ * Restore page attributes freeing the pages.
+ */
+ for (iPage = 0; iPage < cPages; iPage++)
+ {
+ ClearPageReserved(&paPages[iPage]);
+ }
+ __free_pages(paPages, cOrder);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ }
+}
+RT_EXPORT_SYMBOL(RTMemContFree);
+
diff --git a/src/VBox/Runtime/r0drv/linux/assert-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/assert-r0drv-linux.c
new file mode 100644
index 00000000..a7968d06
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/assert-r0drv-linux.c
@@ -0,0 +1,84 @@
+/* $Id: assert-r0drv-linux.c $ */
+/** @file
+ * IPRT - Assertion Workers, Ring-0 Drivers, Linux.
+ */
+
+/*
+ * Copyright (C) 2007-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/assert.h>
+#include <iprt/log.h>
+#include <iprt/string.h>
+#include <iprt/stdarg.h>
+#include <iprt/asm.h>
+
+#include "internal/assert.h"
+
+
+DECLHIDDEN(void) rtR0AssertNativeMsg1(const char *pszExpr, unsigned uLine, const char *pszFile, const char *pszFunction)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ printk(KERN_EMERG
+ "\r\n!!Assertion Failed!!\r\n"
+ "Expression: %s\r\n"
+ "Location : %s(%d) %s\r\n",
+ pszExpr, pszFile, uLine, pszFunction);
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+
+
+DECLHIDDEN(void) rtR0AssertNativeMsg2V(bool fInitial, const char *pszFormat, va_list va)
+{
+ char szMsg[256];
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ RTStrPrintfV(szMsg, sizeof(szMsg) - 1, pszFormat, va);
+ szMsg[sizeof(szMsg) - 1] = '\0';
+ printk(KERN_EMERG "%s", szMsg);
+
+ NOREF(fInitial);
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+
+
+RTR0DECL(void) RTR0AssertPanicSystem(void)
+{
+ panic("%s%s", g_szRTAssertMsg1, g_szRTAssertMsg2);
+}
+RT_EXPORT_SYMBOL(RTR0AssertPanicSystem);
+
diff --git a/src/VBox/Runtime/r0drv/linux/initterm-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/initterm-r0drv-linux.c
new file mode 100644
index 00000000..cbdb324a
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/initterm-r0drv-linux.c
@@ -0,0 +1,138 @@
+/* $Id: initterm-r0drv-linux.c $ */
+/** @file
+ * IPRT - Initialization & Termination, R0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/errcore.h>
+#include <iprt/assert.h>
+#include "internal/initterm.h"
+
+
+/*********************************************************************************************************************************
+* Global Variables *
+*********************************************************************************************************************************/
+/** The IPRT work queue. */
+#if RTLNX_VER_MIN(2,5,41)
+static struct workqueue_struct *g_prtR0LnxWorkQueue;
+#else
+static DECLARE_TASK_QUEUE(g_rtR0LnxWorkQueue);
+#endif
+
+
+/**
+ * Pushes an item onto the IPRT work queue.
+ *
+ * @param pWork The work item.
+ * @param pfnWorker The callback function. It will be called back
+ * with @a pWork as argument.
+ */
+DECLHIDDEN(void) rtR0LnxWorkqueuePush(RTR0LNXWORKQUEUEITEM *pWork, void (*pfnWorker)(RTR0LNXWORKQUEUEITEM *))
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+#if RTLNX_VER_MIN(2,5,41)
+# if RTLNX_VER_MIN(2,6,20)
+ INIT_WORK(pWork, pfnWorker);
+# else
+ INIT_WORK(pWork, (void (*)(void *))pfnWorker, pWork);
+# endif
+ queue_work(g_prtR0LnxWorkQueue, pWork);
+#else
+ INIT_TQUEUE(pWork, (void (*)(void *))pfnWorker, pWork);
+ queue_task(pWork, &g_rtR0LnxWorkQueue);
+#endif
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+
+
+/**
+ * Flushes all items in the IPRT work queue.
+ *
+ * @remarks This is mostly for 2.4.x compatability. Must not be called from
+ * atomic contexts or with unncessary locks held.
+ */
+DECLHIDDEN(void) rtR0LnxWorkqueueFlush(void)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+#if RTLNX_VER_MIN(2,5,41)
+ flush_workqueue(g_prtR0LnxWorkQueue);
+#else
+ run_task_queue(&g_rtR0LnxWorkQueue);
+#endif
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+
+
+DECLHIDDEN(int) rtR0InitNative(void)
+{
+ int rc = VINF_SUCCESS;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+#if RTLNX_VER_MIN(2,5,41)
+ #if RTLNX_VER_MIN(2,6,13)
+ g_prtR0LnxWorkQueue = create_workqueue("iprt-VBoxWQueue");
+ #else
+ g_prtR0LnxWorkQueue = create_workqueue("iprt-VBoxQ");
+ #endif
+ if (!g_prtR0LnxWorkQueue)
+ rc = VERR_NO_MEMORY;
+#endif
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+DECLHIDDEN(void) rtR0TermNative(void)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ rtR0LnxWorkqueueFlush();
+#if RTLNX_VER_MIN(2,5,41)
+ destroy_workqueue(g_prtR0LnxWorkQueue);
+ g_prtR0LnxWorkQueue = NULL;
+#endif
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+
diff --git a/src/VBox/Runtime/r0drv/linux/memobj-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/memobj-r0drv-linux.c
new file mode 100644
index 00000000..8342fbf8
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/memobj-r0drv-linux.c
@@ -0,0 +1,2104 @@
+/* $Id: memobj-r0drv-linux.c $ */
+/** @file
+ * IPRT - Ring-0 Memory Objects, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+
+#include <iprt/memobj.h>
+#include <iprt/assert.h>
+#include <iprt/err.h>
+#include <iprt/log.h>
+#include <iprt/mem.h>
+#include <iprt/process.h>
+#include <iprt/string.h>
+#include "internal/memobj.h"
+#include "internal/iprt.h"
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+/* early 2.6 kernels */
+#ifndef PAGE_SHARED_EXEC
+# define PAGE_SHARED_EXEC PAGE_SHARED
+#endif
+#ifndef PAGE_READONLY_EXEC
+# define PAGE_READONLY_EXEC PAGE_READONLY
+#endif
+
+/** @def IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
+ * Whether we use alloc_vm_area (3.2+) for executable memory.
+ * This is a must for 5.8+, but we enable it all the way back to 3.2.x for
+ * better W^R compliance (fExecutable flag). */
+#if RTLNX_VER_RANGE(3,2,0, 5,10,0) || defined(DOXYGEN_RUNNING)
+# define IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
+#endif
+/** @def IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC
+ * alloc_vm_area was removed with 5.10 so we have to resort to a different way
+ * to allocate executable memory.
+ * It would be possible to remove IPRT_USE_ALLOC_VM_AREA_FOR_EXEC and use
+ * this path execlusively for 3.2+ but no time to test it really works on every
+ * supported kernel, so better play safe for now.
+ */
+#if RTLNX_VER_MIN(5,10,0) || defined(DOXYGEN_RUNNING)
+# define IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC
+#endif
+
+/*
+ * 2.6.29+ kernels don't work with remap_pfn_range() anymore because
+ * track_pfn_vma_new() is apparently not defined for non-RAM pages.
+ * It should be safe to use vm_insert_page() older kernels as well.
+ */
+#if RTLNX_VER_MIN(2,6,23)
+# define VBOX_USE_INSERT_PAGE
+#endif
+#if defined(CONFIG_X86_PAE) \
+ && ( defined(HAVE_26_STYLE_REMAP_PAGE_RANGE) \
+ || RTLNX_VER_RANGE(2,6,0, 2,6,11) )
+# define VBOX_USE_PAE_HACK
+#endif
+
+/* gfp_t was introduced in 2.6.14, define it for earlier. */
+#if RTLNX_VER_MAX(2,6,14)
+# define gfp_t unsigned
+#endif
+
+/*
+ * Wrappers around mmap_lock/mmap_sem difference.
+ */
+#if RTLNX_VER_MIN(5,8,0)
+# define LNX_MM_DOWN_READ(a_pMm) down_read(&(a_pMm)->mmap_lock)
+# define LNX_MM_UP_READ(a_pMm) up_read(&(a_pMm)->mmap_lock)
+# define LNX_MM_DOWN_WRITE(a_pMm) down_write(&(a_pMm)->mmap_lock)
+# define LNX_MM_UP_WRITE(a_pMm) up_write(&(a_pMm)->mmap_lock)
+#else
+# define LNX_MM_DOWN_READ(a_pMm) down_read(&(a_pMm)->mmap_sem)
+# define LNX_MM_UP_READ(a_pMm) up_read(&(a_pMm)->mmap_sem)
+# define LNX_MM_DOWN_WRITE(a_pMm) down_write(&(a_pMm)->mmap_sem)
+# define LNX_MM_UP_WRITE(a_pMm) up_write(&(a_pMm)->mmap_sem)
+#endif
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * The Linux version of the memory object structure.
+ */
+typedef struct RTR0MEMOBJLNX
+{
+ /** The core structure. */
+ RTR0MEMOBJINTERNAL Core;
+ /** Set if the allocation is contiguous.
+ * This means it has to be given back as one chunk. */
+ bool fContiguous;
+ /** Set if executable allocation. */
+ bool fExecutable;
+ /** Set if we've vmap'ed the memory into ring-0. */
+ bool fMappedToRing0;
+ /** This is non-zero if large page allocation. */
+ uint8_t cLargePageOrder;
+#ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
+ /** Return from alloc_vm_area() that we now need to use for executable
+ * memory. */
+ struct vm_struct *pArea;
+ /** PTE array that goes along with pArea (must be freed). */
+ pte_t **papPtesForArea;
+#endif
+ /** The pages in the apPages array. */
+ size_t cPages;
+ /** Array of struct page pointers. (variable size) */
+ struct page *apPages[1];
+} RTR0MEMOBJLNX;
+/** Pointer to the linux memory object. */
+typedef RTR0MEMOBJLNX *PRTR0MEMOBJLNX;
+
+
+static void rtR0MemObjLinuxFreePages(PRTR0MEMOBJLNX pMemLnx);
+
+
+/**
+ * Helper that converts from a RTR0PROCESS handle to a linux task.
+ *
+ * @returns The corresponding Linux task.
+ * @param R0Process IPRT ring-0 process handle.
+ */
+static struct task_struct *rtR0ProcessToLinuxTask(RTR0PROCESS R0Process)
+{
+ /** @todo fix rtR0ProcessToLinuxTask!! */
+ /** @todo many (all?) callers currently assume that we return 'current'! */
+ return R0Process == RTR0ProcHandleSelf() ? current : NULL;
+}
+
+
+/**
+ * Compute order. Some functions allocate 2^order pages.
+ *
+ * @returns order.
+ * @param cPages Number of pages.
+ */
+static int rtR0MemObjLinuxOrder(size_t cPages)
+{
+ int iOrder;
+ size_t cTmp;
+
+ for (iOrder = 0, cTmp = cPages; cTmp >>= 1; ++iOrder)
+ ;
+ if (cPages & ~((size_t)1 << iOrder))
+ ++iOrder;
+
+ return iOrder;
+}
+
+
+/**
+ * Converts from RTMEM_PROT_* to Linux PAGE_*.
+ *
+ * @returns Linux page protection constant.
+ * @param fProt The IPRT protection mask.
+ * @param fKernel Whether it applies to kernel or user space.
+ */
+static pgprot_t rtR0MemObjLinuxConvertProt(unsigned fProt, bool fKernel)
+{
+ switch (fProt)
+ {
+ default:
+ AssertMsgFailed(("%#x %d\n", fProt, fKernel)); RT_FALL_THRU();
+ case RTMEM_PROT_NONE:
+ return PAGE_NONE;
+
+ case RTMEM_PROT_READ:
+ return fKernel ? PAGE_KERNEL_RO : PAGE_READONLY;
+
+ case RTMEM_PROT_WRITE:
+ case RTMEM_PROT_WRITE | RTMEM_PROT_READ:
+ return fKernel ? PAGE_KERNEL : PAGE_SHARED;
+
+ case RTMEM_PROT_EXEC:
+ case RTMEM_PROT_EXEC | RTMEM_PROT_READ:
+#if defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
+ if (fKernel)
+ {
+ pgprot_t fPg = MY_PAGE_KERNEL_EXEC;
+ pgprot_val(fPg) &= ~_PAGE_RW;
+ return fPg;
+ }
+ return PAGE_READONLY_EXEC;
+#else
+ return fKernel ? MY_PAGE_KERNEL_EXEC : PAGE_READONLY_EXEC;
+#endif
+
+ case RTMEM_PROT_WRITE | RTMEM_PROT_EXEC:
+ case RTMEM_PROT_WRITE | RTMEM_PROT_EXEC | RTMEM_PROT_READ:
+ return fKernel ? MY_PAGE_KERNEL_EXEC : PAGE_SHARED_EXEC;
+ }
+}
+
+
+/**
+ * Worker for rtR0MemObjNativeReserveUser and rtR0MemObjNativerMapUser that creates
+ * an empty user space mapping.
+ *
+ * We acquire the mmap_sem/mmap_lock of the task!
+ *
+ * @returns Pointer to the mapping.
+ * (void *)-1 on failure.
+ * @param R3PtrFixed (RTR3PTR)-1 if anywhere, otherwise a specific location.
+ * @param cb The size of the mapping.
+ * @param uAlignment The alignment of the mapping.
+ * @param pTask The Linux task to create this mapping in.
+ * @param fProt The RTMEM_PROT_* mask.
+ */
+static void *rtR0MemObjLinuxDoMmap(RTR3PTR R3PtrFixed, size_t cb, size_t uAlignment, struct task_struct *pTask, unsigned fProt)
+{
+ unsigned fLnxProt;
+ unsigned long ulAddr;
+
+ Assert(pTask == current); /* do_mmap */
+ RT_NOREF_PV(pTask);
+
+ /*
+ * Convert from IPRT protection to mman.h PROT_ and call do_mmap.
+ */
+ fProt &= (RTMEM_PROT_NONE | RTMEM_PROT_READ | RTMEM_PROT_WRITE | RTMEM_PROT_EXEC);
+ if (fProt == RTMEM_PROT_NONE)
+ fLnxProt = PROT_NONE;
+ else
+ {
+ fLnxProt = 0;
+ if (fProt & RTMEM_PROT_READ)
+ fLnxProt |= PROT_READ;
+ if (fProt & RTMEM_PROT_WRITE)
+ fLnxProt |= PROT_WRITE;
+ if (fProt & RTMEM_PROT_EXEC)
+ fLnxProt |= PROT_EXEC;
+ }
+
+ if (R3PtrFixed != (RTR3PTR)-1)
+ {
+#if RTLNX_VER_MIN(3,5,0)
+ ulAddr = vm_mmap(NULL, R3PtrFixed, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, 0);
+#else
+ LNX_MM_DOWN_WRITE(pTask->mm);
+ ulAddr = do_mmap(NULL, R3PtrFixed, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, 0);
+ LNX_MM_UP_WRITE(pTask->mm);
+#endif
+ }
+ else
+ {
+#if RTLNX_VER_MIN(3,5,0)
+ ulAddr = vm_mmap(NULL, 0, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS, 0);
+#else
+ LNX_MM_DOWN_WRITE(pTask->mm);
+ ulAddr = do_mmap(NULL, 0, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS, 0);
+ LNX_MM_UP_WRITE(pTask->mm);
+#endif
+ if ( !(ulAddr & ~PAGE_MASK)
+ && (ulAddr & (uAlignment - 1)))
+ {
+ /** @todo implement uAlignment properly... We'll probably need to make some dummy mappings to fill
+ * up alignment gaps. This is of course complicated by fragmentation (which we might have cause
+ * ourselves) and further by there begin two mmap strategies (top / bottom). */
+ /* For now, just ignore uAlignment requirements... */
+ }
+ }
+
+
+ if (ulAddr & ~PAGE_MASK) /* ~PAGE_MASK == PAGE_OFFSET_MASK */
+ return (void *)-1;
+ return (void *)ulAddr;
+}
+
+
+/**
+ * Worker that destroys a user space mapping.
+ * Undoes what rtR0MemObjLinuxDoMmap did.
+ *
+ * We acquire the mmap_sem/mmap_lock of the task!
+ *
+ * @param pv The ring-3 mapping.
+ * @param cb The size of the mapping.
+ * @param pTask The Linux task to destroy this mapping in.
+ */
+static void rtR0MemObjLinuxDoMunmap(void *pv, size_t cb, struct task_struct *pTask)
+{
+#if RTLNX_VER_MIN(3,5,0)
+ Assert(pTask == current); RT_NOREF_PV(pTask);
+ vm_munmap((unsigned long)pv, cb);
+#elif defined(USE_RHEL4_MUNMAP)
+ LNX_MM_DOWN_WRITE(pTask->mm);
+ do_munmap(pTask->mm, (unsigned long)pv, cb, 0); /* should it be 1 or 0? */
+ LNX_MM_UP_WRITE(pTask->mm);
+#else
+ LNX_MM_DOWN_WRITE(pTask->mm);
+ do_munmap(pTask->mm, (unsigned long)pv, cb);
+ LNX_MM_UP_WRITE(pTask->mm);
+#endif
+}
+
+
+/**
+ * Internal worker that allocates physical pages and creates the memory object for them.
+ *
+ * @returns IPRT status code.
+ * @param ppMemLnx Where to store the memory object pointer.
+ * @param enmType The object type.
+ * @param cb The number of bytes to allocate.
+ * @param uAlignment The alignment of the physical memory.
+ * Only valid if fContiguous == true, ignored otherwise.
+ * @param fFlagsLnx The page allocation flags (GPFs).
+ * @param fContiguous Whether the allocation must be contiguous.
+ * @param fExecutable Whether the memory must be executable.
+ * @param rcNoMem What to return when we're out of pages.
+ * @param pszTag Allocation tag used for statistics and such.
+ */
+static int rtR0MemObjLinuxAllocPages(PRTR0MEMOBJLNX *ppMemLnx, RTR0MEMOBJTYPE enmType, size_t cb,
+ size_t uAlignment, gfp_t fFlagsLnx, bool fContiguous, bool fExecutable, int rcNoMem,
+ const char *pszTag)
+{
+ size_t iPage;
+ size_t const cPages = cb >> PAGE_SHIFT;
+ struct page *paPages;
+
+ /*
+ * Allocate a memory object structure that's large enough to contain
+ * the page pointer array.
+ */
+ PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), enmType,
+ NULL, cb, pszTag);
+ if (!pMemLnx)
+ return VERR_NO_MEMORY;
+ pMemLnx->Core.fFlags |= RTR0MEMOBJ_FLAGS_UNINITIALIZED_AT_ALLOC;
+ pMemLnx->cPages = cPages;
+
+ if (cPages > 255)
+ {
+# ifdef __GFP_REPEAT
+ /* Try hard to allocate the memory, but the allocation attempt might fail. */
+ fFlagsLnx |= __GFP_REPEAT;
+# endif
+# ifdef __GFP_NOMEMALLOC
+ /* Introduced with Linux 2.6.12: Don't use emergency reserves */
+ fFlagsLnx |= __GFP_NOMEMALLOC;
+# endif
+ }
+
+ /*
+ * Allocate the pages.
+ * For small allocations we'll try contiguous first and then fall back on page by page.
+ */
+#if RTLNX_VER_MIN(2,4,22)
+ if ( fContiguous
+ || cb <= PAGE_SIZE * 2)
+ {
+# ifdef VBOX_USE_INSERT_PAGE
+ paPages = alloc_pages(fFlagsLnx | __GFP_COMP | __GFP_NOWARN, rtR0MemObjLinuxOrder(cPages));
+# else
+ paPages = alloc_pages(fFlagsLnx | __GFP_NOWARN, rtR0MemObjLinuxOrder(cPages));
+# endif
+ if (paPages)
+ {
+ fContiguous = true;
+ for (iPage = 0; iPage < cPages; iPage++)
+ pMemLnx->apPages[iPage] = &paPages[iPage];
+ }
+ else if (fContiguous)
+ {
+ rtR0MemObjDelete(&pMemLnx->Core);
+ return rcNoMem;
+ }
+ }
+
+ if (!fContiguous)
+ {
+ /** @todo Try use alloc_pages_bulk_array when available, it should be faster
+ * than a alloc_page loop. Put it in #ifdefs similar to
+ * IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC. */
+ for (iPage = 0; iPage < cPages; iPage++)
+ {
+ pMemLnx->apPages[iPage] = alloc_page(fFlagsLnx | __GFP_NOWARN);
+ if (RT_UNLIKELY(!pMemLnx->apPages[iPage]))
+ {
+ while (iPage-- > 0)
+ __free_page(pMemLnx->apPages[iPage]);
+ rtR0MemObjDelete(&pMemLnx->Core);
+ return rcNoMem;
+ }
+ }
+ }
+
+#else /* < 2.4.22 */
+ /** @todo figure out why we didn't allocate page-by-page on 2.4.21 and older... */
+ paPages = alloc_pages(fFlagsLnx, rtR0MemObjLinuxOrder(cPages));
+ if (!paPages)
+ {
+ rtR0MemObjDelete(&pMemLnx->Core);
+ return rcNoMem;
+ }
+ for (iPage = 0; iPage < cPages; iPage++)
+ {
+ pMemLnx->apPages[iPage] = &paPages[iPage];
+ if (fExecutable)
+ MY_SET_PAGES_EXEC(pMemLnx->apPages[iPage], 1);
+ if (PageHighMem(pMemLnx->apPages[iPage]))
+ BUG();
+ }
+
+ fContiguous = true;
+#endif /* < 2.4.22 */
+ pMemLnx->fContiguous = fContiguous;
+ pMemLnx->fExecutable = fExecutable;
+
+#if RTLNX_VER_MAX(4,5,0)
+ /*
+ * Reserve the pages.
+ *
+ * Linux >= 4.5 with CONFIG_DEBUG_VM panics when setting PG_reserved on compound
+ * pages. According to Michal Hocko this shouldn't be necessary anyway because
+ * as pages which are not on the LRU list are never evictable.
+ */
+ for (iPage = 0; iPage < cPages; iPage++)
+ SetPageReserved(pMemLnx->apPages[iPage]);
+#endif
+
+ /*
+ * Note that the physical address of memory allocated with alloc_pages(flags, order)
+ * is always 2^(PAGE_SHIFT+order)-aligned.
+ */
+ if ( fContiguous
+ && uAlignment > PAGE_SIZE)
+ {
+ /*
+ * Check for alignment constraints. The physical address of memory allocated with
+ * alloc_pages(flags, order) is always 2^(PAGE_SHIFT+order)-aligned.
+ */
+ if (RT_UNLIKELY(page_to_phys(pMemLnx->apPages[0]) & (uAlignment - 1)))
+ {
+ /*
+ * This should never happen!
+ */
+ printk("rtR0MemObjLinuxAllocPages(cb=0x%lx, uAlignment=0x%lx): alloc_pages(..., %d) returned physical memory at 0x%lx!\n",
+ (unsigned long)cb, (unsigned long)uAlignment, rtR0MemObjLinuxOrder(cPages), (unsigned long)page_to_phys(pMemLnx->apPages[0]));
+ rtR0MemObjLinuxFreePages(pMemLnx);
+ return rcNoMem;
+ }
+ }
+
+ *ppMemLnx = pMemLnx;
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Frees the physical pages allocated by the rtR0MemObjLinuxAllocPages() call.
+ *
+ * This method does NOT free the object.
+ *
+ * @param pMemLnx The object which physical pages should be freed.
+ */
+static void rtR0MemObjLinuxFreePages(PRTR0MEMOBJLNX pMemLnx)
+{
+ size_t iPage = pMemLnx->cPages;
+ if (iPage > 0)
+ {
+ /*
+ * Restore the page flags.
+ */
+ while (iPage-- > 0)
+ {
+#if RTLNX_VER_MAX(4,5,0)
+ /* See SetPageReserved() in rtR0MemObjLinuxAllocPages() */
+ ClearPageReserved(pMemLnx->apPages[iPage]);
+#endif
+#if RTLNX_VER_MAX(2,4,22)
+ if (pMemLnx->fExecutable)
+ MY_SET_PAGES_NOEXEC(pMemLnx->apPages[iPage], 1);
+#endif
+ }
+
+ /*
+ * Free the pages.
+ */
+#if RTLNX_VER_MIN(2,4,22)
+ if (!pMemLnx->fContiguous)
+ {
+ iPage = pMemLnx->cPages;
+ while (iPage-- > 0)
+ __free_page(pMemLnx->apPages[iPage]);
+ }
+ else
+#endif
+ __free_pages(pMemLnx->apPages[0], rtR0MemObjLinuxOrder(pMemLnx->cPages));
+
+ pMemLnx->cPages = 0;
+ }
+}
+
+
+#ifdef IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC
+/**
+ * User data passed to the apply_to_page_range() callback.
+ */
+typedef struct LNXAPPLYPGRANGE
+{
+ /** Pointer to the memory object. */
+ PRTR0MEMOBJLNX pMemLnx;
+ /** The page protection flags to apply. */
+ pgprot_t fPg;
+} LNXAPPLYPGRANGE;
+/** Pointer to the user data. */
+typedef LNXAPPLYPGRANGE *PLNXAPPLYPGRANGE;
+/** Pointer to the const user data. */
+typedef const LNXAPPLYPGRANGE *PCLNXAPPLYPGRANGE;
+
+/**
+ * Callback called in apply_to_page_range().
+ *
+ * @returns Linux status code.
+ * @param pPte Pointer to the page table entry for the given address.
+ * @param uAddr The address to apply the new protection to.
+ * @param pvUser The opaque user data.
+ */
+static int rtR0MemObjLinuxApplyPageRange(pte_t *pPte, unsigned long uAddr, void *pvUser)
+{
+ PCLNXAPPLYPGRANGE pArgs = (PCLNXAPPLYPGRANGE)pvUser;
+ PRTR0MEMOBJLNX pMemLnx = pArgs->pMemLnx;
+ size_t idxPg = (uAddr - (unsigned long)pMemLnx->Core.pv) >> PAGE_SHIFT;
+
+ set_pte(pPte, mk_pte(pMemLnx->apPages[idxPg], pArgs->fPg));
+ return 0;
+}
+#endif
+
+
+/**
+ * Maps the allocation into ring-0.
+ *
+ * This will update the RTR0MEMOBJLNX::Core.pv and RTR0MEMOBJ::fMappedToRing0 members.
+ *
+ * Contiguous mappings that isn't in 'high' memory will already be mapped into kernel
+ * space, so we'll use that mapping if possible. If execute access is required, we'll
+ * play safe and do our own mapping.
+ *
+ * @returns IPRT status code.
+ * @param pMemLnx The linux memory object to map.
+ * @param fExecutable Whether execute access is required.
+ */
+static int rtR0MemObjLinuxVMap(PRTR0MEMOBJLNX pMemLnx, bool fExecutable)
+{
+ int rc = VINF_SUCCESS;
+
+ /*
+ * Choose mapping strategy.
+ */
+ bool fMustMap = fExecutable
+ || !pMemLnx->fContiguous;
+ if (!fMustMap)
+ {
+ size_t iPage = pMemLnx->cPages;
+ while (iPage-- > 0)
+ if (PageHighMem(pMemLnx->apPages[iPage]))
+ {
+ fMustMap = true;
+ break;
+ }
+ }
+
+ Assert(!pMemLnx->Core.pv);
+ Assert(!pMemLnx->fMappedToRing0);
+
+ if (fMustMap)
+ {
+ /*
+ * Use vmap - 2.4.22 and later.
+ */
+#if RTLNX_VER_MIN(2,4,22)
+ pgprot_t fPg;
+ pgprot_val(fPg) = _PAGE_PRESENT | _PAGE_RW;
+# ifdef _PAGE_NX
+ if (!fExecutable)
+ pgprot_val(fPg) |= _PAGE_NX;
+# endif
+
+# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
+ if (fExecutable)
+ {
+# if RTLNX_VER_MIN(3,2,51)
+ pte_t **papPtes = (pte_t **)kmalloc_array(pMemLnx->cPages, sizeof(papPtes[0]), GFP_KERNEL);
+# else
+ pte_t **papPtes = (pte_t **)kmalloc(pMemLnx->cPages * sizeof(papPtes[0]), GFP_KERNEL);
+# endif
+ if (papPtes)
+ {
+ pMemLnx->pArea = alloc_vm_area(pMemLnx->Core.cb, papPtes); /* Note! pArea->nr_pages is not set. */
+ if (pMemLnx->pArea)
+ {
+ size_t i;
+ Assert(pMemLnx->pArea->size >= pMemLnx->Core.cb); /* Note! includes guard page. */
+ Assert(pMemLnx->pArea->addr);
+# ifdef _PAGE_NX
+ pgprot_val(fPg) |= _PAGE_NX; /* Uses RTR0MemObjProtect to clear NX when memory ready, W^X fashion. */
+# endif
+ pMemLnx->papPtesForArea = papPtes;
+ for (i = 0; i < pMemLnx->cPages; i++)
+ *papPtes[i] = mk_pte(pMemLnx->apPages[i], fPg);
+ pMemLnx->Core.pv = pMemLnx->pArea->addr;
+ pMemLnx->fMappedToRing0 = true;
+ }
+ else
+ {
+ kfree(papPtes);
+ rc = VERR_MAP_FAILED;
+ }
+ }
+ else
+ rc = VERR_MAP_FAILED;
+ }
+ else
+# endif
+ {
+# if defined(IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC)
+ if (fExecutable)
+ pgprot_val(fPg) |= _PAGE_NX; /* Uses RTR0MemObjProtect to clear NX when memory ready, W^X fashion. */
+# endif
+
+# ifdef VM_MAP
+ pMemLnx->Core.pv = vmap(&pMemLnx->apPages[0], pMemLnx->cPages, VM_MAP, fPg);
+# else
+ pMemLnx->Core.pv = vmap(&pMemLnx->apPages[0], pMemLnx->cPages, VM_ALLOC, fPg);
+# endif
+ if (pMemLnx->Core.pv)
+ pMemLnx->fMappedToRing0 = true;
+ else
+ rc = VERR_MAP_FAILED;
+ }
+#else /* < 2.4.22 */
+ rc = VERR_NOT_SUPPORTED;
+#endif
+ }
+ else
+ {
+ /*
+ * Use the kernel RAM mapping.
+ */
+ pMemLnx->Core.pv = phys_to_virt(page_to_phys(pMemLnx->apPages[0]));
+ Assert(pMemLnx->Core.pv);
+ }
+
+ return rc;
+}
+
+
+/**
+ * Undoes what rtR0MemObjLinuxVMap() did.
+ *
+ * @param pMemLnx The linux memory object.
+ */
+static void rtR0MemObjLinuxVUnmap(PRTR0MEMOBJLNX pMemLnx)
+{
+#if RTLNX_VER_MIN(2,4,22)
+# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
+ if (pMemLnx->pArea)
+ {
+# if 0
+ pte_t **papPtes = pMemLnx->papPtesForArea;
+ size_t i;
+ for (i = 0; i < pMemLnx->cPages; i++)
+ *papPtes[i] = 0;
+# endif
+ free_vm_area(pMemLnx->pArea);
+ kfree(pMemLnx->papPtesForArea);
+ pMemLnx->pArea = NULL;
+ pMemLnx->papPtesForArea = NULL;
+ }
+ else
+# endif
+ if (pMemLnx->fMappedToRing0)
+ {
+ Assert(pMemLnx->Core.pv);
+ vunmap(pMemLnx->Core.pv);
+ pMemLnx->fMappedToRing0 = false;
+ }
+#else /* < 2.4.22 */
+ Assert(!pMemLnx->fMappedToRing0);
+#endif
+ pMemLnx->Core.pv = NULL;
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeFree(RTR0MEMOBJ pMem)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem;
+
+ /*
+ * Release any memory that we've allocated or locked.
+ */
+ switch (pMemLnx->Core.enmType)
+ {
+ case RTR0MEMOBJTYPE_PAGE:
+ case RTR0MEMOBJTYPE_LOW:
+ case RTR0MEMOBJTYPE_CONT:
+ case RTR0MEMOBJTYPE_PHYS:
+ case RTR0MEMOBJTYPE_PHYS_NC:
+ rtR0MemObjLinuxVUnmap(pMemLnx);
+ rtR0MemObjLinuxFreePages(pMemLnx);
+ break;
+
+ case RTR0MEMOBJTYPE_LARGE_PAGE:
+ {
+ uint32_t const cLargePages = pMemLnx->Core.cb >> (pMemLnx->cLargePageOrder + PAGE_SHIFT);
+ uint32_t iLargePage;
+ for (iLargePage = 0; iLargePage < cLargePages; iLargePage++)
+ __free_pages(pMemLnx->apPages[iLargePage << pMemLnx->cLargePageOrder], pMemLnx->cLargePageOrder);
+ pMemLnx->cPages = 0;
+
+#ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
+ Assert(!pMemLnx->pArea);
+ Assert(!pMemLnx->papPtesForArea);
+#endif
+ break;
+ }
+
+ case RTR0MEMOBJTYPE_LOCK:
+ if (pMemLnx->Core.u.Lock.R0Process != NIL_RTR0PROCESS)
+ {
+ struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process);
+ size_t iPage;
+ Assert(pTask);
+ if (pTask && pTask->mm)
+ LNX_MM_DOWN_READ(pTask->mm);
+
+ iPage = pMemLnx->cPages;
+ while (iPage-- > 0)
+ {
+ if (!PageReserved(pMemLnx->apPages[iPage]))
+ SetPageDirty(pMemLnx->apPages[iPage]);
+#if RTLNX_VER_MIN(4,6,0)
+ put_page(pMemLnx->apPages[iPage]);
+#else
+ page_cache_release(pMemLnx->apPages[iPage]);
+#endif
+ }
+
+ if (pTask && pTask->mm)
+ LNX_MM_UP_READ(pTask->mm);
+ }
+ /* else: kernel memory - nothing to do here. */
+ break;
+
+ case RTR0MEMOBJTYPE_RES_VIRT:
+ Assert(pMemLnx->Core.pv);
+ if (pMemLnx->Core.u.ResVirt.R0Process != NIL_RTR0PROCESS)
+ {
+ struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process);
+ Assert(pTask);
+ if (pTask && pTask->mm)
+ rtR0MemObjLinuxDoMunmap(pMemLnx->Core.pv, pMemLnx->Core.cb, pTask);
+ }
+ else
+ {
+ vunmap(pMemLnx->Core.pv);
+
+ Assert(pMemLnx->cPages == 1 && pMemLnx->apPages[0] != NULL);
+ __free_page(pMemLnx->apPages[0]);
+ pMemLnx->apPages[0] = NULL;
+ pMemLnx->cPages = 0;
+ }
+ pMemLnx->Core.pv = NULL;
+ break;
+
+ case RTR0MEMOBJTYPE_MAPPING:
+ Assert(pMemLnx->cPages == 0); Assert(pMemLnx->Core.pv);
+ if (pMemLnx->Core.u.ResVirt.R0Process != NIL_RTR0PROCESS)
+ {
+ struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process);
+ Assert(pTask);
+ if (pTask && pTask->mm)
+ rtR0MemObjLinuxDoMunmap(pMemLnx->Core.pv, pMemLnx->Core.cb, pTask);
+ }
+ else
+ vunmap(pMemLnx->Core.pv);
+ pMemLnx->Core.pv = NULL;
+ break;
+
+ default:
+ AssertMsgFailed(("enmType=%d\n", pMemLnx->Core.enmType));
+ return VERR_INTERNAL_ERROR;
+ }
+ IPRT_LINUX_RESTORE_EFL_ONLY_AC();
+ return VINF_SUCCESS;
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeAllocPage(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable, const char *pszTag)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ PRTR0MEMOBJLNX pMemLnx;
+ int rc;
+
+#if RTLNX_VER_MIN(2,4,22)
+ rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_PAGE, cb, PAGE_SIZE, GFP_HIGHUSER,
+ false /* non-contiguous */, fExecutable, VERR_NO_MEMORY, pszTag);
+#else
+ rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_PAGE, cb, PAGE_SIZE, GFP_USER,
+ false /* non-contiguous */, fExecutable, VERR_NO_MEMORY, pszTag);
+#endif
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable);
+ if (RT_SUCCESS(rc))
+ {
+ *ppMem = &pMemLnx->Core;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+ }
+
+ rtR0MemObjLinuxFreePages(pMemLnx);
+ rtR0MemObjDelete(&pMemLnx->Core);
+ }
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeAllocLarge(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, size_t cbLargePage, uint32_t fFlags,
+ const char *pszTag)
+{
+#ifdef GFP_TRANSHUGE
+ /*
+ * Allocate a memory object structure that's large enough to contain
+ * the page pointer array.
+ */
+# ifdef __GFP_MOVABLE
+ unsigned const fGfp = (GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE;
+# else
+ unsigned const fGfp = (GFP_TRANSHUGE | __GFP_ZERO);
+# endif
+ size_t const cPagesPerLarge = cbLargePage >> PAGE_SHIFT;
+ unsigned const cLargePageOrder = rtR0MemObjLinuxOrder(cPagesPerLarge);
+ size_t const cLargePages = cb >> (cLargePageOrder + PAGE_SHIFT);
+ size_t const cPages = cb >> PAGE_SHIFT;
+ PRTR0MEMOBJLNX pMemLnx;
+
+ Assert(RT_BIT_64(cLargePageOrder + PAGE_SHIFT) == cbLargePage);
+ pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]),
+ RTR0MEMOBJTYPE_LARGE_PAGE, NULL, cb, pszTag);
+ if (pMemLnx)
+ {
+ size_t iLargePage;
+
+ pMemLnx->Core.fFlags |= RTR0MEMOBJ_FLAGS_ZERO_AT_ALLOC;
+ pMemLnx->cLargePageOrder = cLargePageOrder;
+ pMemLnx->cPages = cPages;
+
+ /*
+ * Allocate the requested number of large pages.
+ */
+ for (iLargePage = 0; iLargePage < cLargePages; iLargePage++)
+ {
+ struct page *paPages = alloc_pages(fGfp, cLargePageOrder);
+ if (paPages)
+ {
+ size_t const iPageBase = iLargePage << cLargePageOrder;
+ size_t iPage = cPagesPerLarge;
+ while (iPage-- > 0)
+ pMemLnx->apPages[iPageBase + iPage] = &paPages[iPage];
+ }
+ else
+ {
+ /*Log(("rtR0MemObjNativeAllocLarge: cb=%#zx cPages=%#zx cLargePages=%#zx cLargePageOrder=%u cPagesPerLarge=%#zx iLargePage=%#zx -> failed!\n",
+ cb, cPages, cLargePages, cLargePageOrder, cPagesPerLarge, iLargePage, paPages));*/
+ while (iLargePage-- > 0)
+ __free_pages(pMemLnx->apPages[iLargePage << (cLargePageOrder - PAGE_SHIFT)], cLargePageOrder);
+ rtR0MemObjDelete(&pMemLnx->Core);
+ return VERR_NO_MEMORY;
+ }
+ }
+ *ppMem = &pMemLnx->Core;
+ return VINF_SUCCESS;
+ }
+ return VERR_NO_MEMORY;
+
+#else
+ /*
+ * We don't call rtR0MemObjFallbackAllocLarge here as it can be a really
+ * bad idea to trigger the swap daemon and whatnot. So, just fail.
+ */
+ RT_NOREF(ppMem, cb, cbLargePage, fFlags, pszTag);
+ return VERR_NOT_SUPPORTED;
+#endif
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeAllocLow(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable, const char *pszTag)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ PRTR0MEMOBJLNX pMemLnx;
+ int rc;
+
+ /* Try to avoid GFP_DMA. GFM_DMA32 was introduced with Linux 2.6.15. */
+#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32)
+ /* ZONE_DMA32: 0-4GB */
+ rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_DMA32,
+ false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY, pszTag);
+ if (RT_FAILURE(rc))
+#endif
+#ifdef RT_ARCH_AMD64
+ /* ZONE_DMA: 0-16MB */
+ rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_DMA,
+ false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY, pszTag);
+#else
+# ifdef CONFIG_X86_PAE
+# endif
+ /* ZONE_NORMAL: 0-896MB */
+ rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_USER,
+ false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY, pszTag);
+#endif
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable);
+ if (RT_SUCCESS(rc))
+ {
+ *ppMem = &pMemLnx->Core;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+ }
+
+ rtR0MemObjLinuxFreePages(pMemLnx);
+ rtR0MemObjDelete(&pMemLnx->Core);
+ }
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeAllocCont(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable, const char *pszTag)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ PRTR0MEMOBJLNX pMemLnx;
+ int rc;
+
+#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32)
+ /* ZONE_DMA32: 0-4GB */
+ rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_DMA32,
+ true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY, pszTag);
+ if (RT_FAILURE(rc))
+#endif
+#ifdef RT_ARCH_AMD64
+ /* ZONE_DMA: 0-16MB */
+ rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_DMA,
+ true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY, pszTag);
+#else
+ /* ZONE_NORMAL (32-bit hosts): 0-896MB */
+ rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_USER,
+ true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY, pszTag);
+#endif
+ if (RT_SUCCESS(rc))
+ {
+ rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable);
+ if (RT_SUCCESS(rc))
+ {
+#if defined(RT_STRICT) && (defined(RT_ARCH_AMD64) || defined(CONFIG_HIGHMEM64G))
+ size_t iPage = pMemLnx->cPages;
+ while (iPage-- > 0)
+ Assert(page_to_phys(pMemLnx->apPages[iPage]) < _4G);
+#endif
+ pMemLnx->Core.u.Cont.Phys = page_to_phys(pMemLnx->apPages[0]);
+ *ppMem = &pMemLnx->Core;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+ }
+
+ rtR0MemObjLinuxFreePages(pMemLnx);
+ rtR0MemObjDelete(&pMemLnx->Core);
+ }
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+/**
+ * Worker for rtR0MemObjLinuxAllocPhysSub that tries one allocation strategy.
+ *
+ * @returns IPRT status code.
+ * @param ppMemLnx Where to
+ * @param enmType The object type.
+ * @param cb The size of the allocation.
+ * @param uAlignment The alignment of the physical memory.
+ * Only valid for fContiguous == true, ignored otherwise.
+ * @param PhysHighest See rtR0MemObjNativeAllocPhys.
+ * @param pszTag Allocation tag used for statistics and such.
+ * @param fGfp The Linux GFP flags to use for the allocation.
+ */
+static int rtR0MemObjLinuxAllocPhysSub2(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJTYPE enmType,
+ size_t cb, size_t uAlignment, RTHCPHYS PhysHighest, const char *pszTag, gfp_t fGfp)
+{
+ PRTR0MEMOBJLNX pMemLnx;
+ int rc = rtR0MemObjLinuxAllocPages(&pMemLnx, enmType, cb, uAlignment, fGfp,
+ enmType == RTR0MEMOBJTYPE_PHYS /* contiguous / non-contiguous */,
+ false /*fExecutable*/, VERR_NO_PHYS_MEMORY, pszTag);
+ if (RT_FAILURE(rc))
+ return rc;
+
+ /*
+ * Check the addresses if necessary. (Can be optimized a bit for PHYS.)
+ */
+ if (PhysHighest != NIL_RTHCPHYS)
+ {
+ size_t iPage = pMemLnx->cPages;
+ while (iPage-- > 0)
+ if (page_to_phys(pMemLnx->apPages[iPage]) > PhysHighest)
+ {
+ rtR0MemObjLinuxFreePages(pMemLnx);
+ rtR0MemObjDelete(&pMemLnx->Core);
+ return VERR_NO_MEMORY;
+ }
+ }
+
+ /*
+ * Complete the object.
+ */
+ if (enmType == RTR0MEMOBJTYPE_PHYS)
+ {
+ pMemLnx->Core.u.Phys.PhysBase = page_to_phys(pMemLnx->apPages[0]);
+ pMemLnx->Core.u.Phys.fAllocated = true;
+ }
+ *ppMem = &pMemLnx->Core;
+ return rc;
+}
+
+
+/**
+ * Worker for rtR0MemObjNativeAllocPhys and rtR0MemObjNativeAllocPhysNC.
+ *
+ * @returns IPRT status code.
+ * @param ppMem Where to store the memory object pointer on success.
+ * @param enmType The object type.
+ * @param cb The size of the allocation.
+ * @param uAlignment The alignment of the physical memory.
+ * Only valid for enmType == RTR0MEMOBJTYPE_PHYS, ignored otherwise.
+ * @param PhysHighest See rtR0MemObjNativeAllocPhys.
+ * @param pszTag Allocation tag used for statistics and such.
+ */
+static int rtR0MemObjLinuxAllocPhysSub(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJTYPE enmType,
+ size_t cb, size_t uAlignment, RTHCPHYS PhysHighest, const char *pszTag)
+{
+ int rc;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * There are two clear cases and that's the <=16MB and anything-goes ones.
+ * When the physical address limit is somewhere in-between those two we'll
+ * just have to try, starting with HIGHUSER and working our way thru the
+ * different types, hoping we'll get lucky.
+ *
+ * We should probably move this physical address restriction logic up to
+ * the page alloc function as it would be more efficient there. But since
+ * we don't expect this to be a performance issue just yet it can wait.
+ */
+ if (PhysHighest == NIL_RTHCPHYS)
+ /* ZONE_HIGHMEM: the whole physical memory */
+ rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_HIGHUSER);
+ else if (PhysHighest <= _1M * 16)
+ /* ZONE_DMA: 0-16MB */
+ rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_DMA);
+ else
+ {
+ rc = VERR_NO_MEMORY;
+ if (RT_FAILURE(rc))
+ /* ZONE_HIGHMEM: the whole physical memory */
+ rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_HIGHUSER);
+ if (RT_FAILURE(rc))
+ /* ZONE_NORMAL: 0-896MB */
+ rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_USER);
+#ifdef GFP_DMA32
+ if (RT_FAILURE(rc))
+ /* ZONE_DMA32: 0-4GB */
+ rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_DMA32);
+#endif
+ if (RT_FAILURE(rc))
+ /* ZONE_DMA: 0-16MB */
+ rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_DMA);
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+/**
+ * Translates a kernel virtual address to a linux page structure by walking the
+ * page tables.
+ *
+ * @note We do assume that the page tables will not change as we are walking
+ * them. This assumption is rather forced by the fact that I could not
+ * immediately see any way of preventing this from happening. So, we
+ * take some extra care when accessing them.
+ *
+ * Because of this, we don't want to use this function on memory where
+ * attribute changes to nearby pages is likely to cause large pages to
+ * be used or split up. So, don't use this for the linear mapping of
+ * physical memory.
+ *
+ * @returns Pointer to the page structur or NULL if it could not be found.
+ * @param pv The kernel virtual address.
+ */
+RTDECL(struct page *) rtR0MemObjLinuxVirtToPage(void *pv)
+{
+ unsigned long ulAddr = (unsigned long)pv;
+ unsigned long pfn;
+ struct page *pPage;
+ pte_t *pEntry;
+ union
+ {
+ pgd_t Global;
+#if RTLNX_VER_MIN(4,12,0)
+ p4d_t Four;
+#endif
+#if RTLNX_VER_MIN(2,6,11)
+ pud_t Upper;
+#endif
+ pmd_t Middle;
+ pte_t Entry;
+ } u;
+
+ /* Should this happen in a situation this code will be called in? And if
+ * so, can it change under our feet? See also
+ * "Documentation/vm/active_mm.txt" in the kernel sources. */
+ if (RT_UNLIKELY(!current->active_mm))
+ return NULL;
+ u.Global = *pgd_offset(current->active_mm, ulAddr);
+ if (RT_UNLIKELY(pgd_none(u.Global)))
+ return NULL;
+#if RTLNX_VER_MIN(2,6,11)
+# if RTLNX_VER_MIN(4,12,0)
+ u.Four = *p4d_offset(&u.Global, ulAddr);
+ if (RT_UNLIKELY(p4d_none(u.Four)))
+ return NULL;
+ if (p4d_large(u.Four))
+ {
+ pPage = p4d_page(u.Four);
+ AssertReturn(pPage, NULL);
+ pfn = page_to_pfn(pPage); /* doing the safe way... */
+ AssertCompile(P4D_SHIFT - PAGE_SHIFT < 31);
+ pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (P4D_SHIFT - PAGE_SHIFT)) - 1);
+ return pfn_to_page(pfn);
+ }
+ u.Upper = *pud_offset(&u.Four, ulAddr);
+# else /* < 4.12 */
+ u.Upper = *pud_offset(&u.Global, ulAddr);
+# endif /* < 4.12 */
+ if (RT_UNLIKELY(pud_none(u.Upper)))
+ return NULL;
+# if RTLNX_VER_MIN(2,6,25)
+ if (pud_large(u.Upper))
+ {
+ pPage = pud_page(u.Upper);
+ AssertReturn(pPage, NULL);
+ pfn = page_to_pfn(pPage); /* doing the safe way... */
+ pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (PUD_SHIFT - PAGE_SHIFT)) - 1);
+ return pfn_to_page(pfn);
+ }
+# endif
+ u.Middle = *pmd_offset(&u.Upper, ulAddr);
+#else /* < 2.6.11 */
+ u.Middle = *pmd_offset(&u.Global, ulAddr);
+#endif /* < 2.6.11 */
+ if (RT_UNLIKELY(pmd_none(u.Middle)))
+ return NULL;
+#if RTLNX_VER_MIN(2,6,0)
+ if (pmd_large(u.Middle))
+ {
+ pPage = pmd_page(u.Middle);
+ AssertReturn(pPage, NULL);
+ pfn = page_to_pfn(pPage); /* doing the safe way... */
+ pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+ return pfn_to_page(pfn);
+ }
+#endif
+
+#if RTLNX_VER_MIN(6,5,0) || RTLNX_RHEL_RANGE(9,4, 9,99)
+ pEntry = __pte_map(&u.Middle, ulAddr);
+#elif RTLNX_VER_MIN(2,5,5) || defined(pte_offset_map) /* As usual, RHEL 3 had pte_offset_map earlier. */
+ pEntry = pte_offset_map(&u.Middle, ulAddr);
+#else
+ pEntry = pte_offset(&u.Middle, ulAddr);
+#endif
+ if (RT_UNLIKELY(!pEntry))
+ return NULL;
+ u.Entry = *pEntry;
+#if RTLNX_VER_MIN(2,5,5) || defined(pte_offset_map)
+ pte_unmap(pEntry);
+#endif
+
+ if (RT_UNLIKELY(!pte_present(u.Entry)))
+ return NULL;
+ return pte_page(u.Entry);
+}
+RT_EXPORT_SYMBOL(rtR0MemObjLinuxVirtToPage);
+
+
+DECLHIDDEN(int) rtR0MemObjNativeAllocPhys(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, RTHCPHYS PhysHighest, size_t uAlignment,
+ const char *pszTag)
+{
+ return rtR0MemObjLinuxAllocPhysSub(ppMem, RTR0MEMOBJTYPE_PHYS, cb, uAlignment, PhysHighest, pszTag);
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeAllocPhysNC(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, RTHCPHYS PhysHighest, const char *pszTag)
+{
+ return rtR0MemObjLinuxAllocPhysSub(ppMem, RTR0MEMOBJTYPE_PHYS_NC, cb, PAGE_SIZE, PhysHighest, pszTag);
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeEnterPhys(PPRTR0MEMOBJINTERNAL ppMem, RTHCPHYS Phys, size_t cb, uint32_t uCachePolicy,
+ const char *pszTag)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * All we need to do here is to validate that we can use
+ * ioremap on the specified address (32/64-bit dma_addr_t).
+ */
+ PRTR0MEMOBJLNX pMemLnx;
+ dma_addr_t PhysAddr = Phys;
+ AssertMsgReturn(PhysAddr == Phys, ("%#llx\n", (unsigned long long)Phys), VERR_ADDRESS_TOO_BIG);
+
+ pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_PHYS, NULL, cb, pszTag);
+ if (!pMemLnx)
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+ }
+
+ pMemLnx->Core.u.Phys.PhysBase = PhysAddr;
+ pMemLnx->Core.u.Phys.fAllocated = false;
+ pMemLnx->Core.u.Phys.uCachePolicy = uCachePolicy;
+ Assert(!pMemLnx->cPages);
+ *ppMem = &pMemLnx->Core;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+
+/* openSUSE Leap 42.3 detection :-/ */
+#if RTLNX_VER_RANGE(4,4,0, 4,6,0) && defined(FAULT_FLAG_REMOTE)
+# define GET_USER_PAGES_API KERNEL_VERSION(4, 10, 0) /* no typo! */
+#else
+# define GET_USER_PAGES_API LINUX_VERSION_CODE
+#endif
+
+DECLHIDDEN(int) rtR0MemObjNativeLockUser(PPRTR0MEMOBJINTERNAL ppMem, RTR3PTR R3Ptr, size_t cb, uint32_t fAccess,
+ RTR0PROCESS R0Process, const char *pszTag)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ const int cPages = cb >> PAGE_SHIFT;
+ struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process);
+# if GET_USER_PAGES_API < KERNEL_VERSION(6, 5, 0)
+ struct vm_area_struct **papVMAs;
+# endif
+ PRTR0MEMOBJLNX pMemLnx;
+ int rc = VERR_NO_MEMORY;
+ int const fWrite = fAccess & RTMEM_PROT_WRITE ? 1 : 0;
+
+ /*
+ * Check for valid task and size overflows.
+ */
+ if (!pTask)
+ return VERR_NOT_SUPPORTED;
+ if (((size_t)cPages << PAGE_SHIFT) != cb)
+ return VERR_OUT_OF_RANGE;
+
+ /*
+ * Allocate the memory object and a temporary buffer for the VMAs.
+ */
+ pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), RTR0MEMOBJTYPE_LOCK,
+ (void *)R3Ptr, cb, pszTag);
+ if (!pMemLnx)
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+ }
+
+# if GET_USER_PAGES_API < KERNEL_VERSION(6, 5, 0)
+ papVMAs = (struct vm_area_struct **)RTMemAlloc(sizeof(*papVMAs) * cPages);
+ if (papVMAs)
+ {
+# endif
+ LNX_MM_DOWN_READ(pTask->mm);
+
+ /*
+ * Get user pages.
+ */
+/** @todo r=bird: Should we not force read access too? */
+#if GET_USER_PAGES_API >= KERNEL_VERSION(4, 6, 0)
+ if (R0Process == RTR0ProcHandleSelf())
+ rc = get_user_pages(R3Ptr, /* Where from. */
+ cPages, /* How many pages. */
+# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 9, 0)
+ fWrite ? FOLL_WRITE | /* Write to memory. */
+ FOLL_FORCE /* force write access. */
+ : 0, /* Write to memory. */
+# else
+ fWrite, /* Write to memory. */
+ fWrite, /* force write access. */
+# endif
+ &pMemLnx->apPages[0] /* Page array. */
+# if GET_USER_PAGES_API < KERNEL_VERSION(6, 5, 0)
+ , papVMAs /* vmas */
+# endif
+ );
+ /*
+ * Actually this should not happen at the moment as call this function
+ * only for our own process.
+ */
+ else
+ rc = get_user_pages_remote(
+# if GET_USER_PAGES_API < KERNEL_VERSION(5, 9, 0)
+ pTask, /* Task for fault accounting. */
+# endif
+ pTask->mm, /* Whose pages. */
+ R3Ptr, /* Where from. */
+ cPages, /* How many pages. */
+# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 9, 0)
+ fWrite ? FOLL_WRITE | /* Write to memory. */
+ FOLL_FORCE /* force write access. */
+ : 0, /* Write to memory. */
+# else
+ fWrite, /* Write to memory. */
+ fWrite, /* force write access. */
+# endif
+ &pMemLnx->apPages[0] /* Page array. */
+# if GET_USER_PAGES_API < KERNEL_VERSION(6, 5, 0)
+ , papVMAs /* vmas */
+# endif
+# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 10, 0)
+ , NULL /* locked */
+# endif
+ );
+#else /* GET_USER_PAGES_API < KERNEL_VERSION(4, 6, 0) */
+ rc = get_user_pages(pTask, /* Task for fault accounting. */
+ pTask->mm, /* Whose pages. */
+ R3Ptr, /* Where from. */
+ cPages, /* How many pages. */
+/* The get_user_pages API change was back-ported to 4.4.168. */
+# if RTLNX_VER_RANGE(4,4,168, 4,5,0)
+ fWrite ? FOLL_WRITE | /* Write to memory. */
+ FOLL_FORCE /* force write access. */
+ : 0, /* Write to memory. */
+# else
+ fWrite, /* Write to memory. */
+ fWrite, /* force write access. */
+# endif
+ &pMemLnx->apPages[0] /* Page array. */
+# if GET_USER_PAGES_API < KERNEL_VERSION(6, 5, 0)
+ , papVMAs /* vmas */
+# endif
+ );
+#endif /* GET_USER_PAGES_API < KERNEL_VERSION(4, 6, 0) */
+ if (rc == cPages)
+ {
+ /*
+ * Flush dcache (required?), protect against fork and _really_ pin the page
+ * table entries. get_user_pages() will protect against swapping out the
+ * pages but it will NOT protect against removing page table entries. This
+ * can be achieved with
+ * - using mlock / mmap(..., MAP_LOCKED, ...) from userland. This requires
+ * an appropriate limit set up with setrlimit(..., RLIMIT_MEMLOCK, ...).
+ * Usual Linux distributions support only a limited size of locked pages
+ * (e.g. 32KB).
+ * - setting the PageReserved bit (as we do in rtR0MemObjLinuxAllocPages()
+ * or by
+ * - setting the VM_LOCKED flag. This is the same as doing mlock() without
+ * a range check.
+ */
+ /** @todo The Linux fork() protection will require more work if this API
+ * is to be used for anything but locking VM pages. */
+ while (rc-- > 0)
+ {
+ flush_dcache_page(pMemLnx->apPages[rc]);
+# if GET_USER_PAGES_API < KERNEL_VERSION(6, 5, 0)
+# if RTLNX_VER_MIN(6,3,0)
+ vm_flags_set(papVMAs[rc], VM_DONTCOPY | VM_LOCKED);
+# else
+ papVMAs[rc]->vm_flags |= VM_DONTCOPY | VM_LOCKED;
+# endif
+# endif
+ }
+
+ LNX_MM_UP_READ(pTask->mm);
+
+# if GET_USER_PAGES_API < KERNEL_VERSION(6, 5, 0)
+ RTMemFree(papVMAs);
+# endif
+
+ pMemLnx->Core.u.Lock.R0Process = R0Process;
+ pMemLnx->cPages = cPages;
+ Assert(!pMemLnx->fMappedToRing0);
+ *ppMem = &pMemLnx->Core;
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+
+ /*
+ * Failed - we need to unlock any pages that we succeeded to lock.
+ */
+ while (rc-- > 0)
+ {
+ if (!PageReserved(pMemLnx->apPages[rc]))
+ SetPageDirty(pMemLnx->apPages[rc]);
+#if RTLNX_VER_MIN(4,6,0)
+ put_page(pMemLnx->apPages[rc]);
+#else
+ page_cache_release(pMemLnx->apPages[rc]);
+#endif
+ }
+
+ LNX_MM_UP_READ(pTask->mm);
+
+ rc = VERR_LOCK_FAILED;
+
+# if GET_USER_PAGES_API < KERNEL_VERSION(6, 5, 0)
+ RTMemFree(papVMAs);
+ }
+# endif
+
+ rtR0MemObjDelete(&pMemLnx->Core);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeLockKernel(PPRTR0MEMOBJINTERNAL ppMem, void *pv, size_t cb, uint32_t fAccess, const char *pszTag)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ void *pvLast = (uint8_t *)pv + cb - 1;
+ size_t const cPages = cb >> PAGE_SHIFT;
+ PRTR0MEMOBJLNX pMemLnx;
+ bool fLinearMapping;
+ int rc;
+ uint8_t *pbPage;
+ size_t iPage;
+ NOREF(fAccess);
+
+ if ( !RTR0MemKernelIsValidAddr(pv)
+ || !RTR0MemKernelIsValidAddr(pv + cb))
+ return VERR_INVALID_PARAMETER;
+
+ /*
+ * The lower part of the kernel memory has a linear mapping between
+ * physical and virtual addresses. So we take a short cut here. This is
+ * assumed to be the cleanest way to handle those addresses (and the code
+ * is well tested, though the test for determining it is not very nice).
+ * If we ever decide it isn't we can still remove it.
+ */
+#if 0
+ fLinearMapping = (unsigned long)pvLast < VMALLOC_START;
+#else
+ fLinearMapping = (unsigned long)pv >= (unsigned long)__va(0)
+ && (unsigned long)pvLast < (unsigned long)high_memory;
+#endif
+
+ /*
+ * Allocate the memory object.
+ */
+ pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), RTR0MEMOBJTYPE_LOCK,
+ pv, cb, pszTag);
+ if (!pMemLnx)
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+ }
+
+ /*
+ * Gather the pages.
+ * We ASSUME all kernel pages are non-swappable and non-movable.
+ */
+ rc = VINF_SUCCESS;
+ pbPage = (uint8_t *)pvLast;
+ iPage = cPages;
+ if (!fLinearMapping)
+ {
+ while (iPage-- > 0)
+ {
+ struct page *pPage = rtR0MemObjLinuxVirtToPage(pbPage);
+ if (RT_UNLIKELY(!pPage))
+ {
+ rc = VERR_LOCK_FAILED;
+ break;
+ }
+ pMemLnx->apPages[iPage] = pPage;
+ pbPage -= PAGE_SIZE;
+ }
+ }
+ else
+ {
+ while (iPage-- > 0)
+ {
+ pMemLnx->apPages[iPage] = virt_to_page(pbPage);
+ pbPage -= PAGE_SIZE;
+ }
+ }
+ if (RT_SUCCESS(rc))
+ {
+ /*
+ * Complete the memory object and return.
+ */
+ pMemLnx->Core.u.Lock.R0Process = NIL_RTR0PROCESS;
+ pMemLnx->cPages = cPages;
+ Assert(!pMemLnx->fMappedToRing0);
+ *ppMem = &pMemLnx->Core;
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+
+ rtR0MemObjDelete(&pMemLnx->Core);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeReserveKernel(PPRTR0MEMOBJINTERNAL ppMem, void *pvFixed, size_t cb, size_t uAlignment,
+ const char *pszTag)
+{
+#if RTLNX_VER_MIN(2,4,22)
+ IPRT_LINUX_SAVE_EFL_AC();
+ const size_t cPages = cb >> PAGE_SHIFT;
+ struct page *pDummyPage;
+ struct page **papPages;
+
+ /* check for unsupported stuff. */
+ AssertMsgReturn(pvFixed == (void *)-1, ("%p\n", pvFixed), VERR_NOT_SUPPORTED);
+ if (uAlignment > PAGE_SIZE)
+ return VERR_NOT_SUPPORTED;
+
+ /*
+ * Allocate a dummy page and create a page pointer array for vmap such that
+ * the dummy page is mapped all over the reserved area.
+ */
+ pDummyPage = alloc_page(GFP_HIGHUSER | __GFP_NOWARN);
+ if (pDummyPage)
+ {
+ papPages = RTMemAlloc(sizeof(*papPages) * cPages);
+ if (papPages)
+ {
+ void *pv;
+ size_t iPage = cPages;
+ while (iPage-- > 0)
+ papPages[iPage] = pDummyPage;
+# ifdef VM_MAP
+ pv = vmap(papPages, cPages, VM_MAP, PAGE_KERNEL_RO);
+# else
+ pv = vmap(papPages, cPages, VM_ALLOC, PAGE_KERNEL_RO);
+# endif
+ RTMemFree(papPages);
+ if (pv)
+ {
+ PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_RES_VIRT, pv, cb, pszTag);
+ if (pMemLnx)
+ {
+ pMemLnx->Core.u.ResVirt.R0Process = NIL_RTR0PROCESS;
+ pMemLnx->cPages = 1;
+ pMemLnx->apPages[0] = pDummyPage;
+ *ppMem = &pMemLnx->Core;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+ vunmap(pv);
+ }
+ }
+ __free_page(pDummyPage);
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+
+#else /* < 2.4.22 */
+ /*
+ * Could probably use ioremap here, but the caller is in a better position than us
+ * to select some safe physical memory.
+ */
+ return VERR_NOT_SUPPORTED;
+#endif
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeReserveUser(PPRTR0MEMOBJINTERNAL ppMem, RTR3PTR R3PtrFixed, size_t cb, size_t uAlignment,
+ RTR0PROCESS R0Process, const char *pszTag)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ PRTR0MEMOBJLNX pMemLnx;
+ void *pv;
+ struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process);
+ if (!pTask)
+ return VERR_NOT_SUPPORTED;
+
+ /*
+ * Check that the specified alignment is supported.
+ */
+ if (uAlignment > PAGE_SIZE)
+ return VERR_NOT_SUPPORTED;
+
+ /*
+ * Let rtR0MemObjLinuxDoMmap do the difficult bits.
+ */
+ pv = rtR0MemObjLinuxDoMmap(R3PtrFixed, cb, uAlignment, pTask, RTMEM_PROT_NONE);
+ if (pv == (void *)-1)
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+ }
+
+ pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_RES_VIRT, pv, cb, pszTag);
+ if (!pMemLnx)
+ {
+ rtR0MemObjLinuxDoMunmap(pv, cb, pTask);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+ }
+
+ pMemLnx->Core.u.ResVirt.R0Process = R0Process;
+ *ppMem = &pMemLnx->Core;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeMapKernel(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, void *pvFixed, size_t uAlignment,
+ unsigned fProt, size_t offSub, size_t cbSub, const char *pszTag)
+{
+ int rc = VERR_NO_MEMORY;
+ PRTR0MEMOBJLNX pMemLnxToMap = (PRTR0MEMOBJLNX)pMemToMap;
+ PRTR0MEMOBJLNX pMemLnx;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /* Fail if requested to do something we can't. */
+ AssertMsgReturn(pvFixed == (void *)-1, ("%p\n", pvFixed), VERR_NOT_SUPPORTED);
+ if (uAlignment > PAGE_SIZE)
+ return VERR_NOT_SUPPORTED;
+
+ /*
+ * Create the IPRT memory object.
+ */
+ if (!cbSub)
+ cbSub = pMemLnxToMap->Core.cb - offSub;
+ pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_MAPPING, NULL, cbSub, pszTag);
+ if (pMemLnx)
+ {
+ if (pMemLnxToMap->cPages)
+ {
+#if RTLNX_VER_MIN(2,4,22)
+ /*
+ * Use vmap - 2.4.22 and later.
+ */
+ pgprot_t fPg = rtR0MemObjLinuxConvertProt(fProt, true /* kernel */);
+ /** @todo We don't really care too much for EXEC here... 5.8 always adds NX. */
+ Assert(((offSub + cbSub) >> PAGE_SHIFT) <= pMemLnxToMap->cPages);
+# ifdef VM_MAP
+ pMemLnx->Core.pv = vmap(&pMemLnxToMap->apPages[offSub >> PAGE_SHIFT], cbSub >> PAGE_SHIFT, VM_MAP, fPg);
+# else
+ pMemLnx->Core.pv = vmap(&pMemLnxToMap->apPages[offSub >> PAGE_SHIFT], cbSub >> PAGE_SHIFT, VM_ALLOC, fPg);
+# endif
+ if (pMemLnx->Core.pv)
+ {
+ pMemLnx->fMappedToRing0 = true;
+ rc = VINF_SUCCESS;
+ }
+ else
+ rc = VERR_MAP_FAILED;
+
+#else /* < 2.4.22 */
+ /*
+ * Only option here is to share mappings if possible and forget about fProt.
+ */
+ if (rtR0MemObjIsRing3(pMemToMap))
+ rc = VERR_NOT_SUPPORTED;
+ else
+ {
+ rc = VINF_SUCCESS;
+ if (!pMemLnxToMap->Core.pv)
+ rc = rtR0MemObjLinuxVMap(pMemLnxToMap, !!(fProt & RTMEM_PROT_EXEC));
+ if (RT_SUCCESS(rc))
+ {
+ Assert(pMemLnxToMap->Core.pv);
+ pMemLnx->Core.pv = (uint8_t *)pMemLnxToMap->Core.pv + offSub;
+ }
+ }
+#endif
+ }
+ else
+ {
+ /*
+ * MMIO / physical memory.
+ */
+ Assert(pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_PHYS && !pMemLnxToMap->Core.u.Phys.fAllocated);
+#if RTLNX_VER_MIN(2,6,25)
+ /*
+ * ioremap() defaults to no caching since the 2.6 kernels.
+ * ioremap_nocache() has been removed finally in 5.6-rc1.
+ */
+ pMemLnx->Core.pv = pMemLnxToMap->Core.u.Phys.uCachePolicy == RTMEM_CACHE_POLICY_MMIO
+ ? ioremap(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub)
+ : ioremap_cache(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub);
+#else /* KERNEL_VERSION < 2.6.25 */
+ pMemLnx->Core.pv = pMemLnxToMap->Core.u.Phys.uCachePolicy == RTMEM_CACHE_POLICY_MMIO
+ ? ioremap_nocache(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub)
+ : ioremap(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub);
+#endif /* KERNEL_VERSION < 2.6.25 */
+ if (pMemLnx->Core.pv)
+ {
+ /** @todo fix protection. */
+ rc = VINF_SUCCESS;
+ }
+ }
+ if (RT_SUCCESS(rc))
+ {
+ pMemLnx->Core.u.Mapping.R0Process = NIL_RTR0PROCESS;
+ *ppMem = &pMemLnx->Core;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+ rtR0MemObjDelete(&pMemLnx->Core);
+ }
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+#ifdef VBOX_USE_PAE_HACK
+/**
+ * Replace the PFN of a PTE with the address of the actual page.
+ *
+ * The caller maps a reserved dummy page at the address with the desired access
+ * and flags.
+ *
+ * This hack is required for older Linux kernels which don't provide
+ * remap_pfn_range().
+ *
+ * @returns 0 on success, -ENOMEM on failure.
+ * @param mm The memory context.
+ * @param ulAddr The mapping address.
+ * @param Phys The physical address of the page to map.
+ */
+static int rtR0MemObjLinuxFixPte(struct mm_struct *mm, unsigned long ulAddr, RTHCPHYS Phys)
+{
+ int rc = -ENOMEM;
+ pgd_t *pgd;
+
+ spin_lock(&mm->page_table_lock);
+
+ pgd = pgd_offset(mm, ulAddr);
+ if (!pgd_none(*pgd) && !pgd_bad(*pgd))
+ {
+ pmd_t *pmd = pmd_offset(pgd, ulAddr);
+ if (!pmd_none(*pmd))
+ {
+ pte_t *ptep = pte_offset_map(pmd, ulAddr);
+ if (ptep)
+ {
+ pte_t pte = *ptep;
+ pte.pte_high &= 0xfff00000;
+ pte.pte_high |= ((Phys >> 32) & 0x000fffff);
+ pte.pte_low &= 0x00000fff;
+ pte.pte_low |= (Phys & 0xfffff000);
+ set_pte(ptep, pte);
+ pte_unmap(ptep);
+ rc = 0;
+ }
+ }
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ return rc;
+}
+#endif /* VBOX_USE_PAE_HACK */
+
+
+DECLHIDDEN(int) rtR0MemObjNativeMapUser(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, RTR3PTR R3PtrFixed, size_t uAlignment,
+ unsigned fProt, RTR0PROCESS R0Process, size_t offSub, size_t cbSub, const char *pszTag)
+{
+ struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process);
+ PRTR0MEMOBJLNX pMemLnxToMap = (PRTR0MEMOBJLNX)pMemToMap;
+ int rc = VERR_NO_MEMORY;
+ PRTR0MEMOBJLNX pMemLnx;
+#ifdef VBOX_USE_PAE_HACK
+ struct page *pDummyPage;
+ RTHCPHYS DummyPhys;
+#endif
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Check for restrictions.
+ */
+ if (!pTask)
+ return VERR_NOT_SUPPORTED;
+ if (uAlignment > PAGE_SIZE)
+ return VERR_NOT_SUPPORTED;
+
+#ifdef VBOX_USE_PAE_HACK
+ /*
+ * Allocate a dummy page for use when mapping the memory.
+ */
+ pDummyPage = alloc_page(GFP_USER | __GFP_NOWARN);
+ if (!pDummyPage)
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+ }
+ SetPageReserved(pDummyPage);
+ DummyPhys = page_to_phys(pDummyPage);
+#endif
+
+ /*
+ * Create the IPRT memory object.
+ */
+ Assert(!offSub || cbSub);
+ if (cbSub == 0)
+ cbSub = pMemLnxToMap->Core.cb;
+ pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_MAPPING, NULL, cbSub, pszTag);
+ if (pMemLnx)
+ {
+ /*
+ * Allocate user space mapping.
+ */
+ void *pv;
+ pv = rtR0MemObjLinuxDoMmap(R3PtrFixed, cbSub, uAlignment, pTask, fProt);
+ if (pv != (void *)-1)
+ {
+ /*
+ * Map page by page into the mmap area.
+ * This is generic, paranoid and not very efficient.
+ */
+ pgprot_t fPg = rtR0MemObjLinuxConvertProt(fProt, false /* user */);
+ unsigned long ulAddrCur = (unsigned long)pv;
+ const size_t cPages = (offSub + cbSub) >> PAGE_SHIFT;
+ size_t iPage;
+
+ LNX_MM_DOWN_WRITE(pTask->mm);
+
+ rc = VINF_SUCCESS;
+ if (pMemLnxToMap->cPages)
+ {
+ for (iPage = offSub >> PAGE_SHIFT; iPage < cPages; iPage++, ulAddrCur += PAGE_SIZE)
+ {
+#if RTLNX_VER_MAX(2,6,11)
+ RTHCPHYS Phys = page_to_phys(pMemLnxToMap->apPages[iPage]);
+#endif
+#if RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
+ struct vm_area_struct *vma = find_vma(pTask->mm, ulAddrCur); /* this is probably the same for all the pages... */
+ AssertBreakStmt(vma, rc = VERR_INTERNAL_ERROR);
+#endif
+#if RTLNX_VER_MAX(2,6,0) && defined(RT_ARCH_X86)
+ /* remap_page_range() limitation on x86 */
+ AssertBreakStmt(Phys < _4G, rc = VERR_NO_MEMORY);
+#endif
+
+#if defined(VBOX_USE_INSERT_PAGE) && RTLNX_VER_MIN(2,6,22)
+ rc = vm_insert_page(vma, ulAddrCur, pMemLnxToMap->apPages[iPage]);
+ /* Thes flags help making 100% sure some bad stuff wont happen (swap, core, ++).
+ * See remap_pfn_range() in mm/memory.c */
+
+#if RTLNX_VER_MIN(6,3,0)
+ vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
+#elif RTLNX_VER_MIN(3,7,0)
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+#else
+ vma->vm_flags |= VM_RESERVED;
+#endif
+#elif RTLNX_VER_MIN(2,6,11)
+ rc = remap_pfn_range(vma, ulAddrCur, page_to_pfn(pMemLnxToMap->apPages[iPage]), PAGE_SIZE, fPg);
+#elif defined(VBOX_USE_PAE_HACK)
+ rc = remap_page_range(vma, ulAddrCur, DummyPhys, PAGE_SIZE, fPg);
+ if (!rc)
+ rc = rtR0MemObjLinuxFixPte(pTask->mm, ulAddrCur, Phys);
+#elif RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
+ rc = remap_page_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg);
+#else /* 2.4 */
+ rc = remap_page_range(ulAddrCur, Phys, PAGE_SIZE, fPg);
+#endif
+ if (rc)
+ {
+ rc = VERR_NO_MEMORY;
+ break;
+ }
+ }
+ }
+ else
+ {
+ RTHCPHYS Phys;
+ if (pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_PHYS)
+ Phys = pMemLnxToMap->Core.u.Phys.PhysBase;
+ else if (pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_CONT)
+ Phys = pMemLnxToMap->Core.u.Cont.Phys;
+ else
+ {
+ AssertMsgFailed(("%d\n", pMemLnxToMap->Core.enmType));
+ Phys = NIL_RTHCPHYS;
+ }
+ if (Phys != NIL_RTHCPHYS)
+ {
+ for (iPage = offSub >> PAGE_SHIFT; iPage < cPages; iPage++, ulAddrCur += PAGE_SIZE, Phys += PAGE_SIZE)
+ {
+#if RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
+ struct vm_area_struct *vma = find_vma(pTask->mm, ulAddrCur); /* this is probably the same for all the pages... */
+ AssertBreakStmt(vma, rc = VERR_INTERNAL_ERROR);
+#endif
+#if RTLNX_VER_MAX(2,6,0) && defined(RT_ARCH_X86)
+ /* remap_page_range() limitation on x86 */
+ AssertBreakStmt(Phys < _4G, rc = VERR_NO_MEMORY);
+#endif
+
+#if RTLNX_VER_MIN(2,6,11)
+ rc = remap_pfn_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg);
+#elif defined(VBOX_USE_PAE_HACK)
+ rc = remap_page_range(vma, ulAddrCur, DummyPhys, PAGE_SIZE, fPg);
+ if (!rc)
+ rc = rtR0MemObjLinuxFixPte(pTask->mm, ulAddrCur, Phys);
+#elif RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
+ rc = remap_page_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg);
+#else /* 2.4 */
+ rc = remap_page_range(ulAddrCur, Phys, PAGE_SIZE, fPg);
+#endif
+ if (rc)
+ {
+ rc = VERR_NO_MEMORY;
+ break;
+ }
+ }
+ }
+ }
+
+#ifdef CONFIG_NUMA_BALANCING
+# if RTLNX_VER_MAX(3,13,0) && RTLNX_RHEL_MAX(7,0)
+# define VBOX_NUMA_HACK_OLD
+# endif
+ if (RT_SUCCESS(rc))
+ {
+ /** @todo Ugly hack! But right now we have no other means to
+ * disable automatic NUMA page balancing. */
+# ifdef RT_OS_X86
+# ifdef VBOX_NUMA_HACK_OLD
+ pTask->mm->numa_next_reset = jiffies + 0x7fffffffUL;
+# endif
+ pTask->mm->numa_next_scan = jiffies + 0x7fffffffUL;
+# else
+# ifdef VBOX_NUMA_HACK_OLD
+ pTask->mm->numa_next_reset = jiffies + 0x7fffffffffffffffUL;
+# endif
+ pTask->mm->numa_next_scan = jiffies + 0x7fffffffffffffffUL;
+# endif
+ }
+#endif /* CONFIG_NUMA_BALANCING */
+
+ LNX_MM_UP_WRITE(pTask->mm);
+
+ if (RT_SUCCESS(rc))
+ {
+#ifdef VBOX_USE_PAE_HACK
+ __free_page(pDummyPage);
+#endif
+ pMemLnx->Core.pv = pv;
+ pMemLnx->Core.u.Mapping.R0Process = R0Process;
+ *ppMem = &pMemLnx->Core;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+
+ /*
+ * Bail out.
+ */
+ rtR0MemObjLinuxDoMunmap(pv, cbSub, pTask);
+ }
+ rtR0MemObjDelete(&pMemLnx->Core);
+ }
+#ifdef VBOX_USE_PAE_HACK
+ __free_page(pDummyPage);
+#endif
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+
+
+DECLHIDDEN(int) rtR0MemObjNativeProtect(PRTR0MEMOBJINTERNAL pMem, size_t offSub, size_t cbSub, uint32_t fProt)
+{
+# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
+ /*
+ * Currently only supported when we've got addresses PTEs from the kernel.
+ */
+ PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem;
+ if (pMemLnx->pArea && pMemLnx->papPtesForArea)
+ {
+ pgprot_t const fPg = rtR0MemObjLinuxConvertProt(fProt, true /*fKernel*/);
+ size_t const cPages = (offSub + cbSub) >> PAGE_SHIFT;
+ pte_t **papPtes = pMemLnx->papPtesForArea;
+ size_t i;
+
+ for (i = offSub >> PAGE_SHIFT; i < cPages; i++)
+ {
+ set_pte(papPtes[i], mk_pte(pMemLnx->apPages[i], fPg));
+ }
+ preempt_disable();
+ __flush_tlb_all();
+ preempt_enable();
+ return VINF_SUCCESS;
+ }
+# elif defined(IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC)
+ PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem;
+ if ( pMemLnx->fExecutable
+ && pMemLnx->fMappedToRing0)
+ {
+ LNXAPPLYPGRANGE Args;
+ Args.pMemLnx = pMemLnx;
+ Args.fPg = rtR0MemObjLinuxConvertProt(fProt, true /*fKernel*/);
+ int rcLnx = apply_to_page_range(current->active_mm, (unsigned long)pMemLnx->Core.pv + offSub, cbSub,
+ rtR0MemObjLinuxApplyPageRange, (void *)&Args);
+ if (rcLnx)
+ return VERR_NOT_SUPPORTED;
+
+ return VINF_SUCCESS;
+ }
+# endif
+
+ NOREF(pMem);
+ NOREF(offSub);
+ NOREF(cbSub);
+ NOREF(fProt);
+ return VERR_NOT_SUPPORTED;
+}
+
+
+DECLHIDDEN(RTHCPHYS) rtR0MemObjNativeGetPagePhysAddr(PRTR0MEMOBJINTERNAL pMem, size_t iPage)
+{
+ PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem;
+
+ if (pMemLnx->cPages)
+ return page_to_phys(pMemLnx->apPages[iPage]);
+
+ switch (pMemLnx->Core.enmType)
+ {
+ case RTR0MEMOBJTYPE_CONT:
+ return pMemLnx->Core.u.Cont.Phys + (iPage << PAGE_SHIFT);
+
+ case RTR0MEMOBJTYPE_PHYS:
+ return pMemLnx->Core.u.Phys.PhysBase + (iPage << PAGE_SHIFT);
+
+ /* the parent knows */
+ case RTR0MEMOBJTYPE_MAPPING:
+ return rtR0MemObjNativeGetPagePhysAddr(pMemLnx->Core.uRel.Child.pParent, iPage);
+
+ /* cPages > 0 */
+ case RTR0MEMOBJTYPE_LOW:
+ case RTR0MEMOBJTYPE_LOCK:
+ case RTR0MEMOBJTYPE_PHYS_NC:
+ case RTR0MEMOBJTYPE_PAGE:
+ case RTR0MEMOBJTYPE_LARGE_PAGE:
+ default:
+ AssertMsgFailed(("%d\n", pMemLnx->Core.enmType));
+ RT_FALL_THROUGH();
+
+ case RTR0MEMOBJTYPE_RES_VIRT:
+ return NIL_RTHCPHYS;
+ }
+}
+
diff --git a/src/VBox/Runtime/r0drv/linux/memuserkernel-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/memuserkernel-r0drv-linux.c
new file mode 100644
index 00000000..e526132d
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/memuserkernel-r0drv-linux.c
@@ -0,0 +1,191 @@
+/* $Id: memuserkernel-r0drv-linux.c $ */
+/** @file
+ * IPRT - User & Kernel Memory, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2009-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/mem.h>
+#include <iprt/errcore.h>
+
+
+RTR0DECL(int) RTR0MemUserCopyFrom(void *pvDst, RTR3PTR R3PtrSrc, size_t cb)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ if (RT_LIKELY(copy_from_user(pvDst, (void *)R3PtrSrc, cb) == 0))
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_ACCESS_DENIED;
+}
+RT_EXPORT_SYMBOL(RTR0MemUserCopyFrom);
+
+
+RTR0DECL(int) RTR0MemUserCopyTo(RTR3PTR R3PtrDst, void const *pvSrc, size_t cb)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ if (RT_LIKELY(copy_to_user((void *)R3PtrDst, pvSrc, cb) == 0))
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_ACCESS_DENIED;
+}
+RT_EXPORT_SYMBOL(RTR0MemUserCopyTo);
+
+
+RTR0DECL(bool) RTR0MemUserIsValidAddr(RTR3PTR R3Ptr)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+#if RTLNX_VER_MIN(5,0,0) || RTLNX_RHEL_MIN(8,1)
+ bool fRc = access_ok((void *)R3Ptr, 1);
+#else
+ bool fRc = access_ok(VERIFY_READ, (void *)R3Ptr, 1);
+#endif
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return fRc;
+}
+RT_EXPORT_SYMBOL(RTR0MemUserIsValidAddr);
+
+
+RTR0DECL(bool) RTR0MemKernelIsValidAddr(void *pv)
+{
+ /* Couldn't find a straight forward way of doing this... */
+#if defined(RT_ARCH_X86) && defined(CONFIG_X86_HIGH_ENTRY)
+ return true; /* ?? */
+#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
+ return (uintptr_t)pv >= PAGE_OFFSET;
+#else
+# error "PORT ME"
+#if RTLNX_VER_MIN(5,0,0) || RTLNX_RHEL_MIN(8,1)
+ return !access_ok(pv, 1);
+#else
+ return !access_ok(VERIFY_READ, pv, 1);
+#endif
+#endif
+}
+RT_EXPORT_SYMBOL(RTR0MemKernelIsValidAddr);
+
+
+RTR0DECL(bool) RTR0MemAreKrnlAndUsrDifferent(void)
+{
+#if defined(RT_ARCH_X86) && defined(CONFIG_X86_HIGH_ENTRY) /* ?? */
+ return false;
+#else
+ return true;
+#endif
+}
+RT_EXPORT_SYMBOL(RTR0MemAreKrnlAndUsrDifferent);
+
+
+/**
+ * Treats both source and destination as unsafe buffers.
+ */
+static int rtR0MemKernelCopyLnxWorker(void *pvDst, void const *pvSrc, size_t cb)
+{
+#if RTLNX_VER_MIN(2,5,55)
+/* _ASM_EXTABLE was introduced in 2.6.25 from what I can tell. Using #ifndef
+ here since it has to be a macro and you never know what someone might have
+ backported to an earlier kernel release. */
+# ifndef _ASM_EXTABLE
+# if ARCH_BITS == 32
+# define _ASM_EXTABLE(a_Instr, a_Resume) \
+ ".section __ex_table,\"a\"\n" \
+ ".balign 4\n" \
+ ".long " #a_Instr "\n" \
+ ".long " #a_Resume "\n" \
+ ".previous\n"
+# else
+# define _ASM_EXTABLE(a_Instr, a_Resume) \
+ ".section __ex_table,\"a\"\n" \
+ ".balign 8\n" \
+ ".quad " #a_Instr "\n" \
+ ".quad " #a_Resume "\n" \
+ ".previous\n"
+# endif
+# endif /* !_ASM_EXTABLE */
+ int rc;
+ IPRT_LINUX_SAVE_EFL_AC(); /* paranoia */
+ if (!cb)
+ return VINF_SUCCESS;
+
+ __asm__ __volatile__ ("cld\n"
+ "1:\n\t"
+ "rep; movsb\n"
+ "2:\n\t"
+ ".section .fixup,\"ax\"\n"
+ "3:\n\t"
+ "movl %4, %0\n\t"
+ "jmp 2b\n\t"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : "=r" (rc),
+ "=D" (pvDst),
+ "=S" (pvSrc),
+ "=c" (cb)
+ : "i" (VERR_ACCESS_DENIED),
+ "0" (VINF_SUCCESS),
+ "1" (pvDst),
+ "2" (pvSrc),
+ "3" (cb)
+ : "memory");
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+#else
+ return VERR_NOT_SUPPORTED;
+#endif
+}
+
+
+RTR0DECL(int) RTR0MemKernelCopyFrom(void *pvDst, void const *pvSrc, size_t cb)
+{
+ return rtR0MemKernelCopyLnxWorker(pvDst, pvSrc, cb);
+}
+RT_EXPORT_SYMBOL(RTR0MemKernelCopyFrom);
+
+
+RTR0DECL(int) RTR0MemKernelCopyTo(void *pvDst, void const *pvSrc, size_t cb)
+{
+ return rtR0MemKernelCopyLnxWorker(pvDst, pvSrc, cb);
+}
+RT_EXPORT_SYMBOL(RTR0MemKernelCopyTo);
+
diff --git a/src/VBox/Runtime/r0drv/linux/mp-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/mp-r0drv-linux.c
new file mode 100644
index 00000000..5ac05db5
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/mp-r0drv-linux.c
@@ -0,0 +1,640 @@
+/* $Id: mp-r0drv-linux.c $ */
+/** @file
+ * IPRT - Multiprocessor, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2008-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/mp.h>
+#include <iprt/cpuset.h>
+#include <iprt/err.h>
+#include <iprt/asm.h>
+#include <iprt/thread.h>
+#include "r0drv/mp-r0drv.h"
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+#if defined(nr_cpumask_bits) || RTLNX_VER_MIN(2,6,28)
+# define VBOX_NR_CPUMASK_BITS (nr_cpumask_bits) /* same as nr_cpu_ids */
+#else
+# define VBOX_NR_CPUMASK_BITS (NR_CPUS)
+#endif
+
+
+RTDECL(RTCPUID) RTMpCpuId(void)
+{
+ return smp_processor_id();
+}
+RT_EXPORT_SYMBOL(RTMpCpuId);
+
+
+RTDECL(int) RTMpCurSetIndex(void)
+{
+ return smp_processor_id();
+}
+RT_EXPORT_SYMBOL(RTMpCurSetIndex);
+
+
+RTDECL(int) RTMpCurSetIndexAndId(PRTCPUID pidCpu)
+{
+ return *pidCpu = smp_processor_id();
+}
+RT_EXPORT_SYMBOL(RTMpCurSetIndexAndId);
+
+
+RTDECL(int) RTMpCpuIdToSetIndex(RTCPUID idCpu)
+{
+ return idCpu < RTCPUSET_MAX_CPUS && idCpu < VBOX_NR_CPUMASK_BITS ? (int)idCpu : -1;
+}
+RT_EXPORT_SYMBOL(RTMpCpuIdToSetIndex);
+
+
+RTDECL(RTCPUID) RTMpCpuIdFromSetIndex(int iCpu)
+{
+ return (unsigned)iCpu < VBOX_NR_CPUMASK_BITS ? (RTCPUID)iCpu : NIL_RTCPUID;
+}
+RT_EXPORT_SYMBOL(RTMpCpuIdFromSetIndex);
+
+
+RTDECL(RTCPUID) RTMpGetMaxCpuId(void)
+{
+ return VBOX_NR_CPUMASK_BITS - 1;
+}
+RT_EXPORT_SYMBOL(RTMpGetMaxCpuId);
+
+
+RTDECL(bool) RTMpIsCpuPossible(RTCPUID idCpu)
+{
+#if defined(CONFIG_SMP)
+# if RTLNX_VER_MIN(2,6,2) || defined(cpu_possible)
+ return idCpu < VBOX_NR_CPUMASK_BITS && cpu_possible(idCpu);
+# else /* < 2.5.29 */
+ return idCpu < (RTCPUID)(smp_num_cpus);
+# endif
+#else
+ return idCpu == RTMpCpuId();
+#endif
+}
+RT_EXPORT_SYMBOL(RTMpIsCpuPossible);
+
+
+RTDECL(PRTCPUSET) RTMpGetSet(PRTCPUSET pSet)
+{
+ RTCPUID idCpu;
+
+ RTCpuSetEmpty(pSet);
+ idCpu = RTMpGetMaxCpuId();
+ do
+ {
+ if (RTMpIsCpuPossible(idCpu))
+ RTCpuSetAdd(pSet, idCpu);
+ } while (idCpu-- > 0);
+ return pSet;
+}
+RT_EXPORT_SYMBOL(RTMpGetSet);
+
+
+RTDECL(RTCPUID) RTMpGetCount(void)
+{
+#ifdef CONFIG_SMP
+# if RTLNX_VER_MIN(2,6,4) || defined(num_possible_cpus)
+ return num_possible_cpus();
+# elif RTLNX_VER_MAX(2,5,0)
+ return smp_num_cpus;
+# else
+ RTCPUSET Set;
+ RTMpGetSet(&Set);
+ return RTCpuSetCount(&Set);
+# endif
+#else
+ return 1;
+#endif
+}
+RT_EXPORT_SYMBOL(RTMpGetCount);
+
+
+RTDECL(bool) RTMpIsCpuOnline(RTCPUID idCpu)
+{
+#ifdef CONFIG_SMP
+# if RTLNX_VER_MIN(2,6,0) || defined(cpu_online)
+ return idCpu < VBOX_NR_CPUMASK_BITS && cpu_online(idCpu);
+# else /* 2.4: */
+ return idCpu < VBOX_NR_CPUMASK_BITS && cpu_online_map & RT_BIT_64(idCpu);
+# endif
+#else
+ return idCpu == RTMpCpuId();
+#endif
+}
+RT_EXPORT_SYMBOL(RTMpIsCpuOnline);
+
+
+RTDECL(PRTCPUSET) RTMpGetOnlineSet(PRTCPUSET pSet)
+{
+#ifdef CONFIG_SMP
+ RTCPUID idCpu;
+
+ RTCpuSetEmpty(pSet);
+ idCpu = RTMpGetMaxCpuId();
+ do
+ {
+ if (RTMpIsCpuOnline(idCpu))
+ RTCpuSetAdd(pSet, idCpu);
+ } while (idCpu-- > 0);
+#else
+ RTCpuSetEmpty(pSet);
+ RTCpuSetAdd(pSet, RTMpCpuId());
+#endif
+ return pSet;
+}
+RT_EXPORT_SYMBOL(RTMpGetOnlineSet);
+
+
+RTDECL(RTCPUID) RTMpGetOnlineCount(void)
+{
+#ifdef CONFIG_SMP
+# if RTLNX_VER_MIN(2,6,0) || defined(num_online_cpus)
+ return num_online_cpus();
+# else
+ RTCPUSET Set;
+ RTMpGetOnlineSet(&Set);
+ return RTCpuSetCount(&Set);
+# endif
+#else
+ return 1;
+#endif
+}
+RT_EXPORT_SYMBOL(RTMpGetOnlineCount);
+
+
+RTDECL(bool) RTMpIsCpuWorkPending(void)
+{
+ /** @todo (not used on non-Windows platforms yet). */
+ return false;
+}
+RT_EXPORT_SYMBOL(RTMpIsCpuWorkPending);
+
+
+/**
+ * Wrapper between the native linux per-cpu callbacks and PFNRTWORKER.
+ *
+ * @param pvInfo Pointer to the RTMPARGS package.
+ */
+static void rtmpLinuxWrapper(void *pvInfo)
+{
+ PRTMPARGS pArgs = (PRTMPARGS)pvInfo;
+ ASMAtomicIncU32(&pArgs->cHits);
+ pArgs->pfnWorker(RTMpCpuId(), pArgs->pvUser1, pArgs->pvUser2);
+}
+
+#ifdef CONFIG_SMP
+
+# if RTLNX_VER_MIN(2,6,27)
+/**
+ * Wrapper between the native linux per-cpu callbacks and PFNRTWORKER, does hit
+ * increment after calling the worker.
+ *
+ * @param pvInfo Pointer to the RTMPARGS package.
+ */
+static void rtmpLinuxWrapperPostInc(void *pvInfo)
+{
+ PRTMPARGS pArgs = (PRTMPARGS)pvInfo;
+ pArgs->pfnWorker(RTMpCpuId(), pArgs->pvUser1, pArgs->pvUser2);
+ ASMAtomicIncU32(&pArgs->cHits);
+}
+# endif
+
+
+/**
+ * Wrapper between the native linux all-cpu callbacks and PFNRTWORKER.
+ *
+ * @param pvInfo Pointer to the RTMPARGS package.
+ */
+static void rtmpLinuxAllWrapper(void *pvInfo)
+{
+ PRTMPARGS pArgs = (PRTMPARGS)pvInfo;
+ PRTCPUSET pWorkerSet = pArgs->pWorkerSet;
+ RTCPUID idCpu = RTMpCpuId();
+ Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
+
+ if (RTCpuSetIsMember(pWorkerSet, idCpu))
+ {
+ pArgs->pfnWorker(idCpu, pArgs->pvUser1, pArgs->pvUser2);
+ RTCpuSetDel(pWorkerSet, idCpu);
+ }
+}
+
+#endif /* CONFIG_SMP */
+
+RTDECL(int) RTMpOnAll(PFNRTMPWORKER pfnWorker, void *pvUser1, void *pvUser2)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ RTMPARGS Args;
+ RTCPUSET OnlineSet;
+ RTCPUID idCpu;
+#ifdef CONFIG_SMP
+ uint32_t cLoops;
+#endif
+
+ RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
+
+ Args.pfnWorker = pfnWorker;
+ Args.pvUser1 = pvUser1;
+ Args.pvUser2 = pvUser2;
+ Args.idCpu = NIL_RTCPUID;
+ Args.cHits = 0;
+
+ RTThreadPreemptDisable(&PreemptState);
+ RTMpGetOnlineSet(&OnlineSet);
+ Args.pWorkerSet = &OnlineSet;
+ idCpu = RTMpCpuId();
+
+#ifdef CONFIG_SMP
+ if (RTCpuSetCount(&OnlineSet) > 1)
+ {
+ /* Fire the function on all other CPUs without waiting for completion. */
+# if RTLNX_VER_MIN(5,3,0)
+ smp_call_function(rtmpLinuxAllWrapper, &Args, 0 /* wait */);
+# elif RTLNX_VER_MIN(2,6,27)
+ int rc = smp_call_function(rtmpLinuxAllWrapper, &Args, 0 /* wait */);
+ Assert(!rc); NOREF(rc);
+# else
+ int rc = smp_call_function(rtmpLinuxAllWrapper, &Args, 0 /* retry */, 0 /* wait */);
+ Assert(!rc); NOREF(rc);
+# endif
+ }
+#endif
+
+ /* Fire the function on this CPU. */
+ Args.pfnWorker(idCpu, Args.pvUser1, Args.pvUser2);
+ RTCpuSetDel(Args.pWorkerSet, idCpu);
+
+#ifdef CONFIG_SMP
+ /* Wait for all of them finish. */
+ cLoops = 64000;
+ while (!RTCpuSetIsEmpty(Args.pWorkerSet))
+ {
+ /* Periodically check if any CPU in the wait set has gone offline, if so update the wait set. */
+ if (!cLoops--)
+ {
+ RTCPUSET OnlineSetNow;
+ RTMpGetOnlineSet(&OnlineSetNow);
+ RTCpuSetAnd(Args.pWorkerSet, &OnlineSetNow);
+
+ cLoops = 64000;
+ }
+
+ ASMNopPause();
+ }
+#endif
+
+ RTThreadPreemptRestore(&PreemptState);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTMpOnAll);
+
+
+RTDECL(int) RTMpOnOthers(PFNRTMPWORKER pfnWorker, void *pvUser1, void *pvUser2)
+{
+#ifdef CONFIG_SMP
+ IPRT_LINUX_SAVE_EFL_AC();
+ RTMPARGS Args;
+
+ RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
+ Args.pfnWorker = pfnWorker;
+ Args.pvUser1 = pvUser1;
+ Args.pvUser2 = pvUser2;
+ Args.idCpu = NIL_RTCPUID;
+ Args.cHits = 0;
+
+ RTThreadPreemptDisable(&PreemptState);
+# if RTLNX_VER_MIN(5,3,0)
+ smp_call_function(rtmpLinuxWrapper, &Args, 1 /* wait */);
+# elif RTLNX_VER_MIN(2,6,27)
+ int rc = smp_call_function(rtmpLinuxWrapper, &Args, 1 /* wait */);
+ Assert(rc == 0); NOREF(rc);
+# else /* older kernels */
+ int rc = smp_call_function(rtmpLinuxWrapper, &Args, 0 /* retry */, 1 /* wait */);
+ Assert(rc == 0); NOREF(rc);
+# endif /* older kernels */
+ RTThreadPreemptRestore(&PreemptState);
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+#else
+ RT_NOREF(pfnWorker, pvUser1, pvUser2);
+#endif
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTMpOnOthers);
+
+
+#if RTLNX_VER_MAX(2,6,27) && defined(CONFIG_SMP)
+/**
+ * Wrapper between the native linux per-cpu callbacks and PFNRTWORKER
+ * employed by RTMpOnPair on older kernels that lacks smp_call_function_many.
+ *
+ * @param pvInfo Pointer to the RTMPARGS package.
+ */
+static void rtMpLinuxOnPairWrapper(void *pvInfo)
+{
+ PRTMPARGS pArgs = (PRTMPARGS)pvInfo;
+ RTCPUID idCpu = RTMpCpuId();
+
+ if ( idCpu == pArgs->idCpu
+ || idCpu == pArgs->idCpu2)
+ {
+ pArgs->pfnWorker(idCpu, pArgs->pvUser1, pArgs->pvUser2);
+ ASMAtomicIncU32(&pArgs->cHits);
+ }
+}
+#endif
+
+
+RTDECL(int) RTMpOnPair(RTCPUID idCpu1, RTCPUID idCpu2, uint32_t fFlags, PFNRTMPWORKER pfnWorker, void *pvUser1, void *pvUser2)
+{
+#ifdef CONFIG_SMP
+ IPRT_LINUX_SAVE_EFL_AC();
+ int rc;
+ RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
+# if RTLNX_VER_MIN(2,6,28) /* 2.6.28 introduces CONFIG_CPUMASK_OFFSTACK */
+ cpumask_var_t DstCpuMask;
+# elif RTLNX_VER_MIN(2,6,27)
+ cpumask_t DstCpuMask;
+# endif
+
+ AssertReturn(idCpu1 != idCpu2, VERR_INVALID_PARAMETER);
+ AssertReturn(!(fFlags & RTMPON_F_VALID_MASK), VERR_INVALID_FLAGS);
+
+ /*
+ * Prepare the CPU mask before we disable preemption.
+ */
+# if RTLNX_VER_MIN(2,6,30)
+ if (!zalloc_cpumask_var(&DstCpuMask, GFP_KERNEL))
+ return VERR_NO_MEMORY;
+ cpumask_set_cpu(idCpu1, DstCpuMask);
+ cpumask_set_cpu(idCpu2, DstCpuMask);
+# elif RTLNX_VER_MIN(2,6,28)
+ if (!alloc_cpumask_var(&DstCpuMask, GFP_KERNEL))
+ return VERR_NO_MEMORY;
+ cpumask_clear(DstCpuMask);
+ cpumask_set_cpu(idCpu1, DstCpuMask);
+ cpumask_set_cpu(idCpu2, DstCpuMask);
+# elif RTLNX_VER_MIN(2,6,27)
+ cpus_clear(DstCpuMask);
+ cpu_set(idCpu1, DstCpuMask);
+ cpu_set(idCpu2, DstCpuMask);
+# endif
+
+ /*
+ * Check that both CPUs are online before doing the broadcast call.
+ */
+ RTThreadPreemptDisable(&PreemptState);
+ if ( RTMpIsCpuOnline(idCpu1)
+ && RTMpIsCpuOnline(idCpu2))
+ {
+ /*
+ * Use the smp_call_function variant taking a cpu mask where available,
+ * falling back on broadcast with filter. Slight snag if one of the
+ * CPUs is the one we're running on, we must do the call and the post
+ * call wait ourselves.
+ */
+ RTCPUID idCpuSelf = RTMpCpuId();
+ bool const fCallSelf = idCpuSelf == idCpu1 || idCpuSelf == idCpu2;
+ RTMPARGS Args;
+ Args.pfnWorker = pfnWorker;
+ Args.pvUser1 = pvUser1;
+ Args.pvUser2 = pvUser2;
+ Args.idCpu = idCpu1;
+ Args.idCpu2 = idCpu2;
+ Args.cHits = 0;
+
+# if RTLNX_VER_MIN(2,6,28)
+ smp_call_function_many(DstCpuMask, rtmpLinuxWrapperPostInc, &Args, !fCallSelf /* wait */);
+ rc = 0;
+# elif RTLNX_VER_MIN(2,6,27)
+ rc = smp_call_function_mask(DstCpuMask, rtmpLinuxWrapperPostInc, &Args, !fCallSelf /* wait */);
+# else /* older kernels */
+ rc = smp_call_function(rtMpLinuxOnPairWrapper, &Args, 0 /* retry */, !fCallSelf /* wait */);
+# endif /* older kernels */
+ Assert(rc == 0);
+
+ /* Call ourselves if necessary and wait for the other party to be done. */
+ if (fCallSelf)
+ {
+ uint32_t cLoops = 0;
+ rtmpLinuxWrapper(&Args);
+ while (ASMAtomicReadU32(&Args.cHits) < 2)
+ {
+ if ((cLoops & 0x1ff) == 0 && !RTMpIsCpuOnline(idCpuSelf == idCpu1 ? idCpu2 : idCpu1))
+ break;
+ cLoops++;
+ ASMNopPause();
+ }
+ }
+
+ Assert(Args.cHits <= 2);
+ if (Args.cHits == 2)
+ rc = VINF_SUCCESS;
+ else if (Args.cHits == 1)
+ rc = VERR_NOT_ALL_CPUS_SHOWED;
+ else if (Args.cHits == 0)
+ rc = VERR_CPU_OFFLINE;
+ else
+ rc = VERR_CPU_IPE_1;
+ }
+ /*
+ * A CPU must be present to be considered just offline.
+ */
+ else if ( RTMpIsCpuPresent(idCpu1)
+ && RTMpIsCpuPresent(idCpu2))
+ rc = VERR_CPU_OFFLINE;
+ else
+ rc = VERR_CPU_NOT_FOUND;
+
+ RTThreadPreemptRestore(&PreemptState);;
+# if RTLNX_VER_MIN(2,6,28)
+ free_cpumask_var(DstCpuMask);
+# endif
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+
+#else /* !CONFIG_SMP */
+ RT_NOREF(idCpu1, idCpu2, fFlags, pfnWorker, pvUser1, pvUser2);
+ return VERR_CPU_NOT_FOUND;
+#endif /* !CONFIG_SMP */
+}
+RT_EXPORT_SYMBOL(RTMpOnPair);
+
+
+RTDECL(bool) RTMpOnPairIsConcurrentExecSupported(void)
+{
+ return true;
+}
+RT_EXPORT_SYMBOL(RTMpOnPairIsConcurrentExecSupported);
+
+
+#if RTLNX_VER_MAX(2,6,19) && defined(CONFIG_SMP)
+/**
+ * Wrapper between the native linux per-cpu callbacks and PFNRTWORKER
+ * employed by RTMpOnSpecific on older kernels that lacks smp_call_function_single.
+ *
+ * @param pvInfo Pointer to the RTMPARGS package.
+ */
+static void rtmpOnSpecificLinuxWrapper(void *pvInfo)
+{
+ PRTMPARGS pArgs = (PRTMPARGS)pvInfo;
+ RTCPUID idCpu = RTMpCpuId();
+
+ if (idCpu == pArgs->idCpu)
+ {
+ pArgs->pfnWorker(idCpu, pArgs->pvUser1, pArgs->pvUser2);
+ ASMAtomicIncU32(&pArgs->cHits);
+ }
+}
+#endif
+
+
+RTDECL(int) RTMpOnSpecific(RTCPUID idCpu, PFNRTMPWORKER pfnWorker, void *pvUser1, void *pvUser2)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ int rc;
+ RTMPARGS Args;
+
+ RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER;
+ Args.pfnWorker = pfnWorker;
+ Args.pvUser1 = pvUser1;
+ Args.pvUser2 = pvUser2;
+ Args.idCpu = idCpu;
+ Args.cHits = 0;
+
+ if (!RTMpIsCpuPossible(idCpu))
+ return VERR_CPU_NOT_FOUND;
+
+ RTThreadPreemptDisable(&PreemptState);
+ if (idCpu != RTMpCpuId())
+ {
+#ifdef CONFIG_SMP
+ if (RTMpIsCpuOnline(idCpu))
+ {
+# if RTLNX_VER_MIN(2,6,27)
+ rc = smp_call_function_single(idCpu, rtmpLinuxWrapper, &Args, 1 /* wait */);
+# elif RTLNX_VER_MIN(2,6,19)
+ rc = smp_call_function_single(idCpu, rtmpLinuxWrapper, &Args, 0 /* retry */, 1 /* wait */);
+# else /* older kernels */
+ rc = smp_call_function(rtmpOnSpecificLinuxWrapper, &Args, 0 /* retry */, 1 /* wait */);
+# endif /* older kernels */
+ Assert(rc == 0);
+ rc = Args.cHits ? VINF_SUCCESS : VERR_CPU_OFFLINE;
+ }
+ else
+#endif /* CONFIG_SMP */
+ rc = VERR_CPU_OFFLINE;
+ }
+ else
+ {
+ rtmpLinuxWrapper(&Args);
+ rc = VINF_SUCCESS;
+ }
+ RTThreadPreemptRestore(&PreemptState);;
+
+ NOREF(rc);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+RT_EXPORT_SYMBOL(RTMpOnSpecific);
+
+
+#if RTLNX_VER_MIN(2,6,19) && defined(CONFIG_SMP)
+/**
+ * Dummy callback used by RTMpPokeCpu.
+ *
+ * @param pvInfo Ignored.
+ */
+static void rtmpLinuxPokeCpuCallback(void *pvInfo)
+{
+ NOREF(pvInfo);
+}
+#endif
+
+
+RTDECL(int) RTMpPokeCpu(RTCPUID idCpu)
+{
+#if RTLNX_VER_MIN(2,6,19)
+ IPRT_LINUX_SAVE_EFL_AC();
+ int rc;
+ if (RTMpIsCpuPossible(idCpu))
+ {
+ if (RTMpIsCpuOnline(idCpu))
+ {
+# ifdef CONFIG_SMP
+# if RTLNX_VER_MIN(2,6,27)
+ rc = smp_call_function_single(idCpu, rtmpLinuxPokeCpuCallback, NULL, 0 /* wait */);
+# elif RTLNX_VER_MIN(2,6,19)
+ rc = smp_call_function_single(idCpu, rtmpLinuxPokeCpuCallback, NULL, 0 /* retry */, 0 /* wait */);
+# else /* older kernels */
+# error oops
+# endif /* older kernels */
+ Assert(rc == 0);
+# endif /* CONFIG_SMP */
+ rc = VINF_SUCCESS;
+ }
+ else
+ rc = VERR_CPU_OFFLINE;
+ }
+ else
+ rc = VERR_CPU_NOT_FOUND;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+
+#else /* older kernels */
+ /* no unicast here? */
+ return VERR_NOT_SUPPORTED;
+#endif /* older kernels */
+}
+RT_EXPORT_SYMBOL(RTMpPokeCpu);
+
+
+RTDECL(bool) RTMpOnAllIsConcurrentSafe(void)
+{
+ return true;
+}
+RT_EXPORT_SYMBOL(RTMpOnAllIsConcurrentSafe);
+
diff --git a/src/VBox/Runtime/r0drv/linux/mpnotification-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/mpnotification-r0drv-linux.c
new file mode 100644
index 00000000..99b8748e
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/mpnotification-r0drv-linux.c
@@ -0,0 +1,258 @@
+/* $Id: mpnotification-r0drv-linux.c $ */
+/** @file
+ * IPRT - Multiprocessor Event Notifications, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2008-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/asm-amd64-x86.h>
+#include <iprt/errcore.h>
+#include <iprt/cpuset.h>
+#include <iprt/thread.h>
+#include "r0drv/mp-r0drv.h"
+
+#if RTLNX_VER_MIN(4,10,0)
+
+static enum cpuhp_state g_rtR0MpOnline;
+
+/*
+ * Linux 4.10 completely removed CPU notifiers. So let's switch to CPU hotplug
+ * notification.
+ */
+
+static int rtR0MpNotificationLinuxOnline(unsigned int cpu)
+{
+ RTCPUID idCpu = RTMpCpuIdFromSetIndex(cpu);
+ rtMpNotificationDoCallbacks(RTMPEVENT_ONLINE, idCpu);
+ return 0;
+}
+
+static int rtR0MpNotificationLinuxOffline(unsigned int cpu)
+{
+ RTCPUID idCpu = RTMpCpuIdFromSetIndex(cpu);
+ rtMpNotificationDoCallbacks(RTMPEVENT_OFFLINE, idCpu);
+ return 0;
+}
+
+DECLHIDDEN(int) rtR0MpNotificationNativeInit(void)
+{
+ int rc;
+ IPRT_LINUX_SAVE_EFL_AC();
+ rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "vboxdrv:online",
+ rtR0MpNotificationLinuxOnline, rtR0MpNotificationLinuxOffline);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ /*
+ * cpuhp_setup_state_nocalls() returns a positive state number for
+ * CPUHP_AP_ONLINE_DYN or -ENOSPC if there is no free slot available
+ * (see cpuhp_reserve_state / definition of CPUHP_AP_ONLINE_DYN).
+ */
+ AssertMsgReturn(rc > 0, ("%d\n", rc), RTErrConvertFromErrno(rc));
+ g_rtR0MpOnline = rc;
+ return VINF_SUCCESS;
+}
+
+
+DECLHIDDEN(void) rtR0MpNotificationNativeTerm(void)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ cpuhp_remove_state_nocalls(g_rtR0MpOnline);
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+
+#elif RTLNX_VER_MIN(2,5,71) && defined(CONFIG_SMP)
+
+static int rtMpNotificationLinuxCallback(struct notifier_block *pNotifierBlock, unsigned long ulNativeEvent, void *pvCpu);
+
+/**
+ * The notifier block we use for registering the callback.
+ */
+static struct notifier_block g_NotifierBlock =
+{
+ .notifier_call = rtMpNotificationLinuxCallback,
+ .next = NULL,
+ .priority = 0
+};
+
+# ifdef CPU_DOWN_FAILED
+/**
+ * The set of CPUs we've seen going offline recently.
+ */
+static RTCPUSET g_MpPendingOfflineSet;
+# endif
+
+
+/**
+ * The native callback.
+ *
+ * @returns NOTIFY_DONE.
+ * @param pNotifierBlock Pointer to g_NotifierBlock.
+ * @param ulNativeEvent The native event.
+ * @param pvCpu The cpu id cast into a pointer value.
+ *
+ * @remarks This can fire with preemption enabled and on any CPU.
+ */
+static int rtMpNotificationLinuxCallback(struct notifier_block *pNotifierBlock, unsigned long ulNativeEvent, void *pvCpu)
+{
+ bool fProcessEvent = false;
+ RTCPUID idCpu = (uintptr_t)pvCpu;
+ NOREF(pNotifierBlock);
+
+ /*
+ * Note that redhat/CentOS ported _some_ of the FROZEN macros
+ * back to their 2.6.18-92.1.10.el5 kernel but actually don't
+ * use them. Thus we have to test for both CPU_TASKS_FROZEN and
+ * the individual event variants.
+ */
+ switch (ulNativeEvent)
+ {
+ /*
+ * Pick up online events or failures to go offline.
+ * Ignore failure events for CPUs we didn't see go offline.
+ */
+# ifdef CPU_DOWN_FAILED
+ case CPU_DOWN_FAILED:
+# if defined(CPU_TASKS_FROZEN) && defined(CPU_DOWN_FAILED_FROZEN)
+ case CPU_DOWN_FAILED_FROZEN:
+# endif
+ if (!RTCpuSetIsMember(&g_MpPendingOfflineSet, idCpu))
+ break; /* fProcessEvents = false */
+ /* fall thru */
+# endif
+ case CPU_ONLINE:
+# if defined(CPU_TASKS_FROZEN) && defined(CPU_ONLINE_FROZEN)
+ case CPU_ONLINE_FROZEN:
+# endif
+# ifdef CPU_DOWN_FAILED
+ RTCpuSetDel(&g_MpPendingOfflineSet, idCpu);
+# endif
+ fProcessEvent = true;
+ break;
+
+ /*
+ * Pick the earliest possible offline event.
+ * The only important thing here is that we get the event and that
+ * it's exactly one.
+ */
+# ifdef CPU_DOWN_PREPARE
+ case CPU_DOWN_PREPARE:
+# if defined(CPU_TASKS_FROZEN) && defined(CPU_DOWN_PREPARE_FROZEN)
+ case CPU_DOWN_PREPARE_FROZEN:
+# endif
+ fProcessEvent = true;
+# else
+ case CPU_DEAD:
+# if defined(CPU_TASKS_FROZEN) && defined(CPU_DEAD_FROZEN)
+ case CPU_DEAD_FROZEN:
+# endif
+ /* Don't process CPU_DEAD notifications. */
+# endif
+# ifdef CPU_DOWN_FAILED
+ RTCpuSetAdd(&g_MpPendingOfflineSet, idCpu);
+# endif
+ break;
+ }
+
+ if (!fProcessEvent)
+ return NOTIFY_DONE;
+
+ switch (ulNativeEvent)
+ {
+# ifdef CPU_DOWN_FAILED
+ case CPU_DOWN_FAILED:
+# if defined(CPU_TASKS_FROZEN) && defined(CPU_DOWN_FAILED_FROZEN)
+ case CPU_DOWN_FAILED_FROZEN:
+# endif
+# endif
+ case CPU_ONLINE:
+# if defined(CPU_TASKS_FROZEN) && defined(CPU_ONLINE_FROZEN)
+ case CPU_ONLINE_FROZEN:
+# endif
+ rtMpNotificationDoCallbacks(RTMPEVENT_ONLINE, idCpu);
+ break;
+
+# ifdef CPU_DOWN_PREPARE
+ case CPU_DOWN_PREPARE:
+# if defined(CPU_TASKS_FROZEN) && defined(CPU_DOWN_PREPARE_FROZEN)
+ case CPU_DOWN_PREPARE_FROZEN:
+# endif
+ rtMpNotificationDoCallbacks(RTMPEVENT_OFFLINE, idCpu);
+ break;
+# endif
+ }
+
+ return NOTIFY_DONE;
+}
+
+
+DECLHIDDEN(int) rtR0MpNotificationNativeInit(void)
+{
+ int rc;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+# ifdef CPU_DOWN_FAILED
+ RTCpuSetEmpty(&g_MpPendingOfflineSet);
+# endif
+
+ rc = register_cpu_notifier(&g_NotifierBlock);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ AssertMsgReturn(!rc, ("%d\n", rc), RTErrConvertFromErrno(rc));
+ return VINF_SUCCESS;
+}
+
+
+DECLHIDDEN(void) rtR0MpNotificationNativeTerm(void)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ unregister_cpu_notifier(&g_NotifierBlock);
+ IPRT_LINUX_RESTORE_EFL_AC();
+}
+
+#else /* Not supported / Not needed */
+
+DECLHIDDEN(int) rtR0MpNotificationNativeInit(void)
+{
+ return VINF_SUCCESS;
+}
+
+DECLHIDDEN(void) rtR0MpNotificationNativeTerm(void)
+{
+}
+
+#endif /* Not supported / Not needed */
+
diff --git a/src/VBox/Runtime/r0drv/linux/process-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/process-r0drv-linux.c
new file mode 100644
index 00000000..3179a95e
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/process-r0drv-linux.c
@@ -0,0 +1,59 @@
+/* $Id: process-r0drv-linux.c $ */
+/** @file
+ * IPRT - Process, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/process.h>
+
+
+RTDECL(RTPROCESS) RTProcSelf(void)
+{
+ return (RTPROCESS)current->tgid;
+}
+RT_EXPORT_SYMBOL(RTProcSelf);
+
+
+RTR0DECL(RTR0PROCESS) RTR0ProcHandleSelf(void)
+{
+ return (RTR0PROCESS)current->tgid;
+}
+RT_EXPORT_SYMBOL(RTR0ProcHandleSelf);
+
diff --git a/src/VBox/Runtime/r0drv/linux/rtStrFormatKernelAddress-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/rtStrFormatKernelAddress-r0drv-linux.c
new file mode 100644
index 00000000..14cfc4f7
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/rtStrFormatKernelAddress-r0drv-linux.c
@@ -0,0 +1,66 @@
+/* $Id: rtStrFormatKernelAddress-r0drv-linux.c $ */
+/** @file
+ * IPRT - IPRT String Formatter, ring-0 addresses.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_STRING
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/assert.h>
+#include <iprt/string.h>
+
+#include "internal/string.h"
+
+
+DECLHIDDEN(size_t) rtStrFormatKernelAddress(char *pszBuf, size_t cbBuf, RTR0INTPTR uPtr, signed int cchWidth,
+ signed int cchPrecision, unsigned int fFlags)
+{
+#if !defined(DEBUG) && RTLNX_VER_MIN(2,6,38)
+ RT_NOREF(cchWidth, cchPrecision);
+ /* use the Linux kernel function which is able to handle "%pK" */
+ static const char s_szFmt[] = "0x%pK";
+ const char *pszFmt = s_szFmt;
+ if (!(fFlags & RTSTR_F_SPECIAL))
+ pszFmt += 2;
+ return scnprintf(pszBuf, cbBuf, pszFmt, uPtr);
+#else
+ Assert(cbBuf >= 64);
+ return RTStrFormatNumber(pszBuf, uPtr, 16, cchWidth, cchPrecision, fFlags);
+#endif
+}
diff --git a/src/VBox/Runtime/r0drv/linux/semevent-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/semevent-r0drv-linux.c
new file mode 100644
index 00000000..0b1ca618
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/semevent-r0drv-linux.c
@@ -0,0 +1,296 @@
+/* $Id: semevent-r0drv-linux.c $ */
+/** @file
+ * IPRT - Single Release Event Semaphores, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define RTSEMEVENT_WITHOUT_REMAPPING
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/semaphore.h>
+
+#include <iprt/asm.h>
+#include <iprt/assert.h>
+#include <iprt/err.h>
+#include <iprt/lockvalidator.h>
+#include <iprt/mem.h>
+
+#include "waitqueue-r0drv-linux.h"
+#include "internal/magics.h"
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * Linux event semaphore.
+ */
+typedef struct RTSEMEVENTINTERNAL
+{
+ /** Magic value (RTSEMEVENT_MAGIC). */
+ uint32_t volatile u32Magic;
+ /** The object status - !0 when signaled and 0 when reset. */
+ uint32_t volatile fState;
+ /** Reference counter. */
+ uint32_t volatile cRefs;
+ /** The wait queue. */
+ wait_queue_head_t Head;
+} RTSEMEVENTINTERNAL, *PRTSEMEVENTINTERNAL;
+
+
+
+RTDECL(int) RTSemEventCreate(PRTSEMEVENT phEventSem)
+{
+ return RTSemEventCreateEx(phEventSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, NULL);
+}
+
+
+RTDECL(int) RTSemEventCreateEx(PRTSEMEVENT phEventSem, uint32_t fFlags, RTLOCKVALCLASS hClass, const char *pszNameFmt, ...)
+{
+ PRTSEMEVENTINTERNAL pThis;
+ IPRT_LINUX_SAVE_EFL_AC();
+ RT_NOREF_PV(hClass); RT_NOREF_PV(pszNameFmt);
+
+ AssertReturn(!(fFlags & ~(RTSEMEVENT_FLAGS_NO_LOCK_VAL | RTSEMEVENT_FLAGS_BOOTSTRAP_HACK)), VERR_INVALID_PARAMETER);
+ Assert(!(fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK) || (fFlags & RTSEMEVENT_FLAGS_NO_LOCK_VAL));
+
+ pThis = (PRTSEMEVENTINTERNAL)RTMemAlloc(sizeof(*pThis));
+ if (!pThis)
+ return VERR_NO_MEMORY;
+
+ pThis->u32Magic = RTSEMEVENT_MAGIC;
+ pThis->fState = 0;
+ pThis->cRefs = 1;
+ init_waitqueue_head(&pThis->Head);
+
+ *phEventSem = pThis;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemEventCreate);
+
+
+/**
+ * Retains a reference to the event semaphore.
+ *
+ * @param pThis The event semaphore.
+ */
+DECLINLINE(void) rtR0SemEventLnxRetain(PRTSEMEVENTINTERNAL pThis)
+{
+ uint32_t cRefs = ASMAtomicIncU32(&pThis->cRefs);
+ Assert(cRefs < 100000); NOREF(cRefs);
+}
+
+
+/**
+ * Releases a reference to the event semaphore.
+ *
+ * @param pThis The event semaphore.
+ */
+DECLINLINE(void) rtR0SemEventLnxRelease(PRTSEMEVENTINTERNAL pThis)
+{
+ if (RT_UNLIKELY(ASMAtomicDecU32(&pThis->cRefs) == 0))
+ RTMemFree(pThis);
+}
+
+
+RTDECL(int) RTSemEventDestroy(RTSEMEVENT hEventSem)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate input.
+ */
+ PRTSEMEVENTINTERNAL pThis = hEventSem;
+ if (pThis == NIL_RTSEMEVENT)
+ return VINF_SUCCESS;
+ AssertMsgReturn(pThis->u32Magic == RTSEMEVENT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE);
+ Assert(pThis->cRefs > 0);
+
+ /*
+ * Invalidate it and signal the object just in case.
+ */
+ ASMAtomicWriteU32(&pThis->u32Magic, ~RTSEMEVENT_MAGIC);
+ ASMAtomicWriteU32(&pThis->fState, 0);
+ Assert(!waitqueue_active(&pThis->Head));
+ wake_up_all(&pThis->Head);
+ rtR0SemEventLnxRelease(pThis);
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemEventDestroy);
+
+
+RTDECL(int) RTSemEventSignal(RTSEMEVENT hEventSem)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate input.
+ */
+ PRTSEMEVENTINTERNAL pThis = (PRTSEMEVENTINTERNAL)hEventSem;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertMsgReturn(pThis->u32Magic == RTSEMEVENT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE);
+ rtR0SemEventLnxRetain(pThis);
+
+ /*
+ * Signal the event object.
+ */
+ ASMAtomicWriteU32(&pThis->fState, 1);
+ wake_up(&pThis->Head);
+
+ rtR0SemEventLnxRelease(pThis);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemEventSignal);
+
+
+/**
+ * Worker for RTSemEventWaitEx and RTSemEventWaitExDebug.
+ *
+ * @returns VBox status code.
+ * @param pThis The event semaphore.
+ * @param fFlags See RTSemEventWaitEx.
+ * @param uTimeout See RTSemEventWaitEx.
+ * @param pSrcPos The source code position of the wait.
+ */
+static int rtR0SemEventLnxWait(PRTSEMEVENTINTERNAL pThis, uint32_t fFlags, uint64_t uTimeout,
+ PCRTLOCKVALSRCPOS pSrcPos)
+{
+ int rc;
+ RT_NOREF_PV(pSrcPos);
+
+ /*
+ * Validate the input.
+ */
+ AssertPtrReturn(pThis, VERR_INVALID_PARAMETER);
+ AssertMsgReturn(pThis->u32Magic == RTSEMEVENT_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER);
+ AssertReturn(RTSEMWAIT_FLAGS_ARE_VALID(fFlags), VERR_INVALID_PARAMETER);
+ rtR0SemEventLnxRetain(pThis);
+
+ /*
+ * Try grab the event without setting up the wait.
+ */
+ if ( 1 /** @todo check if there are someone waiting already - waitqueue_active, but then what do we do below? */
+ && ASMAtomicCmpXchgU32(&pThis->fState, 0, 1))
+ rc = VINF_SUCCESS;
+ else
+ {
+ /*
+ * We have to wait.
+ */
+ IPRT_LINUX_SAVE_EFL_AC();
+ RTR0SEMLNXWAIT Wait;
+ rc = rtR0SemLnxWaitInit(&Wait, fFlags, uTimeout, &pThis->Head);
+ if (RT_SUCCESS(rc))
+ {
+ IPRT_DEBUG_SEMS_STATE(pThis, 'E');
+ for (;;)
+ {
+ /* The destruction test. */
+ if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENT_MAGIC))
+ rc = VERR_SEM_DESTROYED;
+ else
+ {
+ rtR0SemLnxWaitPrepare(&Wait);
+
+ /* Check the exit conditions. */
+ if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENT_MAGIC))
+ rc = VERR_SEM_DESTROYED;
+ else if (ASMAtomicCmpXchgU32(&pThis->fState, 0, 1))
+ rc = VINF_SUCCESS;
+ else if (rtR0SemLnxWaitHasTimedOut(&Wait))
+ rc = VERR_TIMEOUT;
+ else if (rtR0SemLnxWaitWasInterrupted(&Wait))
+ rc = VERR_INTERRUPTED;
+ else
+ {
+ /* Do the wait and then recheck the conditions. */
+ rtR0SemLnxWaitDoIt(&Wait);
+ continue;
+ }
+ }
+ break;
+ }
+
+ rtR0SemLnxWaitDelete(&Wait);
+ IPRT_DEBUG_SEMS_STATE_RC(pThis, 'E', rc);
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ }
+
+ rtR0SemEventLnxRelease(pThis);
+ return rc;
+}
+
+
+RTDECL(int) RTSemEventWaitEx(RTSEMEVENT hEventSem, uint32_t fFlags, uint64_t uTimeout)
+{
+#ifndef RTSEMEVENT_STRICT
+ return rtR0SemEventLnxWait(hEventSem, fFlags, uTimeout, NULL);
+#else
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API();
+ return rtR0SemEventLnxWait(hEventSem, fFlags, uTimeout, &SrcPos);
+#endif
+}
+RT_EXPORT_SYMBOL(RTSemEventWaitEx);
+
+
+RTDECL(int) RTSemEventWaitExDebug(RTSEMEVENT hEventSem, uint32_t fFlags, uint64_t uTimeout,
+ RTHCUINTPTR uId, RT_SRC_POS_DECL)
+{
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API();
+ return rtR0SemEventLnxWait(hEventSem, fFlags, uTimeout, &SrcPos);
+}
+RT_EXPORT_SYMBOL(RTSemEventWaitExDebug);
+
+
+RTDECL(uint32_t) RTSemEventGetResolution(void)
+{
+ return rtR0SemLnxWaitGetResolution();
+}
+RT_EXPORT_SYMBOL(RTSemEventGetResolution);
+
+
+RTR0DECL(bool) RTSemEventIsSignalSafe(void)
+{
+ return true;
+}
+RT_EXPORT_SYMBOL(RTSemEventIsSignalSafe);
+
diff --git a/src/VBox/Runtime/r0drv/linux/semeventmulti-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/semeventmulti-r0drv-linux.c
new file mode 100644
index 00000000..fbc028a1
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/semeventmulti-r0drv-linux.c
@@ -0,0 +1,361 @@
+/* $Id: semeventmulti-r0drv-linux.c $ */
+/** @file
+ * IPRT - Multiple Release Event Semaphores, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define RTSEMEVENTMULTI_WITHOUT_REMAPPING
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/semaphore.h>
+
+#include <iprt/assert.h>
+#include <iprt/asm.h>
+#include <iprt/err.h>
+#include <iprt/mem.h>
+#include <iprt/lockvalidator.h>
+
+#include "waitqueue-r0drv-linux.h"
+#include "internal/magics.h"
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+/** @name fStateAndGen values
+ * @{ */
+/** The state bit number. */
+#define RTSEMEVENTMULTILNX_STATE_BIT 0
+/** The state mask. */
+#define RTSEMEVENTMULTILNX_STATE_MASK RT_BIT_32(RTSEMEVENTMULTILNX_STATE_BIT)
+/** The generation mask. */
+#define RTSEMEVENTMULTILNX_GEN_MASK ~RTSEMEVENTMULTILNX_STATE_MASK
+/** The generation shift. */
+#define RTSEMEVENTMULTILNX_GEN_SHIFT 1
+/** The initial variable value. */
+#define RTSEMEVENTMULTILNX_STATE_GEN_INIT UINT32_C(0xfffffffc)
+/** @} */
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * Linux event semaphore.
+ */
+typedef struct RTSEMEVENTMULTIINTERNAL
+{
+ /** Magic value (RTSEMEVENTMULTI_MAGIC). */
+ uint32_t volatile u32Magic;
+ /** The object state bit and generation counter.
+ * The generation counter is incremented every time the object is
+ * signalled. */
+ uint32_t volatile fStateAndGen;
+ /** Reference counter. */
+ uint32_t volatile cRefs;
+ /** The wait queue. */
+ wait_queue_head_t Head;
+} RTSEMEVENTMULTIINTERNAL, *PRTSEMEVENTMULTIINTERNAL;
+
+
+
+
+
+RTDECL(int) RTSemEventMultiCreate(PRTSEMEVENTMULTI phEventMultiSem)
+{
+ return RTSemEventMultiCreateEx(phEventMultiSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, NULL);
+}
+
+
+RTDECL(int) RTSemEventMultiCreateEx(PRTSEMEVENTMULTI phEventMultiSem, uint32_t fFlags, RTLOCKVALCLASS hClass,
+ const char *pszNameFmt, ...)
+{
+ PRTSEMEVENTMULTIINTERNAL pThis;
+ IPRT_LINUX_SAVE_EFL_AC();
+ RT_NOREF_PV(hClass); RT_NOREF_PV(pszNameFmt);
+
+ AssertReturn(!(fFlags & ~RTSEMEVENTMULTI_FLAGS_NO_LOCK_VAL), VERR_INVALID_PARAMETER);
+ pThis = (PRTSEMEVENTMULTIINTERNAL)RTMemAlloc(sizeof(*pThis));
+ if (pThis)
+ {
+ pThis->u32Magic = RTSEMEVENTMULTI_MAGIC;
+ pThis->fStateAndGen = RTSEMEVENTMULTILNX_STATE_GEN_INIT;
+ pThis->cRefs = 1;
+ init_waitqueue_head(&pThis->Head);
+
+ *phEventMultiSem = pThis;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+}
+RT_EXPORT_SYMBOL(RTSemEventMultiCreate);
+
+
+/**
+ * Retain a reference to the semaphore.
+ *
+ * @param pThis The semaphore.
+ */
+DECLINLINE(void) rtR0SemEventMultiLnxRetain(PRTSEMEVENTMULTIINTERNAL pThis)
+{
+ uint32_t cRefs = ASMAtomicIncU32(&pThis->cRefs);
+ NOREF(cRefs);
+ Assert(cRefs && cRefs < 100000);
+}
+
+
+/**
+ * Release a reference, destroy the thing if necessary.
+ *
+ * @param pThis The semaphore.
+ */
+DECLINLINE(void) rtR0SemEventMultiLnxRelease(PRTSEMEVENTMULTIINTERNAL pThis)
+{
+ if (RT_UNLIKELY(ASMAtomicDecU32(&pThis->cRefs) == 0))
+ {
+ Assert(pThis->u32Magic != RTSEMEVENTMULTI_MAGIC);
+ RTMemFree(pThis);
+ }
+}
+
+
+RTDECL(int) RTSemEventMultiDestroy(RTSEMEVENTMULTI hEventMultiSem)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate input.
+ */
+ PRTSEMEVENTMULTIINTERNAL pThis = (PRTSEMEVENTMULTIINTERNAL)hEventMultiSem;
+ if (pThis == NIL_RTSEMEVENTMULTI)
+ return VINF_SUCCESS;
+ AssertPtrReturn(pThis, VERR_INVALID_PARAMETER);
+ AssertMsgReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER);
+ Assert(pThis->cRefs > 0);
+
+ /*
+ * Invalidate it and signal the object just in case.
+ */
+ ASMAtomicWriteU32(&pThis->u32Magic, ~RTSEMEVENTMULTI_MAGIC);
+ ASMAtomicAndU32(&pThis->fStateAndGen, RTSEMEVENTMULTILNX_GEN_MASK);
+ Assert(!waitqueue_active(&pThis->Head));
+ wake_up_all(&pThis->Head);
+ rtR0SemEventMultiLnxRelease(pThis);
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemEventMultiDestroy);
+
+
+RTDECL(int) RTSemEventMultiSignal(RTSEMEVENTMULTI hEventMultiSem)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ uint32_t fNew;
+ uint32_t fOld;
+
+ /*
+ * Validate input.
+ */
+ PRTSEMEVENTMULTIINTERNAL pThis = (PRTSEMEVENTMULTIINTERNAL)hEventMultiSem;
+ if (!pThis)
+ return VERR_INVALID_PARAMETER;
+ AssertPtrReturn(pThis, VERR_INVALID_PARAMETER);
+ AssertMsgReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER);
+ rtR0SemEventMultiLnxRetain(pThis);
+
+ /*
+ * Signal the event object. The cause of the paranoia here is racing to try
+ * deal with racing RTSemEventMultiSignal calls (should probably be
+ * forbidden, but it's relatively easy to handle).
+ */
+ do
+ {
+ fNew = fOld = ASMAtomicUoReadU32(&pThis->fStateAndGen);
+ fNew += 1 << RTSEMEVENTMULTILNX_GEN_SHIFT;
+ fNew |= RTSEMEVENTMULTILNX_STATE_MASK;
+ }
+ while (!ASMAtomicCmpXchgU32(&pThis->fStateAndGen, fNew, fOld));
+
+ wake_up_all(&pThis->Head);
+
+ rtR0SemEventMultiLnxRelease(pThis);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemEventMultiSignal);
+
+
+RTDECL(int) RTSemEventMultiReset(RTSEMEVENTMULTI hEventMultiSem)
+{
+ /*
+ * Validate input.
+ */
+ PRTSEMEVENTMULTIINTERNAL pThis = (PRTSEMEVENTMULTIINTERNAL)hEventMultiSem;
+ if (!pThis)
+ return VERR_INVALID_PARAMETER;
+ AssertPtrReturn(pThis, VERR_INVALID_PARAMETER);
+ AssertMsgReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER);
+ rtR0SemEventMultiLnxRetain(pThis);
+
+ /*
+ * Reset it.
+ */
+ ASMAtomicAndU32(&pThis->fStateAndGen, ~RTSEMEVENTMULTILNX_STATE_MASK);
+
+ rtR0SemEventMultiLnxRelease(pThis);
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemEventMultiReset);
+
+
+/**
+ * Worker for RTSemEventMultiWaitEx and RTSemEventMultiWaitExDebug.
+ *
+ * @returns VBox status code.
+ * @param pThis The event semaphore.
+ * @param fFlags See RTSemEventMultiWaitEx.
+ * @param uTimeout See RTSemEventMultiWaitEx.
+ * @param pSrcPos The source code position of the wait.
+ */
+static int rtR0SemEventMultiLnxWait(PRTSEMEVENTMULTIINTERNAL pThis, uint32_t fFlags, uint64_t uTimeout,
+ PCRTLOCKVALSRCPOS pSrcPos)
+{
+ uint32_t fOrgStateAndGen;
+ int rc;
+ RT_NOREF_PV(pSrcPos);
+
+ /*
+ * Validate the input.
+ */
+ AssertPtrReturn(pThis, VERR_INVALID_PARAMETER);
+ AssertMsgReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER);
+ AssertReturn(RTSEMWAIT_FLAGS_ARE_VALID(fFlags), VERR_INVALID_PARAMETER);
+ rtR0SemEventMultiLnxRetain(pThis);
+
+ /*
+ * Is the event already signalled or do we have to wait?
+ */
+ fOrgStateAndGen = ASMAtomicUoReadU32(&pThis->fStateAndGen);
+ if (fOrgStateAndGen & RTSEMEVENTMULTILNX_STATE_MASK)
+ rc = VINF_SUCCESS;
+ else
+ {
+ /*
+ * We have to wait.
+ */
+ RTR0SEMLNXWAIT Wait;
+ IPRT_LINUX_SAVE_EFL_AC();
+ rc = rtR0SemLnxWaitInit(&Wait, fFlags, uTimeout, &pThis->Head);
+ if (RT_SUCCESS(rc))
+ {
+ IPRT_DEBUG_SEMS_STATE(pThis, 'E');
+ for (;;)
+ {
+ /* The destruction test. */
+ if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENTMULTI_MAGIC))
+ rc = VERR_SEM_DESTROYED;
+ else
+ {
+ rtR0SemLnxWaitPrepare(&Wait);
+
+ /* Check the exit conditions. */
+ if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENTMULTI_MAGIC))
+ rc = VERR_SEM_DESTROYED;
+ else if (ASMAtomicUoReadU32(&pThis->fStateAndGen) != fOrgStateAndGen)
+ rc = VINF_SUCCESS;
+ else if (rtR0SemLnxWaitHasTimedOut(&Wait))
+ rc = VERR_TIMEOUT;
+ else if (rtR0SemLnxWaitWasInterrupted(&Wait))
+ rc = VERR_INTERRUPTED;
+ else
+ {
+ /* Do the wait and then recheck the conditions. */
+ rtR0SemLnxWaitDoIt(&Wait);
+ continue;
+ }
+ }
+ break;
+ }
+
+ rtR0SemLnxWaitDelete(&Wait);
+ IPRT_DEBUG_SEMS_STATE_RC(pThis, 'E', rc);
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ }
+
+ rtR0SemEventMultiLnxRelease(pThis);
+ return rc;
+}
+
+
+RTDECL(int) RTSemEventMultiWaitEx(RTSEMEVENTMULTI hEventMultiSem, uint32_t fFlags, uint64_t uTimeout)
+{
+#ifndef RTSEMEVENT_STRICT
+ return rtR0SemEventMultiLnxWait(hEventMultiSem, fFlags, uTimeout, NULL);
+#else
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API();
+ return rtR0SemEventMultiLnxWait(hEventMultiSem, fFlags, uTimeout, &SrcPos);
+#endif
+}
+RT_EXPORT_SYMBOL(RTSemEventMultiWaitEx);
+
+
+RTDECL(int) RTSemEventMultiWaitExDebug(RTSEMEVENTMULTI hEventMultiSem, uint32_t fFlags, uint64_t uTimeout,
+ RTHCUINTPTR uId, RT_SRC_POS_DECL)
+{
+ RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API();
+ return rtR0SemEventMultiLnxWait(hEventMultiSem, fFlags, uTimeout, &SrcPos);
+}
+RT_EXPORT_SYMBOL(RTSemEventMultiWaitExDebug);
+
+
+RTDECL(uint32_t) RTSemEventMultiGetResolution(void)
+{
+ return rtR0SemLnxWaitGetResolution();
+}
+RT_EXPORT_SYMBOL(RTSemEventMultiGetResolution);
+
+
+RTR0DECL(bool) RTSemEventMultiIsSignalSafe(void)
+{
+ return true;
+}
+RT_EXPORT_SYMBOL(RTSemEventMultiIsSignalSafe);
+
diff --git a/src/VBox/Runtime/r0drv/linux/semfastmutex-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/semfastmutex-r0drv-linux.c
new file mode 100644
index 00000000..0e2dff35
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/semfastmutex-r0drv-linux.c
@@ -0,0 +1,167 @@
+/* $Id: semfastmutex-r0drv-linux.c $ */
+/** @file
+ * IPRT - Fast Mutex Semaphores, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/semaphore.h>
+#include <iprt/alloc.h>
+#include <iprt/assert.h>
+#include <iprt/asm.h>
+#include <iprt/errcore.h>
+#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS)
+# include <iprt/thread.h>
+#endif
+
+#include "internal/magics.h"
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * Wrapper for the linux semaphore structure.
+ */
+typedef struct RTSEMFASTMUTEXINTERNAL
+{
+ /** Magic value (RTSEMFASTMUTEX_MAGIC). */
+ uint32_t u32Magic;
+ /** the linux semaphore. */
+ struct semaphore Semaphore;
+#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS)
+ /** For check. */
+ RTNATIVETHREAD volatile Owner;
+#endif
+} RTSEMFASTMUTEXINTERNAL, *PRTSEMFASTMUTEXINTERNAL;
+
+
+RTDECL(int) RTSemFastMutexCreate(PRTSEMFASTMUTEX phFastMtx)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Allocate.
+ */
+ PRTSEMFASTMUTEXINTERNAL pThis;
+ pThis = (PRTSEMFASTMUTEXINTERNAL)RTMemAlloc(sizeof(*pThis));
+ if (!pThis)
+ return VERR_NO_MEMORY;
+
+ /*
+ * Initialize.
+ */
+ pThis->u32Magic = RTSEMFASTMUTEX_MAGIC;
+ sema_init(&pThis->Semaphore, 1);
+#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS)
+ pThis->Owner = NIL_RTNATIVETHREAD;
+#endif
+
+ *phFastMtx = pThis;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemFastMutexCreate);
+
+
+RTDECL(int) RTSemFastMutexDestroy(RTSEMFASTMUTEX hFastMtx)
+{
+ /*
+ * Validate.
+ */
+ PRTSEMFASTMUTEXINTERNAL pThis = hFastMtx;
+ if (pThis == NIL_RTSEMFASTMUTEX)
+ return VINF_SUCCESS;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertMsgReturn(pThis->u32Magic == RTSEMFASTMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE);
+
+ ASMAtomicWriteU32(&pThis->u32Magic, RTSEMFASTMUTEX_MAGIC_DEAD);
+ RTMemFree(pThis);
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemFastMutexDestroy);
+
+
+RTDECL(int) RTSemFastMutexRequest(RTSEMFASTMUTEX hFastMtx)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate.
+ */
+ PRTSEMFASTMUTEXINTERNAL pThis = hFastMtx;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertMsgReturn(pThis->u32Magic == RTSEMFASTMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE);
+
+ IPRT_DEBUG_SEMS_STATE(pThis, 'd');
+ down(&pThis->Semaphore);
+#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS)
+ IPRT_DEBUG_SEMS_STATE(pThis, 'o');
+ AssertRelease(pThis->Owner == NIL_RTNATIVETHREAD);
+ ASMAtomicUoWriteSize(&pThis->Owner, RTThreadNativeSelf());
+#endif
+
+ IPRT_LINUX_RESTORE_EFL_ONLY_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemFastMutexRequest);
+
+
+RTDECL(int) RTSemFastMutexRelease(RTSEMFASTMUTEX hFastMtx)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate.
+ */
+ PRTSEMFASTMUTEXINTERNAL pThis = hFastMtx;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertMsgReturn(pThis->u32Magic == RTSEMFASTMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE);
+
+#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS)
+ AssertRelease(pThis->Owner == RTThreadNativeSelf());
+ ASMAtomicUoWriteSize(&pThis->Owner, NIL_RTNATIVETHREAD);
+#endif
+ up(&pThis->Semaphore);
+ IPRT_DEBUG_SEMS_STATE(pThis, 'u');
+
+ IPRT_LINUX_RESTORE_EFL_ONLY_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemFastMutexRelease);
+
diff --git a/src/VBox/Runtime/r0drv/linux/semmutex-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/semmutex-r0drv-linux.c
new file mode 100644
index 00000000..16fcacfe
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/semmutex-r0drv-linux.c
@@ -0,0 +1,431 @@
+/* $Id: semmutex-r0drv-linux.c $ */
+/** @file
+ * IPRT - Mutex Semaphores, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define RTSEMMUTEX_WITHOUT_REMAPPING
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/semaphore.h>
+
+#include <iprt/assert.h>
+#include <iprt/asm.h>
+#include <iprt/mem.h>
+#include <iprt/err.h>
+#include <iprt/list.h>
+
+#include "internal/magics.h"
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+typedef struct RTSEMMUTEXLNXWAITER
+{
+ /** The list entry. */
+ RTLISTNODE ListEntry;
+ /** The waiting task. */
+ struct task_struct *pTask;
+ /** Why did we wake up? */
+ enum
+ {
+ /** Wakeup to take the semaphore. */
+ RTSEMMUTEXLNXWAITER_WAKEUP,
+ /** Mutex is being destroyed. */
+ RTSEMMUTEXLNXWAITER_DESTROYED,
+ /** Some other reason. */
+ RTSEMMUTEXLNXWAITER_OTHER
+ } volatile enmReason;
+} RTSEMMUTEXLNXWAITER, *PRTSEMMUTEXLNXWAITER;
+
+/**
+ * Wrapper for the linux semaphore structure.
+ */
+typedef struct RTSEMMUTEXINTERNAL
+{
+ /** Magic value (RTSEMMUTEX_MAGIC). */
+ uint32_t u32Magic;
+ /** The number of recursions. */
+ uint32_t cRecursions;
+ /** The list of waiting threads. */
+ RTLISTANCHOR WaiterList;
+ /** The current owner, NULL if none. */
+ struct task_struct *pOwnerTask;
+ /** The number of references to this piece of memory. This is used to
+ * prevent it from being kicked from underneath us while waiting. */
+ uint32_t volatile cRefs;
+ /** The spinlock protecting the members and falling asleep. */
+ spinlock_t Spinlock;
+} RTSEMMUTEXINTERNAL, *PRTSEMMUTEXINTERNAL;
+
+
+RTDECL(int) RTSemMutexCreate(PRTSEMMUTEX phMtx)
+{
+ int rc = VINF_SUCCESS;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Allocate.
+ */
+ PRTSEMMUTEXINTERNAL pThis;
+ pThis = (PRTSEMMUTEXINTERNAL)RTMemAlloc(sizeof(*pThis));
+ if (pThis)
+ {
+ /*
+ * Initialize.
+ */
+ pThis->u32Magic = RTSEMMUTEX_MAGIC;
+ pThis->cRecursions = 0;
+ pThis->pOwnerTask = NULL;
+ pThis->cRefs = 1;
+ RTListInit(&pThis->WaiterList);
+ spin_lock_init(&pThis->Spinlock);
+
+ *phMtx = pThis;
+ }
+ else
+ rc = VERR_NO_MEMORY;
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+RT_EXPORT_SYMBOL(RTSemMutexCreate);
+
+
+RTDECL(int) RTSemMutexDestroy(RTSEMMUTEX hMtx)
+{
+ PRTSEMMUTEXINTERNAL pThis = hMtx;
+ PRTSEMMUTEXLNXWAITER pCur;
+ unsigned long fSavedIrq;
+
+ /*
+ * Validate.
+ */
+ if (pThis == NIL_RTSEMMUTEX)
+ return VINF_SUCCESS;
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE);
+
+ /*
+ * Kill it, kick waiters and release it.
+ */
+ AssertReturn(ASMAtomicCmpXchgU32(&pThis->u32Magic, RTSEMMUTEX_MAGIC_DEAD, RTSEMMUTEX_MAGIC), VERR_INVALID_HANDLE);
+
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ spin_lock_irqsave(&pThis->Spinlock, fSavedIrq);
+ RTListForEach(&pThis->WaiterList, pCur, RTSEMMUTEXLNXWAITER, ListEntry)
+ {
+ pCur->enmReason = RTSEMMUTEXLNXWAITER_DESTROYED;
+ wake_up_process(pCur->pTask);
+ }
+
+ if (ASMAtomicDecU32(&pThis->cRefs) != 0)
+ spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq);
+ else
+ {
+ spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq);
+ RTMemFree(pThis);
+ }
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSemMutexDestroy);
+
+
+/**
+ * Worker for rtSemMutexLinuxRequest that handles the case where we go to sleep.
+ *
+ * @returns VINF_SUCCESS, VERR_INTERRUPTED, VERR_TIMEOUT or VERR_SEM_DESTROYED.
+ * Returns without owning the spinlock.
+ * @param pThis The mutex instance.
+ * @param cMillies The timeout.
+ * @param fInterruptible The wait type.
+ * @param fSavedIrq The saved IRQ flags.
+ */
+static int rtSemMutexLinuxRequestSleep(PRTSEMMUTEXINTERNAL pThis, RTMSINTERVAL cMillies,
+ bool fInterruptible, unsigned long fSavedIrq)
+{
+ struct task_struct *pSelf = current;
+ int rc = VERR_TIMEOUT;
+ long lTimeout = cMillies == RT_INDEFINITE_WAIT ? MAX_SCHEDULE_TIMEOUT : msecs_to_jiffies(cMillies);
+ RTSEMMUTEXLNXWAITER Waiter;
+
+ IPRT_DEBUG_SEMS_STATE(pThis, 'm');
+
+ /*
+ * Grab a reference to the mutex and add ourselves to the waiter list.
+ */
+ ASMAtomicIncU32(&pThis->cRefs);
+
+ Waiter.pTask = pSelf;
+ Waiter.enmReason = RTSEMMUTEXLNXWAITER_OTHER;
+ RTListAppend(&pThis->WaiterList, &Waiter.ListEntry);
+
+ /*
+ * Do the waiting.
+ */
+ for (;;)
+ {
+ /* Check signal and timeout conditions. */
+ if ( fInterruptible
+ && signal_pending(pSelf))
+ {
+ rc = VERR_INTERRUPTED;
+ break;
+ }
+
+ if (!lTimeout)
+ break;
+
+ /* Go to sleep. */
+ set_current_state(fInterruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&pThis->Spinlock);
+
+ lTimeout = schedule_timeout(lTimeout);
+
+ spin_lock_irq(&pThis->Spinlock);
+ set_current_state(TASK_RUNNING);
+
+ /* Did someone wake us up? */
+ if (Waiter.enmReason == RTSEMMUTEXLNXWAITER_WAKEUP)
+ {
+ Assert(pThis->cRecursions == 0);
+ pThis->cRecursions = 1;
+ pThis->pOwnerTask = pSelf;
+ rc = VINF_SUCCESS;
+ break;
+ }
+
+ /* Is the mutex being destroyed? */
+ if (RT_UNLIKELY( Waiter.enmReason == RTSEMMUTEXLNXWAITER_DESTROYED
+ || pThis->u32Magic != RTSEMMUTEX_MAGIC))
+ {
+ rc = VERR_SEM_DESTROYED;
+ break;
+ }
+ }
+
+ /*
+ * Unlink ourself from the waiter list, dereference the mutex and exit the
+ * lock. We might have to free the mutex if it was the destroyed.
+ */
+ RTListNodeRemove(&Waiter.ListEntry);
+ IPRT_DEBUG_SEMS_STATE_RC(pThis, 'M', rc);
+
+ if (RT_LIKELY(ASMAtomicDecU32(&pThis->cRefs) != 0))
+ spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq);
+ else
+ {
+ Assert(RT_FAILURE_NP(rc));
+ spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq);
+ RTMemFree(pThis);
+ }
+ return rc;
+}
+
+
+/**
+ * Internal worker.
+ */
+DECLINLINE(int) rtSemMutexLinuxRequest(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, bool fInterruptible)
+{
+ PRTSEMMUTEXINTERNAL pThis = hMutexSem;
+ struct task_struct *pSelf = current;
+ unsigned long fSavedIrq;
+ int rc;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate.
+ */
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE);
+ Assert(pThis->cRefs >= 1);
+
+ /*
+ * Lock it and check if it's a recursion.
+ */
+ spin_lock_irqsave(&pThis->Spinlock, fSavedIrq);
+ if (pThis->pOwnerTask == pSelf)
+ {
+ pThis->cRecursions++;
+ Assert(pThis->cRecursions > 1);
+ Assert(pThis->cRecursions < 256);
+ rc = VINF_SUCCESS;
+ }
+ /*
+ * Not a recursion, maybe it's not owned by anyone then?
+ */
+ else if ( pThis->pOwnerTask == NULL
+ && RTListIsEmpty(&pThis->WaiterList))
+ {
+ Assert(pThis->cRecursions == 0);
+ pThis->cRecursions = 1;
+ pThis->pOwnerTask = pSelf;
+ rc = VINF_SUCCESS;
+ }
+ /*
+ * Was it a polling call?
+ */
+ else if (cMillies == 0)
+ rc = VERR_TIMEOUT;
+ /*
+ * No, so go to sleep.
+ */
+ else
+ {
+ rc = rtSemMutexLinuxRequestSleep(pThis, cMillies, fInterruptible, fSavedIrq);
+ IPRT_LINUX_RESTORE_EFL_ONLY_AC();
+ return rc;
+ }
+
+ IPRT_DEBUG_SEMS_STATE_RC(pThis, 'M', rc);
+ spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq);
+ IPRT_LINUX_RESTORE_EFL_ONLY_AC();
+ return rc;
+}
+
+
+RTDECL(int) RTSemMutexRequest(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies)
+{
+ return rtSemMutexLinuxRequest(hMutexSem, cMillies, false /*fInterruptible*/);
+}
+RT_EXPORT_SYMBOL(RTSemMutexRequest);
+
+
+RTDECL(int) RTSemMutexRequestDebug(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, RTHCUINTPTR uId, RT_SRC_POS_DECL)
+{
+ RT_NOREF_PV(uId); RT_SRC_POS_NOREF();
+ return RTSemMutexRequest(hMutexSem, cMillies);
+}
+RT_EXPORT_SYMBOL(RTSemMutexRequestDebug);
+
+
+RTDECL(int) RTSemMutexRequestNoResume(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies)
+{
+ return rtSemMutexLinuxRequest(hMutexSem, cMillies, true /*fInterruptible*/);
+}
+RT_EXPORT_SYMBOL(RTSemMutexRequestNoResume);
+
+
+RTDECL(int) RTSemMutexRequestNoResumeDebug(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, RTHCUINTPTR uId, RT_SRC_POS_DECL)
+{
+ RT_NOREF_PV(uId); RT_SRC_POS_NOREF();
+ return RTSemMutexRequestNoResume(hMutexSem, cMillies);
+}
+RT_EXPORT_SYMBOL(RTSemMutexRequestNoResumeDebug);
+
+
+RTDECL(int) RTSemMutexRelease(RTSEMMUTEX hMtx)
+{
+ PRTSEMMUTEXINTERNAL pThis = hMtx;
+ struct task_struct *pSelf = current;
+ unsigned long fSavedIrq;
+ int rc;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate.
+ */
+ AssertPtrReturn(pThis, VERR_INVALID_HANDLE);
+ AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE);
+ Assert(pThis->cRefs >= 1);
+
+ /*
+ * Take the lock and release one recursion.
+ */
+ spin_lock_irqsave(&pThis->Spinlock, fSavedIrq);
+ if (pThis->pOwnerTask == pSelf)
+ {
+ Assert(pThis->cRecursions > 0);
+ if (--pThis->cRecursions == 0)
+ {
+ pThis->pOwnerTask = NULL;
+
+ /* anyone to wake up? */
+ if (!RTListIsEmpty(&pThis->WaiterList))
+ {
+ PRTSEMMUTEXLNXWAITER pWaiter = RTListGetFirst(&pThis->WaiterList, RTSEMMUTEXLNXWAITER, ListEntry);
+ pWaiter->enmReason = RTSEMMUTEXLNXWAITER_WAKEUP;
+ wake_up_process(pWaiter->pTask);
+ }
+ IPRT_DEBUG_SEMS_STATE(pThis, 'u');
+ }
+ rc = VINF_SUCCESS;
+ }
+ else
+ rc = VERR_NOT_OWNER;
+ spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq);
+
+ AssertRC(rc);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+}
+RT_EXPORT_SYMBOL(RTSemMutexRelease);
+
+
+RTDECL(bool) RTSemMutexIsOwned(RTSEMMUTEX hMutexSem)
+{
+ PRTSEMMUTEXINTERNAL pThis = hMutexSem;
+ unsigned long fSavedIrq;
+ bool fOwned;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate.
+ */
+ AssertPtrReturn(pThis, false);
+ AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), false);
+ Assert(pThis->cRefs >= 1);
+
+ /*
+ * Take the lock and release one recursion.
+ */
+ spin_lock_irqsave(&pThis->Spinlock, fSavedIrq);
+ fOwned = pThis->pOwnerTask != NULL;
+ spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq);
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return fOwned;
+
+}
+RT_EXPORT_SYMBOL(RTSemMutexIsOwned);
+
diff --git a/src/VBox/Runtime/r0drv/linux/spinlock-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/spinlock-r0drv-linux.c
new file mode 100644
index 00000000..c5bdded2
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/spinlock-r0drv-linux.c
@@ -0,0 +1,196 @@
+/* $Id: spinlock-r0drv-linux.c $ */
+/** @file
+ * IPRT - Spinlocks, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/spinlock.h>
+
+#include <iprt/asm.h>
+#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
+# include <iprt/asm-amd64-x86.h>
+#endif
+#include <iprt/assert.h>
+#include <iprt/errcore.h>
+#include <iprt/mem.h>
+#include <iprt/mp.h>
+#include <iprt/thread.h>
+#include "internal/magics.h"
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * Wrapper for the spinlock_t structure.
+ */
+typedef struct RTSPINLOCKINTERNAL
+{
+ /** Spinlock magic value (RTSPINLOCK_MAGIC). */
+ uint32_t volatile u32Magic;
+ /** The spinlock creation flags. */
+ uint32_t fFlags;
+ /** The saved interrupt flag. */
+ unsigned long volatile fIntSaved;
+ /** The linux spinlock structure. */
+ spinlock_t Spinlock;
+#ifdef RT_MORE_STRICT
+ /** The idAssertCpu variable before acquring the lock for asserting after
+ * releasing the spinlock. */
+ RTCPUID volatile idAssertCpu;
+ /** The CPU that owns the lock. */
+ RTCPUID volatile idCpuOwner;
+#endif
+} RTSPINLOCKINTERNAL, *PRTSPINLOCKINTERNAL;
+
+
+
+RTDECL(int) RTSpinlockCreate(PRTSPINLOCK pSpinlock, uint32_t fFlags, const char *pszName)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ PRTSPINLOCKINTERNAL pThis;
+ AssertReturn(fFlags == RTSPINLOCK_FLAGS_INTERRUPT_SAFE || fFlags == RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, VERR_INVALID_PARAMETER);
+ RT_NOREF_PV(pszName);
+
+ /*
+ * Allocate.
+ */
+ Assert(sizeof(RTSPINLOCKINTERNAL) > sizeof(void *));
+ pThis = (PRTSPINLOCKINTERNAL)RTMemAlloc(sizeof(*pThis));
+ if (!pThis)
+ return VERR_NO_MEMORY;
+ /*
+ * Initialize and return.
+ */
+ pThis->u32Magic = RTSPINLOCK_MAGIC;
+ pThis->fFlags = fFlags;
+ pThis->fIntSaved = 0;
+#ifdef RT_MORE_STRICT
+ pThis->idCpuOwner = NIL_RTCPUID;
+ pThis->idAssertCpu = NIL_RTCPUID;
+#endif
+
+ spin_lock_init(&pThis->Spinlock);
+
+ *pSpinlock = pThis;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSpinlockCreate);
+
+
+RTDECL(int) RTSpinlockDestroy(RTSPINLOCK Spinlock)
+{
+ /*
+ * Validate input.
+ */
+ PRTSPINLOCKINTERNAL pThis = (PRTSPINLOCKINTERNAL)Spinlock;
+ if (!pThis)
+ return VERR_INVALID_PARAMETER;
+ if (pThis->u32Magic != RTSPINLOCK_MAGIC)
+ {
+ AssertMsgFailed(("Invalid spinlock %p magic=%#x\n", pThis, pThis->u32Magic));
+ return VERR_INVALID_PARAMETER;
+ }
+
+ ASMAtomicIncU32(&pThis->u32Magic);
+ RTMemFree(pThis);
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTSpinlockDestroy);
+
+
+RTDECL(void) RTSpinlockAcquire(RTSPINLOCK Spinlock)
+{
+ PRTSPINLOCKINTERNAL pThis = (PRTSPINLOCKINTERNAL)Spinlock;
+ IPRT_LINUX_SAVE_EFL_AC();
+ RT_ASSERT_PREEMPT_CPUID_VAR();
+ AssertMsg(pThis && pThis->u32Magic == RTSPINLOCK_MAGIC,
+ ("pThis=%p u32Magic=%08x\n", pThis, pThis ? (int)pThis->u32Magic : 0));
+
+#ifdef CONFIG_PROVE_LOCKING
+ lockdep_off();
+#endif
+ if (pThis->fFlags & RTSPINLOCK_FLAGS_INTERRUPT_SAFE)
+ {
+ unsigned long fIntSaved;
+ spin_lock_irqsave(&pThis->Spinlock, fIntSaved);
+ pThis->fIntSaved = fIntSaved;
+ }
+ else
+ spin_lock(&pThis->Spinlock);
+#ifdef CONFIG_PROVE_LOCKING
+ lockdep_on();
+#endif
+
+ IPRT_LINUX_RESTORE_EFL_ONLY_AC();
+ RT_ASSERT_PREEMPT_CPUID_SPIN_ACQUIRED(pThis);
+}
+RT_EXPORT_SYMBOL(RTSpinlockAcquire);
+
+
+RTDECL(void) RTSpinlockRelease(RTSPINLOCK Spinlock)
+{
+ PRTSPINLOCKINTERNAL pThis = (PRTSPINLOCKINTERNAL)Spinlock;
+ IPRT_LINUX_SAVE_EFL_AC(); /* spin_unlock* may preempt and trash eflags.ac. */
+ RT_ASSERT_PREEMPT_CPUID_SPIN_RELEASE_VARS();
+ AssertMsg(pThis && pThis->u32Magic == RTSPINLOCK_MAGIC,
+ ("pThis=%p u32Magic=%08x\n", pThis, pThis ? (int)pThis->u32Magic : 0));
+ RT_ASSERT_PREEMPT_CPUID_SPIN_RELEASE(pThis);
+
+#ifdef CONFIG_PROVE_LOCKING
+ lockdep_off();
+#endif
+ if (pThis->fFlags & RTSPINLOCK_FLAGS_INTERRUPT_SAFE)
+ {
+ unsigned long fIntSaved = pThis->fIntSaved;
+ pThis->fIntSaved = 0;
+ spin_unlock_irqrestore(&pThis->Spinlock, fIntSaved);
+ }
+ else
+ spin_unlock(&pThis->Spinlock);
+#ifdef CONFIG_PROVE_LOCKING
+ lockdep_on();
+#endif
+
+ IPRT_LINUX_RESTORE_EFL_ONLY_AC();
+ RT_ASSERT_PREEMPT_CPUID();
+}
+RT_EXPORT_SYMBOL(RTSpinlockRelease);
+
diff --git a/src/VBox/Runtime/r0drv/linux/string.h b/src/VBox/Runtime/r0drv/linux/string.h
new file mode 100644
index 00000000..9e5a6941
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/string.h
@@ -0,0 +1,70 @@
+/* $Id: string.h $ */
+/** @file
+ * IPRT - wrapper for the linux kernel asm/string.h.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+#ifndef IPRT_INCLUDED_SRC_r0drv_linux_string_h
+#define IPRT_INCLUDED_SRC_r0drv_linux_string_h
+#ifndef RT_WITHOUT_PRAGMA_ONCE
+# pragma once
+#endif
+
+#include <iprt/cdefs.h>
+
+RT_C_DECLS_BEGIN
+#ifndef bool /* Linux 2.6.19 C++ nightmare */
+#define bool bool_type
+#define true true_type
+#define false false_type
+#define _Bool int
+#define bool_type_r0drv_string_h__
+#endif
+#include <linux/types.h>
+#include <linux/string.h>
+#ifdef bool_type_r0drv_string_h__
+#undef bool
+#undef true
+#undef false
+#undef bool_type_r0drv_string_h__
+#endif
+char *strpbrk(const char *pszStr, const char *pszChars)
+#if defined(__THROW)
+ __THROW
+#endif
+ ;
+
+RT_C_DECLS_END
+
+#endif /* !IPRT_INCLUDED_SRC_r0drv_linux_string_h */
+
diff --git a/src/VBox/Runtime/r0drv/linux/the-linux-kernel.h b/src/VBox/Runtime/r0drv/linux/the-linux-kernel.h
new file mode 100644
index 00000000..8e70f992
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/the-linux-kernel.h
@@ -0,0 +1,494 @@
+/* $Id: the-linux-kernel.h $ */
+/** @file
+ * IPRT - Include all necessary headers for the Linux kernel.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+#ifndef IPRT_INCLUDED_SRC_r0drv_linux_the_linux_kernel_h
+#define IPRT_INCLUDED_SRC_r0drv_linux_the_linux_kernel_h
+#ifndef RT_WITHOUT_PRAGMA_ONCE
+# pragma once
+#endif
+
+/*
+ * Include iprt/types.h to install the bool wrappers.
+ * Then use the linux bool type for all the stuff include here.
+ */
+#include <iprt/types.h>
+#define bool linux_bool
+
+#if RT_GNUC_PREREQ(4, 6)
+# pragma GCC diagnostic push
+#endif
+#if RT_GNUC_PREREQ(4, 2)
+# pragma GCC diagnostic ignored "-Wunused-parameter"
+# if !defined(__cplusplus) && RT_GNUC_PREREQ(4, 3)
+# pragma GCC diagnostic ignored "-Wold-style-declaration" /* 2.6.18-411.0.0.0.1.el5/build/include/asm/apic.h:110: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration] */
+# endif
+#endif
+
+
+#include <iprt/linux/version.h>
+#if RTLNX_VER_MIN(2,6,33)
+# include <generated/autoconf.h>
+#else
+# ifndef AUTOCONF_INCLUDED
+# include <linux/autoconf.h>
+# endif
+#endif
+
+/* We only support 2.4 and 2.6 series kernels */
+#if RTLNX_VER_MAX(2,4,0)
+# error Sorry, we do not support 2.3 and earlier kernels.
+#endif
+#if RTLNX_VER_MIN(2,5,0) && RTLNX_VER_MAX(2,6,0)
+# error Sorry, we do not support 2.5 series kernels (might work though).
+#endif
+
+#if defined(CONFIG_MODVERSIONS) && !defined(MODVERSIONS)
+# define MODVERSIONS
+# if RTLNX_VER_MAX(2,5,71)
+# include <linux/modversions.h>
+# endif
+#endif
+#ifndef KBUILD_STR
+# if RTLNX_VER_MAX(2,6,16)
+# define KBUILD_STR(s) s
+# else
+# define KBUILD_STR(s) #s
+# endif
+#endif
+# if RTLNX_VER_MIN(3,3,0)
+# include <linux/kconfig.h> /* for macro IS_ENABLED */
+# endif
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#if RTLNX_VER_MIN(2,6,27)
+# include <linux/semaphore.h>
+#else /* older kernels */
+# include <asm/semaphore.h>
+#endif /* older kernels */
+#include <linux/module.h>
+#if RTLNX_VER_MIN(2,6,0)
+# include <linux/moduleparam.h>
+#endif
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#if RTLNX_VER_MIN(2,6,0)
+# include <linux/namei.h>
+#endif
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+
+#if RTLNX_VER_RANGE(3,9,23, 3,9,31)
+# include <linux/splice.h>
+#endif
+
+#if RTLNX_VER_MIN(3,9,0)
+# include <linux/sched/rt.h>
+#endif
+#if RTLNX_VER_MIN(4,11,0)
+# include <linux/sched/signal.h>
+# include <linux/sched/types.h>
+#endif
+#if RTLNX_VER_MIN(2,6,7)
+# include <linux/jiffies.h>
+#endif
+#if RTLNX_VER_MIN(2,6,16)
+# include <linux/ktime.h>
+# include <linux/hrtimer.h>
+#endif
+#include <linux/wait.h>
+#if RTLNX_VER_MIN(2,5,71)
+# include <linux/cpu.h>
+# include <linux/notifier.h>
+#endif
+#if RTLNX_VER_MIN(5,1,0)
+# include <uapi/linux/mman.h>
+#endif
+/* For the basic additions module */
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/compiler.h>
+#if RTLNX_VER_MIN(5,9,0) || RTLNX_SUSE_MAJ_PREREQ(15,3) /* linux/fs.h defined HAVE_UNLOCKED_IOCTL from 2.6.11 up to 5.9 (also 5.3.18-56 in SLES15-SP3), when it became an implicit assumption. */
+# define HAVE_UNLOCKED_IOCTL 1 /* We use this in a couple of places, so for now just define it for 5.9+ too. */
+#endif
+#if !defined(HAVE_UNLOCKED_IOCTL) && RTLNX_VER_MAX(2,6,38)
+# include <linux/smp_lock.h>
+#endif
+/* For the shared folders module */
+#include <linux/vmalloc.h>
+#define wchar_t linux_wchar_t
+#include <linux/nls.h>
+#undef wchar_t
+#include <asm/mman.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+
+/* For thread-context hooks. */
+#if RTLNX_VER_MIN(2,6,18) && defined(CONFIG_PREEMPT_NOTIFIERS)
+# include <linux/preempt.h>
+#endif
+
+/* for workqueue / task queues. */
+#if RTLNX_VER_MIN(2,5,41)
+# include <linux/workqueue.h>
+#else
+# include <linux/tqueue.h>
+#endif
+
+#if RTLNX_VER_MIN(2,6,4)
+# include <linux/kthread.h>
+#endif
+
+/* for cr4_init_shadow() / cpu_tlbstate. */
+#if RTLNX_VER_MIN(3,20,0)
+# include <asm/tlbflush.h>
+#endif
+
+/* for set_pages_x() */
+#if RTLNX_VER_MIN(4,12,0)
+# include <asm/set_memory.h>
+#endif
+
+/* for __flush_tlb_all() */
+#if RTLNX_VER_MIN(2,6,28) && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
+# include <asm/tlbflush.h>
+#endif
+
+/* for kernel_fpu_begin / kernel_fpu_end() */
+#if RTLNX_VER_MIN(4,2,0)
+# include <asm/fpu/api.h>
+#endif
+
+#if RTLNX_VER_MIN(3,7,0)
+# include <asm/smap.h>
+#else
+static inline void clac(void) { }
+static inline void stac(void) { }
+#endif
+
+#if RTLNX_VER_MAX(2,6,0)
+# ifndef page_to_pfn
+# define page_to_pfn(page) ((page) - mem_map)
+# endif
+#endif
+
+#ifndef DEFINE_WAIT
+# define DEFINE_WAIT(name) DECLARE_WAITQUEUE(name, current)
+#endif
+
+#ifndef __GFP_NOWARN
+# define __GFP_NOWARN 0
+#endif
+
+/*
+ * 2.4 / early 2.6 compatibility wrappers
+ */
+#if RTLNX_VER_MAX(2,6,7)
+
+# ifndef MAX_JIFFY_OFFSET
+# define MAX_JIFFY_OFFSET ((~0UL >> 1)-1)
+# endif
+
+# if RTLNX_VER_MAX(2,4,29) || RTLNX_VER_MIN(2,6,0)
+
+DECLINLINE(unsigned int) jiffies_to_msecs(unsigned long cJiffies)
+{
+# if HZ <= 1000 && !(1000 % HZ)
+ return (1000 / HZ) * cJiffies;
+# elif HZ > 1000 && !(HZ % 1000)
+ return (cJiffies + (HZ / 1000) - 1) / (HZ / 1000);
+# else
+ return (cJiffies * 1000) / HZ;
+# endif
+}
+
+DECLINLINE(unsigned long) msecs_to_jiffies(unsigned int cMillies)
+{
+# if HZ > 1000
+ if (cMillies > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+# endif
+# if HZ <= 1000 && !(1000 % HZ)
+ return (cMillies + (1000 / HZ) - 1) / (1000 / HZ);
+# elif HZ > 1000 && !(HZ % 1000)
+ return cMillies * (HZ / 1000);
+# else
+ return (cMillies * HZ + 999) / 1000;
+# endif
+}
+
+# endif /* < 2.4.29 || >= 2.6.0 */
+
+#endif /* < 2.6.7 */
+
+/*
+ * 2.4 compatibility wrappers
+ */
+#if RTLNX_VER_MAX(2,6,0)
+
+# define prepare_to_wait(q, wait, state) \
+ do { \
+ add_wait_queue(q, wait); \
+ set_current_state(state); \
+ } while (0)
+
+# define after_wait(wait) \
+ do { \
+ list_del_init(&(wait)->task_list); \
+ } while (0)
+
+# define finish_wait(q, wait) \
+ do { \
+ set_current_state(TASK_RUNNING); \
+ remove_wait_queue(q, wait); \
+ } while (0)
+
+#else /* >= 2.6.0 */
+
+# define after_wait(wait) do {} while (0)
+
+#endif /* >= 2.6.0 */
+
+/** @def TICK_NSEC
+ * The time between ticks in nsec */
+#ifndef TICK_NSEC
+# define TICK_NSEC (1000000000UL / HZ)
+#endif
+
+/*
+ * This sucks soooo badly on x86! Why don't they export __PAGE_KERNEL_EXEC so PAGE_KERNEL_EXEC would be usable?
+ */
+#if RTLNX_VER_MIN(2,6,8) && defined(RT_ARCH_AMD64)
+# define MY_PAGE_KERNEL_EXEC PAGE_KERNEL_EXEC
+#elif RTLNX_VER_MIN(2,6,8) && defined(PAGE_KERNEL_EXEC) && defined(CONFIG_X86_PAE)
+# ifdef __PAGE_KERNEL_EXEC
+ /* >= 2.6.27 */
+# define MY_PAGE_KERNEL_EXEC __pgprot(boot_cpu_has(X86_FEATURE_PGE) ? __PAGE_KERNEL_EXEC | _PAGE_GLOBAL : __PAGE_KERNEL_EXEC)
+# else
+# define MY_PAGE_KERNEL_EXEC __pgprot(boot_cpu_has(X86_FEATURE_PGE) ? _PAGE_KERNEL_EXEC | _PAGE_GLOBAL : _PAGE_KERNEL_EXEC)
+# endif
+#else
+# define MY_PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
+
+/*
+ * The redhat hack section.
+ * - The current hacks are for 2.4.21-15.EL only.
+ */
+#ifndef NO_REDHAT_HACKS
+/* accounting. */
+# if RTLNX_VER_MAX(2,6,0)
+# ifdef VM_ACCOUNT
+# define USE_RHEL4_MUNMAP
+# endif
+# endif
+
+/* backported remap_page_range. */
+# if RTLNX_VER_MAX(2,6,0)
+# include <asm/tlb.h>
+# ifdef tlb_vma /* probably not good enough... */
+# define HAVE_26_STYLE_REMAP_PAGE_RANGE 1
+# endif
+# endif
+
+# ifndef RT_ARCH_AMD64
+/* In 2.6.9-22.ELsmp we have to call change_page_attr() twice when changing
+ * the page attributes from PAGE_KERNEL to something else, because there appears
+ * to be a bug in one of the many patches that redhat applied.
+ * It should be safe to do this on less buggy linux kernels too. ;-)
+ */
+# define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) \
+ do { \
+ if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) \
+ change_page_attr(pPages, cPages, prot); \
+ change_page_attr(pPages, cPages, prot); \
+ } while (0)
+# endif /* !RT_ARCH_AMD64 */
+#endif /* !NO_REDHAT_HACKS */
+
+#ifndef MY_CHANGE_PAGE_ATTR
+# ifdef RT_ARCH_AMD64 /** @todo This is a cheap hack, but it'll get around that 'else BUG();' in __change_page_attr(). */
+# define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) \
+ do { \
+ change_page_attr(pPages, cPages, PAGE_KERNEL_NOCACHE); \
+ change_page_attr(pPages, cPages, prot); \
+ } while (0)
+# else
+# define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) change_page_attr(pPages, cPages, prot)
+# endif
+#endif
+
+#if RTLNX_VER_MIN(2,6,25)
+# if RTLNX_VER_MAX(5,4,0) /* The interface was removed, but we only need it for < 2.4.22, so who cares. */
+# define MY_SET_PAGES_EXEC(pPages, cPages) set_pages_x(pPages, cPages)
+# define MY_SET_PAGES_NOEXEC(pPages, cPages) set_pages_nx(pPages, cPages)
+# endif
+#else
+# define MY_SET_PAGES_EXEC(pPages, cPages) \
+ do { \
+ if (pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL)) \
+ MY_CHANGE_PAGE_ATTR(pPages, cPages, MY_PAGE_KERNEL_EXEC); \
+ } while (0)
+# define MY_SET_PAGES_NOEXEC(pPages, cPages) \
+ do { \
+ if (pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL)) \
+ MY_CHANGE_PAGE_ATTR(pPages, cPages, PAGE_KERNEL); \
+ } while (0)
+#endif
+
+/** @def ONE_MSEC_IN_JIFFIES
+ * The number of jiffies that make up 1 millisecond. Must be at least 1! */
+#if HZ <= 1000
+# define ONE_MSEC_IN_JIFFIES 1
+#elif !(HZ % 1000)
+# define ONE_MSEC_IN_JIFFIES (HZ / 1000)
+#else
+# define ONE_MSEC_IN_JIFFIES ((HZ + 999) / 1000)
+# error "HZ is not a multiple of 1000, the GIP stuff won't work right!"
+#endif
+
+/*
+ * Stop using the linux bool type.
+ */
+#undef bool
+
+#if RT_GNUC_PREREQ(4, 6)
+# pragma GCC diagnostic pop
+#endif
+
+/*
+ * There are post-2.6.24 kernels (confusingly with unchanged version number)
+ * which eliminate macros which were marked as deprecated.
+ */
+#ifndef __attribute_used__
+#define __attribute_used__ __used
+#endif
+
+/**
+ * Hack for shortening pointers on linux so we can stuff more stuff into the
+ * task_struct::comm field. This is used by the semaphore code but put here
+ * because we don't have any better place atm. Don't use outside IPRT, please.
+ */
+#ifdef RT_ARCH_AMD64
+# define IPRT_DEBUG_SEMS_ADDRESS(addr) ( ((long)(addr) & (long)~UINT64_C(0xfffffff000000000)) )
+#else
+# define IPRT_DEBUG_SEMS_ADDRESS(addr) ( (long)(addr) )
+#endif
+
+/**
+ * Puts semaphore info into the task_struct::comm field if IPRT_DEBUG_SEMS is
+ * defined.
+ */
+#ifdef IPRT_DEBUG_SEMS
+# define IPRT_DEBUG_SEMS_STATE(pThis, chState) \
+ snprintf(current->comm, sizeof(current->comm), "%c%lx", (chState), IPRT_DEBUG_SEMS_ADDRESS(pThis));
+#else
+# define IPRT_DEBUG_SEMS_STATE(pThis, chState) do { } while (0)
+#endif
+
+/**
+ * Puts semaphore info into the task_struct::comm field if IPRT_DEBUG_SEMS is
+ * defined.
+ */
+#ifdef IPRT_DEBUG_SEMS
+# define IPRT_DEBUG_SEMS_STATE_RC(pThis, chState, rc) \
+ snprintf(current->comm, sizeof(current->comm), "%c%lx:%d", (chState), IPRT_DEBUG_SEMS_ADDRESS(pThis), rc);
+#else
+# define IPRT_DEBUG_SEMS_STATE_RC(pThis, chState, rc) do { } while (0)
+#endif
+
+/** @name Macros for preserving EFLAGS.AC on 3.19+/amd64 paranoid.
+ * The AMD 64 switch_to in macro in arch/x86/include/asm/switch_to.h stopped
+ * restoring flags.
+ * @{ */
+#if (defined(CONFIG_X86_SMAP) || defined(RT_STRICT) || defined(IPRT_WITH_EFLAGS_AC_PRESERVING)) \
+ && !defined(IPRT_WITHOUT_EFLAGS_AC_PRESERVING)
+# include <iprt/asm-amd64-x86.h>
+# define IPRT_X86_EFL_AC RT_BIT(18)
+# define IPRT_LINUX_SAVE_EFL_AC() RTCCUINTREG fSavedEfl = ASMGetFlags()
+# define IPRT_LINUX_RESTORE_EFL_AC() ASMSetFlags(fSavedEfl)
+# define IPRT_LINUX_RESTORE_EFL_ONLY_AC() ASMChangeFlags(~IPRT_X86_EFL_AC, fSavedEfl & IPRT_X86_EFL_AC)
+#else
+# define IPRT_LINUX_SAVE_EFL_AC() do { } while (0)
+# define IPRT_LINUX_RESTORE_EFL_AC() do { } while (0)
+# define IPRT_LINUX_RESTORE_EFL_ONLY_AC() do { } while (0)
+#endif
+/** @} */
+
+/*
+ * There are some conflicting defines in iprt/param.h, sort them out here.
+ */
+#ifndef IPRT_INCLUDED_param_h
+# undef PAGE_SIZE
+# undef PAGE_OFFSET_MASK
+# include <iprt/param.h>
+#endif
+
+/*
+ * Some global indicator macros.
+ */
+/** @def IPRT_LINUX_HAS_HRTIMER
+ * Whether the kernel support high resolution timers (Linux kernel versions
+ * 2.6.28 and later (hrtimer_add_expires_ns() & schedule_hrtimeout). */
+#if RTLNX_VER_MIN(2,6,28) || defined(DOXYGEN_RUNNING)
+# define IPRT_LINUX_HAS_HRTIMER
+#endif
+
+/*
+ * Workqueue stuff, see initterm-r0drv-linux.c.
+ */
+#if RTLNX_VER_MIN(2,5,41)
+typedef struct work_struct RTR0LNXWORKQUEUEITEM;
+#else
+typedef struct tq_struct RTR0LNXWORKQUEUEITEM;
+#endif
+DECLHIDDEN(void) rtR0LnxWorkqueuePush(RTR0LNXWORKQUEUEITEM *pWork, void (*pfnWorker)(RTR0LNXWORKQUEUEITEM *));
+DECLHIDDEN(void) rtR0LnxWorkqueueFlush(void);
+
+/*
+ * Memory hacks from memobj-r0drv-linux.c that shared folders need.
+ */
+RTDECL(struct page *) rtR0MemObjLinuxVirtToPage(void *pv);
+
+#endif /* !IPRT_INCLUDED_SRC_r0drv_linux_the_linux_kernel_h */
diff --git a/src/VBox/Runtime/r0drv/linux/thread-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/thread-r0drv-linux.c
new file mode 100644
index 00000000..2724d091
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/thread-r0drv-linux.c
@@ -0,0 +1,285 @@
+/* $Id: thread-r0drv-linux.c $ */
+/** @file
+ * IPRT - Threads, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+#include <iprt/thread.h>
+
+#include <iprt/asm.h>
+#if RTLNX_VER_MAX(2,5,28) || defined(CONFIG_X86_SMAP)
+# include <iprt/asm-amd64-x86.h>
+#endif
+#include <iprt/assert.h>
+#include <iprt/errcore.h>
+#include <iprt/mp.h>
+
+
+/*********************************************************************************************************************************
+* Defined Constants And Macros *
+*********************************************************************************************************************************/
+#if defined(CONFIG_PREEMPT_COUNT) /* since 3.1 */ \
+ || defined(CONFIG_PREEMPTION) /* since 5.3 */ \
+ || defined(CONFIG_PREEMPT) /* since 2.6.13 - preemption model choice; before that arch specific choice back to 2.5.45. */
+# define IPRT_LNX_HAVE_PREEMPTION
+#endif
+
+
+/*********************************************************************************************************************************
+* Global Variables *
+*********************************************************************************************************************************/
+#ifndef IPRT_LNX_HAVE_PREEMPTION
+/** Per-cpu preemption counters. */
+static int32_t volatile g_acPreemptDisabled[NR_CPUS];
+#endif
+
+
+RTDECL(RTNATIVETHREAD) RTThreadNativeSelf(void)
+{
+ return (RTNATIVETHREAD)current;
+}
+RT_EXPORT_SYMBOL(RTThreadNativeSelf);
+
+
+static int rtR0ThreadLnxSleepCommon(RTMSINTERVAL cMillies)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+ long cJiffies = msecs_to_jiffies(cMillies);
+ set_current_state(TASK_INTERRUPTIBLE);
+ cJiffies = schedule_timeout(cJiffies);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ if (!cJiffies)
+ return VINF_SUCCESS;
+ return VERR_INTERRUPTED;
+}
+
+
+RTDECL(int) RTThreadSleep(RTMSINTERVAL cMillies)
+{
+ return rtR0ThreadLnxSleepCommon(cMillies);
+}
+RT_EXPORT_SYMBOL(RTThreadSleep);
+
+
+RTDECL(int) RTThreadSleepNoLog(RTMSINTERVAL cMillies)
+{
+ return rtR0ThreadLnxSleepCommon(cMillies);
+}
+RT_EXPORT_SYMBOL(RTThreadSleepNoLog);
+
+
+RTDECL(bool) RTThreadYield(void)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+#if RTLNX_VER_MIN(2,4,20)
+ yield();
+#else
+ /** @todo r=ramshankar: Can we use cond_resched() instead? */
+ set_current_state(TASK_RUNNING);
+ sys_sched_yield();
+ schedule();
+#endif
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return true;
+}
+RT_EXPORT_SYMBOL(RTThreadYield);
+
+
+RTDECL(bool) RTThreadPreemptIsEnabled(RTTHREAD hThread)
+{
+#ifdef IPRT_LNX_HAVE_PREEMPTION
+ Assert(hThread == NIL_RTTHREAD); RT_NOREF_PV(hThread);
+# ifdef preemptible
+ return preemptible();
+# else
+ return preempt_count() == 0 && !in_atomic() && !irqs_disabled();
+# endif
+#else
+ int32_t c;
+
+ Assert(hThread == NIL_RTTHREAD);
+ c = g_acPreemptDisabled[smp_processor_id()];
+ AssertMsg(c >= 0 && c < 32, ("%d\n", c));
+ if (c != 0)
+ return false;
+# if RTLNX_VER_MIN(2,5,32)
+ if (in_atomic())
+ return false;
+# endif
+# if RTLNX_VER_MIN(2,5,28)
+ if (irqs_disabled())
+ return false;
+# else
+ if (!ASMIntAreEnabled())
+ return false;
+# endif
+ return true;
+#endif
+}
+RT_EXPORT_SYMBOL(RTThreadPreemptIsEnabled);
+
+
+RTDECL(bool) RTThreadPreemptIsPending(RTTHREAD hThread)
+{
+ Assert(hThread == NIL_RTTHREAD); RT_NOREF_PV(hThread);
+#if RTLNX_VER_MIN(2,5,4)
+ return !!test_tsk_thread_flag(current, TIF_NEED_RESCHED);
+
+#elif RTLNX_VER_MIN(2,4,20)
+ return !!need_resched();
+
+#elif RTLNX_VER_MIN(2,1,110)
+ return current->need_resched != 0;
+
+#else
+ return need_resched != 0;
+#endif
+}
+RT_EXPORT_SYMBOL(RTThreadPreemptIsPending);
+
+
+RTDECL(bool) RTThreadPreemptIsPendingTrusty(void)
+{
+ /* yes, RTThreadPreemptIsPending is reliable. */
+ return true;
+}
+RT_EXPORT_SYMBOL(RTThreadPreemptIsPendingTrusty);
+
+
+RTDECL(bool) RTThreadPreemptIsPossible(void)
+{
+#ifdef IPRT_LNX_HAVE_PREEMPTION
+ return true; /* Yes, kernel preemption is possible. */
+#else
+ return false; /* No kernel preemption (or just CONFIG_PREEMPT_VOLUNTARY). */
+#endif
+}
+RT_EXPORT_SYMBOL(RTThreadPreemptIsPossible);
+
+
+RTDECL(void) RTThreadPreemptDisable(PRTTHREADPREEMPTSTATE pState)
+{
+#ifdef IPRT_LNX_HAVE_PREEMPTION
+ AssertPtr(pState);
+ Assert(pState->u32Reserved == 0);
+ pState->u32Reserved = 42;
+ preempt_disable();
+ RT_ASSERT_PREEMPT_CPUID_DISABLE(pState);
+
+#else /* !IPRT_LNX_HAVE_PREEMPTION */
+ int32_t c;
+ AssertPtr(pState);
+ Assert(pState->u32Reserved == 0);
+
+ /* Do our own accounting. */
+ c = ASMAtomicIncS32(&g_acPreemptDisabled[smp_processor_id()]);
+ AssertMsg(c > 0 && c < 32, ("%d\n", c));
+ pState->u32Reserved = c;
+ RT_ASSERT_PREEMPT_CPUID_DISABLE(pState);
+#endif
+}
+RT_EXPORT_SYMBOL(RTThreadPreemptDisable);
+
+
+RTDECL(void) RTThreadPreemptRestore(PRTTHREADPREEMPTSTATE pState)
+{
+#ifdef IPRT_LNX_HAVE_PREEMPTION
+ IPRT_LINUX_SAVE_EFL_AC(); /* paranoia */
+ AssertPtr(pState);
+ Assert(pState->u32Reserved == 42);
+ RT_ASSERT_PREEMPT_CPUID_RESTORE(pState);
+ preempt_enable();
+ IPRT_LINUX_RESTORE_EFL_ONLY_AC(); /* paranoia */
+
+#else
+ int32_t volatile *pc;
+ AssertPtr(pState);
+ AssertMsg(pState->u32Reserved > 0 && pState->u32Reserved < 32, ("%d\n", pState->u32Reserved));
+ RT_ASSERT_PREEMPT_CPUID_RESTORE(pState);
+
+ /* Do our own accounting. */
+ pc = &g_acPreemptDisabled[smp_processor_id()];
+ AssertMsg(pState->u32Reserved == (uint32_t)*pc, ("u32Reserved=%d *pc=%d \n", pState->u32Reserved, *pc));
+ ASMAtomicUoWriteS32(pc, pState->u32Reserved - 1);
+#endif
+ pState->u32Reserved = 0;
+}
+RT_EXPORT_SYMBOL(RTThreadPreemptRestore);
+
+
+RTDECL(bool) RTThreadIsInInterrupt(RTTHREAD hThread)
+{
+ Assert(hThread == NIL_RTTHREAD); NOREF(hThread);
+
+ return in_interrupt() != 0;
+}
+RT_EXPORT_SYMBOL(RTThreadIsInInterrupt);
+
+
+RTDECL(int) RTThreadQueryTerminationStatus(RTTHREAD hThread)
+{
+ struct task_struct *pTask = current;
+ AssertReturn(hThread == NIL_RTTHREAD, VERR_NOT_SUPPORTED);
+
+ /* Check out pending signals. ASSUMES we can get away w/o locking
+ anything because we're only reading the data. */
+ if (sigismember(&pTask->pending.signal, SIGKILL))
+ return VINF_THREAD_IS_TERMINATING;
+
+#if RTLNX_VER_MIN(2,5,34)
+ /* Check the pending signals shared with other threads in
+ the same process/group. ASSUME since we're alive that
+ the signal_struct won't be freed while we're looking
+ at it here... */
+ {
+# if RTLNX_VER_MIN(2,5,60)
+ struct signal_struct *pSignal = current->signal;
+# else
+ struct signal_struct *pSignal = current->sig;
+# endif
+ if ( pSignal
+ && sigismember(&pSignal->shared_pending.signal, SIGKILL))
+ return VINF_THREAD_IS_TERMINATING;
+ }
+#endif
+
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTThreadQueryTerminationStatus);
+
diff --git a/src/VBox/Runtime/r0drv/linux/thread2-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/thread2-r0drv-linux.c
new file mode 100644
index 00000000..16cebd7f
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/thread2-r0drv-linux.c
@@ -0,0 +1,243 @@
+/* $Id: thread2-r0drv-linux.c $ */
+/** @file
+ * IPRT - Threads (Part 2), Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/assert.h>
+#include <iprt/thread.h>
+#include <iprt/errcore.h>
+#include "internal/thread.h"
+
+#if RTLNX_VER_MIN(4,11,0)
+ #include <uapi/linux/sched/types.h>
+#endif /* >= KERNEL_VERSION(4, 11, 0) */
+
+RTDECL(RTTHREAD) RTThreadSelf(void)
+{
+ return rtThreadGetByNative((RTNATIVETHREAD)current);
+}
+
+
+DECLHIDDEN(int) rtThreadNativeInit(void)
+{
+ return VINF_SUCCESS;
+}
+
+
+DECLHIDDEN(int) rtThreadNativeSetPriority(PRTTHREADINT pThread, RTTHREADTYPE enmType)
+{
+#if RTLNX_VER_MIN(2,5,2)
+ /*
+ * Assignments are partially based on g_aTypesLinuxFree but
+ * scaled up in the high priority end.
+ *
+ * 5.9.0 - :
+ * The sched_set_normal interfaces does not really check the input,
+ * whereas sched_set_fifo & sched_set_fifo_low have fixed assignments.
+ * 2.6.11 - 5.9.0:
+ * Use sched_setscheduler to try effect FIFO scheduling
+ * for IO and TIMER threads, otherwise use set_user_nice.
+ * 2.5.2 - 5.9.0:
+ * Use set_user_nice to renice the thread.
+ */
+ int iNice = 0;
+# if RTLNX_VER_MAX(5,9,0)
+ int rc;
+# if RTLNX_VER_MIN(2,6,11)
+ int iSchedClass = SCHED_NORMAL;
+ struct sched_param Param = { .sched_priority = 0 };
+# endif
+# endif
+ switch (enmType)
+ {
+ case RTTHREADTYPE_INFREQUENT_POLLER:
+ iNice = +3;
+ break;
+
+ case RTTHREADTYPE_MAIN_HEAVY_WORKER:
+ iNice = +2;
+ break;
+
+ case RTTHREADTYPE_EMULATION:
+ iNice = +1;
+ break;
+
+ case RTTHREADTYPE_DEFAULT:
+ case RTTHREADTYPE_GUI:
+ case RTTHREADTYPE_MAIN_WORKER:
+ iNice = 0;
+ break;
+
+ case RTTHREADTYPE_VRDP_IO:
+ case RTTHREADTYPE_DEBUGGER:
+ iNice = -1;
+ break;
+
+ case RTTHREADTYPE_MSG_PUMP:
+ iNice = -2;
+ break;
+
+ case RTTHREADTYPE_IO:
+# if RTLNX_VER_MIN(5,9,0)
+ sched_set_fifo_low(current);
+ return VINF_SUCCESS;
+# else
+ iNice = -12;
+# if RTLNX_VER_MIN(2,6,11)
+ iSchedClass = SCHED_FIFO;
+ Param.sched_priority = 1; /* => prio=98; */
+# endif
+ break;
+# endif
+
+ case RTTHREADTYPE_TIMER:
+# if RTLNX_VER_MIN(5,9,0)
+ sched_set_fifo(current);
+ return VINF_SUCCESS;
+# else
+ iNice = -19;
+# if RTLNX_VER_MIN(2,6,11)
+ iSchedClass = SCHED_FIFO;
+ Param.sched_priority = MAX_RT_PRIO / 2; /* => prio=49 */
+# endif
+ break;
+# endif
+ default:
+ AssertMsgFailedReturn(("enmType=%d\n", enmType), VERR_INVALID_PARAMETER);
+ }
+
+# if RTLNX_VER_MIN(5,9,0)
+ /*
+ * We only get here for renice work.
+ */
+ sched_set_normal(current, iNice);
+
+# else /* < 5.9.0 */
+# if RTLNX_VER_MIN(2,6,11)
+ /*
+ * Try set scheduler parameters.
+ * Fall back on normal + nice if this fails for FIFO policy.*
+ */
+ rc = sched_setscheduler(current, iSchedClass, &Param);
+ if (rc)
+ {
+ Param.sched_priority = 0;
+ iSchedClass = SCHED_NORMAL;
+ rc = sched_setscheduler(current, iSchedClass, &Param);
+ }
+
+ /*
+ * Renice if using normal scheduling class.
+ */
+ if (iSchedClass == SCHED_NORMAL)
+# endif /* >= 2.6.11 */
+ set_user_nice(current, iNice);
+
+# endif /* < 5.9.0 */
+#else /* < 2.5.2 */
+ RT_NOREF_PV(enmType);
+#endif /* < 2.5.2 */
+ RT_NOREF_PV(pThread);
+ return VINF_SUCCESS;
+}
+
+
+DECLHIDDEN(int) rtThreadNativeAdopt(PRTTHREADINT pThread)
+{
+ RT_NOREF_PV(pThread);
+ return VERR_NOT_IMPLEMENTED;
+}
+
+
+DECLHIDDEN(void) rtThreadNativeWaitKludge(PRTTHREADINT pThread)
+{
+ /** @todo fix RTThreadWait/RTR0Term race on linux. */
+ RTThreadSleep(1); NOREF(pThread);
+}
+
+
+DECLHIDDEN(void) rtThreadNativeDestroy(PRTTHREADINT pThread)
+{
+ NOREF(pThread);
+}
+
+
+#if RTLNX_VER_MIN(2,6,4)
+/**
+ * Native kernel thread wrapper function.
+ *
+ * This will forward to rtThreadMain and do termination upon return.
+ *
+ * @param pvArg Pointer to the argument package.
+ */
+static int rtThreadNativeMain(void *pvArg)
+{
+ PRTTHREADINT pThread = (PRTTHREADINT)pvArg;
+
+ rtThreadMain(pThread, (RTNATIVETHREAD)current, &pThread->szName[0]);
+ return 0;
+}
+#endif
+
+
+DECLHIDDEN(int) rtThreadNativeCreate(PRTTHREADINT pThreadInt, PRTNATIVETHREAD pNativeThread)
+{
+#if RTLNX_VER_MIN(2,6,4)
+ struct task_struct *NativeThread;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ RT_ASSERT_PREEMPTIBLE();
+
+ NativeThread = kthread_run(rtThreadNativeMain, pThreadInt, "iprt-%s", pThreadInt->szName);
+
+ if (!IS_ERR(NativeThread))
+ {
+ *pNativeThread = (RTNATIVETHREAD)NativeThread;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_GENERAL_FAILURE;
+#else
+ return VERR_NOT_IMPLEMENTED;
+#endif
+}
+
diff --git a/src/VBox/Runtime/r0drv/linux/threadctxhooks-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/threadctxhooks-r0drv-linux.c
new file mode 100644
index 00000000..a7b7d905
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/threadctxhooks-r0drv-linux.c
@@ -0,0 +1,341 @@
+/* $Id: threadctxhooks-r0drv-linux.c $ */
+/** @file
+ * IPRT - Thread Context Switching Hook, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2013-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/mem.h>
+#include <iprt/assert.h>
+#include <iprt/thread.h>
+#include <iprt/errcore.h>
+#include <iprt/asm.h>
+#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
+# include <iprt/asm-amd64-x86.h>
+#endif
+#include "internal/thread.h"
+
+
+/*
+ * Linux kernel 2.6.23 introduced preemption notifiers but RedHat 2.6.18 kernels
+ * got it backported.
+ */
+#if RTLNX_VER_MIN(2,6,18) && defined(CONFIG_PREEMPT_NOTIFIERS)
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * The internal hook object for linux.
+ */
+typedef struct RTTHREADCTXHOOKINT
+{
+ /** Magic value (RTTHREADCTXHOOKINT_MAGIC). */
+ uint32_t volatile u32Magic;
+ /** The thread handle (owner) for which the hook is registered. */
+ RTNATIVETHREAD hOwner;
+ /** The preemption notifier object. */
+ struct preempt_notifier LnxPreemptNotifier;
+ /** Whether the hook is enabled or not. If enabled, the LnxPreemptNotifier
+ * is linked into the owning thread's list of preemption callouts. */
+ bool fEnabled;
+ /** Pointer to the user callback. */
+ PFNRTTHREADCTXHOOK pfnCallback;
+ /** User argument passed to the callback. */
+ void *pvUser;
+ /** The linux callbacks. */
+ struct preempt_ops PreemptOps;
+#if RTLNX_VER_MIN(3,1,19) && defined(RT_ARCH_AMD64)
+ /** Starting with 3.1.19, the linux kernel doesn't restore kernel RFLAGS during
+ * task switch, so we have to do that ourselves. (x86 code is not affected.) */
+ RTCCUINTREG fSavedRFlags;
+#endif
+} RTTHREADCTXHOOKINT;
+typedef RTTHREADCTXHOOKINT *PRTTHREADCTXHOOKINT;
+
+
+/**
+ * Hook function for the thread schedule out event.
+ *
+ * @param pPreemptNotifier Pointer to the preempt_notifier struct.
+ * @param pNext Pointer to the task that is being scheduled
+ * instead of the current thread.
+ *
+ * @remarks Called with the rq (runqueue) lock held and with preemption and
+ * interrupts disabled!
+ */
+static void rtThreadCtxHooksLnxSchedOut(struct preempt_notifier *pPreemptNotifier, struct task_struct *pNext)
+{
+ PRTTHREADCTXHOOKINT pThis = RT_FROM_MEMBER(pPreemptNotifier, RTTHREADCTXHOOKINT, LnxPreemptNotifier);
+#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
+ RTCCUINTREG fSavedEFlags = ASMGetFlags();
+ stac();
+#endif
+ RT_NOREF_PV(pNext);
+
+ AssertPtr(pThis);
+ AssertPtr(pThis->pfnCallback);
+ Assert(pThis->fEnabled);
+ Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
+
+ pThis->pfnCallback(RTTHREADCTXEVENT_OUT, pThis->pvUser);
+
+#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
+ ASMSetFlags(fSavedEFlags);
+# if RTLNX_VER_MIN(3,1,19) && defined(RT_ARCH_AMD64)
+ pThis->fSavedRFlags = fSavedEFlags;
+# endif
+#endif
+}
+
+
+/**
+ * Hook function for the thread schedule in event.
+ *
+ * @param pPreemptNotifier Pointer to the preempt_notifier struct.
+ * @param iCpu The CPU this thread is being scheduled on.
+ *
+ * @remarks Called without holding the rq (runqueue) lock and with preemption
+ * enabled!
+ * @todo r=bird: Preemption is of course disabled when it is called.
+ */
+static void rtThreadCtxHooksLnxSchedIn(struct preempt_notifier *pPreemptNotifier, int iCpu)
+{
+ PRTTHREADCTXHOOKINT pThis = RT_FROM_MEMBER(pPreemptNotifier, RTTHREADCTXHOOKINT, LnxPreemptNotifier);
+#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
+ RTCCUINTREG fSavedEFlags = ASMGetFlags();
+ stac();
+#endif
+ RT_NOREF_PV(iCpu);
+
+ AssertPtr(pThis);
+ AssertPtr(pThis->pfnCallback);
+ Assert(pThis->fEnabled);
+
+ pThis->pfnCallback(RTTHREADCTXEVENT_IN, pThis->pvUser);
+
+#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
+# if RTLNX_VER_MIN(3,1,19) && defined(RT_ARCH_AMD64)
+ fSavedEFlags &= ~RT_BIT_64(18) /*X86_EFL_AC*/;
+ fSavedEFlags |= pThis->fSavedRFlags & RT_BIT_64(18) /*X86_EFL_AC*/;
+# endif
+ ASMSetFlags(fSavedEFlags);
+#endif
+}
+
+
+/**
+ * Worker function for RTThreadCtxHooks(Deregister|Release)().
+ *
+ * @param pThis Pointer to the internal thread-context object.
+ */
+DECLINLINE(void) rtThreadCtxHookDisable(PRTTHREADCTXHOOKINT pThis)
+{
+ Assert(pThis->PreemptOps.sched_out == rtThreadCtxHooksLnxSchedOut);
+ Assert(pThis->PreemptOps.sched_in == rtThreadCtxHooksLnxSchedIn);
+ preempt_disable();
+ preempt_notifier_unregister(&pThis->LnxPreemptNotifier);
+ pThis->fEnabled = false;
+ preempt_enable();
+}
+
+
+RTDECL(int) RTThreadCtxHookCreate(PRTTHREADCTXHOOK phCtxHook, uint32_t fFlags, PFNRTTHREADCTXHOOK pfnCallback, void *pvUser)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate input.
+ */
+ PRTTHREADCTXHOOKINT pThis;
+ Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
+ AssertPtrReturn(pfnCallback, VERR_INVALID_POINTER);
+ AssertReturn(fFlags == 0, VERR_INVALID_FLAGS);
+
+ /*
+ * Allocate and initialize a new hook. We don't register it yet, just
+ * create it.
+ */
+ pThis = (PRTTHREADCTXHOOKINT)RTMemAllocZ(sizeof(*pThis));
+ if (RT_UNLIKELY(!pThis))
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_NO_MEMORY;
+ }
+ pThis->u32Magic = RTTHREADCTXHOOKINT_MAGIC;
+ pThis->hOwner = RTThreadNativeSelf();
+ pThis->fEnabled = false;
+ pThis->pfnCallback = pfnCallback;
+ pThis->pvUser = pvUser;
+ preempt_notifier_init(&pThis->LnxPreemptNotifier, &pThis->PreemptOps);
+ pThis->PreemptOps.sched_out = rtThreadCtxHooksLnxSchedOut;
+ pThis->PreemptOps.sched_in = rtThreadCtxHooksLnxSchedIn;
+
+#if RTLNX_VER_MIN(4,2,0)
+ preempt_notifier_inc();
+#endif
+
+ *phCtxHook = pThis;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTThreadCtxHookCreate);
+
+
+RTDECL(int ) RTThreadCtxHookDestroy(RTTHREADCTXHOOK hCtxHook)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate input.
+ */
+ PRTTHREADCTXHOOKINT pThis = hCtxHook;
+ if (pThis == NIL_RTTHREADCTXHOOK)
+ return VINF_SUCCESS;
+ AssertPtr(pThis);
+ AssertMsgReturn(pThis->u32Magic == RTTHREADCTXHOOKINT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis),
+ VERR_INVALID_HANDLE);
+ Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
+ Assert(!pThis->fEnabled || pThis->hOwner == RTThreadNativeSelf());
+
+ /*
+ * If there's still a registered thread-context hook, deregister it now before destroying the object.
+ */
+ if (pThis->fEnabled)
+ {
+ Assert(pThis->hOwner == RTThreadNativeSelf());
+ rtThreadCtxHookDisable(pThis);
+ Assert(!pThis->fEnabled); /* paranoia */
+ }
+
+#if RTLNX_VER_MIN(4,2,0)
+ preempt_notifier_dec();
+#endif
+
+ ASMAtomicWriteU32(&pThis->u32Magic, ~RTTHREADCTXHOOKINT_MAGIC);
+ RTMemFree(pThis);
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTThreadCtxHookDestroy);
+
+
+RTDECL(int) RTThreadCtxHookEnable(RTTHREADCTXHOOK hCtxHook)
+{
+ /*
+ * Validate input.
+ */
+ PRTTHREADCTXHOOKINT pThis = hCtxHook;
+ AssertPtr(pThis);
+ AssertMsgReturn(pThis->u32Magic == RTTHREADCTXHOOKINT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis),
+ VERR_INVALID_HANDLE);
+ Assert(pThis->hOwner == RTThreadNativeSelf());
+ Assert(!pThis->fEnabled);
+ if (!pThis->fEnabled)
+ {
+ IPRT_LINUX_SAVE_EFL_AC();
+ Assert(pThis->PreemptOps.sched_out == rtThreadCtxHooksLnxSchedOut);
+ Assert(pThis->PreemptOps.sched_in == rtThreadCtxHooksLnxSchedIn);
+
+ /*
+ * Register the callback.
+ */
+ preempt_disable();
+ pThis->fEnabled = true;
+ preempt_notifier_register(&pThis->LnxPreemptNotifier);
+ preempt_enable();
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ }
+
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTThreadCtxHookEnable);
+
+
+RTDECL(int) RTThreadCtxHookDisable(RTTHREADCTXHOOK hCtxHook)
+{
+ /*
+ * Validate input.
+ */
+ PRTTHREADCTXHOOKINT pThis = hCtxHook;
+ if (pThis != NIL_RTTHREADCTXHOOK)
+ {
+ AssertPtr(pThis);
+ AssertMsgReturn(pThis->u32Magic == RTTHREADCTXHOOKINT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis),
+ VERR_INVALID_HANDLE);
+ Assert(pThis->hOwner == RTThreadNativeSelf());
+
+ /*
+ * Deregister the callback.
+ */
+ if (pThis->fEnabled)
+ {
+ IPRT_LINUX_SAVE_EFL_AC();
+ rtThreadCtxHookDisable(pThis);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ }
+ }
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTThreadCtxHookDisable);
+
+
+RTDECL(bool) RTThreadCtxHookIsEnabled(RTTHREADCTXHOOK hCtxHook)
+{
+ /*
+ * Validate input.
+ */
+ PRTTHREADCTXHOOKINT pThis = hCtxHook;
+ if (pThis == NIL_RTTHREADCTXHOOK)
+ return false;
+ AssertPtr(pThis);
+ AssertMsgReturn(pThis->u32Magic == RTTHREADCTXHOOKINT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis),
+ false);
+
+ return pThis->fEnabled;
+}
+RT_EXPORT_SYMBOL(RTThreadCtxHookIsEnabled);
+
+#else /* Not supported / Not needed */
+# include "../generic/threadctxhooks-r0drv-generic.cpp"
+#endif /* Not supported / Not needed */
+
diff --git a/src/VBox/Runtime/r0drv/linux/time-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/time-r0drv-linux.c
new file mode 100644
index 00000000..370afbf1
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/time-r0drv-linux.c
@@ -0,0 +1,221 @@
+/* $Id: time-r0drv-linux.c $ */
+/** @file
+ * IPRT - Time, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#define LOG_GROUP RTLOGGROUP_TIME
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+/* Make sure we have the setting functions we need for RTTimeNow: */
+#if RTLNX_VER_MAX(2,6,16)
+# define RTTIME_INCL_TIMEVAL
+#elif RTLNX_VER_MAX(3,17,0)
+# define RTTIME_INCL_TIMESPEC
+#endif
+#include <iprt/time.h>
+#include <iprt/asm.h>
+
+
+
+DECLINLINE(uint64_t) rtTimeGetSystemNanoTS(void)
+{
+#if RTLNX_VER_MIN(5,6,0)
+ /*
+ * Starting with kernel version 5.6-rc3 only 64-bit time interfaces
+ * are allowed in the kernel.
+ */
+ uint64_t u64;
+ struct timespec64 Ts = { 0, 0 };
+
+ ktime_get_ts64(&Ts);
+ u64 = Ts.tv_sec * RT_NS_1SEC_64 + Ts.tv_nsec;
+ return u64;
+
+#elif RTLNX_VER_MIN(2,6,16) /* This must match timer-r0drv-linux.c! */
+ /*
+ * Use ktime_get_ts, this is also what clock_gettime(CLOCK_MONOTONIC,) is using.
+ */
+ uint64_t u64;
+ struct timespec Ts = { 0, 0 };
+ ktime_get_ts(&Ts);
+ u64 = Ts.tv_sec * RT_NS_1SEC_64 + Ts.tv_nsec;
+ return u64;
+
+#elif RTLNX_VER_MIN(2,5,60)
+ /*
+ * Seems there is no way of getting to the exact source of
+ * sys_clock_gettime(CLOCK_MONOTONIC, &ts) here, I think. But
+ * 64-bit jiffies adjusted for the initial value should be pretty
+ * much the same I hope.
+ */
+ uint64_t u64 = get_jiffies_64();
+# ifdef INITIAL_JIFFIES
+ u64 += INITIAL_JIFFIES;
+# endif
+ u64 *= TICK_NSEC;
+ return u64;
+
+#else /* < 2.5.60 */
+# if BITS_PER_LONG >= 64
+ /*
+ * This is the same as above, except that there is no get_jiffies_64()
+ * here and we rely on long, and therefor jiffies, being 64-bit instead.
+ */
+ uint64_t u64 = jiffies;
+# ifdef INITIAL_JIFFIES
+ u64 += INITIAL_JIFFIES;
+# endif
+ u64 *= TICK_NSEC;
+ return u64;
+
+# else /* 32 bit jiffies */
+ /*
+ * We'll have to try track jiffy rollovers here or we'll be
+ * in trouble every time it flips.
+ *
+ * The high dword of the s_u64Last is the rollover count, the
+ * low dword is the previous jiffies. Updating is done by
+ * atomic compare & exchange of course.
+ */
+ static uint64_t volatile s_u64Last = 0;
+ uint64_t u64;
+
+ for (;;)
+ {
+ uint64_t u64NewLast;
+ int32_t iDelta;
+ uint32_t cRollovers;
+ uint32_t u32LastJiffies;
+
+ /* sample the values */
+ unsigned long ulNow = jiffies;
+ uint64_t u64Last = s_u64Last;
+ if (ulNow != jiffies)
+ continue; /* try again */
+# ifdef INITIAL_JIFFIES
+ ulNow += INITIAL_JIFFIES;
+# endif
+
+ u32LastJiffies = (uint32_t)u64Last;
+ cRollovers = u64Last >> 32;
+
+ /*
+ * Check for rollover and update the static last value.
+ *
+ * We have to make sure we update it successfully to rule out
+ * an underrun because of racing someone.
+ */
+ iDelta = ulNow - u32LastJiffies;
+ if (iDelta < 0)
+ {
+ cRollovers++;
+ u64NewLast = RT_MAKE_U64(ulNow, cRollovers);
+ if (!ASMAtomicCmpXchgU64(&s_u64Last, u64NewLast, u64Last))
+ continue; /* race, try again */
+ }
+ else
+ {
+ u64NewLast = RT_MAKE_U64(ulNow, cRollovers);
+ ASMAtomicCmpXchgU64(&s_u64Last, u64NewLast, u64Last);
+ }
+
+ /* calculate the return value */
+ u64 = ulNow;
+ u64 *= TICK_NSEC;
+ u64 += cRollovers * (_4G * TICK_NSEC);
+ break;
+ }
+
+ return u64;
+# endif /* 32 bit jiffies */
+#endif /* < 2.5.60 */
+}
+
+
+RTDECL(uint64_t) RTTimeNanoTS(void)
+{
+ return rtTimeGetSystemNanoTS();
+}
+RT_EXPORT_SYMBOL(RTTimeNanoTS);
+
+
+RTDECL(uint64_t) RTTimeMilliTS(void)
+{
+ return rtTimeGetSystemNanoTS() / RT_NS_1MS;
+}
+RT_EXPORT_SYMBOL(RTTimeMilliTS);
+
+
+RTDECL(uint64_t) RTTimeSystemNanoTS(void)
+{
+ return rtTimeGetSystemNanoTS();
+}
+RT_EXPORT_SYMBOL(RTTimeSystemNanoTS);
+
+
+RTDECL(uint64_t) RTTimeSystemMilliTS(void)
+{
+ return rtTimeGetSystemNanoTS() / RT_NS_1MS;
+}
+RT_EXPORT_SYMBOL(RTTimeSystemMilliTS);
+
+
+RTDECL(PRTTIMESPEC) RTTimeNow(PRTTIMESPEC pTime)
+{
+ IPRT_LINUX_SAVE_EFL_AC();
+#if RTLNX_VER_MIN(3,17,0)
+ struct timespec64 Ts;
+ ktime_get_real_ts64(&Ts); /* ktime_get_real_ts64 was added as a macro in 3.17, function since 4.18. */
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return RTTimeSpecSetTimespec64(pTime, &Ts);
+
+#elif RTLNX_VER_MIN(2,6,16)
+ struct timespec Ts;
+ ktime_get_real_ts(&Ts); /* ktime_get_real_ts was removed in Linux 4.20. */
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return RTTimeSpecSetTimespec(pTime, &Ts);
+
+#else /* < 2.6.16 */
+ struct timeval Tv;
+ do_gettimeofday(&Tv);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return RTTimeSpecSetTimeval(pTime, &Tv);
+#endif
+}
+RT_EXPORT_SYMBOL(RTTimeNow);
+
diff --git a/src/VBox/Runtime/r0drv/linux/timer-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/timer-r0drv-linux.c
new file mode 100644
index 00000000..68886a8c
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/timer-r0drv-linux.c
@@ -0,0 +1,1739 @@
+/* $Id: timer-r0drv-linux.c $ */
+/** @file
+ * IPRT - Timers, Ring-0 Driver, Linux.
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+
+/*********************************************************************************************************************************
+* Header Files *
+*********************************************************************************************************************************/
+#include "the-linux-kernel.h"
+#include "internal/iprt.h"
+
+#include <iprt/timer.h>
+#include <iprt/time.h>
+#include <iprt/mp.h>
+#include <iprt/cpuset.h>
+#include <iprt/spinlock.h>
+#include <iprt/err.h>
+#include <iprt/asm.h>
+#include <iprt/assert.h>
+#include <iprt/alloc.h>
+
+#include "internal/magics.h"
+
+/** @def RTTIMER_LINUX_WITH_HRTIMER
+ * Whether to use high resolution timers. */
+#if !defined(RTTIMER_LINUX_WITH_HRTIMER) \
+ && defined(IPRT_LINUX_HAS_HRTIMER)
+# define RTTIMER_LINUX_WITH_HRTIMER
+#endif
+
+#if RTLNX_VER_MAX(2,6,31)
+# define mod_timer_pinned mod_timer
+# define HRTIMER_MODE_ABS_PINNED HRTIMER_MODE_ABS
+#endif
+
+
+/*********************************************************************************************************************************
+* Structures and Typedefs *
+*********************************************************************************************************************************/
+/**
+ * Timer state machine.
+ *
+ * This is used to try handle the issues with MP events and
+ * timers that runs on all CPUs. It's relatively nasty :-/
+ */
+typedef enum RTTIMERLNXSTATE
+{
+ /** Stopped. */
+ RTTIMERLNXSTATE_STOPPED = 0,
+ /** Transient state; next ACTIVE. */
+ RTTIMERLNXSTATE_STARTING,
+ /** Transient state; next ACTIVE. (not really necessary) */
+ RTTIMERLNXSTATE_MP_STARTING,
+ /** Active. */
+ RTTIMERLNXSTATE_ACTIVE,
+ /** Active and in callback; next ACTIVE, STOPPED or CALLBACK_DESTROYING. */
+ RTTIMERLNXSTATE_CALLBACK,
+ /** Stopped while in the callback; next STOPPED. */
+ RTTIMERLNXSTATE_CB_STOPPING,
+ /** Restarted while in the callback; next ACTIVE, STOPPED, DESTROYING. */
+ RTTIMERLNXSTATE_CB_RESTARTING,
+ /** The callback shall destroy the timer; next STOPPED. */
+ RTTIMERLNXSTATE_CB_DESTROYING,
+ /** Transient state; next STOPPED. */
+ RTTIMERLNXSTATE_STOPPING,
+ /** Transient state; next STOPPED. */
+ RTTIMERLNXSTATE_MP_STOPPING,
+ /** The usual 32-bit hack. */
+ RTTIMERLNXSTATE_32BIT_HACK = 0x7fffffff
+} RTTIMERLNXSTATE;
+
+
+/**
+ * A Linux sub-timer.
+ */
+typedef struct RTTIMERLNXSUBTIMER
+{
+ /** Timer specific data. */
+ union
+ {
+#if defined(RTTIMER_LINUX_WITH_HRTIMER)
+ /** High resolution timer. */
+ struct
+ {
+ /** The linux timer structure. */
+ struct hrtimer LnxTimer;
+ } Hr;
+#endif
+ /** Standard timer. */
+ struct
+ {
+ /** The linux timer structure. */
+ struct timer_list LnxTimer;
+ /** The start of the current run (ns).
+ * This is used to calculate when the timer ought to fire the next time. */
+ uint64_t u64NextTS;
+ /** When the timer was started. */
+ uint64_t nsStartTS;
+ /** The u64NextTS in jiffies. */
+ unsigned long ulNextJiffies;
+ /** Set when starting or changing the timer so that u64StartTs
+ * and u64NextTS gets reinitialized (eliminating some jitter). */
+ bool volatile fFirstAfterChg;
+ } Std;
+ } u;
+ /** The current tick number. */
+ uint64_t iTick;
+ /** Restart the single shot timer at this specific time.
+ * Used when a single shot timer is restarted from the callback. */
+ uint64_t volatile uNsRestartAt;
+ /** Pointer to the parent timer. */
+ PRTTIMER pParent;
+ /** The current sub-timer state. */
+ RTTIMERLNXSTATE volatile enmState;
+} RTTIMERLNXSUBTIMER;
+/** Pointer to a linux sub-timer. */
+typedef RTTIMERLNXSUBTIMER *PRTTIMERLNXSUBTIMER;
+
+
+/**
+ * The internal representation of an Linux timer handle.
+ */
+typedef struct RTTIMER
+{
+ /** Magic.
+ * This is RTTIMER_MAGIC, but changes to something else before the timer
+ * is destroyed to indicate clearly that thread should exit. */
+ uint32_t volatile u32Magic;
+ /** Spinlock synchronizing the fSuspended and MP event handling.
+ * This is NIL_RTSPINLOCK if cCpus == 1. */
+ RTSPINLOCK hSpinlock;
+ /** Flag indicating that the timer is suspended. */
+ bool volatile fSuspended;
+ /** Whether the timer must run on one specific CPU or not. */
+ bool fSpecificCpu;
+#ifdef CONFIG_SMP
+ /** Whether the timer must run on all CPUs or not. */
+ bool fAllCpus;
+#endif /* else: All -> specific on non-SMP kernels */
+ /** Whether it is a high resolution timer or a standard one. */
+ bool fHighRes;
+ /** The id of the CPU it must run on if fSpecificCpu is set. */
+ RTCPUID idCpu;
+ /** The number of CPUs this timer should run on. */
+ RTCPUID cCpus;
+ /** Callback. */
+ PFNRTTIMER pfnTimer;
+ /** User argument. */
+ void *pvUser;
+ /** The timer interval. 0 if one-shot. */
+ uint64_t volatile u64NanoInterval;
+ /** This is set to the number of jiffies between ticks if the interval is
+ * an exact number of jiffies. (Standard timers only.) */
+ unsigned long volatile cJiffies;
+ /** The change interval spinlock for standard timers only. */
+ spinlock_t ChgIntLock;
+ /** Workqueue item for delayed destruction. */
+ RTR0LNXWORKQUEUEITEM DtorWorkqueueItem;
+ /** Sub-timers.
+ * Normally there is just one, but for RTTIMER_FLAGS_CPU_ALL this will contain
+ * an entry for all possible cpus. In that case the index will be the same as
+ * for the RTCpuSet. */
+ RTTIMERLNXSUBTIMER aSubTimers[1];
+} RTTIMER;
+
+
+/**
+ * A rtTimerLinuxStartOnCpu and rtTimerLinuxStartOnCpu argument package.
+ */
+typedef struct RTTIMERLINUXSTARTONCPUARGS
+{
+ /** The current time (RTTimeSystemNanoTS). */
+ uint64_t u64Now;
+ /** When to start firing (delta). */
+ uint64_t u64First;
+} RTTIMERLINUXSTARTONCPUARGS;
+/** Pointer to a rtTimerLinuxStartOnCpu argument package. */
+typedef RTTIMERLINUXSTARTONCPUARGS *PRTTIMERLINUXSTARTONCPUARGS;
+
+
+/*********************************************************************************************************************************
+* Internal Functions *
+*********************************************************************************************************************************/
+#ifdef CONFIG_SMP
+static DECLCALLBACK(void) rtTimerLinuxMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser);
+#endif
+
+#if 0
+#define DEBUG_HACKING
+#include <iprt/string.h>
+#include <iprt/asm-amd64-x86.h>
+static void myLogBackdoorPrintf(const char *pszFormat, ...)
+{
+ char szTmp[256];
+ va_list args;
+ size_t cb;
+
+ cb = RTStrPrintf(szTmp, sizeof(szTmp) - 10, "%d: ", RTMpCpuId());
+ va_start(args, pszFormat);
+ cb += RTStrPrintfV(&szTmp[cb], sizeof(szTmp) - cb, pszFormat, args);
+ va_end(args);
+
+ ASMOutStrU8(0x504, (uint8_t *)&szTmp[0], cb);
+}
+# define RTAssertMsg1Weak(pszExpr, uLine, pszFile, pszFunction) \
+ myLogBackdoorPrintf("\n!!Guest Assertion failed!!\n%s(%d) %s\n%s\n", uLine, pszFile, pszFunction, (pszExpr))
+# define RTAssertMsg2Weak myLogBackdoorPrintf
+# define RTTIMERLNX_LOG(a) myLogBackdoorPrintf a
+#else
+# define RTTIMERLNX_LOG(a) do { } while (0)
+#endif
+
+/**
+ * Sets the state.
+ */
+DECLINLINE(void) rtTimerLnxSetState(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState)
+{
+#ifdef DEBUG_HACKING
+ RTTIMERLNX_LOG(("set %d -> %d\n", *penmState, enmNewState));
+#endif
+ ASMAtomicWriteU32((uint32_t volatile *)penmState, enmNewState);
+}
+
+
+/**
+ * Sets the state if it has a certain value.
+ *
+ * @return true if xchg was done.
+ * @return false if xchg wasn't done.
+ */
+#ifdef DEBUG_HACKING
+#define rtTimerLnxCmpXchgState(penmState, enmNewState, enmCurState) rtTimerLnxCmpXchgStateDebug(penmState, enmNewState, enmCurState, __LINE__)
+static bool rtTimerLnxCmpXchgStateDebug(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState,
+ RTTIMERLNXSTATE enmCurState, uint32_t uLine)
+{
+ RTTIMERLNXSTATE enmOldState = enmCurState;
+ bool fRc = ASMAtomicCmpXchgExU32((uint32_t volatile *)penmState, enmNewState, enmCurState, (uint32_t *)&enmOldState);
+ RTTIMERLNX_LOG(("cxg %d -> %d - %d at %u\n", enmOldState, enmNewState, fRc, uLine));
+ return fRc;
+}
+#else
+DECLINLINE(bool) rtTimerLnxCmpXchgState(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState,
+ RTTIMERLNXSTATE enmCurState)
+{
+ return ASMAtomicCmpXchgU32((uint32_t volatile *)penmState, enmNewState, enmCurState);
+}
+#endif
+
+
+/**
+ * Gets the state.
+ */
+DECLINLINE(RTTIMERLNXSTATE) rtTimerLnxGetState(RTTIMERLNXSTATE volatile *penmState)
+{
+ return (RTTIMERLNXSTATE)ASMAtomicUoReadU32((uint32_t volatile *)penmState);
+}
+
+#ifdef RTTIMER_LINUX_WITH_HRTIMER
+
+/**
+ * Converts a nano second time stamp to ktime_t.
+ *
+ * ASSUMES RTTimeSystemNanoTS() is implemented using ktime_get_ts().
+ *
+ * @returns ktime_t.
+ * @param cNanoSecs Nanoseconds.
+ */
+DECLINLINE(ktime_t) rtTimerLnxNanoToKt(uint64_t cNanoSecs)
+{
+ /* With some luck the compiler optimizes the division out of this... (Bet it doesn't.) */
+ return ktime_set(cNanoSecs / 1000000000, cNanoSecs % 1000000000);
+}
+
+/**
+ * Converts ktime_t to a nano second time stamp.
+ *
+ * ASSUMES RTTimeSystemNanoTS() is implemented using ktime_get_ts().
+ *
+ * @returns nano second time stamp.
+ * @param Kt ktime_t.
+ */
+DECLINLINE(uint64_t) rtTimerLnxKtToNano(ktime_t Kt)
+{
+ return ktime_to_ns(Kt);
+}
+
+#endif /* RTTIMER_LINUX_WITH_HRTIMER */
+
+/**
+ * Converts a nano second interval to jiffies.
+ *
+ * @returns Jiffies.
+ * @param cNanoSecs Nanoseconds.
+ */
+DECLINLINE(unsigned long) rtTimerLnxNanoToJiffies(uint64_t cNanoSecs)
+{
+ /* this can be made even better... */
+ if (cNanoSecs > (uint64_t)TICK_NSEC * MAX_JIFFY_OFFSET)
+ return MAX_JIFFY_OFFSET;
+# if ARCH_BITS == 32
+ if (RT_LIKELY(cNanoSecs <= UINT32_MAX))
+ return ((uint32_t)cNanoSecs + (TICK_NSEC-1)) / TICK_NSEC;
+# endif
+ return (cNanoSecs + (TICK_NSEC-1)) / TICK_NSEC;
+}
+
+
+/**
+ * Starts a sub-timer (RTTimerStart).
+ *
+ * @param pSubTimer The sub-timer to start.
+ * @param u64Now The current timestamp (RTTimeSystemNanoTS()).
+ * @param u64First The interval from u64Now to the first time the timer should fire.
+ * @param fPinned true = timer pinned to a specific CPU,
+ * false = timer can migrate between CPUs
+ * @param fHighRes Whether the user requested a high resolution timer or not.
+ * @param enmOldState The old timer state.
+ */
+static void rtTimerLnxStartSubTimer(PRTTIMERLNXSUBTIMER pSubTimer, uint64_t u64Now, uint64_t u64First,
+ bool fPinned, bool fHighRes)
+{
+ /*
+ * Calc when it should start firing.
+ */
+ uint64_t u64NextTS = u64Now + u64First;
+ if (!fHighRes)
+ {
+ pSubTimer->u.Std.u64NextTS = u64NextTS;
+ pSubTimer->u.Std.nsStartTS = u64NextTS;
+ }
+ RTTIMERLNX_LOG(("startsubtimer %p\n", pSubTimer->pParent));
+
+ pSubTimer->iTick = 0;
+
+#ifdef RTTIMER_LINUX_WITH_HRTIMER
+ if (fHighRes)
+ hrtimer_start(&pSubTimer->u.Hr.LnxTimer, rtTimerLnxNanoToKt(u64NextTS),
+ fPinned ? HRTIMER_MODE_ABS_PINNED : HRTIMER_MODE_ABS);
+ else
+#endif
+ {
+ unsigned long cJiffies = !u64First ? 0 : rtTimerLnxNanoToJiffies(u64First);
+ pSubTimer->u.Std.ulNextJiffies = jiffies + cJiffies;
+ pSubTimer->u.Std.fFirstAfterChg = true;
+#ifdef CONFIG_SMP
+ if (fPinned)
+ {
+# if RTLNX_VER_MIN(4,8,0)
+ mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
+# else
+ mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
+# endif
+ }
+ else
+#endif
+ mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
+ }
+
+ /* Be a bit careful here since we could be racing the callback. */
+ if (!rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_STARTING))
+ rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_MP_STARTING);
+}
+
+
+/**
+ * Stops a sub-timer (RTTimerStart and rtTimerLinuxMpEvent()).
+ *
+ * The caller has already changed the state, so we will not be in a callback
+ * situation wrt to the calling thread.
+ *
+ * @param pSubTimer The sub-timer.
+ * @param fHighRes Whether the user requested a high resolution timer or not.
+ */
+static void rtTimerLnxStopSubTimer(PRTTIMERLNXSUBTIMER pSubTimer, bool fHighRes)
+{
+ RTTIMERLNX_LOG(("stopsubtimer %p %d\n", pSubTimer->pParent, fHighRes));
+#ifdef RTTIMER_LINUX_WITH_HRTIMER
+ if (fHighRes)
+ {
+ /* There is no equivalent to del_timer in the hrtimer API,
+ hrtimer_cancel() == del_timer_sync(). Just like the WARN_ON in
+ del_timer_sync() asserts, waiting for a timer callback to complete
+ is deadlock prone, so don't do it. */
+ int rc = hrtimer_try_to_cancel(&pSubTimer->u.Hr.LnxTimer);
+ if (rc < 0)
+ {
+ hrtimer_start(&pSubTimer->u.Hr.LnxTimer, ktime_set(KTIME_SEC_MAX, 0), HRTIMER_MODE_ABS);
+ hrtimer_try_to_cancel(&pSubTimer->u.Hr.LnxTimer);
+ }
+ }
+ else
+#endif
+ del_timer(&pSubTimer->u.Std.LnxTimer);
+
+ rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED);
+}
+
+
+/**
+ * Used by RTTimerDestroy and rtTimerLnxCallbackDestroy to do the actual work.
+ *
+ * @param pTimer The timer in question.
+ */
+static void rtTimerLnxDestroyIt(PRTTIMER pTimer)
+{
+ RTSPINLOCK hSpinlock = pTimer->hSpinlock;
+ RTCPUID iCpu;
+ Assert(pTimer->fSuspended);
+ RTTIMERLNX_LOG(("destroyit %p\n", pTimer));
+
+ /*
+ * Remove the MP notifications first because it'll reduce the risk of
+ * us overtaking any MP event that might theoretically be racing us here.
+ */
+#ifdef CONFIG_SMP
+ if ( pTimer->cCpus > 1
+ && hSpinlock != NIL_RTSPINLOCK)
+ {
+ int rc = RTMpNotificationDeregister(rtTimerLinuxMpEvent, pTimer);
+ AssertRC(rc);
+ }
+#endif /* CONFIG_SMP */
+
+ /*
+ * Invalidate the handle.
+ */
+ ASMAtomicWriteU32(&pTimer->u32Magic, ~RTTIMER_MAGIC);
+
+ /*
+ * Make sure all timers have stopped executing since we're stopping them in
+ * an asynchronous manner up in rtTimerLnxStopSubTimer.
+ */
+ iCpu = pTimer->cCpus;
+ while (iCpu-- > 0)
+ {
+#ifdef RTTIMER_LINUX_WITH_HRTIMER
+ if (pTimer->fHighRes)
+ hrtimer_cancel(&pTimer->aSubTimers[iCpu].u.Hr.LnxTimer);
+ else
+#endif
+ del_timer_sync(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer);
+ }
+
+ /*
+ * Finally, free the resources.
+ */
+ RTMemFreeEx(pTimer, RT_UOFFSETOF_DYN(RTTIMER, aSubTimers[pTimer->cCpus]));
+ if (hSpinlock != NIL_RTSPINLOCK)
+ RTSpinlockDestroy(hSpinlock);
+}
+
+
+/**
+ * Workqueue callback (no DECLCALLBACK!) for deferred destruction.
+ *
+ * @param pWork Pointer to the DtorWorkqueueItem member of our timer
+ * structure.
+ */
+static void rtTimerLnxDestroyDeferred(RTR0LNXWORKQUEUEITEM *pWork)
+{
+ PRTTIMER pTimer = RT_FROM_MEMBER(pWork, RTTIMER, DtorWorkqueueItem);
+ rtTimerLnxDestroyIt(pTimer);
+}
+
+
+/**
+ * Called when the timer was destroyed by the callback function.
+ *
+ * @param pTimer The timer.
+ * @param pSubTimer The sub-timer which we're handling, the state of this
+ * will be RTTIMERLNXSTATE_CALLBACK_DESTROYING.
+ */
+static void rtTimerLnxCallbackDestroy(PRTTIMER pTimer, PRTTIMERLNXSUBTIMER pSubTimer)
+{
+ /*
+ * If it's an omni timer, the last dude does the destroying.
+ */
+ if (pTimer->cCpus > 1)
+ {
+ uint32_t iCpu = pTimer->cCpus;
+ RTSpinlockAcquire(pTimer->hSpinlock);
+
+ Assert(pSubTimer->enmState == RTTIMERLNXSTATE_CB_DESTROYING);
+ rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED);
+
+ while (iCpu-- > 0)
+ if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) != RTTIMERLNXSTATE_STOPPED)
+ {
+ RTSpinlockRelease(pTimer->hSpinlock);
+ return;
+ }
+
+ RTSpinlockRelease(pTimer->hSpinlock);
+ }
+
+ /*
+ * Destroying a timer from the callback is unsafe since the callout code
+ * might be touching the timer structure upon return (hrtimer does!). So,
+ * we have to defer the actual destruction to the IRPT workqueue.
+ */
+ rtR0LnxWorkqueuePush(&pTimer->DtorWorkqueueItem, rtTimerLnxDestroyDeferred);
+}
+
+
+#ifdef CONFIG_SMP
+/**
+ * Deal with a sub-timer that has migrated.
+ *
+ * @param pTimer The timer.
+ * @param pSubTimer The sub-timer.
+ */
+static void rtTimerLnxCallbackHandleMigration(PRTTIMER pTimer, PRTTIMERLNXSUBTIMER pSubTimer)
+{
+ RTTIMERLNXSTATE enmState;
+ if (pTimer->cCpus > 1)
+ RTSpinlockAcquire(pTimer->hSpinlock);
+
+ do
+ {
+ enmState = rtTimerLnxGetState(&pSubTimer->enmState);
+ switch (enmState)
+ {
+ case RTTIMERLNXSTATE_STOPPING:
+ case RTTIMERLNXSTATE_MP_STOPPING:
+ enmState = RTTIMERLNXSTATE_STOPPED;
+ RT_FALL_THRU();
+ case RTTIMERLNXSTATE_STOPPED:
+ break;
+
+ default:
+ AssertMsgFailed(("%d\n", enmState));
+ RT_FALL_THRU();
+ case RTTIMERLNXSTATE_STARTING:
+ case RTTIMERLNXSTATE_MP_STARTING:
+ case RTTIMERLNXSTATE_ACTIVE:
+ case RTTIMERLNXSTATE_CALLBACK:
+ case RTTIMERLNXSTATE_CB_STOPPING:
+ case RTTIMERLNXSTATE_CB_RESTARTING:
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, enmState))
+ enmState = RTTIMERLNXSTATE_STOPPED;
+ break;
+
+ case RTTIMERLNXSTATE_CB_DESTROYING:
+ {
+ if (pTimer->cCpus > 1)
+ RTSpinlockRelease(pTimer->hSpinlock);
+
+ rtTimerLnxCallbackDestroy(pTimer, pSubTimer);
+ return;
+ }
+ }
+ } while (enmState != RTTIMERLNXSTATE_STOPPED);
+
+ if (pTimer->cCpus > 1)
+ RTSpinlockRelease(pTimer->hSpinlock);
+}
+#endif /* CONFIG_SMP */
+
+
+/**
+ * The slow path of rtTimerLnxChangeToCallbackState.
+ *
+ * @returns true if changed successfully, false if not.
+ * @param pSubTimer The sub-timer.
+ */
+static bool rtTimerLnxChangeToCallbackStateSlow(PRTTIMERLNXSUBTIMER pSubTimer)
+{
+ for (;;)
+ {
+ RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState);
+ switch (enmState)
+ {
+ case RTTIMERLNXSTATE_ACTIVE:
+ case RTTIMERLNXSTATE_STARTING:
+ case RTTIMERLNXSTATE_MP_STARTING:
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CALLBACK, enmState))
+ return true;
+ break;
+
+ case RTTIMERLNXSTATE_CALLBACK:
+ case RTTIMERLNXSTATE_CB_STOPPING:
+ case RTTIMERLNXSTATE_CB_RESTARTING:
+ case RTTIMERLNXSTATE_CB_DESTROYING:
+ AssertMsgFailed(("%d\n", enmState)); RT_FALL_THRU();
+ default:
+ return false;
+ }
+ ASMNopPause();
+ }
+}
+
+
+/**
+ * Tries to change the sub-timer state to 'callback'.
+ *
+ * @returns true if changed successfully, false if not.
+ * @param pSubTimer The sub-timer.
+ */
+DECLINLINE(bool) rtTimerLnxChangeToCallbackState(PRTTIMERLNXSUBTIMER pSubTimer)
+{
+ if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CALLBACK, RTTIMERLNXSTATE_ACTIVE)))
+ return true;
+ return rtTimerLnxChangeToCallbackStateSlow(pSubTimer);
+}
+
+
+#ifdef RTTIMER_LINUX_WITH_HRTIMER
+/**
+ * Timer callback function for high resolution timers.
+ *
+ * @returns HRTIMER_NORESTART or HRTIMER_RESTART depending on whether it's a
+ * one-shot or interval timer.
+ * @param pHrTimer Pointer to the sub-timer structure.
+ */
+static enum hrtimer_restart rtTimerLinuxHrCallback(struct hrtimer *pHrTimer)
+{
+ PRTTIMERLNXSUBTIMER pSubTimer = RT_FROM_MEMBER(pHrTimer, RTTIMERLNXSUBTIMER, u.Hr.LnxTimer);
+ PRTTIMER pTimer = pSubTimer->pParent;
+
+
+ RTTIMERLNX_LOG(("hrcallback %p\n", pTimer));
+ if (RT_UNLIKELY(!rtTimerLnxChangeToCallbackState(pSubTimer)))
+ return HRTIMER_NORESTART;
+
+#ifdef CONFIG_SMP
+ /*
+ * Check for unwanted migration.
+ */
+ if (pTimer->fAllCpus || pTimer->fSpecificCpu)
+ {
+ RTCPUID idCpu = RTMpCpuId();
+ if (RT_UNLIKELY( pTimer->fAllCpus
+ ? (RTCPUID)(pSubTimer - &pTimer->aSubTimers[0]) != idCpu
+ : pTimer->idCpu != idCpu))
+ {
+ rtTimerLnxCallbackHandleMigration(pTimer, pSubTimer);
+ return HRTIMER_NORESTART;
+ }
+ }
+#endif
+
+ if (pTimer->u64NanoInterval)
+ {
+ /*
+ * Periodic timer, run it and update the native timer afterwards so
+ * we can handle RTTimerStop and RTTimerChangeInterval from the
+ * callback as well as a racing control thread.
+ */
+ pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick);
+ hrtimer_add_expires_ns(&pSubTimer->u.Hr.LnxTimer, ASMAtomicReadU64(&pTimer->u64NanoInterval));
+ if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CALLBACK)))
+ return HRTIMER_RESTART;
+ }
+ else
+ {
+ /*
+ * One shot timer (no omni), stop it before dispatching it.
+ * Allow RTTimerStart as well as RTTimerDestroy to be called from
+ * the callback.
+ */
+ ASMAtomicWriteBool(&pTimer->fSuspended, true);
+ pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick);
+ if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CALLBACK)))
+ return HRTIMER_NORESTART;
+ }
+
+ /*
+ * Some state change occurred while we were in the callback routine.
+ */
+ for (;;)
+ {
+ RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState);
+ switch (enmState)
+ {
+ case RTTIMERLNXSTATE_CB_DESTROYING:
+ rtTimerLnxCallbackDestroy(pTimer, pSubTimer);
+ return HRTIMER_NORESTART;
+
+ case RTTIMERLNXSTATE_CB_STOPPING:
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CB_STOPPING))
+ return HRTIMER_NORESTART;
+ break;
+
+ case RTTIMERLNXSTATE_CB_RESTARTING:
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CB_RESTARTING))
+ {
+ pSubTimer->iTick = 0;
+ hrtimer_set_expires(&pSubTimer->u.Hr.LnxTimer, rtTimerLnxNanoToKt(pSubTimer->uNsRestartAt));
+ return HRTIMER_RESTART;
+ }
+ break;
+
+ default:
+ AssertMsgFailed(("%d\n", enmState));
+ return HRTIMER_NORESTART;
+ }
+ ASMNopPause();
+ }
+}
+#endif /* RTTIMER_LINUX_WITH_HRTIMER */
+
+
+#if RTLNX_VER_MIN(4,15,0)
+/**
+ * Timer callback function for standard timers.
+ *
+ * @param pLnxTimer Pointer to the Linux timer structure.
+ */
+static void rtTimerLinuxStdCallback(struct timer_list *pLnxTimer)
+{
+ PRTTIMERLNXSUBTIMER pSubTimer = from_timer(pSubTimer, pLnxTimer, u.Std.LnxTimer);
+#else
+/**
+ * Timer callback function for standard timers.
+ *
+ * @param ulUser Address of the sub-timer structure.
+ */
+static void rtTimerLinuxStdCallback(unsigned long ulUser)
+{
+ PRTTIMERLNXSUBTIMER pSubTimer = (PRTTIMERLNXSUBTIMER)ulUser;
+#endif
+ PRTTIMER pTimer = pSubTimer->pParent;
+
+ RTTIMERLNX_LOG(("stdcallback %p\n", pTimer));
+ if (RT_UNLIKELY(!rtTimerLnxChangeToCallbackState(pSubTimer)))
+ return;
+
+#ifdef CONFIG_SMP
+ /*
+ * Check for unwanted migration.
+ */
+ if (pTimer->fAllCpus || pTimer->fSpecificCpu)
+ {
+ RTCPUID idCpu = RTMpCpuId();
+ if (RT_UNLIKELY( pTimer->fAllCpus
+ ? (RTCPUID)(pSubTimer - &pTimer->aSubTimers[0]) != idCpu
+ : pTimer->idCpu != idCpu))
+ {
+ rtTimerLnxCallbackHandleMigration(pTimer, pSubTimer);
+ return;
+ }
+ }
+#endif
+
+ if (pTimer->u64NanoInterval)
+ {
+ /*
+ * Interval timer, calculate the next timeout.
+ *
+ * The first time around, we'll re-adjust the u.Std.u64NextTS to
+ * try prevent some jittering if we were started at a bad time.
+ */
+ const uint64_t iTick = ++pSubTimer->iTick;
+ unsigned long uCurJiffies = jiffies;
+ unsigned long ulNextJiffies;
+ uint64_t u64NanoInterval;
+ unsigned long cJiffies;
+ unsigned long flFlags;
+
+ spin_lock_irqsave(&pTimer->ChgIntLock, flFlags);
+ u64NanoInterval = pTimer->u64NanoInterval;
+ cJiffies = pTimer->cJiffies;
+ if (RT_UNLIKELY(pSubTimer->u.Std.fFirstAfterChg))
+ {
+ pSubTimer->u.Std.fFirstAfterChg = false;
+ pSubTimer->u.Std.u64NextTS = RTTimeSystemNanoTS();
+ pSubTimer->u.Std.nsStartTS = pSubTimer->u.Std.u64NextTS - u64NanoInterval * (iTick - 1);
+ pSubTimer->u.Std.ulNextJiffies = uCurJiffies = jiffies;
+ }
+ spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags);
+
+ pSubTimer->u.Std.u64NextTS += u64NanoInterval;
+ if (cJiffies)
+ {
+ ulNextJiffies = pSubTimer->u.Std.ulNextJiffies + cJiffies;
+ pSubTimer->u.Std.ulNextJiffies = ulNextJiffies;
+ if (time_after_eq(ulNextJiffies, uCurJiffies))
+ { /* likely */ }
+ else
+ {
+ unsigned long cJiffiesBehind = uCurJiffies - ulNextJiffies;
+ ulNextJiffies = uCurJiffies + cJiffies / 2;
+ if (cJiffiesBehind >= HZ / 4) /* Conside if we're lagging too far behind. Screw the u64NextTS member. */
+ pSubTimer->u.Std.ulNextJiffies = ulNextJiffies;
+ /*else: Don't update u.Std.ulNextJiffies so we can continue catching up in the next tick. */
+ }
+ }
+ else
+ {
+ const uint64_t u64NanoTS = RTTimeSystemNanoTS();
+ const int64_t cNsBehind = u64NanoTS - pSubTimer->u.Std.u64NextTS;
+ if (cNsBehind <= 0)
+ ulNextJiffies = uCurJiffies + rtTimerLnxNanoToJiffies(pSubTimer->u.Std.u64NextTS - u64NanoTS);
+ else if (u64NanoInterval >= RT_NS_1SEC_64 * 2 / HZ)
+ {
+ ulNextJiffies = uCurJiffies + rtTimerLnxNanoToJiffies(u64NanoInterval / 2);
+ if (cNsBehind >= RT_NS_1SEC_64 / HZ / 4) /* Conside if we're lagging too far behind. */
+ pSubTimer->u.Std.u64NextTS = u64NanoTS + u64NanoInterval / 2;
+ }
+ else
+ {
+ ulNextJiffies = uCurJiffies + 1;
+ if (cNsBehind >= RT_NS_1SEC_64 / HZ / 4) /* Conside if we're lagging too far behind. */
+ pSubTimer->u.Std.u64NextTS = u64NanoTS + RT_NS_1SEC_64 / HZ;
+ }
+ pSubTimer->u.Std.ulNextJiffies = ulNextJiffies;
+ }
+
+ /*
+ * Run the timer and re-arm it unless the state changed .
+ * .
+ * We must re-arm it afterwards as we're not in a position to undo this .
+ * operation if for instance someone stopped or destroyed us while we .
+ * were in the callback. (Linux takes care of any races here.)
+ */
+ pTimer->pfnTimer(pTimer, pTimer->pvUser, iTick);
+ if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CALLBACK)))
+ {
+#ifdef CONFIG_SMP
+ if (pTimer->fSpecificCpu || pTimer->fAllCpus)
+ {
+# if RTLNX_VER_MIN(4,8,0)
+ mod_timer(&pSubTimer->u.Std.LnxTimer, ulNextJiffies);
+# else
+ mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, ulNextJiffies);
+# endif
+ }
+ else
+#endif
+ mod_timer(&pSubTimer->u.Std.LnxTimer, ulNextJiffies);
+ return;
+ }
+ }
+ else
+ {
+ /*
+ * One shot timer, stop it before dispatching it.
+ * Allow RTTimerStart as well as RTTimerDestroy to be called from
+ * the callback.
+ */
+ ASMAtomicWriteBool(&pTimer->fSuspended, true);
+ pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick);
+ if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CALLBACK)))
+ return;
+ }
+
+ /*
+ * Some state change occurred while we were in the callback routine.
+ */
+ for (;;)
+ {
+ RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState);
+ switch (enmState)
+ {
+ case RTTIMERLNXSTATE_CB_DESTROYING:
+ rtTimerLnxCallbackDestroy(pTimer, pSubTimer);
+ return;
+
+ case RTTIMERLNXSTATE_CB_STOPPING:
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CB_STOPPING))
+ return;
+ break;
+
+ case RTTIMERLNXSTATE_CB_RESTARTING:
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CB_RESTARTING))
+ {
+ uint64_t u64NanoTS;
+ uint64_t u64NextTS;
+ unsigned long flFlags;
+
+ spin_lock_irqsave(&pTimer->ChgIntLock, flFlags);
+ u64NextTS = pSubTimer->uNsRestartAt;
+ u64NanoTS = RTTimeSystemNanoTS();
+ pSubTimer->iTick = 0;
+ pSubTimer->u.Std.u64NextTS = u64NextTS;
+ pSubTimer->u.Std.fFirstAfterChg = true;
+ pSubTimer->u.Std.ulNextJiffies = u64NextTS > u64NanoTS
+ ? jiffies + rtTimerLnxNanoToJiffies(u64NextTS - u64NanoTS)
+ : jiffies;
+ spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags);
+
+#ifdef CONFIG_SMP
+ if (pTimer->fSpecificCpu || pTimer->fAllCpus)
+ {
+# if RTLNX_VER_MIN(4,8,0)
+ mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
+# else
+ mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
+# endif
+ }
+ else
+#endif
+ mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
+ return;
+ }
+ break;
+
+ default:
+ AssertMsgFailed(("%d\n", enmState));
+ return;
+ }
+ ASMNopPause();
+ }
+}
+
+
+#ifdef CONFIG_SMP
+
+/**
+ * Per-cpu callback function (RTMpOnAll/RTMpOnSpecific).
+ *
+ * @param idCpu The current CPU.
+ * @param pvUser1 Pointer to the timer.
+ * @param pvUser2 Pointer to the argument structure.
+ */
+static DECLCALLBACK(void) rtTimerLnxStartAllOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
+{
+ PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2;
+ PRTTIMER pTimer = (PRTTIMER)pvUser1;
+ Assert(idCpu < pTimer->cCpus);
+ rtTimerLnxStartSubTimer(&pTimer->aSubTimers[idCpu], pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes);
+}
+
+
+/**
+ * Worker for RTTimerStart() that takes care of the ugly bits.
+ *
+ * @returns RTTimerStart() return value.
+ * @param pTimer The timer.
+ * @param pArgs The argument structure.
+ */
+static int rtTimerLnxOmniStart(PRTTIMER pTimer, PRTTIMERLINUXSTARTONCPUARGS pArgs)
+{
+ RTCPUID iCpu;
+ RTCPUSET OnlineSet;
+ RTCPUSET OnlineSet2;
+ int rc2;
+
+ /*
+ * Prepare all the sub-timers for the startup and then flag the timer
+ * as a whole as non-suspended, make sure we get them all before
+ * clearing fSuspended as the MP handler will be waiting on this
+ * should something happen while we're looping.
+ */
+ RTSpinlockAcquire(pTimer->hSpinlock);
+
+ /* Just make it a omni timer restriction that no stop/start races are allowed. */
+ for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
+ if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) != RTTIMERLNXSTATE_STOPPED)
+ {
+ RTSpinlockRelease(pTimer->hSpinlock);
+ return VERR_TIMER_BUSY;
+ }
+
+ do
+ {
+ RTMpGetOnlineSet(&OnlineSet);
+ for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
+ {
+ Assert(pTimer->aSubTimers[iCpu].enmState != RTTIMERLNXSTATE_MP_STOPPING);
+ rtTimerLnxSetState(&pTimer->aSubTimers[iCpu].enmState,
+ RTCpuSetIsMember(&OnlineSet, iCpu)
+ ? RTTIMERLNXSTATE_STARTING
+ : RTTIMERLNXSTATE_STOPPED);
+ }
+ } while (!RTCpuSetIsEqual(&OnlineSet, RTMpGetOnlineSet(&OnlineSet2)));
+
+ ASMAtomicWriteBool(&pTimer->fSuspended, false);
+
+ RTSpinlockRelease(pTimer->hSpinlock);
+
+ /*
+ * Start them (can't find any exported function that allows me to
+ * do this without the cross calls).
+ */
+ pArgs->u64Now = RTTimeSystemNanoTS();
+ rc2 = RTMpOnAll(rtTimerLnxStartAllOnCpu, pTimer, pArgs);
+ AssertRC(rc2); /* screw this if it fails. */
+
+ /*
+ * Reset the sub-timers who didn't start up (ALL CPUs case).
+ */
+ RTSpinlockAcquire(pTimer->hSpinlock);
+
+ for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
+ if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_STARTING))
+ {
+ /** @todo very odd case for a rainy day. Cpus that temporarily went offline while
+ * we were between calls needs to nudged as the MP handler will ignore events for
+ * them because of the STARTING state. This is an extremely unlikely case - not that
+ * that means anything in my experience... ;-) */
+ RTTIMERLNX_LOG(("what!? iCpu=%u -> didn't start\n", iCpu));
+ }
+
+ RTSpinlockRelease(pTimer->hSpinlock);
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Worker for RTTimerStop() that takes care of the ugly SMP bits.
+ *
+ * @returns true if there was any active callbacks, false if not.
+ * @param pTimer The timer (valid).
+ * @param fForDestroy Whether this is for RTTimerDestroy or not.
+ */
+static bool rtTimerLnxOmniStop(PRTTIMER pTimer, bool fForDestroy)
+{
+ bool fActiveCallbacks = false;
+ RTCPUID iCpu;
+ RTTIMERLNXSTATE enmState;
+
+
+ /*
+ * Mark the timer as suspended and flag all timers as stopping, except
+ * for those being stopped by an MP event.
+ */
+ RTSpinlockAcquire(pTimer->hSpinlock);
+
+ ASMAtomicWriteBool(&pTimer->fSuspended, true);
+ for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
+ {
+ for (;;)
+ {
+ enmState = rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState);
+ if ( enmState == RTTIMERLNXSTATE_STOPPED
+ || enmState == RTTIMERLNXSTATE_MP_STOPPING)
+ break;
+ if ( enmState == RTTIMERLNXSTATE_CALLBACK
+ || enmState == RTTIMERLNXSTATE_CB_STOPPING
+ || enmState == RTTIMERLNXSTATE_CB_RESTARTING)
+ {
+ Assert(enmState != RTTIMERLNXSTATE_CB_STOPPING || fForDestroy);
+ if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState,
+ !fForDestroy ? RTTIMERLNXSTATE_CB_STOPPING : RTTIMERLNXSTATE_CB_DESTROYING,
+ enmState))
+ {
+ fActiveCallbacks = true;
+ break;
+ }
+ }
+ else
+ {
+ Assert(enmState == RTTIMERLNXSTATE_ACTIVE);
+ if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_STOPPING, enmState))
+ break;
+ }
+ ASMNopPause();
+ }
+ }
+
+ RTSpinlockRelease(pTimer->hSpinlock);
+
+ /*
+ * Do the actual stopping. Fortunately, this doesn't require any IPIs.
+ * Unfortunately it cannot be done synchronously.
+ */
+ for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
+ if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) == RTTIMERLNXSTATE_STOPPING)
+ rtTimerLnxStopSubTimer(&pTimer->aSubTimers[iCpu], pTimer->fHighRes);
+
+ return fActiveCallbacks;
+}
+
+
+/**
+ * Per-cpu callback function (RTMpOnSpecific) used by rtTimerLinuxMpEvent()
+ * to start a sub-timer on a cpu that just have come online.
+ *
+ * @param idCpu The current CPU.
+ * @param pvUser1 Pointer to the timer.
+ * @param pvUser2 Pointer to the argument structure.
+ */
+static DECLCALLBACK(void) rtTimerLinuxMpStartOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
+{
+ PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2;
+ PRTTIMER pTimer = (PRTTIMER)pvUser1;
+ RTSPINLOCK hSpinlock;
+ Assert(idCpu < pTimer->cCpus);
+
+ /*
+ * We have to be kind of careful here as we might be racing RTTimerStop
+ * (and/or RTTimerDestroy, thus the paranoia.
+ */
+ hSpinlock = pTimer->hSpinlock;
+ if ( hSpinlock != NIL_RTSPINLOCK
+ && pTimer->u32Magic == RTTIMER_MAGIC)
+ {
+ RTSpinlockAcquire(hSpinlock);
+
+ if ( !ASMAtomicUoReadBool(&pTimer->fSuspended)
+ && pTimer->u32Magic == RTTIMER_MAGIC)
+ {
+ /* We're sane and the timer is not suspended yet. */
+ PRTTIMERLNXSUBTIMER pSubTimer = &pTimer->aSubTimers[idCpu];
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STARTING, RTTIMERLNXSTATE_STOPPED))
+ rtTimerLnxStartSubTimer(pSubTimer, pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes);
+ }
+
+ RTSpinlockRelease(hSpinlock);
+ }
+}
+
+
+/**
+ * MP event notification callback.
+ *
+ * @param enmEvent The event.
+ * @param idCpu The cpu it applies to.
+ * @param pvUser The timer.
+ */
+static DECLCALLBACK(void) rtTimerLinuxMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
+{
+ PRTTIMER pTimer = (PRTTIMER)pvUser;
+ PRTTIMERLNXSUBTIMER pSubTimer = &pTimer->aSubTimers[idCpu];
+ RTSPINLOCK hSpinlock;
+
+ Assert(idCpu < pTimer->cCpus);
+
+ /*
+ * Some initial paranoia.
+ */
+ if (pTimer->u32Magic != RTTIMER_MAGIC)
+ return;
+ hSpinlock = pTimer->hSpinlock;
+ if (hSpinlock == NIL_RTSPINLOCK)
+ return;
+
+ RTSpinlockAcquire(hSpinlock);
+
+ /* Is it active? */
+ if ( !ASMAtomicUoReadBool(&pTimer->fSuspended)
+ && pTimer->u32Magic == RTTIMER_MAGIC)
+ {
+ switch (enmEvent)
+ {
+ /*
+ * Try do it without leaving the spin lock, but if we have to, retake it
+ * when we're on the right cpu.
+ */
+ case RTMPEVENT_ONLINE:
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STARTING, RTTIMERLNXSTATE_STOPPED))
+ {
+ RTTIMERLINUXSTARTONCPUARGS Args;
+ Args.u64Now = RTTimeSystemNanoTS();
+ Args.u64First = 0;
+
+ if (RTMpCpuId() == idCpu)
+ rtTimerLnxStartSubTimer(pSubTimer, Args.u64Now, Args.u64First, true /*fPinned*/, pTimer->fHighRes);
+ else
+ {
+ rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED); /* we'll recheck it. */
+ RTSpinlockRelease(hSpinlock);
+
+ RTMpOnSpecific(idCpu, rtTimerLinuxMpStartOnCpu, pTimer, &Args);
+ return; /* we've left the spinlock */
+ }
+ }
+ break;
+
+ /*
+ * The CPU is (going) offline, make sure the sub-timer is stopped.
+ *
+ * Linux will migrate it to a different CPU, but we don't want this. The
+ * timer function is checking for this.
+ */
+ case RTMPEVENT_OFFLINE:
+ {
+ RTTIMERLNXSTATE enmState;
+ while ( (enmState = rtTimerLnxGetState(&pSubTimer->enmState)) == RTTIMERLNXSTATE_ACTIVE
+ || enmState == RTTIMERLNXSTATE_CALLBACK
+ || enmState == RTTIMERLNXSTATE_CB_RESTARTING)
+ {
+ if (enmState == RTTIMERLNXSTATE_ACTIVE)
+ {
+ if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STOPPING, RTTIMERLNXSTATE_ACTIVE))
+ {
+ RTSpinlockRelease(hSpinlock);
+
+ rtTimerLnxStopSubTimer(pSubTimer, pTimer->fHighRes);
+ return; /* we've left the spinlock */
+ }
+ }
+ else if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CB_STOPPING, enmState))
+ break;
+
+ /* State not stable, try again. */
+ ASMNopPause();
+ }
+ break;
+ }
+ }
+ }
+
+ RTSpinlockRelease(hSpinlock);
+}
+
+#endif /* CONFIG_SMP */
+
+
+/**
+ * Callback function use by RTTimerStart via RTMpOnSpecific to start a timer
+ * running on a specific CPU.
+ *
+ * @param idCpu The current CPU.
+ * @param pvUser1 Pointer to the timer.
+ * @param pvUser2 Pointer to the argument structure.
+ */
+static DECLCALLBACK(void) rtTimerLnxStartOnSpecificCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
+{
+ PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2;
+ PRTTIMER pTimer = (PRTTIMER)pvUser1;
+ RT_NOREF_PV(idCpu);
+ rtTimerLnxStartSubTimer(&pTimer->aSubTimers[0], pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes);
+}
+
+
+RTDECL(int) RTTimerStart(PRTTIMER pTimer, uint64_t u64First)
+{
+ RTTIMERLINUXSTARTONCPUARGS Args;
+ int rc2;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate.
+ */
+ AssertPtrReturn(pTimer, VERR_INVALID_HANDLE);
+ AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE);
+
+ if (!ASMAtomicUoReadBool(&pTimer->fSuspended))
+ return VERR_TIMER_ACTIVE;
+ RTTIMERLNX_LOG(("start %p cCpus=%d\n", pTimer, pTimer->cCpus));
+
+ Args.u64First = u64First;
+#ifdef CONFIG_SMP
+ /*
+ * Omni timer?
+ */
+ if (pTimer->fAllCpus)
+ {
+ rc2 = rtTimerLnxOmniStart(pTimer, &Args);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc2;
+ }
+#endif
+
+ /*
+ * Simple timer - Pretty straight forward if it wasn't for restarting.
+ */
+ Args.u64Now = RTTimeSystemNanoTS();
+ ASMAtomicWriteU64(&pTimer->aSubTimers[0].uNsRestartAt, Args.u64Now + u64First);
+ for (;;)
+ {
+ RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[0].enmState);
+ switch (enmState)
+ {
+ case RTTIMERLNXSTATE_STOPPED:
+ if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STARTING, RTTIMERLNXSTATE_STOPPED))
+ {
+ ASMAtomicWriteBool(&pTimer->fSuspended, false);
+ if (!pTimer->fSpecificCpu)
+ rtTimerLnxStartSubTimer(&pTimer->aSubTimers[0], Args.u64Now, Args.u64First,
+ false /*fPinned*/, pTimer->fHighRes);
+ else
+ {
+ rc2 = RTMpOnSpecific(pTimer->idCpu, rtTimerLnxStartOnSpecificCpu, pTimer, &Args);
+ if (RT_FAILURE(rc2))
+ {
+ /* Suspend it, the cpu id is probably invalid or offline. */
+ ASMAtomicWriteBool(&pTimer->fSuspended, true);
+ rtTimerLnxSetState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STOPPED);
+ return rc2;
+ }
+ }
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+ break;
+
+ case RTTIMERLNXSTATE_CALLBACK:
+ case RTTIMERLNXSTATE_CB_STOPPING:
+ if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_CB_RESTARTING, enmState))
+ {
+ ASMAtomicWriteBool(&pTimer->fSuspended, false);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+ break;
+
+ default:
+ AssertMsgFailed(("%d\n", enmState));
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_INTERNAL_ERROR_4;
+ }
+ ASMNopPause();
+ }
+}
+RT_EXPORT_SYMBOL(RTTimerStart);
+
+
+/**
+ * Common worker for RTTimerStop and RTTimerDestroy.
+ *
+ * @returns true if there was any active callbacks, false if not.
+ * @param pTimer The timer to stop.
+ * @param fForDestroy Whether it's RTTimerDestroy calling or not.
+ */
+static bool rtTimerLnxStop(PRTTIMER pTimer, bool fForDestroy)
+{
+ RTTIMERLNX_LOG(("lnxstop %p %d\n", pTimer, fForDestroy));
+#ifdef CONFIG_SMP
+ /*
+ * Omni timer?
+ */
+ if (pTimer->fAllCpus)
+ return rtTimerLnxOmniStop(pTimer, fForDestroy);
+#endif
+
+ /*
+ * Simple timer.
+ */
+ ASMAtomicWriteBool(&pTimer->fSuspended, true);
+ for (;;)
+ {
+ RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[0].enmState);
+ switch (enmState)
+ {
+ case RTTIMERLNXSTATE_ACTIVE:
+ if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STOPPING, RTTIMERLNXSTATE_ACTIVE))
+ {
+ rtTimerLnxStopSubTimer(&pTimer->aSubTimers[0], pTimer->fHighRes);
+ return false;
+ }
+ break;
+
+ case RTTIMERLNXSTATE_CALLBACK:
+ case RTTIMERLNXSTATE_CB_RESTARTING:
+ case RTTIMERLNXSTATE_CB_STOPPING:
+ Assert(enmState != RTTIMERLNXSTATE_CB_STOPPING || fForDestroy);
+ if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState,
+ !fForDestroy ? RTTIMERLNXSTATE_CB_STOPPING : RTTIMERLNXSTATE_CB_DESTROYING,
+ enmState))
+ return true;
+ break;
+
+ case RTTIMERLNXSTATE_STOPPED:
+ return VINF_SUCCESS;
+
+ case RTTIMERLNXSTATE_CB_DESTROYING:
+ AssertMsgFailed(("enmState=%d pTimer=%p\n", enmState, pTimer));
+ return true;
+
+ default:
+ case RTTIMERLNXSTATE_STARTING:
+ case RTTIMERLNXSTATE_MP_STARTING:
+ case RTTIMERLNXSTATE_STOPPING:
+ case RTTIMERLNXSTATE_MP_STOPPING:
+ AssertMsgFailed(("enmState=%d pTimer=%p\n", enmState, pTimer));
+ return false;
+ }
+
+ /* State not stable, try again. */
+ ASMNopPause();
+ }
+}
+
+
+RTDECL(int) RTTimerStop(PRTTIMER pTimer)
+{
+ /*
+ * Validate.
+ */
+ IPRT_LINUX_SAVE_EFL_AC();
+ AssertPtrReturn(pTimer, VERR_INVALID_HANDLE);
+ AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE);
+ RTTIMERLNX_LOG(("stop %p\n", pTimer));
+
+ if (ASMAtomicUoReadBool(&pTimer->fSuspended))
+ return VERR_TIMER_SUSPENDED;
+
+ rtTimerLnxStop(pTimer, false /*fForDestroy*/);
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTTimerStop);
+
+
+RTDECL(int) RTTimerChangeInterval(PRTTIMER pTimer, uint64_t u64NanoInterval)
+{
+ unsigned long cJiffies;
+ unsigned long flFlags;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate.
+ */
+ AssertPtrReturn(pTimer, VERR_INVALID_HANDLE);
+ AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE);
+ AssertReturn(u64NanoInterval, VERR_INVALID_PARAMETER);
+ AssertReturn(u64NanoInterval < UINT64_MAX / 8, VERR_INVALID_PARAMETER);
+ AssertReturn(pTimer->u64NanoInterval, VERR_INVALID_STATE);
+ RTTIMERLNX_LOG(("change %p %llu\n", pTimer, u64NanoInterval));
+
+#ifdef RTTIMER_LINUX_WITH_HRTIMER
+ /*
+ * For the high resolution timers it is easy since we don't care so much
+ * about when it is applied to the sub-timers.
+ */
+ if (pTimer->fHighRes)
+ {
+ ASMAtomicWriteU64(&pTimer->u64NanoInterval, u64NanoInterval);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+ }
+#endif
+
+ /*
+ * Standard timers have a bit more complicated way of calculating
+ * their interval and such. So, forget omni timers for now.
+ */
+ if (pTimer->cCpus > 1)
+ return VERR_NOT_SUPPORTED;
+
+ cJiffies = u64NanoInterval / (RT_NS_1SEC / HZ);
+ if (cJiffies * (RT_NS_1SEC / HZ) != u64NanoInterval)
+ cJiffies = 0;
+
+ spin_lock_irqsave(&pTimer->ChgIntLock, flFlags);
+ pTimer->aSubTimers[0].u.Std.fFirstAfterChg = true;
+ pTimer->cJiffies = cJiffies;
+ ASMAtomicWriteU64(&pTimer->u64NanoInterval, u64NanoInterval);
+ spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTTimerChangeInterval);
+
+
+RTDECL(int) RTTimerDestroy(PRTTIMER pTimer)
+{
+ bool fCanDestroy;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ /*
+ * Validate. It's ok to pass NULL pointer.
+ */
+ if (pTimer == /*NIL_RTTIMER*/ NULL)
+ return VINF_SUCCESS;
+ AssertPtrReturn(pTimer, VERR_INVALID_HANDLE);
+ AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE);
+ RTTIMERLNX_LOG(("destroy %p\n", pTimer));
+/** @todo We should invalidate the magic here! */
+
+ /*
+ * Stop the timer if it's still active, then destroy it if we can.
+ */
+ if (!ASMAtomicUoReadBool(&pTimer->fSuspended))
+ fCanDestroy = rtTimerLnxStop(pTimer, true /*fForDestroy*/);
+ else
+ {
+ uint32_t iCpu = pTimer->cCpus;
+ if (pTimer->cCpus > 1)
+ RTSpinlockAcquire(pTimer->hSpinlock);
+
+ fCanDestroy = true;
+ while (iCpu-- > 0)
+ {
+ for (;;)
+ {
+ RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState);
+ switch (enmState)
+ {
+ case RTTIMERLNXSTATE_CALLBACK:
+ case RTTIMERLNXSTATE_CB_RESTARTING:
+ case RTTIMERLNXSTATE_CB_STOPPING:
+ if (!rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_CB_DESTROYING, enmState))
+ continue;
+ fCanDestroy = false;
+ break;
+
+ case RTTIMERLNXSTATE_CB_DESTROYING:
+ AssertMsgFailed(("%d\n", enmState));
+ fCanDestroy = false;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ }
+
+ if (pTimer->cCpus > 1)
+ RTSpinlockRelease(pTimer->hSpinlock);
+ }
+
+ if (fCanDestroy)
+ {
+ /* For paranoid reasons, defer actually destroying the semaphore when
+ in atomic or interrupt context. */
+#if RTLNX_VER_MIN(2,5,32)
+ if (in_atomic() || in_interrupt())
+#else
+ if (in_interrupt())
+#endif
+ rtR0LnxWorkqueuePush(&pTimer->DtorWorkqueueItem, rtTimerLnxDestroyDeferred);
+ else
+ rtTimerLnxDestroyIt(pTimer);
+ }
+
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTTimerDestroy);
+
+
+RTDECL(int) RTTimerCreateEx(PRTTIMER *ppTimer, uint64_t u64NanoInterval, uint32_t fFlags, PFNRTTIMER pfnTimer, void *pvUser)
+{
+ PRTTIMER pTimer;
+ RTCPUID iCpu;
+ unsigned cCpus;
+ int rc;
+ IPRT_LINUX_SAVE_EFL_AC();
+
+ rtR0LnxWorkqueueFlush(); /* for 2.4 */
+ *ppTimer = NULL;
+
+ /*
+ * Validate flags.
+ */
+ if (!RTTIMER_FLAGS_ARE_VALID(fFlags))
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_INVALID_PARAMETER;
+ }
+ if ( (fFlags & RTTIMER_FLAGS_CPU_SPECIFIC)
+ && (fFlags & RTTIMER_FLAGS_CPU_ALL) != RTTIMER_FLAGS_CPU_ALL
+ && !RTMpIsCpuPossible(RTMpCpuIdFromSetIndex(fFlags & RTTIMER_FLAGS_CPU_MASK)))
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VERR_CPU_NOT_FOUND;
+ }
+
+ /*
+ * Allocate the timer handler.
+ */
+ cCpus = 1;
+#ifdef CONFIG_SMP
+ if ((fFlags & RTTIMER_FLAGS_CPU_ALL) == RTTIMER_FLAGS_CPU_ALL)
+ {
+ cCpus = RTMpGetMaxCpuId() + 1;
+ Assert(cCpus <= RTCPUSET_MAX_CPUS); /* On linux we have a 1:1 relationship between cpuid and set index. */
+ AssertReturnStmt(u64NanoInterval, IPRT_LINUX_RESTORE_EFL_AC(), VERR_NOT_IMPLEMENTED); /* We don't implement single shot on all cpus, sorry. */
+ }
+#endif
+
+ rc = RTMemAllocEx(RT_UOFFSETOF_DYN(RTTIMER, aSubTimers[cCpus]), 0,
+ RTMEMALLOCEX_FLAGS_ZEROED | RTMEMALLOCEX_FLAGS_ANY_CTX_FREE, (void **)&pTimer);
+ if (RT_FAILURE(rc))
+ {
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+ }
+
+ /*
+ * Initialize it.
+ */
+ pTimer->u32Magic = RTTIMER_MAGIC;
+ pTimer->hSpinlock = NIL_RTSPINLOCK;
+ pTimer->fSuspended = true;
+ pTimer->fHighRes = !!(fFlags & RTTIMER_FLAGS_HIGH_RES);
+#ifdef CONFIG_SMP
+ pTimer->fSpecificCpu = (fFlags & RTTIMER_FLAGS_CPU_SPECIFIC) && (fFlags & RTTIMER_FLAGS_CPU_ALL) != RTTIMER_FLAGS_CPU_ALL;
+ pTimer->fAllCpus = (fFlags & RTTIMER_FLAGS_CPU_ALL) == RTTIMER_FLAGS_CPU_ALL;
+ pTimer->idCpu = pTimer->fSpecificCpu
+ ? RTMpCpuIdFromSetIndex(fFlags & RTTIMER_FLAGS_CPU_MASK)
+ : NIL_RTCPUID;
+#else
+ pTimer->fSpecificCpu = !!(fFlags & RTTIMER_FLAGS_CPU_SPECIFIC);
+ pTimer->idCpu = RTMpCpuId();
+#endif
+ pTimer->cCpus = cCpus;
+ pTimer->pfnTimer = pfnTimer;
+ pTimer->pvUser = pvUser;
+ pTimer->u64NanoInterval = u64NanoInterval;
+ pTimer->cJiffies = u64NanoInterval / (RT_NS_1SEC / HZ);
+ if (pTimer->cJiffies * (RT_NS_1SEC / HZ) != u64NanoInterval)
+ pTimer->cJiffies = 0;
+ spin_lock_init(&pTimer->ChgIntLock);
+
+ for (iCpu = 0; iCpu < cCpus; iCpu++)
+ {
+#ifdef RTTIMER_LINUX_WITH_HRTIMER
+ if (pTimer->fHighRes)
+ {
+ hrtimer_init(&pTimer->aSubTimers[iCpu].u.Hr.LnxTimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pTimer->aSubTimers[iCpu].u.Hr.LnxTimer.function = rtTimerLinuxHrCallback;
+ }
+ else
+#endif
+ {
+#if RTLNX_VER_MIN(4,15,0)
+ timer_setup(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer, rtTimerLinuxStdCallback, TIMER_PINNED);
+#elif RTLNX_VER_MIN(4,8,0)
+ init_timer_pinned(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer);
+#else
+ init_timer(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer);
+#endif
+#if RTLNX_VER_MAX(4,15,0)
+ pTimer->aSubTimers[iCpu].u.Std.LnxTimer.data = (unsigned long)&pTimer->aSubTimers[iCpu];
+ pTimer->aSubTimers[iCpu].u.Std.LnxTimer.function = rtTimerLinuxStdCallback;
+#endif
+ pTimer->aSubTimers[iCpu].u.Std.LnxTimer.expires = jiffies;
+ pTimer->aSubTimers[iCpu].u.Std.u64NextTS = 0;
+ }
+ pTimer->aSubTimers[iCpu].iTick = 0;
+ pTimer->aSubTimers[iCpu].pParent = pTimer;
+ pTimer->aSubTimers[iCpu].enmState = RTTIMERLNXSTATE_STOPPED;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * If this is running on ALL cpus, we'll have to register a callback
+ * for MP events (so timers can be started/stopped on cpus going
+ * online/offline). We also create the spinlock for synchronizing
+ * stop/start/mp-event.
+ */
+ if (cCpus > 1)
+ {
+ int rc = RTSpinlockCreate(&pTimer->hSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_SAFE, "RTTimerLnx");
+ if (RT_SUCCESS(rc))
+ rc = RTMpNotificationRegister(rtTimerLinuxMpEvent, pTimer);
+ else
+ pTimer->hSpinlock = NIL_RTSPINLOCK;
+ if (RT_FAILURE(rc))
+ {
+ RTTimerDestroy(pTimer);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return rc;
+ }
+ }
+#endif /* CONFIG_SMP */
+
+ RTTIMERLNX_LOG(("create %p hires=%d fFlags=%#x cCpus=%u\n", pTimer, pTimer->fHighRes, fFlags, cCpus));
+ *ppTimer = pTimer;
+ IPRT_LINUX_RESTORE_EFL_AC();
+ return VINF_SUCCESS;
+}
+RT_EXPORT_SYMBOL(RTTimerCreateEx);
+
+
+RTDECL(uint32_t) RTTimerGetSystemGranularity(void)
+{
+#if 0 /** @todo Not sure if this is what we want or not... Add new API for
+ * querying the resolution of the high res timers? */
+ struct timespec Ts;
+ int rc;
+ IPRT_LINUX_SAVE_EFL_AC();
+ rc = hrtimer_get_res(CLOCK_MONOTONIC, &Ts);
+ IPRT_LINUX_RESTORE_EFL_AC();
+ if (!rc)
+ {
+ Assert(!Ts.tv_sec);
+ return Ts.tv_nsec;
+ }
+#endif
+ /* */
+#if RTLNX_VER_MAX(4,9,0) || RTLNX_VER_MIN(4,13,0)
+ /* On 4.9, 4.10 and 4.12 we've observed tstRTR0Timer failures of the omni timer tests
+ where we get about half of the ticks we want. The failing test is using this value
+ as interval. So, this is a very very crude hack to try make omni timers work
+ correctly without actually knowing what's going wrong... */
+ return RT_NS_1SEC * 2 / HZ; /* ns */
+#else
+ return RT_NS_1SEC / HZ; /* ns */
+#endif
+}
+RT_EXPORT_SYMBOL(RTTimerGetSystemGranularity);
+
+
+RTDECL(int) RTTimerRequestSystemGranularity(uint32_t u32Request, uint32_t *pu32Granted)
+{
+ RT_NOREF_PV(u32Request); RT_NOREF_PV(*pu32Granted);
+ return VERR_NOT_SUPPORTED;
+}
+RT_EXPORT_SYMBOL(RTTimerRequestSystemGranularity);
+
+
+RTDECL(int) RTTimerReleaseSystemGranularity(uint32_t u32Granted)
+{
+ RT_NOREF_PV(u32Granted);
+ return VERR_NOT_SUPPORTED;
+}
+RT_EXPORT_SYMBOL(RTTimerReleaseSystemGranularity);
+
+
+RTDECL(bool) RTTimerCanDoHighResolution(void)
+{
+#ifdef RTTIMER_LINUX_WITH_HRTIMER
+ return true;
+#else
+ return false;
+#endif
+}
+RT_EXPORT_SYMBOL(RTTimerCanDoHighResolution);
+
diff --git a/src/VBox/Runtime/r0drv/linux/waitqueue-r0drv-linux.h b/src/VBox/Runtime/r0drv/linux/waitqueue-r0drv-linux.h
new file mode 100644
index 00000000..0ef103fe
--- /dev/null
+++ b/src/VBox/Runtime/r0drv/linux/waitqueue-r0drv-linux.h
@@ -0,0 +1,302 @@
+/* $Id: waitqueue-r0drv-linux.h $ */
+/** @file
+ * IPRT - Linux Ring-0 Driver Helpers for Abstracting Wait Queues,
+ */
+
+/*
+ * Copyright (C) 2006-2023 Oracle and/or its affiliates.
+ *
+ * This file is part of VirtualBox base platform packages, as
+ * available from https://www.virtualbox.org.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, in version 3 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses>.
+ *
+ * The contents of this file may alternatively be used under the terms
+ * of the Common Development and Distribution License Version 1.0
+ * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
+ * in the VirtualBox distribution, in which case the provisions of the
+ * CDDL are applicable instead of those of the GPL.
+ *
+ * You may elect to license modified versions of this file under the
+ * terms and conditions of either the GPL or the CDDL or both.
+ *
+ * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
+ */
+
+#ifndef IPRT_INCLUDED_SRC_r0drv_linux_waitqueue_r0drv_linux_h
+#define IPRT_INCLUDED_SRC_r0drv_linux_waitqueue_r0drv_linux_h
+#ifndef RT_WITHOUT_PRAGMA_ONCE
+# pragma once
+#endif
+
+#include "the-linux-kernel.h"
+
+#include <iprt/asm-math.h>
+#include <iprt/err.h>
+#include <iprt/string.h>
+#include <iprt/time.h>
+
+/** The resolution (nanoseconds) specified when using
+ * schedule_hrtimeout_range. */
+#define RTR0SEMLNXWAIT_RESOLUTION 50000
+
+
+/**
+ * Kernel mode Linux wait state structure.
+ */
+typedef struct RTR0SEMLNXWAIT
+{
+ /** The wait queue entry. */
+#if RTLNX_VER_MIN(4,13,0) || RTLNX_SUSE_MAJ_PREREQ(12, 4) || RTLNX_SUSE_MAJ_PREREQ(15, 0)
+ wait_queue_entry_t WaitQE;
+#else
+ wait_queue_t WaitQE;
+#endif
+ /** The absolute timeout given as nano seconds since the start of the
+ * monotonic clock. */
+ uint64_t uNsAbsTimeout;
+ /** The timeout in nano seconds relative to the start of the wait. */
+ uint64_t cNsRelTimeout;
+ /** The native timeout value. */
+ union
+ {
+#ifdef IPRT_LINUX_HAS_HRTIMER
+ /** The timeout when fHighRes is true. Absolute, so no updating. */
+ ktime_t KtTimeout;
+#endif
+ /** The timeout when fHighRes is false. Updated after waiting. */
+ long lTimeout;
+ } u;
+ /** Set if we use high resolution timeouts. */
+ bool fHighRes;
+ /** Set if it's an indefinite wait. */
+ bool fIndefinite;
+ /** Set if we've already timed out.
+ * Set by rtR0SemLnxWaitDoIt and read by rtR0SemLnxWaitHasTimedOut. */
+ bool fTimedOut;
+ /** TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE. */
+ int iWaitState;
+ /** The wait queue. */
+ wait_queue_head_t *pWaitQueue;
+} RTR0SEMLNXWAIT;
+/** Pointer to a linux wait state. */
+typedef RTR0SEMLNXWAIT *PRTR0SEMLNXWAIT;
+
+
+/**
+ * Initializes a wait.
+ *
+ * The caller MUST check the wait condition BEFORE calling this function or the
+ * timeout logic will be flawed.
+ *
+ * @returns VINF_SUCCESS or VERR_TIMEOUT.
+ * @param pWait The wait structure.
+ * @param fFlags The wait flags.
+ * @param uTimeout The timeout.
+ * @param pWaitQueue The wait queue head.
+ */
+DECLINLINE(int) rtR0SemLnxWaitInit(PRTR0SEMLNXWAIT pWait, uint32_t fFlags, uint64_t uTimeout,
+ wait_queue_head_t *pWaitQueue)
+{
+ /*
+ * Process the flags and timeout.
+ */
+ if (!(fFlags & RTSEMWAIT_FLAGS_INDEFINITE))
+ {
+/** @todo optimize: millisecs -> nanosecs -> millisec -> jiffies */
+ if (fFlags & RTSEMWAIT_FLAGS_MILLISECS)
+ uTimeout = uTimeout < UINT64_MAX / RT_US_1SEC * RT_US_1SEC
+ ? uTimeout * RT_US_1SEC
+ : UINT64_MAX;
+ if (uTimeout == UINT64_MAX)
+ fFlags |= RTSEMWAIT_FLAGS_INDEFINITE;
+ else
+ {
+ uint64_t u64Now;
+ if (fFlags & RTSEMWAIT_FLAGS_RELATIVE)
+ {
+ if (uTimeout == 0)
+ return VERR_TIMEOUT;
+
+ u64Now = RTTimeSystemNanoTS();
+ pWait->cNsRelTimeout = uTimeout;
+ pWait->uNsAbsTimeout = u64Now + uTimeout;
+ if (pWait->uNsAbsTimeout < u64Now) /* overflow */
+ fFlags |= RTSEMWAIT_FLAGS_INDEFINITE;
+ }
+ else
+ {
+ u64Now = RTTimeSystemNanoTS();
+ if (u64Now >= uTimeout)
+ return VERR_TIMEOUT;
+
+ pWait->cNsRelTimeout = uTimeout - u64Now;
+ pWait->uNsAbsTimeout = uTimeout;
+ }
+ }
+ }
+
+ if (!(fFlags & RTSEMWAIT_FLAGS_INDEFINITE))
+ {
+ pWait->fIndefinite = false;
+#ifdef IPRT_LINUX_HAS_HRTIMER
+ if ( (fFlags & (RTSEMWAIT_FLAGS_NANOSECS | RTSEMWAIT_FLAGS_ABSOLUTE))
+ || pWait->cNsRelTimeout < RT_NS_1SEC / HZ * 4)
+ {
+ pWait->fHighRes = true;
+# if BITS_PER_LONG < 64
+ if ( KTIME_SEC_MAX <= LONG_MAX
+ && pWait->uNsAbsTimeout >= KTIME_SEC_MAX * RT_NS_1SEC_64 + (RT_NS_1SEC - 1))
+ fFlags |= RTSEMWAIT_FLAGS_INDEFINITE;
+ else
+# endif
+ pWait->u.KtTimeout = ns_to_ktime(pWait->uNsAbsTimeout);
+ }
+ else
+#endif
+ {
+ uint64_t cJiffies = ASMMultU64ByU32DivByU32(pWait->cNsRelTimeout, HZ, RT_NS_1SEC);
+ if (cJiffies >= MAX_JIFFY_OFFSET)
+ fFlags |= RTSEMWAIT_FLAGS_INDEFINITE;
+ else
+ {
+ pWait->u.lTimeout = (long)cJiffies;
+ pWait->fHighRes = false;
+ }
+ }
+ }
+
+ if (fFlags & RTSEMWAIT_FLAGS_INDEFINITE)
+ {
+ pWait->fIndefinite = true;
+ pWait->fHighRes = false;
+ pWait->uNsAbsTimeout = UINT64_MAX;
+ pWait->cNsRelTimeout = UINT64_MAX;
+ pWait->u.lTimeout = LONG_MAX;
+ }
+
+ pWait->fTimedOut = false;
+
+ /*
+ * Initialize the wait queue related bits.
+ */
+#if RTLNX_VER_MIN(2,5,39)
+ init_wait((&pWait->WaitQE));
+#else
+ RT_ZERO(pWait->WaitQE);
+ init_waitqueue_entry((&pWait->WaitQE), current);
+#endif
+ pWait->pWaitQueue = pWaitQueue;
+ pWait->iWaitState = fFlags & RTSEMWAIT_FLAGS_INTERRUPTIBLE
+ ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
+
+ return VINF_SUCCESS;
+}
+
+
+/**
+ * Prepares the next wait.
+ *
+ * This must be called before rtR0SemLnxWaitDoIt, and the caller should check
+ * the exit conditions in-between the two calls.
+ *
+ * @param pWait The wait structure.
+ */
+DECLINLINE(void) rtR0SemLnxWaitPrepare(PRTR0SEMLNXWAIT pWait)
+{
+ /* Make everything thru schedule*() atomic scheduling wise. (Is this correct?) */
+ prepare_to_wait(pWait->pWaitQueue, &pWait->WaitQE, pWait->iWaitState);
+}
+
+
+/**
+ * Do the actual wait.
+ *
+ * @param pWait The wait structure.
+ */
+DECLINLINE(void) rtR0SemLnxWaitDoIt(PRTR0SEMLNXWAIT pWait)
+{
+ if (pWait->fIndefinite)
+ schedule();
+#ifdef IPRT_LINUX_HAS_HRTIMER
+ else if (pWait->fHighRes)
+ {
+ int rc = schedule_hrtimeout_range(&pWait->u.KtTimeout, HRTIMER_MODE_ABS, RTR0SEMLNXWAIT_RESOLUTION);
+ if (!rc)
+ pWait->fTimedOut = true;
+ }
+#endif
+ else
+ {
+ pWait->u.lTimeout = schedule_timeout(pWait->u.lTimeout);
+ if (pWait->u.lTimeout <= 0)
+ pWait->fTimedOut = true;
+ }
+ after_wait((&pWait->WaitQE));
+}
+
+
+/**
+ * Checks if a linux wait was interrupted.
+ *
+ * @returns true / false
+ * @param pWait The wait structure.
+ * @remarks This shall be called before the first rtR0SemLnxWaitDoIt().
+ */
+DECLINLINE(bool) rtR0SemLnxWaitWasInterrupted(PRTR0SEMLNXWAIT pWait)
+{
+ return pWait->iWaitState == TASK_INTERRUPTIBLE
+ && signal_pending(current);
+}
+
+
+/**
+ * Checks if a linux wait has timed out.
+ *
+ * @returns true / false
+ * @param pWait The wait structure.
+ */
+DECLINLINE(bool) rtR0SemLnxWaitHasTimedOut(PRTR0SEMLNXWAIT pWait)
+{
+ return pWait->fTimedOut;
+}
+
+
+/**
+ * Deletes a linux wait.
+ *
+ * @param pWait The wait structure.
+ */
+DECLINLINE(void) rtR0SemLnxWaitDelete(PRTR0SEMLNXWAIT pWait)
+{
+ finish_wait(pWait->pWaitQueue, &pWait->WaitQE);
+}
+
+
+/**
+ * Gets the max resolution of the timeout machinery.
+ *
+ * @returns Resolution specified in nanoseconds.
+ */
+DECLINLINE(uint32_t) rtR0SemLnxWaitGetResolution(void)
+{
+#ifdef IPRT_LINUX_HAS_HRTIMER
+ return RTR0SEMLNXWAIT_RESOLUTION;
+#else
+ return RT_NS_1SEC / HZ; /* ns */
+#endif
+}
+
+#endif /* !IPRT_INCLUDED_SRC_r0drv_linux_waitqueue_r0drv_linux_h */
+