diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 14:19:18 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 14:19:18 +0000 |
commit | 4035b1bfb1e5843a539a8b624d21952b756974d1 (patch) | |
tree | f1e9cd5bf548cbc57ff2fddfb2b4aa9ae95587e2 /src/VBox/Runtime/r0drv/linux | |
parent | Initial commit. (diff) | |
download | virtualbox-upstream.tar.xz virtualbox-upstream.zip |
Adding upstream version 6.1.22-dfsg.upstream/6.1.22-dfsgupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
24 files changed, 8827 insertions, 0 deletions
diff --git a/src/VBox/Runtime/r0drv/linux/Makefile.kup b/src/VBox/Runtime/r0drv/linux/Makefile.kup new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/Makefile.kup diff --git a/src/VBox/Runtime/r0drv/linux/RTLogWriteDebugger-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/RTLogWriteDebugger-r0drv-linux.c new file mode 100644 index 00000000..460094ef --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/RTLogWriteDebugger-r0drv-linux.c @@ -0,0 +1,43 @@ +/* $Id: RTLogWriteDebugger-r0drv-linux.c $ */ +/** @file + * IPRT - Log To Debugger, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/log.h> + + +RTDECL(void) RTLogWriteDebugger(const char *pch, size_t cb) +{ + IPRT_LINUX_SAVE_EFL_AC(); + printk("%.*s", (int)cb, pch); + IPRT_LINUX_RESTORE_EFL_AC(); +} +RT_EXPORT_SYMBOL(RTLogWriteDebugger); + diff --git a/src/VBox/Runtime/r0drv/linux/alloc-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/alloc-r0drv-linux.c new file mode 100644 index 00000000..1ae48f9f --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/alloc-r0drv-linux.c @@ -0,0 +1,497 @@ +/* $Id: alloc-r0drv-linux.c $ */ +/** @file + * IPRT - Memory Allocation, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/mem.h> + +#include <iprt/assert.h> +#include <iprt/errcore.h> +#include "r0drv/alloc-r0drv.h" + + +#if (defined(RT_ARCH_AMD64) || defined(DOXYGEN_RUNNING)) && !defined(RTMEMALLOC_EXEC_HEAP) +# if RTLNX_VER_MIN(2,6,23) && RTLNX_VER_MAX(5,8,0) +/** + * Starting with 2.6.23 we can use __get_vm_area and map_vm_area to allocate + * memory in the moduel range. This is preferrable to the exec heap below. + */ +# define RTMEMALLOC_EXEC_VM_AREA +# else +/** + * We need memory in the module range (~2GB to ~0) this can only be obtained + * thru APIs that are not exported (see module_alloc()). + * + * So, we'll have to create a quick and dirty heap here using BSS memory. + * Very annoying and it's going to restrict us! + */ +# define RTMEMALLOC_EXEC_HEAP +# endif +#endif + +#ifdef RTMEMALLOC_EXEC_HEAP +# include <iprt/heap.h> +# include <iprt/spinlock.h> +# include <iprt/errcore.h> +#endif + +#include "internal/initterm.h" + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +#ifdef RTMEMALLOC_EXEC_VM_AREA +/** + * Extended header used for headers marked with RTMEMHDR_FLAG_EXEC_VM_AREA. + * + * This is used with allocating executable memory, for things like generated + * code and loaded modules. + */ +typedef struct RTMEMLNXHDREX +{ + /** The VM area for this allocation. */ + struct vm_struct *pVmArea; + void *pvDummy; + /** The header we present to the generic API. */ + RTMEMHDR Hdr; +} RTMEMLNXHDREX; +AssertCompileSize(RTMEMLNXHDREX, 32); +/** Pointer to an extended memory header. */ +typedef RTMEMLNXHDREX *PRTMEMLNXHDREX; +#endif + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +#ifdef RTMEMALLOC_EXEC_HEAP +/** The heap. */ +static RTHEAPSIMPLE g_HeapExec = NIL_RTHEAPSIMPLE; +/** Spinlock protecting the heap. */ +static RTSPINLOCK g_HeapExecSpinlock = NIL_RTSPINLOCK; +#endif + + +/** + * API for cleaning up the heap spinlock on IPRT termination. + * This is as RTMemExecDonate specific to AMD64 Linux/GNU. + */ +DECLHIDDEN(void) rtR0MemExecCleanup(void) +{ +#ifdef RTMEMALLOC_EXEC_HEAP + RTSpinlockDestroy(g_HeapExecSpinlock); + g_HeapExecSpinlock = NIL_RTSPINLOCK; +#endif +} + + +/** + * Donate read+write+execute memory to the exec heap. + * + * This API is specific to AMD64 and Linux/GNU. A kernel module that desires to + * use RTMemExecAlloc on AMD64 Linux/GNU will have to donate some statically + * allocated memory in the module if it wishes for GCC generated code to work. + * GCC can only generate modules that work in the address range ~2GB to ~0 + * currently. + * + * The API only accept one single donation. + * + * @returns IPRT status code. + * @retval VERR_NOT_SUPPORTED if the code isn't enabled. + * @param pvMemory Pointer to the memory block. + * @param cb The size of the memory block. + */ +RTR0DECL(int) RTR0MemExecDonate(void *pvMemory, size_t cb) +{ +#ifdef RTMEMALLOC_EXEC_HEAP + int rc; + AssertReturn(g_HeapExec == NIL_RTHEAPSIMPLE, VERR_WRONG_ORDER); + + rc = RTSpinlockCreate(&g_HeapExecSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_SAFE, "RTR0MemExecDonate"); + if (RT_SUCCESS(rc)) + { + rc = RTHeapSimpleInit(&g_HeapExec, pvMemory, cb); + if (RT_FAILURE(rc)) + rtR0MemExecCleanup(); + } + return rc; +#else + RT_NOREF_PV(pvMemory); RT_NOREF_PV(cb); + return VERR_NOT_SUPPORTED; +#endif +} +RT_EXPORT_SYMBOL(RTR0MemExecDonate); + + + +#ifdef RTMEMALLOC_EXEC_VM_AREA +/** + * Allocate executable kernel memory in the module range. + * + * @returns Pointer to a allocation header success. NULL on failure. + * + * @param cb The size the user requested. + */ +static PRTMEMHDR rtR0MemAllocExecVmArea(size_t cb) +{ + size_t const cbAlloc = RT_ALIGN_Z(sizeof(RTMEMLNXHDREX) + cb, PAGE_SIZE); + size_t const cPages = cbAlloc >> PAGE_SHIFT; + struct page **papPages; + struct vm_struct *pVmArea; + size_t iPage; + + pVmArea = __get_vm_area(cbAlloc, VM_ALLOC, MODULES_VADDR, MODULES_END); + if (!pVmArea) + return NULL; + pVmArea->nr_pages = 0; /* paranoia? */ + pVmArea->pages = NULL; /* paranoia? */ + + papPages = (struct page **)kmalloc(cPages * sizeof(papPages[0]), GFP_KERNEL | __GFP_NOWARN); + if (!papPages) + { + vunmap(pVmArea->addr); + return NULL; + } + + for (iPage = 0; iPage < cPages; iPage++) + { + papPages[iPage] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN); + if (!papPages[iPage]) + break; + } + if (iPage == cPages) + { + /* + * Map the pages. + * + * Not entirely sure we really need to set nr_pages and pages here, but + * they provide a very convenient place for storing something we need + * in the free function, if nothing else... + */ +# if RTLNX_VER_MAX(3,17,0) + struct page **papPagesIterator = papPages; +# endif + pVmArea->nr_pages = cPages; + pVmArea->pages = papPages; + if (!map_vm_area(pVmArea, PAGE_KERNEL_EXEC, +# if RTLNX_VER_MAX(3,17,0) + &papPagesIterator +# else + papPages +# endif + )) + { + PRTMEMLNXHDREX pHdrEx = (PRTMEMLNXHDREX)pVmArea->addr; + pHdrEx->pVmArea = pVmArea; + pHdrEx->pvDummy = NULL; + return &pHdrEx->Hdr; + } + /* bail out */ +# if RTLNX_VER_MAX(3,17,0) + pVmArea->nr_pages = papPagesIterator - papPages; +# endif + } + + vunmap(pVmArea->addr); + + while (iPage-- > 0) + __free_page(papPages[iPage]); + kfree(papPages); + + return NULL; +} +#endif /* RTMEMALLOC_EXEC_VM_AREA */ + + +/** + * OS specific allocation function. + */ +DECLHIDDEN(int) rtR0MemAllocEx(size_t cb, uint32_t fFlags, PRTMEMHDR *ppHdr) +{ + PRTMEMHDR pHdr; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Allocate. + */ + if (fFlags & RTMEMHDR_FLAG_EXEC) + { + if (fFlags & RTMEMHDR_FLAG_ANY_CTX) + return VERR_NOT_SUPPORTED; + +#if defined(RT_ARCH_AMD64) +# ifdef RTMEMALLOC_EXEC_HEAP + if (g_HeapExec != NIL_RTHEAPSIMPLE) + { + RTSpinlockAcquire(g_HeapExecSpinlock); + pHdr = (PRTMEMHDR)RTHeapSimpleAlloc(g_HeapExec, cb + sizeof(*pHdr), 0); + RTSpinlockRelease(g_HeapExecSpinlock); + fFlags |= RTMEMHDR_FLAG_EXEC_HEAP; + } + else + pHdr = NULL; + +# elif defined(RTMEMALLOC_EXEC_VM_AREA) + pHdr = rtR0MemAllocExecVmArea(cb); + fFlags |= RTMEMHDR_FLAG_EXEC_VM_AREA; + +# else /* !RTMEMALLOC_EXEC_HEAP */ +# error "you don not want to go here..." + pHdr = (PRTMEMHDR)__vmalloc(cb + sizeof(*pHdr), GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, MY_PAGE_KERNEL_EXEC); +# endif /* !RTMEMALLOC_EXEC_HEAP */ + +#elif defined(PAGE_KERNEL_EXEC) && defined(CONFIG_X86_PAE) + pHdr = (PRTMEMHDR)__vmalloc(cb + sizeof(*pHdr), GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, MY_PAGE_KERNEL_EXEC); +#else + pHdr = (PRTMEMHDR)vmalloc(cb + sizeof(*pHdr)); +#endif + } + else + { + if ( +#if 1 /* vmalloc has serious performance issues, avoid it. */ + cb <= PAGE_SIZE*16 - sizeof(*pHdr) +#else + cb <= PAGE_SIZE +#endif + || (fFlags & RTMEMHDR_FLAG_ANY_CTX) + ) + { + fFlags |= RTMEMHDR_FLAG_KMALLOC; + pHdr = kmalloc(cb + sizeof(*pHdr), + (fFlags & RTMEMHDR_FLAG_ANY_CTX_ALLOC) ? (GFP_ATOMIC | __GFP_NOWARN) + : (GFP_KERNEL | __GFP_NOWARN)); + if (RT_UNLIKELY( !pHdr + && cb > PAGE_SIZE + && !(fFlags & RTMEMHDR_FLAG_ANY_CTX) )) + { + fFlags &= ~RTMEMHDR_FLAG_KMALLOC; + pHdr = vmalloc(cb + sizeof(*pHdr)); + } + } + else + pHdr = vmalloc(cb + sizeof(*pHdr)); + } + if (RT_UNLIKELY(!pHdr)) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + } + + /* + * Initialize. + */ + pHdr->u32Magic = RTMEMHDR_MAGIC; + pHdr->fFlags = fFlags; + pHdr->cb = cb; + pHdr->cbReq = cb; + + *ppHdr = pHdr; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} + + +/** + * OS specific free function. + */ +DECLHIDDEN(void) rtR0MemFree(PRTMEMHDR pHdr) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + pHdr->u32Magic += 1; + if (pHdr->fFlags & RTMEMHDR_FLAG_KMALLOC) + kfree(pHdr); +#ifdef RTMEMALLOC_EXEC_HEAP + else if (pHdr->fFlags & RTMEMHDR_FLAG_EXEC_HEAP) + { + RTSpinlockAcquire(g_HeapExecSpinlock); + RTHeapSimpleFree(g_HeapExec, pHdr); + RTSpinlockRelease(g_HeapExecSpinlock); + } +#endif +#ifdef RTMEMALLOC_EXEC_VM_AREA + else if (pHdr->fFlags & RTMEMHDR_FLAG_EXEC_VM_AREA) + { + PRTMEMLNXHDREX pHdrEx = RT_FROM_MEMBER(pHdr, RTMEMLNXHDREX, Hdr); + size_t iPage = pHdrEx->pVmArea->nr_pages; + struct page **papPages = pHdrEx->pVmArea->pages; + void *pvMapping = pHdrEx->pVmArea->addr; + + vunmap(pvMapping); + + while (iPage-- > 0) + __free_page(papPages[iPage]); + kfree(papPages); + } +#endif + else + vfree(pHdr); + + IPRT_LINUX_RESTORE_EFL_AC(); +} + + + +/** + * Compute order. Some functions allocate 2^order pages. + * + * @returns order. + * @param cPages Number of pages. + */ +static int CalcPowerOf2Order(unsigned long cPages) +{ + int iOrder; + unsigned long cTmp; + + for (iOrder = 0, cTmp = cPages; cTmp >>= 1; ++iOrder) + ; + if (cPages & ~(1 << iOrder)) + ++iOrder; + + return iOrder; +} + + +/** + * Allocates physical contiguous memory (below 4GB). + * The allocation is page aligned and the content is undefined. + * + * @returns Pointer to the memory block. This is page aligned. + * @param pPhys Where to store the physical address. + * @param cb The allocation size in bytes. This is always + * rounded up to PAGE_SIZE. + */ +RTR0DECL(void *) RTMemContAlloc(PRTCCPHYS pPhys, size_t cb) +{ + int cOrder; + unsigned cPages; + struct page *paPages; + void *pvRet; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * validate input. + */ + Assert(VALID_PTR(pPhys)); + Assert(cb > 0); + + /* + * Allocate page pointer array. + */ + cb = RT_ALIGN_Z(cb, PAGE_SIZE); + cPages = cb >> PAGE_SHIFT; + cOrder = CalcPowerOf2Order(cPages); +#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32) + /* ZONE_DMA32: 0-4GB */ + paPages = alloc_pages(GFP_DMA32 | __GFP_NOWARN, cOrder); + if (!paPages) +#endif +#ifdef RT_ARCH_AMD64 + /* ZONE_DMA; 0-16MB */ + paPages = alloc_pages(GFP_DMA | __GFP_NOWARN, cOrder); +#else + /* ZONE_NORMAL: 0-896MB */ + paPages = alloc_pages(GFP_USER | __GFP_NOWARN, cOrder); +#endif + if (paPages) + { + /* + * Reserve the pages and mark them executable. + */ + unsigned iPage; + for (iPage = 0; iPage < cPages; iPage++) + { + Assert(!PageHighMem(&paPages[iPage])); + if (iPage + 1 < cPages) + { + AssertMsg( (uintptr_t)phys_to_virt(page_to_phys(&paPages[iPage])) + PAGE_SIZE + == (uintptr_t)phys_to_virt(page_to_phys(&paPages[iPage + 1])) + && page_to_phys(&paPages[iPage]) + PAGE_SIZE + == page_to_phys(&paPages[iPage + 1]), + ("iPage=%i cPages=%u [0]=%#llx,%p [1]=%#llx,%p\n", iPage, cPages, + (long long)page_to_phys(&paPages[iPage]), phys_to_virt(page_to_phys(&paPages[iPage])), + (long long)page_to_phys(&paPages[iPage + 1]), phys_to_virt(page_to_phys(&paPages[iPage + 1])) )); + } + + SetPageReserved(&paPages[iPage]); + } + *pPhys = page_to_phys(paPages); + pvRet = phys_to_virt(page_to_phys(paPages)); + } + else + pvRet = NULL; + + IPRT_LINUX_RESTORE_EFL_AC(); + return pvRet; +} +RT_EXPORT_SYMBOL(RTMemContAlloc); + + +/** + * Frees memory allocated using RTMemContAlloc(). + * + * @param pv Pointer to return from RTMemContAlloc(). + * @param cb The cb parameter passed to RTMemContAlloc(). + */ +RTR0DECL(void) RTMemContFree(void *pv, size_t cb) +{ + if (pv) + { + int cOrder; + unsigned cPages; + unsigned iPage; + struct page *paPages; + IPRT_LINUX_SAVE_EFL_AC(); + + /* validate */ + AssertMsg(!((uintptr_t)pv & PAGE_OFFSET_MASK), ("pv=%p\n", pv)); + Assert(cb > 0); + + /* calc order and get pages */ + cb = RT_ALIGN_Z(cb, PAGE_SIZE); + cPages = cb >> PAGE_SHIFT; + cOrder = CalcPowerOf2Order(cPages); + paPages = virt_to_page(pv); + + /* + * Restore page attributes freeing the pages. + */ + for (iPage = 0; iPage < cPages; iPage++) + { + ClearPageReserved(&paPages[iPage]); + } + __free_pages(paPages, cOrder); + IPRT_LINUX_RESTORE_EFL_AC(); + } +} +RT_EXPORT_SYMBOL(RTMemContFree); + diff --git a/src/VBox/Runtime/r0drv/linux/assert-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/assert-r0drv-linux.c new file mode 100644 index 00000000..ba78b437 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/assert-r0drv-linux.c @@ -0,0 +1,74 @@ +/* $Id: assert-r0drv-linux.c $ */ +/** @file + * IPRT - Assertion Workers, Ring-0 Drivers, Linux. + */ + +/* + * Copyright (C) 2007-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/assert.h> +#include <iprt/log.h> +#include <iprt/string.h> +#include <iprt/stdarg.h> +#include <iprt/asm.h> + +#include "internal/assert.h" + + +DECLHIDDEN(void) rtR0AssertNativeMsg1(const char *pszExpr, unsigned uLine, const char *pszFile, const char *pszFunction) +{ + IPRT_LINUX_SAVE_EFL_AC(); + printk(KERN_EMERG + "\r\n!!Assertion Failed!!\r\n" + "Expression: %s\r\n" + "Location : %s(%d) %s\r\n", + pszExpr, pszFile, uLine, pszFunction); + IPRT_LINUX_RESTORE_EFL_AC(); +} + + +DECLHIDDEN(void) rtR0AssertNativeMsg2V(bool fInitial, const char *pszFormat, va_list va) +{ + char szMsg[256]; + IPRT_LINUX_SAVE_EFL_AC(); + + RTStrPrintfV(szMsg, sizeof(szMsg) - 1, pszFormat, va); + szMsg[sizeof(szMsg) - 1] = '\0'; + printk(KERN_EMERG "%s", szMsg); + + NOREF(fInitial); + IPRT_LINUX_RESTORE_EFL_AC(); +} + + +RTR0DECL(void) RTR0AssertPanicSystem(void) +{ + panic("%s%s", g_szRTAssertMsg1, g_szRTAssertMsg2); +} +RT_EXPORT_SYMBOL(RTR0AssertPanicSystem); + diff --git a/src/VBox/Runtime/r0drv/linux/initterm-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/initterm-r0drv-linux.c new file mode 100644 index 00000000..9d6b4d4e --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/initterm-r0drv-linux.c @@ -0,0 +1,130 @@ +/* $Id: initterm-r0drv-linux.c $ */ +/** @file + * IPRT - Initialization & Termination, R0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/errcore.h> +#include <iprt/assert.h> +#include "internal/initterm.h" + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +/** The IPRT work queue. */ +#if RTLNX_VER_MIN(2,5,41) +static struct workqueue_struct *g_prtR0LnxWorkQueue; +#else +static DECLARE_TASK_QUEUE(g_rtR0LnxWorkQueue); +#endif + + +/** + * Pushes an item onto the IPRT work queue. + * + * @param pWork The work item. + * @param pfnWorker The callback function. It will be called back + * with @a pWork as argument. + */ +DECLHIDDEN(void) rtR0LnxWorkqueuePush(RTR0LNXWORKQUEUEITEM *pWork, void (*pfnWorker)(RTR0LNXWORKQUEUEITEM *)) +{ + IPRT_LINUX_SAVE_EFL_AC(); + +#if RTLNX_VER_MIN(2,5,41) +# if RTLNX_VER_MIN(2,6,20) + INIT_WORK(pWork, pfnWorker); +# else + INIT_WORK(pWork, (void (*)(void *))pfnWorker, pWork); +# endif + queue_work(g_prtR0LnxWorkQueue, pWork); +#else + INIT_TQUEUE(pWork, (void (*)(void *))pfnWorker, pWork); + queue_task(pWork, &g_rtR0LnxWorkQueue); +#endif + + IPRT_LINUX_RESTORE_EFL_AC(); +} + + +/** + * Flushes all items in the IPRT work queue. + * + * @remarks This is mostly for 2.4.x compatability. Must not be called from + * atomic contexts or with unncessary locks held. + */ +DECLHIDDEN(void) rtR0LnxWorkqueueFlush(void) +{ + IPRT_LINUX_SAVE_EFL_AC(); + +#if RTLNX_VER_MIN(2,5,41) + flush_workqueue(g_prtR0LnxWorkQueue); +#else + run_task_queue(&g_rtR0LnxWorkQueue); +#endif + + IPRT_LINUX_RESTORE_EFL_AC(); +} + + +DECLHIDDEN(int) rtR0InitNative(void) +{ + int rc = VINF_SUCCESS; + IPRT_LINUX_SAVE_EFL_AC(); + +#if RTLNX_VER_MIN(2,5,41) + #if RTLNX_VER_MIN(2,6,13) + g_prtR0LnxWorkQueue = create_workqueue("iprt-VBoxWQueue"); + #else + g_prtR0LnxWorkQueue = create_workqueue("iprt-VBoxQ"); + #endif + if (!g_prtR0LnxWorkQueue) + rc = VERR_NO_MEMORY; +#endif + + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +DECLHIDDEN(void) rtR0TermNative(void) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + rtR0LnxWorkqueueFlush(); +#if RTLNX_VER_MIN(2,5,41) + destroy_workqueue(g_prtR0LnxWorkQueue); + g_prtR0LnxWorkQueue = NULL; +#endif + + rtR0MemExecCleanup(); + + IPRT_LINUX_RESTORE_EFL_AC(); +} + diff --git a/src/VBox/Runtime/r0drv/linux/memobj-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/memobj-r0drv-linux.c new file mode 100644 index 00000000..6ae59089 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/memobj-r0drv-linux.c @@ -0,0 +1,1966 @@ +/* $Id: memobj-r0drv-linux.c $ */ +/** @file + * IPRT - Ring-0 Memory Objects, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" + +#include <iprt/memobj.h> +#include <iprt/assert.h> +#include <iprt/err.h> +#include <iprt/log.h> +#include <iprt/mem.h> +#include <iprt/process.h> +#include <iprt/string.h> +#include "internal/memobj.h" +#include "internal/iprt.h" + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +/* early 2.6 kernels */ +#ifndef PAGE_SHARED_EXEC +# define PAGE_SHARED_EXEC PAGE_SHARED +#endif +#ifndef PAGE_READONLY_EXEC +# define PAGE_READONLY_EXEC PAGE_READONLY +#endif + +/** @def IPRT_USE_ALLOC_VM_AREA_FOR_EXEC + * Whether we use alloc_vm_area (3.2+) for executable memory. + * This is a must for 5.8+, but we enable it all the way back to 3.2.x for + * better W^R compliance (fExecutable flag). */ +#if RTLNX_VER_RANGE(3,2,0, 5,10,0) || defined(DOXYGEN_RUNNING) +# define IPRT_USE_ALLOC_VM_AREA_FOR_EXEC +#endif +/** @def IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC + * alloc_vm_area was removed with 5.10 so we have to resort to a different way + * to allocate executable memory. + * It would be possible to remove IPRT_USE_ALLOC_VM_AREA_FOR_EXEC and use + * this path execlusively for 3.2+ but no time to test it really works on every + * supported kernel, so better play safe for now. + */ +#if RTLNX_VER_MIN(5,10,0) || defined(DOXYGEN_RUNNING) +# define IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC +#endif + +/* + * 2.6.29+ kernels don't work with remap_pfn_range() anymore because + * track_pfn_vma_new() is apparently not defined for non-RAM pages. + * It should be safe to use vm_insert_page() older kernels as well. + */ +#if RTLNX_VER_MIN(2,6,23) +# define VBOX_USE_INSERT_PAGE +#endif +#if defined(CONFIG_X86_PAE) \ + && ( defined(HAVE_26_STYLE_REMAP_PAGE_RANGE) \ + || RTLNX_VER_RANGE(2,6,0, 2,6,11) ) +# define VBOX_USE_PAE_HACK +#endif + +/* gfp_t was introduced in 2.6.14, define it for earlier. */ +#if RTLNX_VER_MAX(2,6,14) +# define gfp_t unsigned +#endif + +/* + * Wrappers around mmap_lock/mmap_sem difference. + */ +#if RTLNX_VER_MIN(5,8,0) +# define LNX_MM_DOWN_READ(a_pMm) down_read(&(a_pMm)->mmap_lock) +# define LNX_MM_UP_READ(a_pMm) up_read(&(a_pMm)->mmap_lock) +# define LNX_MM_DOWN_WRITE(a_pMm) down_write(&(a_pMm)->mmap_lock) +# define LNX_MM_UP_WRITE(a_pMm) up_write(&(a_pMm)->mmap_lock) +#else +# define LNX_MM_DOWN_READ(a_pMm) down_read(&(a_pMm)->mmap_sem) +# define LNX_MM_UP_READ(a_pMm) up_read(&(a_pMm)->mmap_sem) +# define LNX_MM_DOWN_WRITE(a_pMm) down_write(&(a_pMm)->mmap_sem) +# define LNX_MM_UP_WRITE(a_pMm) up_write(&(a_pMm)->mmap_sem) +#endif + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * The Linux version of the memory object structure. + */ +typedef struct RTR0MEMOBJLNX +{ + /** The core structure. */ + RTR0MEMOBJINTERNAL Core; + /** Set if the allocation is contiguous. + * This means it has to be given back as one chunk. */ + bool fContiguous; + /** Set if executable allocation. */ + bool fExecutable; + /** Set if we've vmap'ed the memory into ring-0. */ + bool fMappedToRing0; +#ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC + /** Return from alloc_vm_area() that we now need to use for executable + * memory. */ + struct vm_struct *pArea; + /** PTE array that goes along with pArea (must be freed). */ + pte_t **papPtesForArea; +#endif + /** The pages in the apPages array. */ + size_t cPages; + /** Array of struct page pointers. (variable size) */ + struct page *apPages[1]; +} RTR0MEMOBJLNX; +/** Pointer to the linux memory object. */ +typedef RTR0MEMOBJLNX *PRTR0MEMOBJLNX; + + +static void rtR0MemObjLinuxFreePages(PRTR0MEMOBJLNX pMemLnx); + + +/** + * Helper that converts from a RTR0PROCESS handle to a linux task. + * + * @returns The corresponding Linux task. + * @param R0Process IPRT ring-0 process handle. + */ +static struct task_struct *rtR0ProcessToLinuxTask(RTR0PROCESS R0Process) +{ + /** @todo fix rtR0ProcessToLinuxTask!! */ + /** @todo many (all?) callers currently assume that we return 'current'! */ + return R0Process == RTR0ProcHandleSelf() ? current : NULL; +} + + +/** + * Compute order. Some functions allocate 2^order pages. + * + * @returns order. + * @param cPages Number of pages. + */ +static int rtR0MemObjLinuxOrder(size_t cPages) +{ + int iOrder; + size_t cTmp; + + for (iOrder = 0, cTmp = cPages; cTmp >>= 1; ++iOrder) + ; + if (cPages & ~((size_t)1 << iOrder)) + ++iOrder; + + return iOrder; +} + + +/** + * Converts from RTMEM_PROT_* to Linux PAGE_*. + * + * @returns Linux page protection constant. + * @param fProt The IPRT protection mask. + * @param fKernel Whether it applies to kernel or user space. + */ +static pgprot_t rtR0MemObjLinuxConvertProt(unsigned fProt, bool fKernel) +{ + switch (fProt) + { + default: + AssertMsgFailed(("%#x %d\n", fProt, fKernel)); RT_FALL_THRU(); + case RTMEM_PROT_NONE: + return PAGE_NONE; + + case RTMEM_PROT_READ: + return fKernel ? PAGE_KERNEL_RO : PAGE_READONLY; + + case RTMEM_PROT_WRITE: + case RTMEM_PROT_WRITE | RTMEM_PROT_READ: + return fKernel ? PAGE_KERNEL : PAGE_SHARED; + + case RTMEM_PROT_EXEC: + case RTMEM_PROT_EXEC | RTMEM_PROT_READ: +#if defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64) + if (fKernel) + { + pgprot_t fPg = MY_PAGE_KERNEL_EXEC; + pgprot_val(fPg) &= ~_PAGE_RW; + return fPg; + } + return PAGE_READONLY_EXEC; +#else + return fKernel ? MY_PAGE_KERNEL_EXEC : PAGE_READONLY_EXEC; +#endif + + case RTMEM_PROT_WRITE | RTMEM_PROT_EXEC: + case RTMEM_PROT_WRITE | RTMEM_PROT_EXEC | RTMEM_PROT_READ: + return fKernel ? MY_PAGE_KERNEL_EXEC : PAGE_SHARED_EXEC; + } +} + + +/** + * Worker for rtR0MemObjNativeReserveUser and rtR0MemObjNativerMapUser that creates + * an empty user space mapping. + * + * We acquire the mmap_sem/mmap_lock of the task! + * + * @returns Pointer to the mapping. + * (void *)-1 on failure. + * @param R3PtrFixed (RTR3PTR)-1 if anywhere, otherwise a specific location. + * @param cb The size of the mapping. + * @param uAlignment The alignment of the mapping. + * @param pTask The Linux task to create this mapping in. + * @param fProt The RTMEM_PROT_* mask. + */ +static void *rtR0MemObjLinuxDoMmap(RTR3PTR R3PtrFixed, size_t cb, size_t uAlignment, struct task_struct *pTask, unsigned fProt) +{ + unsigned fLnxProt; + unsigned long ulAddr; + + Assert(pTask == current); /* do_mmap */ + RT_NOREF_PV(pTask); + + /* + * Convert from IPRT protection to mman.h PROT_ and call do_mmap. + */ + fProt &= (RTMEM_PROT_NONE | RTMEM_PROT_READ | RTMEM_PROT_WRITE | RTMEM_PROT_EXEC); + if (fProt == RTMEM_PROT_NONE) + fLnxProt = PROT_NONE; + else + { + fLnxProt = 0; + if (fProt & RTMEM_PROT_READ) + fLnxProt |= PROT_READ; + if (fProt & RTMEM_PROT_WRITE) + fLnxProt |= PROT_WRITE; + if (fProt & RTMEM_PROT_EXEC) + fLnxProt |= PROT_EXEC; + } + + if (R3PtrFixed != (RTR3PTR)-1) + { +#if RTLNX_VER_MIN(3,5,0) + ulAddr = vm_mmap(NULL, R3PtrFixed, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, 0); +#else + LNX_MM_DOWN_WRITE(pTask->mm); + ulAddr = do_mmap(NULL, R3PtrFixed, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, 0); + LNX_MM_UP_WRITE(pTask->mm); +#endif + } + else + { +#if RTLNX_VER_MIN(3,5,0) + ulAddr = vm_mmap(NULL, 0, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS, 0); +#else + LNX_MM_DOWN_WRITE(pTask->mm); + ulAddr = do_mmap(NULL, 0, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS, 0); + LNX_MM_UP_WRITE(pTask->mm); +#endif + if ( !(ulAddr & ~PAGE_MASK) + && (ulAddr & (uAlignment - 1))) + { + /** @todo implement uAlignment properly... We'll probably need to make some dummy mappings to fill + * up alignment gaps. This is of course complicated by fragmentation (which we might have cause + * ourselves) and further by there begin two mmap strategies (top / bottom). */ + /* For now, just ignore uAlignment requirements... */ + } + } + + + if (ulAddr & ~PAGE_MASK) /* ~PAGE_MASK == PAGE_OFFSET_MASK */ + return (void *)-1; + return (void *)ulAddr; +} + + +/** + * Worker that destroys a user space mapping. + * Undoes what rtR0MemObjLinuxDoMmap did. + * + * We acquire the mmap_sem/mmap_lock of the task! + * + * @param pv The ring-3 mapping. + * @param cb The size of the mapping. + * @param pTask The Linux task to destroy this mapping in. + */ +static void rtR0MemObjLinuxDoMunmap(void *pv, size_t cb, struct task_struct *pTask) +{ +#if RTLNX_VER_MIN(3,5,0) + Assert(pTask == current); RT_NOREF_PV(pTask); + vm_munmap((unsigned long)pv, cb); +#elif defined(USE_RHEL4_MUNMAP) + LNX_MM_DOWN_WRITE(pTask->mm); + do_munmap(pTask->mm, (unsigned long)pv, cb, 0); /* should it be 1 or 0? */ + LNX_MM_UP_WRITE(pTask->mm); +#else + LNX_MM_DOWN_WRITE(pTask->mm); + do_munmap(pTask->mm, (unsigned long)pv, cb); + LNX_MM_UP_WRITE(pTask->mm); +#endif +} + + +/** + * Internal worker that allocates physical pages and creates the memory object for them. + * + * @returns IPRT status code. + * @param ppMemLnx Where to store the memory object pointer. + * @param enmType The object type. + * @param cb The number of bytes to allocate. + * @param uAlignment The alignment of the physical memory. + * Only valid if fContiguous == true, ignored otherwise. + * @param fFlagsLnx The page allocation flags (GPFs). + * @param fContiguous Whether the allocation must be contiguous. + * @param fExecutable Whether the memory must be executable. + * @param rcNoMem What to return when we're out of pages. + */ +static int rtR0MemObjLinuxAllocPages(PRTR0MEMOBJLNX *ppMemLnx, RTR0MEMOBJTYPE enmType, size_t cb, + size_t uAlignment, gfp_t fFlagsLnx, bool fContiguous, bool fExecutable, int rcNoMem) +{ + size_t iPage; + size_t const cPages = cb >> PAGE_SHIFT; + struct page *paPages; + + /* + * Allocate a memory object structure that's large enough to contain + * the page pointer array. + */ + PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), enmType, NULL, cb); + if (!pMemLnx) + return VERR_NO_MEMORY; + pMemLnx->cPages = cPages; + + if (cPages > 255) + { +# ifdef __GFP_REPEAT + /* Try hard to allocate the memory, but the allocation attempt might fail. */ + fFlagsLnx |= __GFP_REPEAT; +# endif +# ifdef __GFP_NOMEMALLOC + /* Introduced with Linux 2.6.12: Don't use emergency reserves */ + fFlagsLnx |= __GFP_NOMEMALLOC; +# endif + } + + /* + * Allocate the pages. + * For small allocations we'll try contiguous first and then fall back on page by page. + */ +#if RTLNX_VER_MIN(2,4,22) + if ( fContiguous + || cb <= PAGE_SIZE * 2) + { +# ifdef VBOX_USE_INSERT_PAGE + paPages = alloc_pages(fFlagsLnx | __GFP_COMP | __GFP_NOWARN, rtR0MemObjLinuxOrder(cPages)); +# else + paPages = alloc_pages(fFlagsLnx | __GFP_NOWARN, rtR0MemObjLinuxOrder(cPages)); +# endif + if (paPages) + { + fContiguous = true; + for (iPage = 0; iPage < cPages; iPage++) + pMemLnx->apPages[iPage] = &paPages[iPage]; + } + else if (fContiguous) + { + rtR0MemObjDelete(&pMemLnx->Core); + return rcNoMem; + } + } + + if (!fContiguous) + { + for (iPage = 0; iPage < cPages; iPage++) + { + pMemLnx->apPages[iPage] = alloc_page(fFlagsLnx | __GFP_NOWARN); + if (RT_UNLIKELY(!pMemLnx->apPages[iPage])) + { + while (iPage-- > 0) + __free_page(pMemLnx->apPages[iPage]); + rtR0MemObjDelete(&pMemLnx->Core); + return rcNoMem; + } + } + } + +#else /* < 2.4.22 */ + /** @todo figure out why we didn't allocate page-by-page on 2.4.21 and older... */ + paPages = alloc_pages(fFlagsLnx, rtR0MemObjLinuxOrder(cPages)); + if (!paPages) + { + rtR0MemObjDelete(&pMemLnx->Core); + return rcNoMem; + } + for (iPage = 0; iPage < cPages; iPage++) + { + pMemLnx->apPages[iPage] = &paPages[iPage]; + if (fExecutable) + MY_SET_PAGES_EXEC(pMemLnx->apPages[iPage], 1); + if (PageHighMem(pMemLnx->apPages[iPage])) + BUG(); + } + + fContiguous = true; +#endif /* < 2.4.22 */ + pMemLnx->fContiguous = fContiguous; + pMemLnx->fExecutable = fExecutable; + +#if RTLNX_VER_MAX(4,5,0) + /* + * Reserve the pages. + * + * Linux >= 4.5 with CONFIG_DEBUG_VM panics when setting PG_reserved on compound + * pages. According to Michal Hocko this shouldn't be necessary anyway because + * as pages which are not on the LRU list are never evictable. + */ + for (iPage = 0; iPage < cPages; iPage++) + SetPageReserved(pMemLnx->apPages[iPage]); +#endif + + /* + * Note that the physical address of memory allocated with alloc_pages(flags, order) + * is always 2^(PAGE_SHIFT+order)-aligned. + */ + if ( fContiguous + && uAlignment > PAGE_SIZE) + { + /* + * Check for alignment constraints. The physical address of memory allocated with + * alloc_pages(flags, order) is always 2^(PAGE_SHIFT+order)-aligned. + */ + if (RT_UNLIKELY(page_to_phys(pMemLnx->apPages[0]) & (uAlignment - 1))) + { + /* + * This should never happen! + */ + printk("rtR0MemObjLinuxAllocPages(cb=0x%lx, uAlignment=0x%lx): alloc_pages(..., %d) returned physical memory at 0x%lx!\n", + (unsigned long)cb, (unsigned long)uAlignment, rtR0MemObjLinuxOrder(cPages), (unsigned long)page_to_phys(pMemLnx->apPages[0])); + rtR0MemObjLinuxFreePages(pMemLnx); + return rcNoMem; + } + } + + *ppMemLnx = pMemLnx; + return VINF_SUCCESS; +} + + +/** + * Frees the physical pages allocated by the rtR0MemObjLinuxAllocPages() call. + * + * This method does NOT free the object. + * + * @param pMemLnx The object which physical pages should be freed. + */ +static void rtR0MemObjLinuxFreePages(PRTR0MEMOBJLNX pMemLnx) +{ + size_t iPage = pMemLnx->cPages; + if (iPage > 0) + { + /* + * Restore the page flags. + */ + while (iPage-- > 0) + { +#if RTLNX_VER_MAX(4,5,0) + /* See SetPageReserved() in rtR0MemObjLinuxAllocPages() */ + ClearPageReserved(pMemLnx->apPages[iPage]); +#endif +#if RTLNX_VER_MAX(2,4,22) + if (pMemLnx->fExecutable) + MY_SET_PAGES_NOEXEC(pMemLnx->apPages[iPage], 1); +#endif + } + + /* + * Free the pages. + */ +#if RTLNX_VER_MIN(2,4,22) + if (!pMemLnx->fContiguous) + { + iPage = pMemLnx->cPages; + while (iPage-- > 0) + __free_page(pMemLnx->apPages[iPage]); + } + else +#endif + __free_pages(pMemLnx->apPages[0], rtR0MemObjLinuxOrder(pMemLnx->cPages)); + + pMemLnx->cPages = 0; + } +} + + +#ifdef IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC +/** + * User data passed to the apply_to_page_range() callback. + */ +typedef struct LNXAPPLYPGRANGE +{ + /** Pointer to the memory object. */ + PRTR0MEMOBJLNX pMemLnx; + /** The page protection flags to apply. */ + pgprot_t fPg; +} LNXAPPLYPGRANGE; +/** Pointer to the user data. */ +typedef LNXAPPLYPGRANGE *PLNXAPPLYPGRANGE; +/** Pointer to the const user data. */ +typedef const LNXAPPLYPGRANGE *PCLNXAPPLYPGRANGE; + +/** + * Callback called in apply_to_page_range(). + * + * @returns Linux status code. + * @param pPte Pointer to the page table entry for the given address. + * @param uAddr The address to apply the new protection to. + * @param pvUser The opaque user data. + */ +static int rtR0MemObjLinuxApplyPageRange(pte_t *pPte, unsigned long uAddr, void *pvUser) +{ + PCLNXAPPLYPGRANGE pArgs = (PCLNXAPPLYPGRANGE)pvUser; + PRTR0MEMOBJLNX pMemLnx = pArgs->pMemLnx; + size_t idxPg = (uAddr - (unsigned long)pMemLnx->Core.pv) >> PAGE_SHIFT; + + set_pte(pPte, mk_pte(pMemLnx->apPages[idxPg], pArgs->fPg)); + return 0; +} +#endif + + +/** + * Maps the allocation into ring-0. + * + * This will update the RTR0MEMOBJLNX::Core.pv and RTR0MEMOBJ::fMappedToRing0 members. + * + * Contiguous mappings that isn't in 'high' memory will already be mapped into kernel + * space, so we'll use that mapping if possible. If execute access is required, we'll + * play safe and do our own mapping. + * + * @returns IPRT status code. + * @param pMemLnx The linux memory object to map. + * @param fExecutable Whether execute access is required. + */ +static int rtR0MemObjLinuxVMap(PRTR0MEMOBJLNX pMemLnx, bool fExecutable) +{ + int rc = VINF_SUCCESS; + + /* + * Choose mapping strategy. + */ + bool fMustMap = fExecutable + || !pMemLnx->fContiguous; + if (!fMustMap) + { + size_t iPage = pMemLnx->cPages; + while (iPage-- > 0) + if (PageHighMem(pMemLnx->apPages[iPage])) + { + fMustMap = true; + break; + } + } + + Assert(!pMemLnx->Core.pv); + Assert(!pMemLnx->fMappedToRing0); + + if (fMustMap) + { + /* + * Use vmap - 2.4.22 and later. + */ +#if RTLNX_VER_MIN(2,4,22) + pgprot_t fPg; + pgprot_val(fPg) = _PAGE_PRESENT | _PAGE_RW; +# ifdef _PAGE_NX + if (!fExecutable) + pgprot_val(fPg) |= _PAGE_NX; +# endif + +# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC + if (fExecutable) + { +# if RTLNX_VER_MIN(3,2,51) + pte_t **papPtes = (pte_t **)kmalloc_array(pMemLnx->cPages, sizeof(papPtes[0]), GFP_KERNEL); +# else + pte_t **papPtes = (pte_t **)kmalloc(pMemLnx->cPages * sizeof(papPtes[0]), GFP_KERNEL); +# endif + if (papPtes) + { + pMemLnx->pArea = alloc_vm_area(pMemLnx->Core.cb, papPtes); /* Note! pArea->nr_pages is not set. */ + if (pMemLnx->pArea) + { + size_t i; + Assert(pMemLnx->pArea->size >= pMemLnx->Core.cb); /* Note! includes guard page. */ + Assert(pMemLnx->pArea->addr); +# ifdef _PAGE_NX + pgprot_val(fPg) |= _PAGE_NX; /* Uses RTR0MemObjProtect to clear NX when memory ready, W^X fashion. */ +# endif + pMemLnx->papPtesForArea = papPtes; + for (i = 0; i < pMemLnx->cPages; i++) + *papPtes[i] = mk_pte(pMemLnx->apPages[i], fPg); + pMemLnx->Core.pv = pMemLnx->pArea->addr; + pMemLnx->fMappedToRing0 = true; + } + else + { + kfree(papPtes); + rc = VERR_MAP_FAILED; + } + } + else + rc = VERR_MAP_FAILED; + } + else +# endif + { +# if defined(IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC) + if (fExecutable) + pgprot_val(fPg) |= _PAGE_NX; /* Uses RTR0MemObjProtect to clear NX when memory ready, W^X fashion. */ +# endif + +# ifdef VM_MAP + pMemLnx->Core.pv = vmap(&pMemLnx->apPages[0], pMemLnx->cPages, VM_MAP, fPg); +# else + pMemLnx->Core.pv = vmap(&pMemLnx->apPages[0], pMemLnx->cPages, VM_ALLOC, fPg); +# endif + if (pMemLnx->Core.pv) + pMemLnx->fMappedToRing0 = true; + else + rc = VERR_MAP_FAILED; + } +#else /* < 2.4.22 */ + rc = VERR_NOT_SUPPORTED; +#endif + } + else + { + /* + * Use the kernel RAM mapping. + */ + pMemLnx->Core.pv = phys_to_virt(page_to_phys(pMemLnx->apPages[0])); + Assert(pMemLnx->Core.pv); + } + + return rc; +} + + +/** + * Undoes what rtR0MemObjLinuxVMap() did. + * + * @param pMemLnx The linux memory object. + */ +static void rtR0MemObjLinuxVUnmap(PRTR0MEMOBJLNX pMemLnx) +{ +#if RTLNX_VER_MIN(2,4,22) +# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC + if (pMemLnx->pArea) + { +# if 0 + pte_t **papPtes = pMemLnx->papPtesForArea; + size_t i; + for (i = 0; i < pMemLnx->cPages; i++) + *papPtes[i] = 0; +# endif + free_vm_area(pMemLnx->pArea); + kfree(pMemLnx->papPtesForArea); + pMemLnx->pArea = NULL; + pMemLnx->papPtesForArea = NULL; + } + else +# endif + if (pMemLnx->fMappedToRing0) + { + Assert(pMemLnx->Core.pv); + vunmap(pMemLnx->Core.pv); + pMemLnx->fMappedToRing0 = false; + } +#else /* < 2.4.22 */ + Assert(!pMemLnx->fMappedToRing0); +#endif + pMemLnx->Core.pv = NULL; +} + + +DECLHIDDEN(int) rtR0MemObjNativeFree(RTR0MEMOBJ pMem) +{ + IPRT_LINUX_SAVE_EFL_AC(); + PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem; + + /* + * Release any memory that we've allocated or locked. + */ + switch (pMemLnx->Core.enmType) + { + case RTR0MEMOBJTYPE_LOW: + case RTR0MEMOBJTYPE_PAGE: + case RTR0MEMOBJTYPE_CONT: + case RTR0MEMOBJTYPE_PHYS: + case RTR0MEMOBJTYPE_PHYS_NC: + rtR0MemObjLinuxVUnmap(pMemLnx); + rtR0MemObjLinuxFreePages(pMemLnx); + break; + + case RTR0MEMOBJTYPE_LOCK: + if (pMemLnx->Core.u.Lock.R0Process != NIL_RTR0PROCESS) + { + struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process); + size_t iPage; + Assert(pTask); + if (pTask && pTask->mm) + LNX_MM_DOWN_READ(pTask->mm); + + iPage = pMemLnx->cPages; + while (iPage-- > 0) + { + if (!PageReserved(pMemLnx->apPages[iPage])) + SetPageDirty(pMemLnx->apPages[iPage]); +#if RTLNX_VER_MIN(4,6,0) + put_page(pMemLnx->apPages[iPage]); +#else + page_cache_release(pMemLnx->apPages[iPage]); +#endif + } + + if (pTask && pTask->mm) + LNX_MM_UP_READ(pTask->mm); + } + /* else: kernel memory - nothing to do here. */ + break; + + case RTR0MEMOBJTYPE_RES_VIRT: + Assert(pMemLnx->Core.pv); + if (pMemLnx->Core.u.ResVirt.R0Process != NIL_RTR0PROCESS) + { + struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process); + Assert(pTask); + if (pTask && pTask->mm) + rtR0MemObjLinuxDoMunmap(pMemLnx->Core.pv, pMemLnx->Core.cb, pTask); + } + else + { + vunmap(pMemLnx->Core.pv); + + Assert(pMemLnx->cPages == 1 && pMemLnx->apPages[0] != NULL); + __free_page(pMemLnx->apPages[0]); + pMemLnx->apPages[0] = NULL; + pMemLnx->cPages = 0; + } + pMemLnx->Core.pv = NULL; + break; + + case RTR0MEMOBJTYPE_MAPPING: + Assert(pMemLnx->cPages == 0); Assert(pMemLnx->Core.pv); + if (pMemLnx->Core.u.ResVirt.R0Process != NIL_RTR0PROCESS) + { + struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process); + Assert(pTask); + if (pTask && pTask->mm) + rtR0MemObjLinuxDoMunmap(pMemLnx->Core.pv, pMemLnx->Core.cb, pTask); + } + else + vunmap(pMemLnx->Core.pv); + pMemLnx->Core.pv = NULL; + break; + + default: + AssertMsgFailed(("enmType=%d\n", pMemLnx->Core.enmType)); + return VERR_INTERNAL_ERROR; + } + IPRT_LINUX_RESTORE_EFL_ONLY_AC(); + return VINF_SUCCESS; +} + + +DECLHIDDEN(int) rtR0MemObjNativeAllocPage(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable) +{ + IPRT_LINUX_SAVE_EFL_AC(); + PRTR0MEMOBJLNX pMemLnx; + int rc; + +#if RTLNX_VER_MIN(2,4,22) + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_PAGE, cb, PAGE_SIZE, GFP_HIGHUSER, + false /* non-contiguous */, fExecutable, VERR_NO_MEMORY); +#else + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_PAGE, cb, PAGE_SIZE, GFP_USER, + false /* non-contiguous */, fExecutable, VERR_NO_MEMORY); +#endif + if (RT_SUCCESS(rc)) + { + rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable); + if (RT_SUCCESS(rc)) + { + *ppMem = &pMemLnx->Core; + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; + } + + rtR0MemObjLinuxFreePages(pMemLnx); + rtR0MemObjDelete(&pMemLnx->Core); + } + + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +DECLHIDDEN(int) rtR0MemObjNativeAllocLow(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable) +{ + IPRT_LINUX_SAVE_EFL_AC(); + PRTR0MEMOBJLNX pMemLnx; + int rc; + + /* Try to avoid GFP_DMA. GFM_DMA32 was introduced with Linux 2.6.15. */ +#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32) + /* ZONE_DMA32: 0-4GB */ + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_DMA32, + false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY); + if (RT_FAILURE(rc)) +#endif +#ifdef RT_ARCH_AMD64 + /* ZONE_DMA: 0-16MB */ + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_DMA, + false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY); +#else +# ifdef CONFIG_X86_PAE +# endif + /* ZONE_NORMAL: 0-896MB */ + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_USER, + false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY); +#endif + if (RT_SUCCESS(rc)) + { + rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable); + if (RT_SUCCESS(rc)) + { + *ppMem = &pMemLnx->Core; + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; + } + + rtR0MemObjLinuxFreePages(pMemLnx); + rtR0MemObjDelete(&pMemLnx->Core); + } + + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +DECLHIDDEN(int) rtR0MemObjNativeAllocCont(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable) +{ + IPRT_LINUX_SAVE_EFL_AC(); + PRTR0MEMOBJLNX pMemLnx; + int rc; + +#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32) + /* ZONE_DMA32: 0-4GB */ + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_DMA32, + true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY); + if (RT_FAILURE(rc)) +#endif +#ifdef RT_ARCH_AMD64 + /* ZONE_DMA: 0-16MB */ + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_DMA, + true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY); +#else + /* ZONE_NORMAL (32-bit hosts): 0-896MB */ + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_USER, + true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY); +#endif + if (RT_SUCCESS(rc)) + { + rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable); + if (RT_SUCCESS(rc)) + { +#if defined(RT_STRICT) && (defined(RT_ARCH_AMD64) || defined(CONFIG_HIGHMEM64G)) + size_t iPage = pMemLnx->cPages; + while (iPage-- > 0) + Assert(page_to_phys(pMemLnx->apPages[iPage]) < _4G); +#endif + pMemLnx->Core.u.Cont.Phys = page_to_phys(pMemLnx->apPages[0]); + *ppMem = &pMemLnx->Core; + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; + } + + rtR0MemObjLinuxFreePages(pMemLnx); + rtR0MemObjDelete(&pMemLnx->Core); + } + + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +/** + * Worker for rtR0MemObjLinuxAllocPhysSub that tries one allocation strategy. + * + * @returns IPRT status code. + * @param ppMemLnx Where to + * @param enmType The object type. + * @param cb The size of the allocation. + * @param uAlignment The alignment of the physical memory. + * Only valid for fContiguous == true, ignored otherwise. + * @param PhysHighest See rtR0MemObjNativeAllocPhys. + * @param fGfp The Linux GFP flags to use for the allocation. + */ +static int rtR0MemObjLinuxAllocPhysSub2(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJTYPE enmType, + size_t cb, size_t uAlignment, RTHCPHYS PhysHighest, gfp_t fGfp) +{ + PRTR0MEMOBJLNX pMemLnx; + int rc; + + rc = rtR0MemObjLinuxAllocPages(&pMemLnx, enmType, cb, uAlignment, fGfp, + enmType == RTR0MEMOBJTYPE_PHYS /* contiguous / non-contiguous */, + false /*fExecutable*/, VERR_NO_PHYS_MEMORY); + if (RT_FAILURE(rc)) + return rc; + + /* + * Check the addresses if necessary. (Can be optimized a bit for PHYS.) + */ + if (PhysHighest != NIL_RTHCPHYS) + { + size_t iPage = pMemLnx->cPages; + while (iPage-- > 0) + if (page_to_phys(pMemLnx->apPages[iPage]) > PhysHighest) + { + rtR0MemObjLinuxFreePages(pMemLnx); + rtR0MemObjDelete(&pMemLnx->Core); + return VERR_NO_MEMORY; + } + } + + /* + * Complete the object. + */ + if (enmType == RTR0MEMOBJTYPE_PHYS) + { + pMemLnx->Core.u.Phys.PhysBase = page_to_phys(pMemLnx->apPages[0]); + pMemLnx->Core.u.Phys.fAllocated = true; + } + *ppMem = &pMemLnx->Core; + return rc; +} + + +/** + * Worker for rtR0MemObjNativeAllocPhys and rtR0MemObjNativeAllocPhysNC. + * + * @returns IPRT status code. + * @param ppMem Where to store the memory object pointer on success. + * @param enmType The object type. + * @param cb The size of the allocation. + * @param uAlignment The alignment of the physical memory. + * Only valid for enmType == RTR0MEMOBJTYPE_PHYS, ignored otherwise. + * @param PhysHighest See rtR0MemObjNativeAllocPhys. + */ +static int rtR0MemObjLinuxAllocPhysSub(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJTYPE enmType, + size_t cb, size_t uAlignment, RTHCPHYS PhysHighest) +{ + int rc; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * There are two clear cases and that's the <=16MB and anything-goes ones. + * When the physical address limit is somewhere in-between those two we'll + * just have to try, starting with HIGHUSER and working our way thru the + * different types, hoping we'll get lucky. + * + * We should probably move this physical address restriction logic up to + * the page alloc function as it would be more efficient there. But since + * we don't expect this to be a performance issue just yet it can wait. + */ + if (PhysHighest == NIL_RTHCPHYS) + /* ZONE_HIGHMEM: the whole physical memory */ + rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, GFP_HIGHUSER); + else if (PhysHighest <= _1M * 16) + /* ZONE_DMA: 0-16MB */ + rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, GFP_DMA); + else + { + rc = VERR_NO_MEMORY; + if (RT_FAILURE(rc)) + /* ZONE_HIGHMEM: the whole physical memory */ + rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, GFP_HIGHUSER); + if (RT_FAILURE(rc)) + /* ZONE_NORMAL: 0-896MB */ + rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, GFP_USER); +#ifdef GFP_DMA32 + if (RT_FAILURE(rc)) + /* ZONE_DMA32: 0-4GB */ + rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, GFP_DMA32); +#endif + if (RT_FAILURE(rc)) + /* ZONE_DMA: 0-16MB */ + rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, GFP_DMA); + } + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +/** + * Translates a kernel virtual address to a linux page structure by walking the + * page tables. + * + * @note We do assume that the page tables will not change as we are walking + * them. This assumption is rather forced by the fact that I could not + * immediately see any way of preventing this from happening. So, we + * take some extra care when accessing them. + * + * Because of this, we don't want to use this function on memory where + * attribute changes to nearby pages is likely to cause large pages to + * be used or split up. So, don't use this for the linear mapping of + * physical memory. + * + * @returns Pointer to the page structur or NULL if it could not be found. + * @param pv The kernel virtual address. + */ +RTDECL(struct page *) rtR0MemObjLinuxVirtToPage(void *pv) +{ + unsigned long ulAddr = (unsigned long)pv; + unsigned long pfn; + struct page *pPage; + pte_t *pEntry; + union + { + pgd_t Global; +#if RTLNX_VER_MIN(4,12,0) + p4d_t Four; +#endif +#if RTLNX_VER_MIN(2,6,11) + pud_t Upper; +#endif + pmd_t Middle; + pte_t Entry; + } u; + + /* Should this happen in a situation this code will be called in? And if + * so, can it change under our feet? See also + * "Documentation/vm/active_mm.txt" in the kernel sources. */ + if (RT_UNLIKELY(!current->active_mm)) + return NULL; + u.Global = *pgd_offset(current->active_mm, ulAddr); + if (RT_UNLIKELY(pgd_none(u.Global))) + return NULL; +#if RTLNX_VER_MIN(2,6,11) +# if RTLNX_VER_MIN(4,12,0) + u.Four = *p4d_offset(&u.Global, ulAddr); + if (RT_UNLIKELY(p4d_none(u.Four))) + return NULL; + if (p4d_large(u.Four)) + { + pPage = p4d_page(u.Four); + AssertReturn(pPage, NULL); + pfn = page_to_pfn(pPage); /* doing the safe way... */ + AssertCompile(P4D_SHIFT - PAGE_SHIFT < 31); + pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (P4D_SHIFT - PAGE_SHIFT)) - 1); + return pfn_to_page(pfn); + } + u.Upper = *pud_offset(&u.Four, ulAddr); +# else /* < 4.12 */ + u.Upper = *pud_offset(&u.Global, ulAddr); +# endif /* < 4.12 */ + if (RT_UNLIKELY(pud_none(u.Upper))) + return NULL; +# if RTLNX_VER_MIN(2,6,25) + if (pud_large(u.Upper)) + { + pPage = pud_page(u.Upper); + AssertReturn(pPage, NULL); + pfn = page_to_pfn(pPage); /* doing the safe way... */ + pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (PUD_SHIFT - PAGE_SHIFT)) - 1); + return pfn_to_page(pfn); + } +# endif + u.Middle = *pmd_offset(&u.Upper, ulAddr); +#else /* < 2.6.11 */ + u.Middle = *pmd_offset(&u.Global, ulAddr); +#endif /* < 2.6.11 */ + if (RT_UNLIKELY(pmd_none(u.Middle))) + return NULL; +#if RTLNX_VER_MIN(2,6,0) + if (pmd_large(u.Middle)) + { + pPage = pmd_page(u.Middle); + AssertReturn(pPage, NULL); + pfn = page_to_pfn(pPage); /* doing the safe way... */ + pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (PMD_SHIFT - PAGE_SHIFT)) - 1); + return pfn_to_page(pfn); + } +#endif + +#if RTLNX_VER_MIN(2,5,5) || defined(pte_offset_map) /* As usual, RHEL 3 had pte_offset_map earlier. */ + pEntry = pte_offset_map(&u.Middle, ulAddr); +#else + pEntry = pte_offset(&u.Middle, ulAddr); +#endif + if (RT_UNLIKELY(!pEntry)) + return NULL; + u.Entry = *pEntry; +#if RTLNX_VER_MIN(2,5,5) || defined(pte_offset_map) + pte_unmap(pEntry); +#endif + + if (RT_UNLIKELY(!pte_present(u.Entry))) + return NULL; + return pte_page(u.Entry); +} +RT_EXPORT_SYMBOL(rtR0MemObjLinuxVirtToPage); + + +DECLHIDDEN(int) rtR0MemObjNativeAllocPhys(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, RTHCPHYS PhysHighest, size_t uAlignment) +{ + return rtR0MemObjLinuxAllocPhysSub(ppMem, RTR0MEMOBJTYPE_PHYS, cb, uAlignment, PhysHighest); +} + + +DECLHIDDEN(int) rtR0MemObjNativeAllocPhysNC(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, RTHCPHYS PhysHighest) +{ + return rtR0MemObjLinuxAllocPhysSub(ppMem, RTR0MEMOBJTYPE_PHYS_NC, cb, PAGE_SIZE, PhysHighest); +} + + +DECLHIDDEN(int) rtR0MemObjNativeEnterPhys(PPRTR0MEMOBJINTERNAL ppMem, RTHCPHYS Phys, size_t cb, uint32_t uCachePolicy) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * All we need to do here is to validate that we can use + * ioremap on the specified address (32/64-bit dma_addr_t). + */ + PRTR0MEMOBJLNX pMemLnx; + dma_addr_t PhysAddr = Phys; + AssertMsgReturn(PhysAddr == Phys, ("%#llx\n", (unsigned long long)Phys), VERR_ADDRESS_TOO_BIG); + + pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_PHYS, NULL, cb); + if (!pMemLnx) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + } + + pMemLnx->Core.u.Phys.PhysBase = PhysAddr; + pMemLnx->Core.u.Phys.fAllocated = false; + pMemLnx->Core.u.Phys.uCachePolicy = uCachePolicy; + Assert(!pMemLnx->cPages); + *ppMem = &pMemLnx->Core; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} + +/* openSUSE Leap 42.3 detection :-/ */ +#if RTLNX_VER_RANGE(4,4,0, 4,6,0) && defined(FAULT_FLAG_REMOTE) +# define GET_USER_PAGES_API KERNEL_VERSION(4, 10, 0) /* no typo! */ +#else +# define GET_USER_PAGES_API LINUX_VERSION_CODE +#endif + +DECLHIDDEN(int) rtR0MemObjNativeLockUser(PPRTR0MEMOBJINTERNAL ppMem, RTR3PTR R3Ptr, size_t cb, uint32_t fAccess, RTR0PROCESS R0Process) +{ + IPRT_LINUX_SAVE_EFL_AC(); + const int cPages = cb >> PAGE_SHIFT; + struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process); + struct vm_area_struct **papVMAs; + PRTR0MEMOBJLNX pMemLnx; + int rc = VERR_NO_MEMORY; + int const fWrite = fAccess & RTMEM_PROT_WRITE ? 1 : 0; + + /* + * Check for valid task and size overflows. + */ + if (!pTask) + return VERR_NOT_SUPPORTED; + if (((size_t)cPages << PAGE_SHIFT) != cb) + return VERR_OUT_OF_RANGE; + + /* + * Allocate the memory object and a temporary buffer for the VMAs. + */ + pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), RTR0MEMOBJTYPE_LOCK, (void *)R3Ptr, cb); + if (!pMemLnx) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + } + + papVMAs = (struct vm_area_struct **)RTMemAlloc(sizeof(*papVMAs) * cPages); + if (papVMAs) + { + LNX_MM_DOWN_READ(pTask->mm); + + /* + * Get user pages. + */ +/** @todo r=bird: Should we not force read access too? */ +#if GET_USER_PAGES_API >= KERNEL_VERSION(4, 6, 0) + if (R0Process == RTR0ProcHandleSelf()) + rc = get_user_pages(R3Ptr, /* Where from. */ + cPages, /* How many pages. */ +# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 9, 0) + fWrite ? FOLL_WRITE | /* Write to memory. */ + FOLL_FORCE /* force write access. */ + : 0, /* Write to memory. */ +# else + fWrite, /* Write to memory. */ + fWrite, /* force write access. */ +# endif + &pMemLnx->apPages[0], /* Page array. */ + papVMAs); /* vmas */ + /* + * Actually this should not happen at the moment as call this function + * only for our own process. + */ + else + rc = get_user_pages_remote( +# if GET_USER_PAGES_API < KERNEL_VERSION(5, 9, 0) + pTask, /* Task for fault accounting. */ +# endif + pTask->mm, /* Whose pages. */ + R3Ptr, /* Where from. */ + cPages, /* How many pages. */ +# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 9, 0) + fWrite ? FOLL_WRITE | /* Write to memory. */ + FOLL_FORCE /* force write access. */ + : 0, /* Write to memory. */ +# else + fWrite, /* Write to memory. */ + fWrite, /* force write access. */ +# endif + &pMemLnx->apPages[0], /* Page array. */ + papVMAs /* vmas */ +# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 10, 0) + , NULL /* locked */ +# endif + ); +#else /* GET_USER_PAGES_API < KERNEL_VERSION(4, 6, 0) */ + rc = get_user_pages(pTask, /* Task for fault accounting. */ + pTask->mm, /* Whose pages. */ + R3Ptr, /* Where from. */ + cPages, /* How many pages. */ +/* The get_user_pages API change was back-ported to 4.4.168. */ +# if RTLNX_VER_RANGE(4,4,168, 4,5,0) + fWrite ? FOLL_WRITE | /* Write to memory. */ + FOLL_FORCE /* force write access. */ + : 0, /* Write to memory. */ +# else + fWrite, /* Write to memory. */ + fWrite, /* force write access. */ +# endif + &pMemLnx->apPages[0], /* Page array. */ + papVMAs); /* vmas */ +#endif /* GET_USER_PAGES_API < KERNEL_VERSION(4, 6, 0) */ + if (rc == cPages) + { + /* + * Flush dcache (required?), protect against fork and _really_ pin the page + * table entries. get_user_pages() will protect against swapping out the + * pages but it will NOT protect against removing page table entries. This + * can be achieved with + * - using mlock / mmap(..., MAP_LOCKED, ...) from userland. This requires + * an appropriate limit set up with setrlimit(..., RLIMIT_MEMLOCK, ...). + * Usual Linux distributions support only a limited size of locked pages + * (e.g. 32KB). + * - setting the PageReserved bit (as we do in rtR0MemObjLinuxAllocPages() + * or by + * - setting the VM_LOCKED flag. This is the same as doing mlock() without + * a range check. + */ + /** @todo The Linux fork() protection will require more work if this API + * is to be used for anything but locking VM pages. */ + while (rc-- > 0) + { + flush_dcache_page(pMemLnx->apPages[rc]); + papVMAs[rc]->vm_flags |= VM_DONTCOPY | VM_LOCKED; + } + + LNX_MM_UP_READ(pTask->mm); + + RTMemFree(papVMAs); + + pMemLnx->Core.u.Lock.R0Process = R0Process; + pMemLnx->cPages = cPages; + Assert(!pMemLnx->fMappedToRing0); + *ppMem = &pMemLnx->Core; + + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + + /* + * Failed - we need to unlock any pages that we succeeded to lock. + */ + while (rc-- > 0) + { + if (!PageReserved(pMemLnx->apPages[rc])) + SetPageDirty(pMemLnx->apPages[rc]); +#if RTLNX_VER_MIN(4,6,0) + put_page(pMemLnx->apPages[rc]); +#else + page_cache_release(pMemLnx->apPages[rc]); +#endif + } + + LNX_MM_UP_READ(pTask->mm); + + RTMemFree(papVMAs); + rc = VERR_LOCK_FAILED; + } + + rtR0MemObjDelete(&pMemLnx->Core); + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +DECLHIDDEN(int) rtR0MemObjNativeLockKernel(PPRTR0MEMOBJINTERNAL ppMem, void *pv, size_t cb, uint32_t fAccess) +{ + IPRT_LINUX_SAVE_EFL_AC(); + void *pvLast = (uint8_t *)pv + cb - 1; + size_t const cPages = cb >> PAGE_SHIFT; + PRTR0MEMOBJLNX pMemLnx; + bool fLinearMapping; + int rc; + uint8_t *pbPage; + size_t iPage; + NOREF(fAccess); + + if ( !RTR0MemKernelIsValidAddr(pv) + || !RTR0MemKernelIsValidAddr(pv + cb)) + return VERR_INVALID_PARAMETER; + + /* + * The lower part of the kernel memory has a linear mapping between + * physical and virtual addresses. So we take a short cut here. This is + * assumed to be the cleanest way to handle those addresses (and the code + * is well tested, though the test for determining it is not very nice). + * If we ever decide it isn't we can still remove it. + */ +#if 0 + fLinearMapping = (unsigned long)pvLast < VMALLOC_START; +#else + fLinearMapping = (unsigned long)pv >= (unsigned long)__va(0) + && (unsigned long)pvLast < (unsigned long)high_memory; +#endif + + /* + * Allocate the memory object. + */ + pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), RTR0MEMOBJTYPE_LOCK, pv, cb); + if (!pMemLnx) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + } + + /* + * Gather the pages. + * We ASSUME all kernel pages are non-swappable and non-movable. + */ + rc = VINF_SUCCESS; + pbPage = (uint8_t *)pvLast; + iPage = cPages; + if (!fLinearMapping) + { + while (iPage-- > 0) + { + struct page *pPage = rtR0MemObjLinuxVirtToPage(pbPage); + if (RT_UNLIKELY(!pPage)) + { + rc = VERR_LOCK_FAILED; + break; + } + pMemLnx->apPages[iPage] = pPage; + pbPage -= PAGE_SIZE; + } + } + else + { + while (iPage-- > 0) + { + pMemLnx->apPages[iPage] = virt_to_page(pbPage); + pbPage -= PAGE_SIZE; + } + } + if (RT_SUCCESS(rc)) + { + /* + * Complete the memory object and return. + */ + pMemLnx->Core.u.Lock.R0Process = NIL_RTR0PROCESS; + pMemLnx->cPages = cPages; + Assert(!pMemLnx->fMappedToRing0); + *ppMem = &pMemLnx->Core; + + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + + rtR0MemObjDelete(&pMemLnx->Core); + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +DECLHIDDEN(int) rtR0MemObjNativeReserveKernel(PPRTR0MEMOBJINTERNAL ppMem, void *pvFixed, size_t cb, size_t uAlignment) +{ +#if RTLNX_VER_MIN(2,4,22) + IPRT_LINUX_SAVE_EFL_AC(); + const size_t cPages = cb >> PAGE_SHIFT; + struct page *pDummyPage; + struct page **papPages; + + /* check for unsupported stuff. */ + AssertMsgReturn(pvFixed == (void *)-1, ("%p\n", pvFixed), VERR_NOT_SUPPORTED); + if (uAlignment > PAGE_SIZE) + return VERR_NOT_SUPPORTED; + + /* + * Allocate a dummy page and create a page pointer array for vmap such that + * the dummy page is mapped all over the reserved area. + */ + pDummyPage = alloc_page(GFP_HIGHUSER | __GFP_NOWARN); + if (pDummyPage) + { + papPages = RTMemAlloc(sizeof(*papPages) * cPages); + if (papPages) + { + void *pv; + size_t iPage = cPages; + while (iPage-- > 0) + papPages[iPage] = pDummyPage; +# ifdef VM_MAP + pv = vmap(papPages, cPages, VM_MAP, PAGE_KERNEL_RO); +# else + pv = vmap(papPages, cPages, VM_ALLOC, PAGE_KERNEL_RO); +# endif + RTMemFree(papPages); + if (pv) + { + PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_RES_VIRT, pv, cb); + if (pMemLnx) + { + pMemLnx->Core.u.ResVirt.R0Process = NIL_RTR0PROCESS; + pMemLnx->cPages = 1; + pMemLnx->apPages[0] = pDummyPage; + *ppMem = &pMemLnx->Core; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + vunmap(pv); + } + } + __free_page(pDummyPage); + } + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + +#else /* < 2.4.22 */ + /* + * Could probably use ioremap here, but the caller is in a better position than us + * to select some safe physical memory. + */ + return VERR_NOT_SUPPORTED; +#endif +} + + +DECLHIDDEN(int) rtR0MemObjNativeReserveUser(PPRTR0MEMOBJINTERNAL ppMem, RTR3PTR R3PtrFixed, size_t cb, size_t uAlignment, RTR0PROCESS R0Process) +{ + IPRT_LINUX_SAVE_EFL_AC(); + PRTR0MEMOBJLNX pMemLnx; + void *pv; + struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process); + if (!pTask) + return VERR_NOT_SUPPORTED; + + /* + * Check that the specified alignment is supported. + */ + if (uAlignment > PAGE_SIZE) + return VERR_NOT_SUPPORTED; + + /* + * Let rtR0MemObjLinuxDoMmap do the difficult bits. + */ + pv = rtR0MemObjLinuxDoMmap(R3PtrFixed, cb, uAlignment, pTask, RTMEM_PROT_NONE); + if (pv == (void *)-1) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + } + + pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_RES_VIRT, pv, cb); + if (!pMemLnx) + { + rtR0MemObjLinuxDoMunmap(pv, cb, pTask); + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + } + + pMemLnx->Core.u.ResVirt.R0Process = R0Process; + *ppMem = &pMemLnx->Core; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} + + +DECLHIDDEN(int) rtR0MemObjNativeMapKernel(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, + void *pvFixed, size_t uAlignment, + unsigned fProt, size_t offSub, size_t cbSub) +{ + int rc = VERR_NO_MEMORY; + PRTR0MEMOBJLNX pMemLnxToMap = (PRTR0MEMOBJLNX)pMemToMap; + PRTR0MEMOBJLNX pMemLnx; + IPRT_LINUX_SAVE_EFL_AC(); + + /* Fail if requested to do something we can't. */ + AssertMsgReturn(pvFixed == (void *)-1, ("%p\n", pvFixed), VERR_NOT_SUPPORTED); + if (uAlignment > PAGE_SIZE) + return VERR_NOT_SUPPORTED; + + /* + * Create the IPRT memory object. + */ + if (!cbSub) + cbSub = pMemLnxToMap->Core.cb - offSub; + pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_MAPPING, NULL, cbSub); + if (pMemLnx) + { + if (pMemLnxToMap->cPages) + { +#if RTLNX_VER_MIN(2,4,22) + /* + * Use vmap - 2.4.22 and later. + */ + pgprot_t fPg = rtR0MemObjLinuxConvertProt(fProt, true /* kernel */); + /** @todo We don't really care too much for EXEC here... 5.8 always adds NX. */ + Assert(((offSub + cbSub) >> PAGE_SHIFT) <= pMemLnxToMap->cPages); +# ifdef VM_MAP + pMemLnx->Core.pv = vmap(&pMemLnxToMap->apPages[offSub >> PAGE_SHIFT], cbSub >> PAGE_SHIFT, VM_MAP, fPg); +# else + pMemLnx->Core.pv = vmap(&pMemLnxToMap->apPages[offSub >> PAGE_SHIFT], cbSub >> PAGE_SHIFT, VM_ALLOC, fPg); +# endif + if (pMemLnx->Core.pv) + { + pMemLnx->fMappedToRing0 = true; + rc = VINF_SUCCESS; + } + else + rc = VERR_MAP_FAILED; + +#else /* < 2.4.22 */ + /* + * Only option here is to share mappings if possible and forget about fProt. + */ + if (rtR0MemObjIsRing3(pMemToMap)) + rc = VERR_NOT_SUPPORTED; + else + { + rc = VINF_SUCCESS; + if (!pMemLnxToMap->Core.pv) + rc = rtR0MemObjLinuxVMap(pMemLnxToMap, !!(fProt & RTMEM_PROT_EXEC)); + if (RT_SUCCESS(rc)) + { + Assert(pMemLnxToMap->Core.pv); + pMemLnx->Core.pv = (uint8_t *)pMemLnxToMap->Core.pv + offSub; + } + } +#endif + } + else + { + /* + * MMIO / physical memory. + */ + Assert(pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_PHYS && !pMemLnxToMap->Core.u.Phys.fAllocated); +#if RTLNX_VER_MIN(2,6,25) + /* + * ioremap() defaults to no caching since the 2.6 kernels. + * ioremap_nocache() has been removed finally in 5.6-rc1. + */ + pMemLnx->Core.pv = pMemLnxToMap->Core.u.Phys.uCachePolicy == RTMEM_CACHE_POLICY_MMIO + ? ioremap(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub) + : ioremap_cache(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub); +#else /* KERNEL_VERSION < 2.6.25 */ + pMemLnx->Core.pv = pMemLnxToMap->Core.u.Phys.uCachePolicy == RTMEM_CACHE_POLICY_MMIO + ? ioremap_nocache(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub) + : ioremap(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub); +#endif /* KERNEL_VERSION < 2.6.25 */ + if (pMemLnx->Core.pv) + { + /** @todo fix protection. */ + rc = VINF_SUCCESS; + } + } + if (RT_SUCCESS(rc)) + { + pMemLnx->Core.u.Mapping.R0Process = NIL_RTR0PROCESS; + *ppMem = &pMemLnx->Core; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + rtR0MemObjDelete(&pMemLnx->Core); + } + + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +#ifdef VBOX_USE_PAE_HACK +/** + * Replace the PFN of a PTE with the address of the actual page. + * + * The caller maps a reserved dummy page at the address with the desired access + * and flags. + * + * This hack is required for older Linux kernels which don't provide + * remap_pfn_range(). + * + * @returns 0 on success, -ENOMEM on failure. + * @param mm The memory context. + * @param ulAddr The mapping address. + * @param Phys The physical address of the page to map. + */ +static int rtR0MemObjLinuxFixPte(struct mm_struct *mm, unsigned long ulAddr, RTHCPHYS Phys) +{ + int rc = -ENOMEM; + pgd_t *pgd; + + spin_lock(&mm->page_table_lock); + + pgd = pgd_offset(mm, ulAddr); + if (!pgd_none(*pgd) && !pgd_bad(*pgd)) + { + pmd_t *pmd = pmd_offset(pgd, ulAddr); + if (!pmd_none(*pmd)) + { + pte_t *ptep = pte_offset_map(pmd, ulAddr); + if (ptep) + { + pte_t pte = *ptep; + pte.pte_high &= 0xfff00000; + pte.pte_high |= ((Phys >> 32) & 0x000fffff); + pte.pte_low &= 0x00000fff; + pte.pte_low |= (Phys & 0xfffff000); + set_pte(ptep, pte); + pte_unmap(ptep); + rc = 0; + } + } + } + + spin_unlock(&mm->page_table_lock); + return rc; +} +#endif /* VBOX_USE_PAE_HACK */ + + +DECLHIDDEN(int) rtR0MemObjNativeMapUser(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, RTR3PTR R3PtrFixed, size_t uAlignment, + unsigned fProt, RTR0PROCESS R0Process, size_t offSub, size_t cbSub) +{ + struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process); + PRTR0MEMOBJLNX pMemLnxToMap = (PRTR0MEMOBJLNX)pMemToMap; + int rc = VERR_NO_MEMORY; + PRTR0MEMOBJLNX pMemLnx; +#ifdef VBOX_USE_PAE_HACK + struct page *pDummyPage; + RTHCPHYS DummyPhys; +#endif + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Check for restrictions. + */ + if (!pTask) + return VERR_NOT_SUPPORTED; + if (uAlignment > PAGE_SIZE) + return VERR_NOT_SUPPORTED; + +#ifdef VBOX_USE_PAE_HACK + /* + * Allocate a dummy page for use when mapping the memory. + */ + pDummyPage = alloc_page(GFP_USER | __GFP_NOWARN); + if (!pDummyPage) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + } + SetPageReserved(pDummyPage); + DummyPhys = page_to_phys(pDummyPage); +#endif + + /* + * Create the IPRT memory object. + */ + Assert(!offSub || cbSub); + if (cbSub == 0) + cbSub = pMemLnxToMap->Core.cb; + pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_MAPPING, NULL, cbSub); + if (pMemLnx) + { + /* + * Allocate user space mapping. + */ + void *pv; + pv = rtR0MemObjLinuxDoMmap(R3PtrFixed, cbSub, uAlignment, pTask, fProt); + if (pv != (void *)-1) + { + /* + * Map page by page into the mmap area. + * This is generic, paranoid and not very efficient. + */ + pgprot_t fPg = rtR0MemObjLinuxConvertProt(fProt, false /* user */); + unsigned long ulAddrCur = (unsigned long)pv; + const size_t cPages = (offSub + cbSub) >> PAGE_SHIFT; + size_t iPage; + + LNX_MM_DOWN_WRITE(pTask->mm); + + rc = VINF_SUCCESS; + if (pMemLnxToMap->cPages) + { + for (iPage = offSub >> PAGE_SHIFT; iPage < cPages; iPage++, ulAddrCur += PAGE_SIZE) + { +#if RTLNX_VER_MAX(2,6,11) + RTHCPHYS Phys = page_to_phys(pMemLnxToMap->apPages[iPage]); +#endif +#if RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE) + struct vm_area_struct *vma = find_vma(pTask->mm, ulAddrCur); /* this is probably the same for all the pages... */ + AssertBreakStmt(vma, rc = VERR_INTERNAL_ERROR); +#endif +#if RTLNX_VER_MAX(2,6,0) && defined(RT_ARCH_X86) + /* remap_page_range() limitation on x86 */ + AssertBreakStmt(Phys < _4G, rc = VERR_NO_MEMORY); +#endif + +#if defined(VBOX_USE_INSERT_PAGE) && RTLNX_VER_MIN(2,6,22) + rc = vm_insert_page(vma, ulAddrCur, pMemLnxToMap->apPages[iPage]); + /* Thes flags help making 100% sure some bad stuff wont happen (swap, core, ++). + * See remap_pfn_range() in mm/memory.c */ +#if RTLNX_VER_MIN(3,7,0) + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; +#else + vma->vm_flags |= VM_RESERVED; +#endif +#elif RTLNX_VER_MIN(2,6,11) + rc = remap_pfn_range(vma, ulAddrCur, page_to_pfn(pMemLnxToMap->apPages[iPage]), PAGE_SIZE, fPg); +#elif defined(VBOX_USE_PAE_HACK) + rc = remap_page_range(vma, ulAddrCur, DummyPhys, PAGE_SIZE, fPg); + if (!rc) + rc = rtR0MemObjLinuxFixPte(pTask->mm, ulAddrCur, Phys); +#elif RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE) + rc = remap_page_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg); +#else /* 2.4 */ + rc = remap_page_range(ulAddrCur, Phys, PAGE_SIZE, fPg); +#endif + if (rc) + { + rc = VERR_NO_MEMORY; + break; + } + } + } + else + { + RTHCPHYS Phys; + if (pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_PHYS) + Phys = pMemLnxToMap->Core.u.Phys.PhysBase; + else if (pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_CONT) + Phys = pMemLnxToMap->Core.u.Cont.Phys; + else + { + AssertMsgFailed(("%d\n", pMemLnxToMap->Core.enmType)); + Phys = NIL_RTHCPHYS; + } + if (Phys != NIL_RTHCPHYS) + { + for (iPage = offSub >> PAGE_SHIFT; iPage < cPages; iPage++, ulAddrCur += PAGE_SIZE, Phys += PAGE_SIZE) + { +#if RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE) + struct vm_area_struct *vma = find_vma(pTask->mm, ulAddrCur); /* this is probably the same for all the pages... */ + AssertBreakStmt(vma, rc = VERR_INTERNAL_ERROR); +#endif +#if RTLNX_VER_MAX(2,6,0) && defined(RT_ARCH_X86) + /* remap_page_range() limitation on x86 */ + AssertBreakStmt(Phys < _4G, rc = VERR_NO_MEMORY); +#endif + +#if RTLNX_VER_MIN(2,6,11) + rc = remap_pfn_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg); +#elif defined(VBOX_USE_PAE_HACK) + rc = remap_page_range(vma, ulAddrCur, DummyPhys, PAGE_SIZE, fPg); + if (!rc) + rc = rtR0MemObjLinuxFixPte(pTask->mm, ulAddrCur, Phys); +#elif RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE) + rc = remap_page_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg); +#else /* 2.4 */ + rc = remap_page_range(ulAddrCur, Phys, PAGE_SIZE, fPg); +#endif + if (rc) + { + rc = VERR_NO_MEMORY; + break; + } + } + } + } + +#ifdef CONFIG_NUMA_BALANCING +# if RTLNX_VER_MAX(3,13,0) && RTLNX_RHEL_MAX(7,0) +# define VBOX_NUMA_HACK_OLD +# endif + if (RT_SUCCESS(rc)) + { + /** @todo Ugly hack! But right now we have no other means to + * disable automatic NUMA page balancing. */ +# ifdef RT_OS_X86 +# ifdef VBOX_NUMA_HACK_OLD + pTask->mm->numa_next_reset = jiffies + 0x7fffffffUL; +# endif + pTask->mm->numa_next_scan = jiffies + 0x7fffffffUL; +# else +# ifdef VBOX_NUMA_HACK_OLD + pTask->mm->numa_next_reset = jiffies + 0x7fffffffffffffffUL; +# endif + pTask->mm->numa_next_scan = jiffies + 0x7fffffffffffffffUL; +# endif + } +#endif /* CONFIG_NUMA_BALANCING */ + + LNX_MM_UP_WRITE(pTask->mm); + + if (RT_SUCCESS(rc)) + { +#ifdef VBOX_USE_PAE_HACK + __free_page(pDummyPage); +#endif + pMemLnx->Core.pv = pv; + pMemLnx->Core.u.Mapping.R0Process = R0Process; + *ppMem = &pMemLnx->Core; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + + /* + * Bail out. + */ + rtR0MemObjLinuxDoMunmap(pv, cbSub, pTask); + } + rtR0MemObjDelete(&pMemLnx->Core); + } +#ifdef VBOX_USE_PAE_HACK + __free_page(pDummyPage); +#endif + + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} + + +DECLHIDDEN(int) rtR0MemObjNativeProtect(PRTR0MEMOBJINTERNAL pMem, size_t offSub, size_t cbSub, uint32_t fProt) +{ +# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC + /* + * Currently only supported when we've got addresses PTEs from the kernel. + */ + PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem; + if (pMemLnx->pArea && pMemLnx->papPtesForArea) + { + pgprot_t const fPg = rtR0MemObjLinuxConvertProt(fProt, true /*fKernel*/); + size_t const cPages = (offSub + cbSub) >> PAGE_SHIFT; + pte_t **papPtes = pMemLnx->papPtesForArea; + size_t i; + + for (i = offSub >> PAGE_SHIFT; i < cPages; i++) + { + set_pte(papPtes[i], mk_pte(pMemLnx->apPages[i], fPg)); + } + preempt_disable(); + __flush_tlb_all(); + preempt_enable(); + return VINF_SUCCESS; + } +# elif defined(IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC) + PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem; + if ( pMemLnx->fExecutable + && pMemLnx->fMappedToRing0) + { + LNXAPPLYPGRANGE Args; + Args.pMemLnx = pMemLnx; + Args.fPg = rtR0MemObjLinuxConvertProt(fProt, true /*fKernel*/); + int rcLnx = apply_to_page_range(current->active_mm, (unsigned long)pMemLnx->Core.pv + offSub, cbSub, + rtR0MemObjLinuxApplyPageRange, (void *)&Args); + if (rcLnx) + return VERR_NOT_SUPPORTED; + + return VINF_SUCCESS; + } +# endif + + NOREF(pMem); + NOREF(offSub); + NOREF(cbSub); + NOREF(fProt); + return VERR_NOT_SUPPORTED; +} + + +DECLHIDDEN(RTHCPHYS) rtR0MemObjNativeGetPagePhysAddr(PRTR0MEMOBJINTERNAL pMem, size_t iPage) +{ + PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem; + + if (pMemLnx->cPages) + return page_to_phys(pMemLnx->apPages[iPage]); + + switch (pMemLnx->Core.enmType) + { + case RTR0MEMOBJTYPE_CONT: + return pMemLnx->Core.u.Cont.Phys + (iPage << PAGE_SHIFT); + + case RTR0MEMOBJTYPE_PHYS: + return pMemLnx->Core.u.Phys.PhysBase + (iPage << PAGE_SHIFT); + + /* the parent knows */ + case RTR0MEMOBJTYPE_MAPPING: + return rtR0MemObjNativeGetPagePhysAddr(pMemLnx->Core.uRel.Child.pParent, iPage); + + /* cPages > 0 */ + case RTR0MEMOBJTYPE_LOW: + case RTR0MEMOBJTYPE_LOCK: + case RTR0MEMOBJTYPE_PHYS_NC: + case RTR0MEMOBJTYPE_PAGE: + default: + AssertMsgFailed(("%d\n", pMemLnx->Core.enmType)); + /* fall thru */ + + case RTR0MEMOBJTYPE_RES_VIRT: + return NIL_RTHCPHYS; + } +} + diff --git a/src/VBox/Runtime/r0drv/linux/memuserkernel-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/memuserkernel-r0drv-linux.c new file mode 100644 index 00000000..f23bd091 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/memuserkernel-r0drv-linux.c @@ -0,0 +1,181 @@ +/* $Id: memuserkernel-r0drv-linux.c $ */ +/** @file + * IPRT - User & Kernel Memory, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2009-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/mem.h> +#include <iprt/errcore.h> + + +RTR0DECL(int) RTR0MemUserCopyFrom(void *pvDst, RTR3PTR R3PtrSrc, size_t cb) +{ + IPRT_LINUX_SAVE_EFL_AC(); + if (RT_LIKELY(copy_from_user(pvDst, (void *)R3PtrSrc, cb) == 0)) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_ACCESS_DENIED; +} +RT_EXPORT_SYMBOL(RTR0MemUserCopyFrom); + + +RTR0DECL(int) RTR0MemUserCopyTo(RTR3PTR R3PtrDst, void const *pvSrc, size_t cb) +{ + IPRT_LINUX_SAVE_EFL_AC(); + if (RT_LIKELY(copy_to_user((void *)R3PtrDst, pvSrc, cb) == 0)) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_ACCESS_DENIED; +} +RT_EXPORT_SYMBOL(RTR0MemUserCopyTo); + + +RTR0DECL(bool) RTR0MemUserIsValidAddr(RTR3PTR R3Ptr) +{ + IPRT_LINUX_SAVE_EFL_AC(); +#if RTLNX_VER_MIN(5,0,0) || RTLNX_RHEL_MIN(8,1) + bool fRc = access_ok((void *)R3Ptr, 1); +#else + bool fRc = access_ok(VERIFY_READ, (void *)R3Ptr, 1); +#endif + IPRT_LINUX_RESTORE_EFL_AC(); + return fRc; +} +RT_EXPORT_SYMBOL(RTR0MemUserIsValidAddr); + + +RTR0DECL(bool) RTR0MemKernelIsValidAddr(void *pv) +{ + /* Couldn't find a straight forward way of doing this... */ +#if defined(RT_ARCH_X86) && defined(CONFIG_X86_HIGH_ENTRY) + return true; /* ?? */ +#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64) + return (uintptr_t)pv >= PAGE_OFFSET; +#else +# error "PORT ME" +#if RTLNX_VER_MIN(5,0,0) || RTLNX_RHEL_MIN(8,1) + return !access_ok(pv, 1); +#else + return !access_ok(VERIFY_READ, pv, 1); +#endif +#endif +} +RT_EXPORT_SYMBOL(RTR0MemKernelIsValidAddr); + + +RTR0DECL(bool) RTR0MemAreKrnlAndUsrDifferent(void) +{ +#if defined(RT_ARCH_X86) && defined(CONFIG_X86_HIGH_ENTRY) /* ?? */ + return false; +#else + return true; +#endif +} +RT_EXPORT_SYMBOL(RTR0MemAreKrnlAndUsrDifferent); + + +/** + * Treats both source and destination as unsafe buffers. + */ +static int rtR0MemKernelCopyLnxWorker(void *pvDst, void const *pvSrc, size_t cb) +{ +#if RTLNX_VER_MIN(2,5,55) +/* _ASM_EXTABLE was introduced in 2.6.25 from what I can tell. Using #ifndef + here since it has to be a macro and you never know what someone might have + backported to an earlier kernel release. */ +# ifndef _ASM_EXTABLE +# if ARCH_BITS == 32 +# define _ASM_EXTABLE(a_Instr, a_Resume) \ + ".section __ex_table,\"a\"\n" \ + ".balign 4\n" \ + ".long " #a_Instr "\n" \ + ".long " #a_Resume "\n" \ + ".previous\n" +# else +# define _ASM_EXTABLE(a_Instr, a_Resume) \ + ".section __ex_table,\"a\"\n" \ + ".balign 8\n" \ + ".quad " #a_Instr "\n" \ + ".quad " #a_Resume "\n" \ + ".previous\n" +# endif +# endif /* !_ASM_EXTABLE */ + int rc; + IPRT_LINUX_SAVE_EFL_AC(); /* paranoia */ + if (!cb) + return VINF_SUCCESS; + + __asm__ __volatile__ ("cld\n" + "1:\n\t" + "rep; movsb\n" + "2:\n\t" + ".section .fixup,\"ax\"\n" + "3:\n\t" + "movl %4, %0\n\t" + "jmp 2b\n\t" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : "=r" (rc), + "=D" (pvDst), + "=S" (pvSrc), + "=c" (cb) + : "i" (VERR_ACCESS_DENIED), + "0" (VINF_SUCCESS), + "1" (pvDst), + "2" (pvSrc), + "3" (cb) + : "memory"); + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +#else + return VERR_NOT_SUPPORTED; +#endif +} + + +RTR0DECL(int) RTR0MemKernelCopyFrom(void *pvDst, void const *pvSrc, size_t cb) +{ + return rtR0MemKernelCopyLnxWorker(pvDst, pvSrc, cb); +} +RT_EXPORT_SYMBOL(RTR0MemKernelCopyFrom); + + +RTR0DECL(int) RTR0MemKernelCopyTo(void *pvDst, void const *pvSrc, size_t cb) +{ + return rtR0MemKernelCopyLnxWorker(pvDst, pvSrc, cb); +} +RT_EXPORT_SYMBOL(RTR0MemKernelCopyTo); + diff --git a/src/VBox/Runtime/r0drv/linux/mp-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/mp-r0drv-linux.c new file mode 100644 index 00000000..d76fa314 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/mp-r0drv-linux.c @@ -0,0 +1,630 @@ +/* $Id: mp-r0drv-linux.c $ */ +/** @file + * IPRT - Multiprocessor, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2008-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/mp.h> +#include <iprt/cpuset.h> +#include <iprt/err.h> +#include <iprt/asm.h> +#include <iprt/thread.h> +#include "r0drv/mp-r0drv.h" + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +#if defined(nr_cpumask_bits) || RTLNX_VER_MIN(2,6,28) +# define VBOX_NR_CPUMASK_BITS (nr_cpumask_bits) /* same as nr_cpu_ids */ +#else +# define VBOX_NR_CPUMASK_BITS (NR_CPUS) +#endif + + +RTDECL(RTCPUID) RTMpCpuId(void) +{ + return smp_processor_id(); +} +RT_EXPORT_SYMBOL(RTMpCpuId); + + +RTDECL(int) RTMpCurSetIndex(void) +{ + return smp_processor_id(); +} +RT_EXPORT_SYMBOL(RTMpCurSetIndex); + + +RTDECL(int) RTMpCurSetIndexAndId(PRTCPUID pidCpu) +{ + return *pidCpu = smp_processor_id(); +} +RT_EXPORT_SYMBOL(RTMpCurSetIndexAndId); + + +RTDECL(int) RTMpCpuIdToSetIndex(RTCPUID idCpu) +{ + return idCpu < RTCPUSET_MAX_CPUS && idCpu < VBOX_NR_CPUMASK_BITS ? (int)idCpu : -1; +} +RT_EXPORT_SYMBOL(RTMpCpuIdToSetIndex); + + +RTDECL(RTCPUID) RTMpCpuIdFromSetIndex(int iCpu) +{ + return (unsigned)iCpu < VBOX_NR_CPUMASK_BITS ? (RTCPUID)iCpu : NIL_RTCPUID; +} +RT_EXPORT_SYMBOL(RTMpCpuIdFromSetIndex); + + +RTDECL(RTCPUID) RTMpGetMaxCpuId(void) +{ + return VBOX_NR_CPUMASK_BITS - 1; +} +RT_EXPORT_SYMBOL(RTMpGetMaxCpuId); + + +RTDECL(bool) RTMpIsCpuPossible(RTCPUID idCpu) +{ +#if defined(CONFIG_SMP) +# if RTLNX_VER_MIN(2,6,2) || defined(cpu_possible) + return idCpu < VBOX_NR_CPUMASK_BITS && cpu_possible(idCpu); +# else /* < 2.5.29 */ + return idCpu < (RTCPUID)(smp_num_cpus); +# endif +#else + return idCpu == RTMpCpuId(); +#endif +} +RT_EXPORT_SYMBOL(RTMpIsCpuPossible); + + +RTDECL(PRTCPUSET) RTMpGetSet(PRTCPUSET pSet) +{ + RTCPUID idCpu; + + RTCpuSetEmpty(pSet); + idCpu = RTMpGetMaxCpuId(); + do + { + if (RTMpIsCpuPossible(idCpu)) + RTCpuSetAdd(pSet, idCpu); + } while (idCpu-- > 0); + return pSet; +} +RT_EXPORT_SYMBOL(RTMpGetSet); + + +RTDECL(RTCPUID) RTMpGetCount(void) +{ +#ifdef CONFIG_SMP +# if RTLNX_VER_MIN(2,6,4) || defined(num_possible_cpus) + return num_possible_cpus(); +# elif RTLNX_VER_MAX(2,5,0) + return smp_num_cpus; +# else + RTCPUSET Set; + RTMpGetSet(&Set); + return RTCpuSetCount(&Set); +# endif +#else + return 1; +#endif +} +RT_EXPORT_SYMBOL(RTMpGetCount); + + +RTDECL(bool) RTMpIsCpuOnline(RTCPUID idCpu) +{ +#ifdef CONFIG_SMP +# if RTLNX_VER_MIN(2,6,0) || defined(cpu_online) + return idCpu < VBOX_NR_CPUMASK_BITS && cpu_online(idCpu); +# else /* 2.4: */ + return idCpu < VBOX_NR_CPUMASK_BITS && cpu_online_map & RT_BIT_64(idCpu); +# endif +#else + return idCpu == RTMpCpuId(); +#endif +} +RT_EXPORT_SYMBOL(RTMpIsCpuOnline); + + +RTDECL(PRTCPUSET) RTMpGetOnlineSet(PRTCPUSET pSet) +{ +#ifdef CONFIG_SMP + RTCPUID idCpu; + + RTCpuSetEmpty(pSet); + idCpu = RTMpGetMaxCpuId(); + do + { + if (RTMpIsCpuOnline(idCpu)) + RTCpuSetAdd(pSet, idCpu); + } while (idCpu-- > 0); +#else + RTCpuSetEmpty(pSet); + RTCpuSetAdd(pSet, RTMpCpuId()); +#endif + return pSet; +} +RT_EXPORT_SYMBOL(RTMpGetOnlineSet); + + +RTDECL(RTCPUID) RTMpGetOnlineCount(void) +{ +#ifdef CONFIG_SMP +# if RTLNX_VER_MIN(2,6,0) || defined(num_online_cpus) + return num_online_cpus(); +# else + RTCPUSET Set; + RTMpGetOnlineSet(&Set); + return RTCpuSetCount(&Set); +# endif +#else + return 1; +#endif +} +RT_EXPORT_SYMBOL(RTMpGetOnlineCount); + + +RTDECL(bool) RTMpIsCpuWorkPending(void) +{ + /** @todo (not used on non-Windows platforms yet). */ + return false; +} +RT_EXPORT_SYMBOL(RTMpIsCpuWorkPending); + + +/** + * Wrapper between the native linux per-cpu callbacks and PFNRTWORKER. + * + * @param pvInfo Pointer to the RTMPARGS package. + */ +static void rtmpLinuxWrapper(void *pvInfo) +{ + PRTMPARGS pArgs = (PRTMPARGS)pvInfo; + ASMAtomicIncU32(&pArgs->cHits); + pArgs->pfnWorker(RTMpCpuId(), pArgs->pvUser1, pArgs->pvUser2); +} + +#ifdef CONFIG_SMP + +# if RTLNX_VER_MIN(2,6,27) +/** + * Wrapper between the native linux per-cpu callbacks and PFNRTWORKER, does hit + * increment after calling the worker. + * + * @param pvInfo Pointer to the RTMPARGS package. + */ +static void rtmpLinuxWrapperPostInc(void *pvInfo) +{ + PRTMPARGS pArgs = (PRTMPARGS)pvInfo; + pArgs->pfnWorker(RTMpCpuId(), pArgs->pvUser1, pArgs->pvUser2); + ASMAtomicIncU32(&pArgs->cHits); +} +# endif + + +/** + * Wrapper between the native linux all-cpu callbacks and PFNRTWORKER. + * + * @param pvInfo Pointer to the RTMPARGS package. + */ +static void rtmpLinuxAllWrapper(void *pvInfo) +{ + PRTMPARGS pArgs = (PRTMPARGS)pvInfo; + PRTCPUSET pWorkerSet = pArgs->pWorkerSet; + RTCPUID idCpu = RTMpCpuId(); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + if (RTCpuSetIsMember(pWorkerSet, idCpu)) + { + pArgs->pfnWorker(idCpu, pArgs->pvUser1, pArgs->pvUser2); + RTCpuSetDel(pWorkerSet, idCpu); + } +} + +#endif /* CONFIG_SMP */ + +RTDECL(int) RTMpOnAll(PFNRTMPWORKER pfnWorker, void *pvUser1, void *pvUser2) +{ + IPRT_LINUX_SAVE_EFL_AC(); + RTMPARGS Args; + RTCPUSET OnlineSet; + RTCPUID idCpu; +#ifdef CONFIG_SMP + uint32_t cLoops; +#endif + + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + + Args.pfnWorker = pfnWorker; + Args.pvUser1 = pvUser1; + Args.pvUser2 = pvUser2; + Args.idCpu = NIL_RTCPUID; + Args.cHits = 0; + + RTThreadPreemptDisable(&PreemptState); + RTMpGetOnlineSet(&OnlineSet); + Args.pWorkerSet = &OnlineSet; + idCpu = RTMpCpuId(); + +#ifdef CONFIG_SMP + if (RTCpuSetCount(&OnlineSet) > 1) + { + /* Fire the function on all other CPUs without waiting for completion. */ +# if RTLNX_VER_MIN(5,3,0) + smp_call_function(rtmpLinuxAllWrapper, &Args, 0 /* wait */); +# elif RTLNX_VER_MIN(2,6,27) + int rc = smp_call_function(rtmpLinuxAllWrapper, &Args, 0 /* wait */); + Assert(!rc); NOREF(rc); +# else + int rc = smp_call_function(rtmpLinuxAllWrapper, &Args, 0 /* retry */, 0 /* wait */); + Assert(!rc); NOREF(rc); +# endif + } +#endif + + /* Fire the function on this CPU. */ + Args.pfnWorker(idCpu, Args.pvUser1, Args.pvUser2); + RTCpuSetDel(Args.pWorkerSet, idCpu); + +#ifdef CONFIG_SMP + /* Wait for all of them finish. */ + cLoops = 64000; + while (!RTCpuSetIsEmpty(Args.pWorkerSet)) + { + /* Periodically check if any CPU in the wait set has gone offline, if so update the wait set. */ + if (!cLoops--) + { + RTCPUSET OnlineSetNow; + RTMpGetOnlineSet(&OnlineSetNow); + RTCpuSetAnd(Args.pWorkerSet, &OnlineSetNow); + + cLoops = 64000; + } + + ASMNopPause(); + } +#endif + + RTThreadPreemptRestore(&PreemptState); + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTMpOnAll); + + +RTDECL(int) RTMpOnOthers(PFNRTMPWORKER pfnWorker, void *pvUser1, void *pvUser2) +{ +#ifdef CONFIG_SMP + IPRT_LINUX_SAVE_EFL_AC(); + RTMPARGS Args; + + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + Args.pfnWorker = pfnWorker; + Args.pvUser1 = pvUser1; + Args.pvUser2 = pvUser2; + Args.idCpu = NIL_RTCPUID; + Args.cHits = 0; + + RTThreadPreemptDisable(&PreemptState); +# if RTLNX_VER_MIN(5,3,0) + smp_call_function(rtmpLinuxWrapper, &Args, 1 /* wait */); +# elif RTLNX_VER_MIN(2,6,27) + int rc = smp_call_function(rtmpLinuxWrapper, &Args, 1 /* wait */); + Assert(rc == 0); NOREF(rc); +# else /* older kernels */ + int rc = smp_call_function(rtmpLinuxWrapper, &Args, 0 /* retry */, 1 /* wait */); + Assert(rc == 0); NOREF(rc); +# endif /* older kernels */ + RTThreadPreemptRestore(&PreemptState); + + IPRT_LINUX_RESTORE_EFL_AC(); +#else + RT_NOREF(pfnWorker, pvUser1, pvUser2); +#endif + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTMpOnOthers); + + +#if RTLNX_VER_MAX(2,6,27) && defined(CONFIG_SMP) +/** + * Wrapper between the native linux per-cpu callbacks and PFNRTWORKER + * employed by RTMpOnPair on older kernels that lacks smp_call_function_many. + * + * @param pvInfo Pointer to the RTMPARGS package. + */ +static void rtMpLinuxOnPairWrapper(void *pvInfo) +{ + PRTMPARGS pArgs = (PRTMPARGS)pvInfo; + RTCPUID idCpu = RTMpCpuId(); + + if ( idCpu == pArgs->idCpu + || idCpu == pArgs->idCpu2) + { + pArgs->pfnWorker(idCpu, pArgs->pvUser1, pArgs->pvUser2); + ASMAtomicIncU32(&pArgs->cHits); + } +} +#endif + + +RTDECL(int) RTMpOnPair(RTCPUID idCpu1, RTCPUID idCpu2, uint32_t fFlags, PFNRTMPWORKER pfnWorker, void *pvUser1, void *pvUser2) +{ +#ifdef CONFIG_SMP + IPRT_LINUX_SAVE_EFL_AC(); + int rc; + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; +# if RTLNX_VER_MIN(2,6,28) /* 2.6.28 introduces CONFIG_CPUMASK_OFFSTACK */ + cpumask_var_t DstCpuMask; +# elif RTLNX_VER_MIN(2,6,27) + cpumask_t DstCpuMask; +# endif + + AssertReturn(idCpu1 != idCpu2, VERR_INVALID_PARAMETER); + AssertReturn(!(fFlags & RTMPON_F_VALID_MASK), VERR_INVALID_FLAGS); + + /* + * Prepare the CPU mask before we disable preemption. + */ +# if RTLNX_VER_MIN(2,6,30) + if (!zalloc_cpumask_var(&DstCpuMask, GFP_KERNEL)) + return VERR_NO_MEMORY; + cpumask_set_cpu(idCpu1, DstCpuMask); + cpumask_set_cpu(idCpu2, DstCpuMask); +# elif RTLNX_VER_MIN(2,6,28) + if (!alloc_cpumask_var(&DstCpuMask, GFP_KERNEL)) + return VERR_NO_MEMORY; + cpumask_clear(DstCpuMask); + cpumask_set_cpu(idCpu1, DstCpuMask); + cpumask_set_cpu(idCpu2, DstCpuMask); +# elif RTLNX_VER_MIN(2,6,27) + cpus_clear(DstCpuMask); + cpu_set(idCpu1, DstCpuMask); + cpu_set(idCpu2, DstCpuMask); +# endif + + /* + * Check that both CPUs are online before doing the broadcast call. + */ + RTThreadPreemptDisable(&PreemptState); + if ( RTMpIsCpuOnline(idCpu1) + && RTMpIsCpuOnline(idCpu2)) + { + /* + * Use the smp_call_function variant taking a cpu mask where available, + * falling back on broadcast with filter. Slight snag if one of the + * CPUs is the one we're running on, we must do the call and the post + * call wait ourselves. + */ + RTCPUID idCpuSelf = RTMpCpuId(); + bool const fCallSelf = idCpuSelf == idCpu1 || idCpuSelf == idCpu2; + RTMPARGS Args; + Args.pfnWorker = pfnWorker; + Args.pvUser1 = pvUser1; + Args.pvUser2 = pvUser2; + Args.idCpu = idCpu1; + Args.idCpu2 = idCpu2; + Args.cHits = 0; + +# if RTLNX_VER_MIN(2,6,28) + smp_call_function_many(DstCpuMask, rtmpLinuxWrapperPostInc, &Args, !fCallSelf /* wait */); + rc = 0; +# elif RTLNX_VER_MIN(2,6,27) + rc = smp_call_function_mask(DstCpuMask, rtmpLinuxWrapperPostInc, &Args, !fCallSelf /* wait */); +# else /* older kernels */ + rc = smp_call_function(rtMpLinuxOnPairWrapper, &Args, 0 /* retry */, !fCallSelf /* wait */); +# endif /* older kernels */ + Assert(rc == 0); + + /* Call ourselves if necessary and wait for the other party to be done. */ + if (fCallSelf) + { + uint32_t cLoops = 0; + rtmpLinuxWrapper(&Args); + while (ASMAtomicReadU32(&Args.cHits) < 2) + { + if ((cLoops & 0x1ff) == 0 && !RTMpIsCpuOnline(idCpuSelf == idCpu1 ? idCpu2 : idCpu1)) + break; + cLoops++; + ASMNopPause(); + } + } + + Assert(Args.cHits <= 2); + if (Args.cHits == 2) + rc = VINF_SUCCESS; + else if (Args.cHits == 1) + rc = VERR_NOT_ALL_CPUS_SHOWED; + else if (Args.cHits == 0) + rc = VERR_CPU_OFFLINE; + else + rc = VERR_CPU_IPE_1; + } + /* + * A CPU must be present to be considered just offline. + */ + else if ( RTMpIsCpuPresent(idCpu1) + && RTMpIsCpuPresent(idCpu2)) + rc = VERR_CPU_OFFLINE; + else + rc = VERR_CPU_NOT_FOUND; + + RTThreadPreemptRestore(&PreemptState);; +# if RTLNX_VER_MIN(2,6,28) + free_cpumask_var(DstCpuMask); +# endif + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; + +#else /* !CONFIG_SMP */ + RT_NOREF(idCpu1, idCpu2, fFlags, pfnWorker, pvUser1, pvUser2); + return VERR_CPU_NOT_FOUND; +#endif /* !CONFIG_SMP */ +} +RT_EXPORT_SYMBOL(RTMpOnPair); + + +RTDECL(bool) RTMpOnPairIsConcurrentExecSupported(void) +{ + return true; +} +RT_EXPORT_SYMBOL(RTMpOnPairIsConcurrentExecSupported); + + +#if RTLNX_VER_MAX(2,6,19) && defined(CONFIG_SMP) +/** + * Wrapper between the native linux per-cpu callbacks and PFNRTWORKER + * employed by RTMpOnSpecific on older kernels that lacks smp_call_function_single. + * + * @param pvInfo Pointer to the RTMPARGS package. + */ +static void rtmpOnSpecificLinuxWrapper(void *pvInfo) +{ + PRTMPARGS pArgs = (PRTMPARGS)pvInfo; + RTCPUID idCpu = RTMpCpuId(); + + if (idCpu == pArgs->idCpu) + { + pArgs->pfnWorker(idCpu, pArgs->pvUser1, pArgs->pvUser2); + ASMAtomicIncU32(&pArgs->cHits); + } +} +#endif + + +RTDECL(int) RTMpOnSpecific(RTCPUID idCpu, PFNRTMPWORKER pfnWorker, void *pvUser1, void *pvUser2) +{ + IPRT_LINUX_SAVE_EFL_AC(); + int rc; + RTMPARGS Args; + + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + Args.pfnWorker = pfnWorker; + Args.pvUser1 = pvUser1; + Args.pvUser2 = pvUser2; + Args.idCpu = idCpu; + Args.cHits = 0; + + if (!RTMpIsCpuPossible(idCpu)) + return VERR_CPU_NOT_FOUND; + + RTThreadPreemptDisable(&PreemptState); + if (idCpu != RTMpCpuId()) + { +#ifdef CONFIG_SMP + if (RTMpIsCpuOnline(idCpu)) + { +# if RTLNX_VER_MIN(2,6,27) + rc = smp_call_function_single(idCpu, rtmpLinuxWrapper, &Args, 1 /* wait */); +# elif RTLNX_VER_MIN(2,6,19) + rc = smp_call_function_single(idCpu, rtmpLinuxWrapper, &Args, 0 /* retry */, 1 /* wait */); +# else /* older kernels */ + rc = smp_call_function(rtmpOnSpecificLinuxWrapper, &Args, 0 /* retry */, 1 /* wait */); +# endif /* older kernels */ + Assert(rc == 0); + rc = Args.cHits ? VINF_SUCCESS : VERR_CPU_OFFLINE; + } + else +#endif /* CONFIG_SMP */ + rc = VERR_CPU_OFFLINE; + } + else + { + rtmpLinuxWrapper(&Args); + rc = VINF_SUCCESS; + } + RTThreadPreemptRestore(&PreemptState);; + + NOREF(rc); + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} +RT_EXPORT_SYMBOL(RTMpOnSpecific); + + +#if RTLNX_VER_MIN(2,6,19) && defined(CONFIG_SMP) +/** + * Dummy callback used by RTMpPokeCpu. + * + * @param pvInfo Ignored. + */ +static void rtmpLinuxPokeCpuCallback(void *pvInfo) +{ + NOREF(pvInfo); +} +#endif + + +RTDECL(int) RTMpPokeCpu(RTCPUID idCpu) +{ +#if RTLNX_VER_MIN(2,6,19) + IPRT_LINUX_SAVE_EFL_AC(); + int rc; + if (RTMpIsCpuPossible(idCpu)) + { + if (RTMpIsCpuOnline(idCpu)) + { +# ifdef CONFIG_SMP +# if RTLNX_VER_MIN(2,6,27) + rc = smp_call_function_single(idCpu, rtmpLinuxPokeCpuCallback, NULL, 0 /* wait */); +# elif RTLNX_VER_MIN(2,6,19) + rc = smp_call_function_single(idCpu, rtmpLinuxPokeCpuCallback, NULL, 0 /* retry */, 0 /* wait */); +# else /* older kernels */ +# error oops +# endif /* older kernels */ + Assert(rc == 0); +# endif /* CONFIG_SMP */ + rc = VINF_SUCCESS; + } + else + rc = VERR_CPU_OFFLINE; + } + else + rc = VERR_CPU_NOT_FOUND; + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; + +#else /* older kernels */ + /* no unicast here? */ + return VERR_NOT_SUPPORTED; +#endif /* older kernels */ +} +RT_EXPORT_SYMBOL(RTMpPokeCpu); + + +RTDECL(bool) RTMpOnAllIsConcurrentSafe(void) +{ + return true; +} +RT_EXPORT_SYMBOL(RTMpOnAllIsConcurrentSafe); + diff --git a/src/VBox/Runtime/r0drv/linux/mpnotification-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/mpnotification-r0drv-linux.c new file mode 100644 index 00000000..9b649b85 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/mpnotification-r0drv-linux.c @@ -0,0 +1,248 @@ +/* $Id: mpnotification-r0drv-linux.c $ */ +/** @file + * IPRT - Multiprocessor Event Notifications, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2008-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/asm-amd64-x86.h> +#include <iprt/errcore.h> +#include <iprt/cpuset.h> +#include <iprt/thread.h> +#include "r0drv/mp-r0drv.h" + +#if RTLNX_VER_MIN(4,10,0) + +static enum cpuhp_state g_rtR0MpOnline; + +/* + * Linux 4.10 completely removed CPU notifiers. So let's switch to CPU hotplug + * notification. + */ + +static int rtR0MpNotificationLinuxOnline(unsigned int cpu) +{ + RTCPUID idCpu = RTMpCpuIdFromSetIndex(cpu); + rtMpNotificationDoCallbacks(RTMPEVENT_ONLINE, idCpu); + return 0; +} + +static int rtR0MpNotificationLinuxOffline(unsigned int cpu) +{ + RTCPUID idCpu = RTMpCpuIdFromSetIndex(cpu); + rtMpNotificationDoCallbacks(RTMPEVENT_OFFLINE, idCpu); + return 0; +} + +DECLHIDDEN(int) rtR0MpNotificationNativeInit(void) +{ + int rc; + IPRT_LINUX_SAVE_EFL_AC(); + rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "vboxdrv:online", + rtR0MpNotificationLinuxOnline, rtR0MpNotificationLinuxOffline); + IPRT_LINUX_RESTORE_EFL_AC(); + /* + * cpuhp_setup_state_nocalls() returns a positive state number for + * CPUHP_AP_ONLINE_DYN or -ENOSPC if there is no free slot available + * (see cpuhp_reserve_state / definition of CPUHP_AP_ONLINE_DYN). + */ + AssertMsgReturn(rc > 0, ("%d\n", rc), RTErrConvertFromErrno(rc)); + g_rtR0MpOnline = rc; + return VINF_SUCCESS; +} + + +DECLHIDDEN(void) rtR0MpNotificationNativeTerm(void) +{ + IPRT_LINUX_SAVE_EFL_AC(); + cpuhp_remove_state_nocalls(g_rtR0MpOnline); + IPRT_LINUX_RESTORE_EFL_AC(); +} + +#elif RTLNX_VER_MIN(2,5,71) && defined(CONFIG_SMP) + +static int rtMpNotificationLinuxCallback(struct notifier_block *pNotifierBlock, unsigned long ulNativeEvent, void *pvCpu); + +/** + * The notifier block we use for registering the callback. + */ +static struct notifier_block g_NotifierBlock = +{ + .notifier_call = rtMpNotificationLinuxCallback, + .next = NULL, + .priority = 0 +}; + +# ifdef CPU_DOWN_FAILED +/** + * The set of CPUs we've seen going offline recently. + */ +static RTCPUSET g_MpPendingOfflineSet; +# endif + + +/** + * The native callback. + * + * @returns NOTIFY_DONE. + * @param pNotifierBlock Pointer to g_NotifierBlock. + * @param ulNativeEvent The native event. + * @param pvCpu The cpu id cast into a pointer value. + * + * @remarks This can fire with preemption enabled and on any CPU. + */ +static int rtMpNotificationLinuxCallback(struct notifier_block *pNotifierBlock, unsigned long ulNativeEvent, void *pvCpu) +{ + bool fProcessEvent = false; + RTCPUID idCpu = (uintptr_t)pvCpu; + NOREF(pNotifierBlock); + + /* + * Note that redhat/CentOS ported _some_ of the FROZEN macros + * back to their 2.6.18-92.1.10.el5 kernel but actually don't + * use them. Thus we have to test for both CPU_TASKS_FROZEN and + * the individual event variants. + */ + switch (ulNativeEvent) + { + /* + * Pick up online events or failures to go offline. + * Ignore failure events for CPUs we didn't see go offline. + */ +# ifdef CPU_DOWN_FAILED + case CPU_DOWN_FAILED: +# if defined(CPU_TASKS_FROZEN) && defined(CPU_DOWN_FAILED_FROZEN) + case CPU_DOWN_FAILED_FROZEN: +# endif + if (!RTCpuSetIsMember(&g_MpPendingOfflineSet, idCpu)) + break; /* fProcessEvents = false */ + /* fall thru */ +# endif + case CPU_ONLINE: +# if defined(CPU_TASKS_FROZEN) && defined(CPU_ONLINE_FROZEN) + case CPU_ONLINE_FROZEN: +# endif +# ifdef CPU_DOWN_FAILED + RTCpuSetDel(&g_MpPendingOfflineSet, idCpu); +# endif + fProcessEvent = true; + break; + + /* + * Pick the earliest possible offline event. + * The only important thing here is that we get the event and that + * it's exactly one. + */ +# ifdef CPU_DOWN_PREPARE + case CPU_DOWN_PREPARE: +# if defined(CPU_TASKS_FROZEN) && defined(CPU_DOWN_PREPARE_FROZEN) + case CPU_DOWN_PREPARE_FROZEN: +# endif + fProcessEvent = true; +# else + case CPU_DEAD: +# if defined(CPU_TASKS_FROZEN) && defined(CPU_DEAD_FROZEN) + case CPU_DEAD_FROZEN: +# endif + /* Don't process CPU_DEAD notifications. */ +# endif +# ifdef CPU_DOWN_FAILED + RTCpuSetAdd(&g_MpPendingOfflineSet, idCpu); +# endif + break; + } + + if (!fProcessEvent) + return NOTIFY_DONE; + + switch (ulNativeEvent) + { +# ifdef CPU_DOWN_FAILED + case CPU_DOWN_FAILED: +# if defined(CPU_TASKS_FROZEN) && defined(CPU_DOWN_FAILED_FROZEN) + case CPU_DOWN_FAILED_FROZEN: +# endif +# endif + case CPU_ONLINE: +# if defined(CPU_TASKS_FROZEN) && defined(CPU_ONLINE_FROZEN) + case CPU_ONLINE_FROZEN: +# endif + rtMpNotificationDoCallbacks(RTMPEVENT_ONLINE, idCpu); + break; + +# ifdef CPU_DOWN_PREPARE + case CPU_DOWN_PREPARE: +# if defined(CPU_TASKS_FROZEN) && defined(CPU_DOWN_PREPARE_FROZEN) + case CPU_DOWN_PREPARE_FROZEN: +# endif + rtMpNotificationDoCallbacks(RTMPEVENT_OFFLINE, idCpu); + break; +# endif + } + + return NOTIFY_DONE; +} + + +DECLHIDDEN(int) rtR0MpNotificationNativeInit(void) +{ + int rc; + IPRT_LINUX_SAVE_EFL_AC(); + +# ifdef CPU_DOWN_FAILED + RTCpuSetEmpty(&g_MpPendingOfflineSet); +# endif + + rc = register_cpu_notifier(&g_NotifierBlock); + IPRT_LINUX_RESTORE_EFL_AC(); + AssertMsgReturn(!rc, ("%d\n", rc), RTErrConvertFromErrno(rc)); + return VINF_SUCCESS; +} + + +DECLHIDDEN(void) rtR0MpNotificationNativeTerm(void) +{ + IPRT_LINUX_SAVE_EFL_AC(); + unregister_cpu_notifier(&g_NotifierBlock); + IPRT_LINUX_RESTORE_EFL_AC(); +} + +#else /* Not supported / Not needed */ + +DECLHIDDEN(int) rtR0MpNotificationNativeInit(void) +{ + return VINF_SUCCESS; +} + +DECLHIDDEN(void) rtR0MpNotificationNativeTerm(void) +{ +} + +#endif /* Not supported / Not needed */ + diff --git a/src/VBox/Runtime/r0drv/linux/process-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/process-r0drv-linux.c new file mode 100644 index 00000000..9c2bcd93 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/process-r0drv-linux.c @@ -0,0 +1,49 @@ +/* $Id: process-r0drv-linux.c $ */ +/** @file + * IPRT - Process, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/process.h> + + +RTDECL(RTPROCESS) RTProcSelf(void) +{ + return (RTPROCESS)current->tgid; +} +RT_EXPORT_SYMBOL(RTProcSelf); + + +RTR0DECL(RTR0PROCESS) RTR0ProcHandleSelf(void) +{ + return (RTR0PROCESS)current->tgid; +} +RT_EXPORT_SYMBOL(RTR0ProcHandleSelf); + diff --git a/src/VBox/Runtime/r0drv/linux/rtStrFormatKernelAddress-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/rtStrFormatKernelAddress-r0drv-linux.c new file mode 100644 index 00000000..b9839e64 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/rtStrFormatKernelAddress-r0drv-linux.c @@ -0,0 +1,56 @@ +/* $Id: rtStrFormatKernelAddress-r0drv-linux.c $ */ +/** @file + * IPRT - IPRT String Formatter, ring-0 addresses. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_STRING +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/assert.h> +#include <iprt/string.h> + +#include "internal/string.h" + + +DECLHIDDEN(size_t) rtStrFormatKernelAddress(char *pszBuf, size_t cbBuf, RTR0INTPTR uPtr, signed int cchWidth, + signed int cchPrecision, unsigned int fFlags) +{ +#if !defined(DEBUG) && RTLNX_VER_MIN(2,6,38) + RT_NOREF(cchWidth, cchPrecision); + /* use the Linux kernel function which is able to handle "%pK" */ + static const char s_szFmt[] = "0x%pK"; + const char *pszFmt = s_szFmt; + if (!(fFlags & RTSTR_F_SPECIAL)) + pszFmt += 2; + return scnprintf(pszBuf, cbBuf, pszFmt, uPtr); +#else + Assert(cbBuf >= 64); + return RTStrFormatNumber(pszBuf, uPtr, 16, cchWidth, cchPrecision, fFlags); +#endif +} diff --git a/src/VBox/Runtime/r0drv/linux/semevent-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/semevent-r0drv-linux.c new file mode 100644 index 00000000..1ccc9eae --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/semevent-r0drv-linux.c @@ -0,0 +1,279 @@ +/* $Id: semevent-r0drv-linux.c $ */ +/** @file + * IPRT - Single Release Event Semaphores, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define RTSEMEVENT_WITHOUT_REMAPPING +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/semaphore.h> + +#include <iprt/asm.h> +#include <iprt/assert.h> +#include <iprt/err.h> +#include <iprt/lockvalidator.h> +#include <iprt/mem.h> + +#include "waitqueue-r0drv-linux.h" +#include "internal/magics.h" + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Linux event semaphore. + */ +typedef struct RTSEMEVENTINTERNAL +{ + /** Magic value (RTSEMEVENT_MAGIC). */ + uint32_t volatile u32Magic; + /** The object status - !0 when signaled and 0 when reset. */ + uint32_t volatile fState; + /** Reference counter. */ + uint32_t volatile cRefs; + /** The wait queue. */ + wait_queue_head_t Head; +} RTSEMEVENTINTERNAL, *PRTSEMEVENTINTERNAL; + + + +RTDECL(int) RTSemEventCreate(PRTSEMEVENT phEventSem) +{ + return RTSemEventCreateEx(phEventSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, NULL); +} + + +RTDECL(int) RTSemEventCreateEx(PRTSEMEVENT phEventSem, uint32_t fFlags, RTLOCKVALCLASS hClass, const char *pszNameFmt, ...) +{ + PRTSEMEVENTINTERNAL pThis; + IPRT_LINUX_SAVE_EFL_AC(); + RT_NOREF_PV(hClass); RT_NOREF_PV(pszNameFmt); + + AssertReturn(!(fFlags & ~(RTSEMEVENT_FLAGS_NO_LOCK_VAL | RTSEMEVENT_FLAGS_BOOTSTRAP_HACK)), VERR_INVALID_PARAMETER); + Assert(!(fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK) || (fFlags & RTSEMEVENT_FLAGS_NO_LOCK_VAL)); + + pThis = (PRTSEMEVENTINTERNAL)RTMemAlloc(sizeof(*pThis)); + if (!pThis) + return VERR_NO_MEMORY; + + pThis->u32Magic = RTSEMEVENT_MAGIC; + pThis->fState = 0; + pThis->cRefs = 1; + init_waitqueue_head(&pThis->Head); + + *phEventSem = pThis; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemEventCreate); + + +/** + * Retains a reference to the event semaphore. + * + * @param pThis The event semaphore. + */ +DECLINLINE(void) rtR0SemEventLnxRetain(PRTSEMEVENTINTERNAL pThis) +{ + uint32_t cRefs = ASMAtomicIncU32(&pThis->cRefs); + Assert(cRefs < 100000); NOREF(cRefs); +} + + +/** + * Releases a reference to the event semaphore. + * + * @param pThis The event semaphore. + */ +DECLINLINE(void) rtR0SemEventLnxRelease(PRTSEMEVENTINTERNAL pThis) +{ + if (RT_UNLIKELY(ASMAtomicDecU32(&pThis->cRefs) == 0)) + RTMemFree(pThis); +} + + +RTDECL(int) RTSemEventDestroy(RTSEMEVENT hEventSem) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate input. + */ + PRTSEMEVENTINTERNAL pThis = hEventSem; + if (pThis == NIL_RTSEMEVENT) + return VINF_SUCCESS; + AssertMsgReturn(pThis->u32Magic == RTSEMEVENT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE); + Assert(pThis->cRefs > 0); + + /* + * Invalidate it and signal the object just in case. + */ + ASMAtomicWriteU32(&pThis->u32Magic, ~RTSEMEVENT_MAGIC); + ASMAtomicWriteU32(&pThis->fState, 0); + Assert(!waitqueue_active(&pThis->Head)); + wake_up_all(&pThis->Head); + rtR0SemEventLnxRelease(pThis); + + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemEventDestroy); + + +RTDECL(int) RTSemEventSignal(RTSEMEVENT hEventSem) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate input. + */ + PRTSEMEVENTINTERNAL pThis = (PRTSEMEVENTINTERNAL)hEventSem; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertMsgReturn(pThis->u32Magic == RTSEMEVENT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE); + rtR0SemEventLnxRetain(pThis); + + /* + * Signal the event object. + */ + ASMAtomicWriteU32(&pThis->fState, 1); + wake_up(&pThis->Head); + + rtR0SemEventLnxRelease(pThis); + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemEventSignal); + + +/** + * Worker for RTSemEventWaitEx and RTSemEventWaitExDebug. + * + * @returns VBox status code. + * @param pThis The event semaphore. + * @param fFlags See RTSemEventWaitEx. + * @param uTimeout See RTSemEventWaitEx. + * @param pSrcPos The source code position of the wait. + */ +static int rtR0SemEventLnxWait(PRTSEMEVENTINTERNAL pThis, uint32_t fFlags, uint64_t uTimeout, + PCRTLOCKVALSRCPOS pSrcPos) +{ + int rc; + RT_NOREF_PV(pSrcPos); + + /* + * Validate the input. + */ + AssertPtrReturn(pThis, VERR_INVALID_PARAMETER); + AssertMsgReturn(pThis->u32Magic == RTSEMEVENT_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER); + AssertReturn(RTSEMWAIT_FLAGS_ARE_VALID(fFlags), VERR_INVALID_PARAMETER); + rtR0SemEventLnxRetain(pThis); + + /* + * Try grab the event without setting up the wait. + */ + if ( 1 /** @todo check if there are someone waiting already - waitqueue_active, but then what do we do below? */ + && ASMAtomicCmpXchgU32(&pThis->fState, 0, 1)) + rc = VINF_SUCCESS; + else + { + /* + * We have to wait. + */ + IPRT_LINUX_SAVE_EFL_AC(); + RTR0SEMLNXWAIT Wait; + rc = rtR0SemLnxWaitInit(&Wait, fFlags, uTimeout, &pThis->Head); + if (RT_SUCCESS(rc)) + { + IPRT_DEBUG_SEMS_STATE(pThis, 'E'); + for (;;) + { + /* The destruction test. */ + if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENT_MAGIC)) + rc = VERR_SEM_DESTROYED; + else + { + rtR0SemLnxWaitPrepare(&Wait); + + /* Check the exit conditions. */ + if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENT_MAGIC)) + rc = VERR_SEM_DESTROYED; + else if (ASMAtomicCmpXchgU32(&pThis->fState, 0, 1)) + rc = VINF_SUCCESS; + else if (rtR0SemLnxWaitHasTimedOut(&Wait)) + rc = VERR_TIMEOUT; + else if (rtR0SemLnxWaitWasInterrupted(&Wait)) + rc = VERR_INTERRUPTED; + else + { + /* Do the wait and then recheck the conditions. */ + rtR0SemLnxWaitDoIt(&Wait); + continue; + } + } + break; + } + + rtR0SemLnxWaitDelete(&Wait); + IPRT_DEBUG_SEMS_STATE_RC(pThis, 'E', rc); + } + IPRT_LINUX_RESTORE_EFL_AC(); + } + + rtR0SemEventLnxRelease(pThis); + return rc; +} + + +RTDECL(int) RTSemEventWaitEx(RTSEMEVENT hEventSem, uint32_t fFlags, uint64_t uTimeout) +{ +#ifndef RTSEMEVENT_STRICT + return rtR0SemEventLnxWait(hEventSem, fFlags, uTimeout, NULL); +#else + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API(); + return rtR0SemEventLnxWait(hEventSem, fFlags, uTimeout, &SrcPos); +#endif +} +RT_EXPORT_SYMBOL(RTSemEventWaitEx); + + +RTDECL(int) RTSemEventWaitExDebug(RTSEMEVENT hEventSem, uint32_t fFlags, uint64_t uTimeout, + RTHCUINTPTR uId, RT_SRC_POS_DECL) +{ + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API(); + return rtR0SemEventLnxWait(hEventSem, fFlags, uTimeout, &SrcPos); +} +RT_EXPORT_SYMBOL(RTSemEventWaitExDebug); + + +RTDECL(uint32_t) RTSemEventGetResolution(void) +{ + return rtR0SemLnxWaitGetResolution(); +} +RT_EXPORT_SYMBOL(RTSemEventGetResolution); + diff --git a/src/VBox/Runtime/r0drv/linux/semeventmulti-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/semeventmulti-r0drv-linux.c new file mode 100644 index 00000000..78ff6572 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/semeventmulti-r0drv-linux.c @@ -0,0 +1,344 @@ +/* $Id: semeventmulti-r0drv-linux.c $ */ +/** @file + * IPRT - Multiple Release Event Semaphores, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define RTSEMEVENTMULTI_WITHOUT_REMAPPING +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/semaphore.h> + +#include <iprt/assert.h> +#include <iprt/asm.h> +#include <iprt/err.h> +#include <iprt/mem.h> +#include <iprt/lockvalidator.h> + +#include "waitqueue-r0drv-linux.h" +#include "internal/magics.h" + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +/** @name fStateAndGen values + * @{ */ +/** The state bit number. */ +#define RTSEMEVENTMULTILNX_STATE_BIT 0 +/** The state mask. */ +#define RTSEMEVENTMULTILNX_STATE_MASK RT_BIT_32(RTSEMEVENTMULTILNX_STATE_BIT) +/** The generation mask. */ +#define RTSEMEVENTMULTILNX_GEN_MASK ~RTSEMEVENTMULTILNX_STATE_MASK +/** The generation shift. */ +#define RTSEMEVENTMULTILNX_GEN_SHIFT 1 +/** The initial variable value. */ +#define RTSEMEVENTMULTILNX_STATE_GEN_INIT UINT32_C(0xfffffffc) +/** @} */ + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Linux event semaphore. + */ +typedef struct RTSEMEVENTMULTIINTERNAL +{ + /** Magic value (RTSEMEVENTMULTI_MAGIC). */ + uint32_t volatile u32Magic; + /** The object state bit and generation counter. + * The generation counter is incremented every time the object is + * signalled. */ + uint32_t volatile fStateAndGen; + /** Reference counter. */ + uint32_t volatile cRefs; + /** The wait queue. */ + wait_queue_head_t Head; +} RTSEMEVENTMULTIINTERNAL, *PRTSEMEVENTMULTIINTERNAL; + + + + + +RTDECL(int) RTSemEventMultiCreate(PRTSEMEVENTMULTI phEventMultiSem) +{ + return RTSemEventMultiCreateEx(phEventMultiSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, NULL); +} + + +RTDECL(int) RTSemEventMultiCreateEx(PRTSEMEVENTMULTI phEventMultiSem, uint32_t fFlags, RTLOCKVALCLASS hClass, + const char *pszNameFmt, ...) +{ + PRTSEMEVENTMULTIINTERNAL pThis; + IPRT_LINUX_SAVE_EFL_AC(); + RT_NOREF_PV(hClass); RT_NOREF_PV(pszNameFmt); + + AssertReturn(!(fFlags & ~RTSEMEVENTMULTI_FLAGS_NO_LOCK_VAL), VERR_INVALID_PARAMETER); + pThis = (PRTSEMEVENTMULTIINTERNAL)RTMemAlloc(sizeof(*pThis)); + if (pThis) + { + pThis->u32Magic = RTSEMEVENTMULTI_MAGIC; + pThis->fStateAndGen = RTSEMEVENTMULTILNX_STATE_GEN_INIT; + pThis->cRefs = 1; + init_waitqueue_head(&pThis->Head); + + *phEventMultiSem = pThis; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; +} +RT_EXPORT_SYMBOL(RTSemEventMultiCreate); + + +/** + * Retain a reference to the semaphore. + * + * @param pThis The semaphore. + */ +DECLINLINE(void) rtR0SemEventMultiLnxRetain(PRTSEMEVENTMULTIINTERNAL pThis) +{ + uint32_t cRefs = ASMAtomicIncU32(&pThis->cRefs); + NOREF(cRefs); + Assert(cRefs && cRefs < 100000); +} + + +/** + * Release a reference, destroy the thing if necessary. + * + * @param pThis The semaphore. + */ +DECLINLINE(void) rtR0SemEventMultiLnxRelease(PRTSEMEVENTMULTIINTERNAL pThis) +{ + if (RT_UNLIKELY(ASMAtomicDecU32(&pThis->cRefs) == 0)) + { + Assert(pThis->u32Magic != RTSEMEVENTMULTI_MAGIC); + RTMemFree(pThis); + } +} + + +RTDECL(int) RTSemEventMultiDestroy(RTSEMEVENTMULTI hEventMultiSem) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate input. + */ + PRTSEMEVENTMULTIINTERNAL pThis = (PRTSEMEVENTMULTIINTERNAL)hEventMultiSem; + if (pThis == NIL_RTSEMEVENTMULTI) + return VINF_SUCCESS; + AssertPtrReturn(pThis, VERR_INVALID_PARAMETER); + AssertMsgReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER); + Assert(pThis->cRefs > 0); + + /* + * Invalidate it and signal the object just in case. + */ + ASMAtomicWriteU32(&pThis->u32Magic, ~RTSEMEVENTMULTI_MAGIC); + ASMAtomicAndU32(&pThis->fStateAndGen, RTSEMEVENTMULTILNX_GEN_MASK); + Assert(!waitqueue_active(&pThis->Head)); + wake_up_all(&pThis->Head); + rtR0SemEventMultiLnxRelease(pThis); + + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemEventMultiDestroy); + + +RTDECL(int) RTSemEventMultiSignal(RTSEMEVENTMULTI hEventMultiSem) +{ + IPRT_LINUX_SAVE_EFL_AC(); + uint32_t fNew; + uint32_t fOld; + + /* + * Validate input. + */ + PRTSEMEVENTMULTIINTERNAL pThis = (PRTSEMEVENTMULTIINTERNAL)hEventMultiSem; + if (!pThis) + return VERR_INVALID_PARAMETER; + AssertPtrReturn(pThis, VERR_INVALID_PARAMETER); + AssertMsgReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER); + rtR0SemEventMultiLnxRetain(pThis); + + /* + * Signal the event object. The cause of the paranoia here is racing to try + * deal with racing RTSemEventMultiSignal calls (should probably be + * forbidden, but it's relatively easy to handle). + */ + do + { + fNew = fOld = ASMAtomicUoReadU32(&pThis->fStateAndGen); + fNew += 1 << RTSEMEVENTMULTILNX_GEN_SHIFT; + fNew |= RTSEMEVENTMULTILNX_STATE_MASK; + } + while (!ASMAtomicCmpXchgU32(&pThis->fStateAndGen, fNew, fOld)); + + wake_up_all(&pThis->Head); + + rtR0SemEventMultiLnxRelease(pThis); + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemEventMultiSignal); + + +RTDECL(int) RTSemEventMultiReset(RTSEMEVENTMULTI hEventMultiSem) +{ + /* + * Validate input. + */ + PRTSEMEVENTMULTIINTERNAL pThis = (PRTSEMEVENTMULTIINTERNAL)hEventMultiSem; + if (!pThis) + return VERR_INVALID_PARAMETER; + AssertPtrReturn(pThis, VERR_INVALID_PARAMETER); + AssertMsgReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER); + rtR0SemEventMultiLnxRetain(pThis); + + /* + * Reset it. + */ + ASMAtomicAndU32(&pThis->fStateAndGen, ~RTSEMEVENTMULTILNX_STATE_MASK); + + rtR0SemEventMultiLnxRelease(pThis); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemEventMultiReset); + + +/** + * Worker for RTSemEventMultiWaitEx and RTSemEventMultiWaitExDebug. + * + * @returns VBox status code. + * @param pThis The event semaphore. + * @param fFlags See RTSemEventMultiWaitEx. + * @param uTimeout See RTSemEventMultiWaitEx. + * @param pSrcPos The source code position of the wait. + */ +static int rtR0SemEventMultiLnxWait(PRTSEMEVENTMULTIINTERNAL pThis, uint32_t fFlags, uint64_t uTimeout, + PCRTLOCKVALSRCPOS pSrcPos) +{ + uint32_t fOrgStateAndGen; + int rc; + RT_NOREF_PV(pSrcPos); + + /* + * Validate the input. + */ + AssertPtrReturn(pThis, VERR_INVALID_PARAMETER); + AssertMsgReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, ("%p u32Magic=%RX32\n", pThis, pThis->u32Magic), VERR_INVALID_PARAMETER); + AssertReturn(RTSEMWAIT_FLAGS_ARE_VALID(fFlags), VERR_INVALID_PARAMETER); + rtR0SemEventMultiLnxRetain(pThis); + + /* + * Is the event already signalled or do we have to wait? + */ + fOrgStateAndGen = ASMAtomicUoReadU32(&pThis->fStateAndGen); + if (fOrgStateAndGen & RTSEMEVENTMULTILNX_STATE_MASK) + rc = VINF_SUCCESS; + else + { + /* + * We have to wait. + */ + RTR0SEMLNXWAIT Wait; + IPRT_LINUX_SAVE_EFL_AC(); + rc = rtR0SemLnxWaitInit(&Wait, fFlags, uTimeout, &pThis->Head); + if (RT_SUCCESS(rc)) + { + IPRT_DEBUG_SEMS_STATE(pThis, 'E'); + for (;;) + { + /* The destruction test. */ + if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENTMULTI_MAGIC)) + rc = VERR_SEM_DESTROYED; + else + { + rtR0SemLnxWaitPrepare(&Wait); + + /* Check the exit conditions. */ + if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENTMULTI_MAGIC)) + rc = VERR_SEM_DESTROYED; + else if (ASMAtomicUoReadU32(&pThis->fStateAndGen) != fOrgStateAndGen) + rc = VINF_SUCCESS; + else if (rtR0SemLnxWaitHasTimedOut(&Wait)) + rc = VERR_TIMEOUT; + else if (rtR0SemLnxWaitWasInterrupted(&Wait)) + rc = VERR_INTERRUPTED; + else + { + /* Do the wait and then recheck the conditions. */ + rtR0SemLnxWaitDoIt(&Wait); + continue; + } + } + break; + } + + rtR0SemLnxWaitDelete(&Wait); + IPRT_DEBUG_SEMS_STATE_RC(pThis, 'E', rc); + } + IPRT_LINUX_RESTORE_EFL_AC(); + } + + rtR0SemEventMultiLnxRelease(pThis); + return rc; +} + + +RTDECL(int) RTSemEventMultiWaitEx(RTSEMEVENTMULTI hEventMultiSem, uint32_t fFlags, uint64_t uTimeout) +{ +#ifndef RTSEMEVENT_STRICT + return rtR0SemEventMultiLnxWait(hEventMultiSem, fFlags, uTimeout, NULL); +#else + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API(); + return rtR0SemEventMultiLnxWait(hEventMultiSem, fFlags, uTimeout, &SrcPos); +#endif +} +RT_EXPORT_SYMBOL(RTSemEventMultiWaitEx); + + +RTDECL(int) RTSemEventMultiWaitExDebug(RTSEMEVENTMULTI hEventMultiSem, uint32_t fFlags, uint64_t uTimeout, + RTHCUINTPTR uId, RT_SRC_POS_DECL) +{ + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API(); + return rtR0SemEventMultiLnxWait(hEventMultiSem, fFlags, uTimeout, &SrcPos); +} +RT_EXPORT_SYMBOL(RTSemEventMultiWaitExDebug); + + +RTDECL(uint32_t) RTSemEventMultiGetResolution(void) +{ + return rtR0SemLnxWaitGetResolution(); +} +RT_EXPORT_SYMBOL(RTSemEventMultiGetResolution); + diff --git a/src/VBox/Runtime/r0drv/linux/semfastmutex-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/semfastmutex-r0drv-linux.c new file mode 100644 index 00000000..affcbe15 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/semfastmutex-r0drv-linux.c @@ -0,0 +1,157 @@ +/* $Id: semfastmutex-r0drv-linux.c $ */ +/** @file + * IPRT - Fast Mutex Semaphores, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/semaphore.h> +#include <iprt/alloc.h> +#include <iprt/assert.h> +#include <iprt/asm.h> +#include <iprt/errcore.h> +#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS) +# include <iprt/thread.h> +#endif + +#include "internal/magics.h" + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Wrapper for the linux semaphore structure. + */ +typedef struct RTSEMFASTMUTEXINTERNAL +{ + /** Magic value (RTSEMFASTMUTEX_MAGIC). */ + uint32_t u32Magic; + /** the linux semaphore. */ + struct semaphore Semaphore; +#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS) + /** For check. */ + RTNATIVETHREAD volatile Owner; +#endif +} RTSEMFASTMUTEXINTERNAL, *PRTSEMFASTMUTEXINTERNAL; + + +RTDECL(int) RTSemFastMutexCreate(PRTSEMFASTMUTEX phFastMtx) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Allocate. + */ + PRTSEMFASTMUTEXINTERNAL pThis; + pThis = (PRTSEMFASTMUTEXINTERNAL)RTMemAlloc(sizeof(*pThis)); + if (!pThis) + return VERR_NO_MEMORY; + + /* + * Initialize. + */ + pThis->u32Magic = RTSEMFASTMUTEX_MAGIC; + sema_init(&pThis->Semaphore, 1); +#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS) + pThis->Owner = NIL_RTNATIVETHREAD; +#endif + + *phFastMtx = pThis; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemFastMutexCreate); + + +RTDECL(int) RTSemFastMutexDestroy(RTSEMFASTMUTEX hFastMtx) +{ + /* + * Validate. + */ + PRTSEMFASTMUTEXINTERNAL pThis = hFastMtx; + if (pThis == NIL_RTSEMFASTMUTEX) + return VINF_SUCCESS; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertMsgReturn(pThis->u32Magic == RTSEMFASTMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE); + + ASMAtomicWriteU32(&pThis->u32Magic, RTSEMFASTMUTEX_MAGIC_DEAD); + RTMemFree(pThis); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemFastMutexDestroy); + + +RTDECL(int) RTSemFastMutexRequest(RTSEMFASTMUTEX hFastMtx) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate. + */ + PRTSEMFASTMUTEXINTERNAL pThis = hFastMtx; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertMsgReturn(pThis->u32Magic == RTSEMFASTMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE); + + IPRT_DEBUG_SEMS_STATE(pThis, 'd'); + down(&pThis->Semaphore); +#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS) + IPRT_DEBUG_SEMS_STATE(pThis, 'o'); + AssertRelease(pThis->Owner == NIL_RTNATIVETHREAD); + ASMAtomicUoWriteSize(&pThis->Owner, RTThreadNativeSelf()); +#endif + + IPRT_LINUX_RESTORE_EFL_ONLY_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemFastMutexRequest); + + +RTDECL(int) RTSemFastMutexRelease(RTSEMFASTMUTEX hFastMtx) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate. + */ + PRTSEMFASTMUTEXINTERNAL pThis = hFastMtx; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertMsgReturn(pThis->u32Magic == RTSEMFASTMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE); + +#if defined(RT_STRICT) || defined(IPRT_DEBUG_SEMS) + AssertRelease(pThis->Owner == RTThreadNativeSelf()); + ASMAtomicUoWriteSize(&pThis->Owner, NIL_RTNATIVETHREAD); +#endif + up(&pThis->Semaphore); + IPRT_DEBUG_SEMS_STATE(pThis, 'u'); + + IPRT_LINUX_RESTORE_EFL_ONLY_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemFastMutexRelease); + diff --git a/src/VBox/Runtime/r0drv/linux/semmutex-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/semmutex-r0drv-linux.c new file mode 100644 index 00000000..8f14778a --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/semmutex-r0drv-linux.c @@ -0,0 +1,421 @@ +/* $Id: semmutex-r0drv-linux.c $ */ +/** @file + * IPRT - Mutex Semaphores, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define RTSEMMUTEX_WITHOUT_REMAPPING +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/semaphore.h> + +#include <iprt/assert.h> +#include <iprt/asm.h> +#include <iprt/mem.h> +#include <iprt/err.h> +#include <iprt/list.h> + +#include "internal/magics.h" + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +typedef struct RTSEMMUTEXLNXWAITER +{ + /** The list entry. */ + RTLISTNODE ListEntry; + /** The waiting task. */ + struct task_struct *pTask; + /** Why did we wake up? */ + enum + { + /** Wakeup to take the semaphore. */ + RTSEMMUTEXLNXWAITER_WAKEUP, + /** Mutex is being destroyed. */ + RTSEMMUTEXLNXWAITER_DESTROYED, + /** Some other reason. */ + RTSEMMUTEXLNXWAITER_OTHER + } volatile enmReason; +} RTSEMMUTEXLNXWAITER, *PRTSEMMUTEXLNXWAITER; + +/** + * Wrapper for the linux semaphore structure. + */ +typedef struct RTSEMMUTEXINTERNAL +{ + /** Magic value (RTSEMMUTEX_MAGIC). */ + uint32_t u32Magic; + /** The number of recursions. */ + uint32_t cRecursions; + /** The list of waiting threads. */ + RTLISTANCHOR WaiterList; + /** The current owner, NULL if none. */ + struct task_struct *pOwnerTask; + /** The number of references to this piece of memory. This is used to + * prevent it from being kicked from underneath us while waiting. */ + uint32_t volatile cRefs; + /** The spinlock protecting the members and falling asleep. */ + spinlock_t Spinlock; +} RTSEMMUTEXINTERNAL, *PRTSEMMUTEXINTERNAL; + + +RTDECL(int) RTSemMutexCreate(PRTSEMMUTEX phMtx) +{ + int rc = VINF_SUCCESS; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Allocate. + */ + PRTSEMMUTEXINTERNAL pThis; + pThis = (PRTSEMMUTEXINTERNAL)RTMemAlloc(sizeof(*pThis)); + if (pThis) + { + /* + * Initialize. + */ + pThis->u32Magic = RTSEMMUTEX_MAGIC; + pThis->cRecursions = 0; + pThis->pOwnerTask = NULL; + pThis->cRefs = 1; + RTListInit(&pThis->WaiterList); + spin_lock_init(&pThis->Spinlock); + + *phMtx = pThis; + } + else + rc = VERR_NO_MEMORY; + + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} +RT_EXPORT_SYMBOL(RTSemMutexCreate); + + +RTDECL(int) RTSemMutexDestroy(RTSEMMUTEX hMtx) +{ + PRTSEMMUTEXINTERNAL pThis = hMtx; + PRTSEMMUTEXLNXWAITER pCur; + unsigned long fSavedIrq; + + /* + * Validate. + */ + if (pThis == NIL_RTSEMMUTEX) + return VINF_SUCCESS; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE); + + /* + * Kill it, kick waiters and release it. + */ + AssertReturn(ASMAtomicCmpXchgU32(&pThis->u32Magic, RTSEMMUTEX_MAGIC_DEAD, RTSEMMUTEX_MAGIC), VERR_INVALID_HANDLE); + + IPRT_LINUX_SAVE_EFL_AC(); + + spin_lock_irqsave(&pThis->Spinlock, fSavedIrq); + RTListForEach(&pThis->WaiterList, pCur, RTSEMMUTEXLNXWAITER, ListEntry) + { + pCur->enmReason = RTSEMMUTEXLNXWAITER_DESTROYED; + wake_up_process(pCur->pTask); + } + + if (ASMAtomicDecU32(&pThis->cRefs) != 0) + spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq); + else + { + spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq); + RTMemFree(pThis); + } + + IPRT_LINUX_RESTORE_EFL_AC(); + + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSemMutexDestroy); + + +/** + * Worker for rtSemMutexLinuxRequest that handles the case where we go to sleep. + * + * @returns VINF_SUCCESS, VERR_INTERRUPTED, VERR_TIMEOUT or VERR_SEM_DESTROYED. + * Returns without owning the spinlock. + * @param pThis The mutex instance. + * @param cMillies The timeout. + * @param fInterruptible The wait type. + * @param fSavedIrq The saved IRQ flags. + */ +static int rtSemMutexLinuxRequestSleep(PRTSEMMUTEXINTERNAL pThis, RTMSINTERVAL cMillies, + bool fInterruptible, unsigned long fSavedIrq) +{ + struct task_struct *pSelf = current; + int rc = VERR_TIMEOUT; + long lTimeout = cMillies == RT_INDEFINITE_WAIT ? MAX_SCHEDULE_TIMEOUT : msecs_to_jiffies(cMillies); + RTSEMMUTEXLNXWAITER Waiter; + + IPRT_DEBUG_SEMS_STATE(pThis, 'm'); + + /* + * Grab a reference to the mutex and add ourselves to the waiter list. + */ + ASMAtomicIncU32(&pThis->cRefs); + + Waiter.pTask = pSelf; + Waiter.enmReason = RTSEMMUTEXLNXWAITER_OTHER; + RTListAppend(&pThis->WaiterList, &Waiter.ListEntry); + + /* + * Do the waiting. + */ + for (;;) + { + /* Check signal and timeout conditions. */ + if ( fInterruptible + && signal_pending(pSelf)) + { + rc = VERR_INTERRUPTED; + break; + } + + if (!lTimeout) + break; + + /* Go to sleep. */ + set_current_state(fInterruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&pThis->Spinlock); + + lTimeout = schedule_timeout(lTimeout); + + spin_lock_irq(&pThis->Spinlock); + set_current_state(TASK_RUNNING); + + /* Did someone wake us up? */ + if (Waiter.enmReason == RTSEMMUTEXLNXWAITER_WAKEUP) + { + Assert(pThis->cRecursions == 0); + pThis->cRecursions = 1; + pThis->pOwnerTask = pSelf; + rc = VINF_SUCCESS; + break; + } + + /* Is the mutex being destroyed? */ + if (RT_UNLIKELY( Waiter.enmReason == RTSEMMUTEXLNXWAITER_DESTROYED + || pThis->u32Magic != RTSEMMUTEX_MAGIC)) + { + rc = VERR_SEM_DESTROYED; + break; + } + } + + /* + * Unlink ourself from the waiter list, dereference the mutex and exit the + * lock. We might have to free the mutex if it was the destroyed. + */ + RTListNodeRemove(&Waiter.ListEntry); + IPRT_DEBUG_SEMS_STATE_RC(pThis, 'M', rc); + + if (RT_LIKELY(ASMAtomicDecU32(&pThis->cRefs) != 0)) + spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq); + else + { + Assert(RT_FAILURE_NP(rc)); + spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq); + RTMemFree(pThis); + } + return rc; +} + + +/** + * Internal worker. + */ +DECLINLINE(int) rtSemMutexLinuxRequest(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, bool fInterruptible) +{ + PRTSEMMUTEXINTERNAL pThis = hMutexSem; + struct task_struct *pSelf = current; + unsigned long fSavedIrq; + int rc; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate. + */ + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE); + Assert(pThis->cRefs >= 1); + + /* + * Lock it and check if it's a recursion. + */ + spin_lock_irqsave(&pThis->Spinlock, fSavedIrq); + if (pThis->pOwnerTask == pSelf) + { + pThis->cRecursions++; + Assert(pThis->cRecursions > 1); + Assert(pThis->cRecursions < 256); + rc = VINF_SUCCESS; + } + /* + * Not a recursion, maybe it's not owned by anyone then? + */ + else if ( pThis->pOwnerTask == NULL + && RTListIsEmpty(&pThis->WaiterList)) + { + Assert(pThis->cRecursions == 0); + pThis->cRecursions = 1; + pThis->pOwnerTask = pSelf; + rc = VINF_SUCCESS; + } + /* + * Was it a polling call? + */ + else if (cMillies == 0) + rc = VERR_TIMEOUT; + /* + * No, so go to sleep. + */ + else + { + rc = rtSemMutexLinuxRequestSleep(pThis, cMillies, fInterruptible, fSavedIrq); + IPRT_LINUX_RESTORE_EFL_ONLY_AC(); + return rc; + } + + IPRT_DEBUG_SEMS_STATE_RC(pThis, 'M', rc); + spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq); + IPRT_LINUX_RESTORE_EFL_ONLY_AC(); + return rc; +} + + +RTDECL(int) RTSemMutexRequest(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies) +{ + return rtSemMutexLinuxRequest(hMutexSem, cMillies, false /*fInterruptible*/); +} +RT_EXPORT_SYMBOL(RTSemMutexRequest); + + +RTDECL(int) RTSemMutexRequestDebug(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, RTHCUINTPTR uId, RT_SRC_POS_DECL) +{ + RT_NOREF_PV(uId); RT_SRC_POS_NOREF(); + return RTSemMutexRequest(hMutexSem, cMillies); +} +RT_EXPORT_SYMBOL(RTSemMutexRequestDebug); + + +RTDECL(int) RTSemMutexRequestNoResume(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies) +{ + return rtSemMutexLinuxRequest(hMutexSem, cMillies, true /*fInterruptible*/); +} +RT_EXPORT_SYMBOL(RTSemMutexRequestNoResume); + + +RTDECL(int) RTSemMutexRequestNoResumeDebug(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, RTHCUINTPTR uId, RT_SRC_POS_DECL) +{ + RT_NOREF_PV(uId); RT_SRC_POS_NOREF(); + return RTSemMutexRequestNoResume(hMutexSem, cMillies); +} +RT_EXPORT_SYMBOL(RTSemMutexRequestNoResumeDebug); + + +RTDECL(int) RTSemMutexRelease(RTSEMMUTEX hMtx) +{ + PRTSEMMUTEXINTERNAL pThis = hMtx; + struct task_struct *pSelf = current; + unsigned long fSavedIrq; + int rc; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate. + */ + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), VERR_INVALID_HANDLE); + Assert(pThis->cRefs >= 1); + + /* + * Take the lock and release one recursion. + */ + spin_lock_irqsave(&pThis->Spinlock, fSavedIrq); + if (pThis->pOwnerTask == pSelf) + { + Assert(pThis->cRecursions > 0); + if (--pThis->cRecursions == 0) + { + pThis->pOwnerTask = NULL; + + /* anyone to wake up? */ + if (!RTListIsEmpty(&pThis->WaiterList)) + { + PRTSEMMUTEXLNXWAITER pWaiter = RTListGetFirst(&pThis->WaiterList, RTSEMMUTEXLNXWAITER, ListEntry); + pWaiter->enmReason = RTSEMMUTEXLNXWAITER_WAKEUP; + wake_up_process(pWaiter->pTask); + } + IPRT_DEBUG_SEMS_STATE(pThis, 'u'); + } + rc = VINF_SUCCESS; + } + else + rc = VERR_NOT_OWNER; + spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq); + + AssertRC(rc); + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; +} +RT_EXPORT_SYMBOL(RTSemMutexRelease); + + +RTDECL(bool) RTSemMutexIsOwned(RTSEMMUTEX hMutexSem) +{ + PRTSEMMUTEXINTERNAL pThis = hMutexSem; + unsigned long fSavedIrq; + bool fOwned; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate. + */ + AssertPtrReturn(pThis, false); + AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, ("u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), false); + Assert(pThis->cRefs >= 1); + + /* + * Take the lock and release one recursion. + */ + spin_lock_irqsave(&pThis->Spinlock, fSavedIrq); + fOwned = pThis->pOwnerTask != NULL; + spin_unlock_irqrestore(&pThis->Spinlock, fSavedIrq); + + IPRT_LINUX_RESTORE_EFL_AC(); + return fOwned; + +} +RT_EXPORT_SYMBOL(RTSemMutexIsOwned); + diff --git a/src/VBox/Runtime/r0drv/linux/spinlock-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/spinlock-r0drv-linux.c new file mode 100644 index 00000000..c6ba0ed7 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/spinlock-r0drv-linux.c @@ -0,0 +1,186 @@ +/* $Id: spinlock-r0drv-linux.c $ */ +/** @file + * IPRT - Spinlocks, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/spinlock.h> + +#include <iprt/asm.h> +#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) +# include <iprt/asm-amd64-x86.h> +#endif +#include <iprt/assert.h> +#include <iprt/errcore.h> +#include <iprt/mem.h> +#include <iprt/mp.h> +#include <iprt/thread.h> +#include "internal/magics.h" + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Wrapper for the spinlock_t structure. + */ +typedef struct RTSPINLOCKINTERNAL +{ + /** Spinlock magic value (RTSPINLOCK_MAGIC). */ + uint32_t volatile u32Magic; + /** The spinlock creation flags. */ + uint32_t fFlags; + /** The saved interrupt flag. */ + unsigned long volatile fIntSaved; + /** The linux spinlock structure. */ + spinlock_t Spinlock; +#ifdef RT_MORE_STRICT + /** The idAssertCpu variable before acquring the lock for asserting after + * releasing the spinlock. */ + RTCPUID volatile idAssertCpu; + /** The CPU that owns the lock. */ + RTCPUID volatile idCpuOwner; +#endif +} RTSPINLOCKINTERNAL, *PRTSPINLOCKINTERNAL; + + + +RTDECL(int) RTSpinlockCreate(PRTSPINLOCK pSpinlock, uint32_t fFlags, const char *pszName) +{ + IPRT_LINUX_SAVE_EFL_AC(); + PRTSPINLOCKINTERNAL pThis; + AssertReturn(fFlags == RTSPINLOCK_FLAGS_INTERRUPT_SAFE || fFlags == RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, VERR_INVALID_PARAMETER); + RT_NOREF_PV(pszName); + + /* + * Allocate. + */ + Assert(sizeof(RTSPINLOCKINTERNAL) > sizeof(void *)); + pThis = (PRTSPINLOCKINTERNAL)RTMemAlloc(sizeof(*pThis)); + if (!pThis) + return VERR_NO_MEMORY; + /* + * Initialize and return. + */ + pThis->u32Magic = RTSPINLOCK_MAGIC; + pThis->fFlags = fFlags; + pThis->fIntSaved = 0; +#ifdef RT_MORE_STRICT + pThis->idCpuOwner = NIL_RTCPUID; + pThis->idAssertCpu = NIL_RTCPUID; +#endif + + spin_lock_init(&pThis->Spinlock); + + *pSpinlock = pThis; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSpinlockCreate); + + +RTDECL(int) RTSpinlockDestroy(RTSPINLOCK Spinlock) +{ + /* + * Validate input. + */ + PRTSPINLOCKINTERNAL pThis = (PRTSPINLOCKINTERNAL)Spinlock; + if (!pThis) + return VERR_INVALID_PARAMETER; + if (pThis->u32Magic != RTSPINLOCK_MAGIC) + { + AssertMsgFailed(("Invalid spinlock %p magic=%#x\n", pThis, pThis->u32Magic)); + return VERR_INVALID_PARAMETER; + } + + ASMAtomicIncU32(&pThis->u32Magic); + RTMemFree(pThis); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSpinlockDestroy); + + +RTDECL(void) RTSpinlockAcquire(RTSPINLOCK Spinlock) +{ + PRTSPINLOCKINTERNAL pThis = (PRTSPINLOCKINTERNAL)Spinlock; + IPRT_LINUX_SAVE_EFL_AC(); + RT_ASSERT_PREEMPT_CPUID_VAR(); + AssertMsg(pThis && pThis->u32Magic == RTSPINLOCK_MAGIC, + ("pThis=%p u32Magic=%08x\n", pThis, pThis ? (int)pThis->u32Magic : 0)); + +#ifdef CONFIG_PROVE_LOCKING + lockdep_off(); +#endif + if (pThis->fFlags & RTSPINLOCK_FLAGS_INTERRUPT_SAFE) + { + unsigned long fIntSaved; + spin_lock_irqsave(&pThis->Spinlock, fIntSaved); + pThis->fIntSaved = fIntSaved; + } + else + spin_lock(&pThis->Spinlock); +#ifdef CONFIG_PROVE_LOCKING + lockdep_on(); +#endif + + IPRT_LINUX_RESTORE_EFL_ONLY_AC(); + RT_ASSERT_PREEMPT_CPUID_SPIN_ACQUIRED(pThis); +} +RT_EXPORT_SYMBOL(RTSpinlockAcquire); + + +RTDECL(void) RTSpinlockRelease(RTSPINLOCK Spinlock) +{ + PRTSPINLOCKINTERNAL pThis = (PRTSPINLOCKINTERNAL)Spinlock; + IPRT_LINUX_SAVE_EFL_AC(); /* spin_unlock* may preempt and trash eflags.ac. */ + RT_ASSERT_PREEMPT_CPUID_SPIN_RELEASE_VARS(); + AssertMsg(pThis && pThis->u32Magic == RTSPINLOCK_MAGIC, + ("pThis=%p u32Magic=%08x\n", pThis, pThis ? (int)pThis->u32Magic : 0)); + RT_ASSERT_PREEMPT_CPUID_SPIN_RELEASE(pThis); + +#ifdef CONFIG_PROVE_LOCKING + lockdep_off(); +#endif + if (pThis->fFlags & RTSPINLOCK_FLAGS_INTERRUPT_SAFE) + { + unsigned long fIntSaved = pThis->fIntSaved; + pThis->fIntSaved = 0; + spin_unlock_irqrestore(&pThis->Spinlock, fIntSaved); + } + else + spin_unlock(&pThis->Spinlock); +#ifdef CONFIG_PROVE_LOCKING + lockdep_on(); +#endif + + IPRT_LINUX_RESTORE_EFL_ONLY_AC(); + RT_ASSERT_PREEMPT_CPUID(); +} +RT_EXPORT_SYMBOL(RTSpinlockRelease); + diff --git a/src/VBox/Runtime/r0drv/linux/string.h b/src/VBox/Runtime/r0drv/linux/string.h new file mode 100644 index 00000000..0560fa1f --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/string.h @@ -0,0 +1,60 @@ +/* $Id: string.h $ */ +/** @file + * IPRT - wrapper for the linux kernel asm/string.h. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + +#ifndef IPRT_INCLUDED_SRC_r0drv_linux_string_h +#define IPRT_INCLUDED_SRC_r0drv_linux_string_h +#ifndef RT_WITHOUT_PRAGMA_ONCE +# pragma once +#endif + +#include <iprt/cdefs.h> + +RT_C_DECLS_BEGIN +#ifndef bool /* Linux 2.6.19 C++ nightmare */ +#define bool bool_type +#define true true_type +#define false false_type +#define _Bool int +#define bool_type_r0drv_string_h__ +#endif +#include <linux/types.h> +#include <linux/string.h> +#ifdef bool_type_r0drv_string_h__ +#undef bool +#undef true +#undef false +#undef bool_type_r0drv_string_h__ +#endif +char *strpbrk(const char *pszStr, const char *pszChars) +#if defined(__THROW) + __THROW +#endif + ; + +RT_C_DECLS_END + +#endif /* !IPRT_INCLUDED_SRC_r0drv_linux_string_h */ + diff --git a/src/VBox/Runtime/r0drv/linux/the-linux-kernel.h b/src/VBox/Runtime/r0drv/linux/the-linux-kernel.h new file mode 100644 index 00000000..138072ad --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/the-linux-kernel.h @@ -0,0 +1,479 @@ +/* $Id: the-linux-kernel.h $ */ +/** @file + * IPRT - Include all necessary headers for the Linux kernel. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + +#ifndef IPRT_INCLUDED_SRC_r0drv_linux_the_linux_kernel_h +#define IPRT_INCLUDED_SRC_r0drv_linux_the_linux_kernel_h +#ifndef RT_WITHOUT_PRAGMA_ONCE +# pragma once +#endif + +/* + * Include iprt/types.h to install the bool wrappers. + * Then use the linux bool type for all the stuff include here. + */ +#include <iprt/types.h> +#define bool linux_bool + +#if RT_GNUC_PREREQ(4, 6) +# pragma GCC diagnostic push +#endif +#if RT_GNUC_PREREQ(4, 2) +# pragma GCC diagnostic ignored "-Wunused-parameter" +# if !defined(__cplusplus) && RT_GNUC_PREREQ(4, 3) +# pragma GCC diagnostic ignored "-Wold-style-declaration" /* 2.6.18-411.0.0.0.1.el5/build/include/asm/apic.h:110: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration] */ +# endif +#endif + + +#include <iprt/linux/version.h> +#if RTLNX_VER_MIN(2,6,33) +# include <generated/autoconf.h> +#else +# ifndef AUTOCONF_INCLUDED +# include <linux/autoconf.h> +# endif +#endif + +/* We only support 2.4 and 2.6 series kernels */ +#if RTLNX_VER_MAX(2,4,0) +# error Sorry, we do not support 2.3 and earlier kernels. +#endif +#if RTLNX_VER_MIN(2,5,0) && RTLNX_VER_MAX(2,6,0) +# error Sorry, we do not support 2.5 series kernels (might work though). +#endif + +#if defined(CONFIG_MODVERSIONS) && !defined(MODVERSIONS) +# define MODVERSIONS +# if RTLNX_VER_MAX(2,5,71) +# include <linux/modversions.h> +# endif +#endif +#ifndef KBUILD_STR +# if RTLNX_VER_MAX(2,6,16) +# define KBUILD_STR(s) s +# else +# define KBUILD_STR(s) #s +# endif +#endif +# if RTLNX_VER_MIN(3,3,0) +# include <linux/kconfig.h> /* for macro IS_ENABLED */ +# endif +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#if RTLNX_VER_MIN(2,6,27) +# include <linux/semaphore.h> +#else /* older kernels */ +# include <asm/semaphore.h> +#endif /* older kernels */ +#include <linux/module.h> +#if RTLNX_VER_MIN(2,6,0) +# include <linux/moduleparam.h> +#endif +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/fs.h> +#if RTLNX_VER_MIN(2,6,0) +# include <linux/namei.h> +#endif +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/sched.h> + +#if RTLNX_VER_RANGE(3,9,23, 3,9,31) +# include <linux/splice.h> +#endif + +#if RTLNX_VER_MIN(3,9,0) +# include <linux/sched/rt.h> +#endif +#if RTLNX_VER_MIN(4,11,0) +# include <linux/sched/signal.h> +# include <linux/sched/types.h> +#endif +#if RTLNX_VER_MIN(2,6,7) +# include <linux/jiffies.h> +#endif +#if RTLNX_VER_MIN(2,6,16) +# include <linux/ktime.h> +# include <linux/hrtimer.h> +#endif +#include <linux/wait.h> +#if RTLNX_VER_MIN(2,5,71) +# include <linux/cpu.h> +# include <linux/notifier.h> +#endif +#if RTLNX_VER_MIN(5,1,0) +# include <uapi/linux/mman.h> +#endif +/* For the basic additions module */ +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/completion.h> +#include <linux/compiler.h> +#if RTLNX_VER_MIN(5,9,0) /* linux/fs.h defined HAVE_UNLOCKED_IOCTL from 2.6.11 up to 5.9, when it became an implicit assumption. */ +# define HAVE_UNLOCKED_IOCTL 1 /* We use this in a couple of places, so for now just define it for 5.9+ too. */ +#endif +#if !defined(HAVE_UNLOCKED_IOCTL) && RTLNX_VER_MAX(2,6,38) +# include <linux/smp_lock.h> +#endif +/* For the shared folders module */ +#include <linux/vmalloc.h> +#define wchar_t linux_wchar_t +#include <linux/nls.h> +#undef wchar_t +#include <asm/mman.h> +#include <asm/io.h> +#include <asm/uaccess.h> +#include <asm/div64.h> + +/* For thread-context hooks. */ +#if RTLNX_VER_MIN(2,6,18) && defined(CONFIG_PREEMPT_NOTIFIERS) +# include <linux/preempt.h> +#endif + +/* for workqueue / task queues. */ +#if RTLNX_VER_MIN(2,5,41) +# include <linux/workqueue.h> +#else +# include <linux/tqueue.h> +#endif + +#if RTLNX_VER_MIN(2,6,4) +# include <linux/kthread.h> +#endif + +/* for cr4_init_shadow() / cpu_tlbstate. */ +#if RTLNX_VER_MIN(3,20,0) +# include <asm/tlbflush.h> +#endif + +/* for set_pages_x() */ +#if RTLNX_VER_MIN(4,12,0) +# include <asm/set_memory.h> +#endif + +/* for __flush_tlb_all() */ +#if RTLNX_VER_MIN(2,6,28) && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)) +# include <asm/tlbflush.h> +#endif + +#if RTLNX_VER_MIN(3,7,0) +# include <asm/smap.h> +#else +static inline void clac(void) { } +static inline void stac(void) { } +#endif + +#if RTLNX_VER_MAX(2,6,0) +# ifndef page_to_pfn +# define page_to_pfn(page) ((page) - mem_map) +# endif +#endif + +#ifndef DEFINE_WAIT +# define DEFINE_WAIT(name) DECLARE_WAITQUEUE(name, current) +#endif + +#ifndef __GFP_NOWARN +# define __GFP_NOWARN 0 +#endif + +/* + * 2.4 / early 2.6 compatibility wrappers + */ +#if RTLNX_VER_MAX(2,6,7) + +# ifndef MAX_JIFFY_OFFSET +# define MAX_JIFFY_OFFSET ((~0UL >> 1)-1) +# endif + +# if RTLNX_VER_MAX(2,4,29) || RTLNX_VER_MIN(2,6,0) + +DECLINLINE(unsigned int) jiffies_to_msecs(unsigned long cJiffies) +{ +# if HZ <= 1000 && !(1000 % HZ) + return (1000 / HZ) * cJiffies; +# elif HZ > 1000 && !(HZ % 1000) + return (cJiffies + (HZ / 1000) - 1) / (HZ / 1000); +# else + return (cJiffies * 1000) / HZ; +# endif +} + +DECLINLINE(unsigned long) msecs_to_jiffies(unsigned int cMillies) +{ +# if HZ > 1000 + if (cMillies > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +# endif +# if HZ <= 1000 && !(1000 % HZ) + return (cMillies + (1000 / HZ) - 1) / (1000 / HZ); +# elif HZ > 1000 && !(HZ % 1000) + return cMillies * (HZ / 1000); +# else + return (cMillies * HZ + 999) / 1000; +# endif +} + +# endif /* < 2.4.29 || >= 2.6.0 */ + +#endif /* < 2.6.7 */ + +/* + * 2.4 compatibility wrappers + */ +#if RTLNX_VER_MAX(2,6,0) + +# define prepare_to_wait(q, wait, state) \ + do { \ + add_wait_queue(q, wait); \ + set_current_state(state); \ + } while (0) + +# define after_wait(wait) \ + do { \ + list_del_init(&(wait)->task_list); \ + } while (0) + +# define finish_wait(q, wait) \ + do { \ + set_current_state(TASK_RUNNING); \ + remove_wait_queue(q, wait); \ + } while (0) + +#else /* >= 2.6.0 */ + +# define after_wait(wait) do {} while (0) + +#endif /* >= 2.6.0 */ + +/** @def TICK_NSEC + * The time between ticks in nsec */ +#ifndef TICK_NSEC +# define TICK_NSEC (1000000000UL / HZ) +#endif + +/* + * This sucks soooo badly on x86! Why don't they export __PAGE_KERNEL_EXEC so PAGE_KERNEL_EXEC would be usable? + */ +#if RTLNX_VER_MIN(2,6,8) && defined(RT_ARCH_AMD64) +# define MY_PAGE_KERNEL_EXEC PAGE_KERNEL_EXEC +#elif RTLNX_VER_MIN(2,6,8) && defined(PAGE_KERNEL_EXEC) && defined(CONFIG_X86_PAE) +# ifdef __PAGE_KERNEL_EXEC + /* >= 2.6.27 */ +# define MY_PAGE_KERNEL_EXEC __pgprot(boot_cpu_has(X86_FEATURE_PGE) ? __PAGE_KERNEL_EXEC | _PAGE_GLOBAL : __PAGE_KERNEL_EXEC) +# else +# define MY_PAGE_KERNEL_EXEC __pgprot(boot_cpu_has(X86_FEATURE_PGE) ? _PAGE_KERNEL_EXEC | _PAGE_GLOBAL : _PAGE_KERNEL_EXEC) +# endif +#else +# define MY_PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + + +/* + * The redhat hack section. + * - The current hacks are for 2.4.21-15.EL only. + */ +#ifndef NO_REDHAT_HACKS +/* accounting. */ +# if RTLNX_VER_MAX(2,6,0) +# ifdef VM_ACCOUNT +# define USE_RHEL4_MUNMAP +# endif +# endif + +/* backported remap_page_range. */ +# if RTLNX_VER_MAX(2,6,0) +# include <asm/tlb.h> +# ifdef tlb_vma /* probably not good enough... */ +# define HAVE_26_STYLE_REMAP_PAGE_RANGE 1 +# endif +# endif + +# ifndef RT_ARCH_AMD64 +/* In 2.6.9-22.ELsmp we have to call change_page_attr() twice when changing + * the page attributes from PAGE_KERNEL to something else, because there appears + * to be a bug in one of the many patches that redhat applied. + * It should be safe to do this on less buggy linux kernels too. ;-) + */ +# define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) \ + do { \ + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) \ + change_page_attr(pPages, cPages, prot); \ + change_page_attr(pPages, cPages, prot); \ + } while (0) +# endif /* !RT_ARCH_AMD64 */ +#endif /* !NO_REDHAT_HACKS */ + +#ifndef MY_CHANGE_PAGE_ATTR +# ifdef RT_ARCH_AMD64 /** @todo This is a cheap hack, but it'll get around that 'else BUG();' in __change_page_attr(). */ +# define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) \ + do { \ + change_page_attr(pPages, cPages, PAGE_KERNEL_NOCACHE); \ + change_page_attr(pPages, cPages, prot); \ + } while (0) +# else +# define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) change_page_attr(pPages, cPages, prot) +# endif +#endif + +#if RTLNX_VER_MIN(2,6,25) +# if RTLNX_VER_MAX(5,4,0) /* The interface was removed, but we only need it for < 2.4.22, so who cares. */ +# define MY_SET_PAGES_EXEC(pPages, cPages) set_pages_x(pPages, cPages) +# define MY_SET_PAGES_NOEXEC(pPages, cPages) set_pages_nx(pPages, cPages) +# endif +#else +# define MY_SET_PAGES_EXEC(pPages, cPages) \ + do { \ + if (pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL)) \ + MY_CHANGE_PAGE_ATTR(pPages, cPages, MY_PAGE_KERNEL_EXEC); \ + } while (0) +# define MY_SET_PAGES_NOEXEC(pPages, cPages) \ + do { \ + if (pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL)) \ + MY_CHANGE_PAGE_ATTR(pPages, cPages, PAGE_KERNEL); \ + } while (0) +#endif + +/** @def ONE_MSEC_IN_JIFFIES + * The number of jiffies that make up 1 millisecond. Must be at least 1! */ +#if HZ <= 1000 +# define ONE_MSEC_IN_JIFFIES 1 +#elif !(HZ % 1000) +# define ONE_MSEC_IN_JIFFIES (HZ / 1000) +#else +# define ONE_MSEC_IN_JIFFIES ((HZ + 999) / 1000) +# error "HZ is not a multiple of 1000, the GIP stuff won't work right!" +#endif + +/* + * Stop using the linux bool type. + */ +#undef bool + +#if RT_GNUC_PREREQ(4, 6) +# pragma GCC diagnostic pop +#endif + +/* + * There are post-2.6.24 kernels (confusingly with unchanged version number) + * which eliminate macros which were marked as deprecated. + */ +#ifndef __attribute_used__ +#define __attribute_used__ __used +#endif + +/** + * Hack for shortening pointers on linux so we can stuff more stuff into the + * task_struct::comm field. This is used by the semaphore code but put here + * because we don't have any better place atm. Don't use outside IPRT, please. + */ +#ifdef RT_ARCH_AMD64 +# define IPRT_DEBUG_SEMS_ADDRESS(addr) ( ((long)(addr) & (long)~UINT64_C(0xfffffff000000000)) ) +#else +# define IPRT_DEBUG_SEMS_ADDRESS(addr) ( (long)(addr) ) +#endif + +/** + * Puts semaphore info into the task_struct::comm field if IPRT_DEBUG_SEMS is + * defined. + */ +#ifdef IPRT_DEBUG_SEMS +# define IPRT_DEBUG_SEMS_STATE(pThis, chState) \ + snprintf(current->comm, sizeof(current->comm), "%c%lx", (chState), IPRT_DEBUG_SEMS_ADDRESS(pThis)); +#else +# define IPRT_DEBUG_SEMS_STATE(pThis, chState) do { } while (0) +#endif + +/** + * Puts semaphore info into the task_struct::comm field if IPRT_DEBUG_SEMS is + * defined. + */ +#ifdef IPRT_DEBUG_SEMS +# define IPRT_DEBUG_SEMS_STATE_RC(pThis, chState, rc) \ + snprintf(current->comm, sizeof(current->comm), "%c%lx:%d", (chState), IPRT_DEBUG_SEMS_ADDRESS(pThis), rc); +#else +# define IPRT_DEBUG_SEMS_STATE_RC(pThis, chState, rc) do { } while (0) +#endif + +/** @name Macros for preserving EFLAGS.AC on 3.19+/amd64 paranoid. + * The AMD 64 switch_to in macro in arch/x86/include/asm/switch_to.h stopped + * restoring flags. + * @{ */ +#if (defined(CONFIG_X86_SMAP) || defined(RT_STRICT) || defined(IPRT_WITH_EFLAGS_AC_PRESERVING)) \ + && !defined(IPRT_WITHOUT_EFLAGS_AC_PRESERVING) +# include <iprt/asm-amd64-x86.h> +# define IPRT_X86_EFL_AC RT_BIT(18) +# define IPRT_LINUX_SAVE_EFL_AC() RTCCUINTREG fSavedEfl = ASMGetFlags() +# define IPRT_LINUX_RESTORE_EFL_AC() ASMSetFlags(fSavedEfl) +# define IPRT_LINUX_RESTORE_EFL_ONLY_AC() ASMChangeFlags(~IPRT_X86_EFL_AC, fSavedEfl & IPRT_X86_EFL_AC) +#else +# define IPRT_LINUX_SAVE_EFL_AC() do { } while (0) +# define IPRT_LINUX_RESTORE_EFL_AC() do { } while (0) +# define IPRT_LINUX_RESTORE_EFL_ONLY_AC() do { } while (0) +#endif +/** @} */ + +/* + * There are some conflicting defines in iprt/param.h, sort them out here. + */ +#ifndef IPRT_INCLUDED_param_h +# undef PAGE_SIZE +# undef PAGE_OFFSET_MASK +# include <iprt/param.h> +#endif + +/* + * Some global indicator macros. + */ +/** @def IPRT_LINUX_HAS_HRTIMER + * Whether the kernel support high resolution timers (Linux kernel versions + * 2.6.28 and later (hrtimer_add_expires_ns() & schedule_hrtimeout). */ +#if RTLNX_VER_MIN(2,6,28) +# define IPRT_LINUX_HAS_HRTIMER +#endif + +/* + * Workqueue stuff, see initterm-r0drv-linux.c. + */ +#if RTLNX_VER_MIN(2,5,41) +typedef struct work_struct RTR0LNXWORKQUEUEITEM; +#else +typedef struct tq_struct RTR0LNXWORKQUEUEITEM; +#endif +DECLHIDDEN(void) rtR0LnxWorkqueuePush(RTR0LNXWORKQUEUEITEM *pWork, void (*pfnWorker)(RTR0LNXWORKQUEUEITEM *)); +DECLHIDDEN(void) rtR0LnxWorkqueueFlush(void); + +/* + * Memory hacks from memobj-r0drv-linux.c that shared folders need. + */ +RTDECL(struct page *) rtR0MemObjLinuxVirtToPage(void *pv); + +#endif /* !IPRT_INCLUDED_SRC_r0drv_linux_the_linux_kernel_h */ diff --git a/src/VBox/Runtime/r0drv/linux/thread-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/thread-r0drv-linux.c new file mode 100644 index 00000000..bda9c789 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/thread-r0drv-linux.c @@ -0,0 +1,234 @@ +/* $Id: thread-r0drv-linux.c $ */ +/** @file + * IPRT - Threads, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" +#include <iprt/thread.h> + +#include <iprt/asm.h> +#if RTLNX_VER_MAX(2,5,28) || defined(CONFIG_X86_SMAP) +# include <iprt/asm-amd64-x86.h> +#endif +#include <iprt/assert.h> +#include <iprt/errcore.h> +#include <iprt/mp.h> + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +#ifndef CONFIG_PREEMPT +/** Per-cpu preemption counters. */ +static int32_t volatile g_acPreemptDisabled[NR_CPUS]; +#endif + + +RTDECL(RTNATIVETHREAD) RTThreadNativeSelf(void) +{ + return (RTNATIVETHREAD)current; +} +RT_EXPORT_SYMBOL(RTThreadNativeSelf); + + +static int rtR0ThreadLnxSleepCommon(RTMSINTERVAL cMillies) +{ + IPRT_LINUX_SAVE_EFL_AC(); + long cJiffies = msecs_to_jiffies(cMillies); + set_current_state(TASK_INTERRUPTIBLE); + cJiffies = schedule_timeout(cJiffies); + IPRT_LINUX_RESTORE_EFL_AC(); + if (!cJiffies) + return VINF_SUCCESS; + return VERR_INTERRUPTED; +} + + +RTDECL(int) RTThreadSleep(RTMSINTERVAL cMillies) +{ + return rtR0ThreadLnxSleepCommon(cMillies); +} +RT_EXPORT_SYMBOL(RTThreadSleep); + + +RTDECL(int) RTThreadSleepNoLog(RTMSINTERVAL cMillies) +{ + return rtR0ThreadLnxSleepCommon(cMillies); +} +RT_EXPORT_SYMBOL(RTThreadSleepNoLog); + + +RTDECL(bool) RTThreadYield(void) +{ + IPRT_LINUX_SAVE_EFL_AC(); +#if RTLNX_VER_MIN(2,4,20) + yield(); +#else + /** @todo r=ramshankar: Can we use cond_resched() instead? */ + set_current_state(TASK_RUNNING); + sys_sched_yield(); + schedule(); +#endif + IPRT_LINUX_RESTORE_EFL_AC(); + return true; +} +RT_EXPORT_SYMBOL(RTThreadYield); + + +RTDECL(bool) RTThreadPreemptIsEnabled(RTTHREAD hThread) +{ +#ifdef CONFIG_PREEMPT + Assert(hThread == NIL_RTTHREAD); RT_NOREF_PV(hThread); +# ifdef preemptible + return preemptible(); +# else + return preempt_count() == 0 && !in_atomic() && !irqs_disabled(); +# endif +#else + int32_t c; + + Assert(hThread == NIL_RTTHREAD); + c = g_acPreemptDisabled[smp_processor_id()]; + AssertMsg(c >= 0 && c < 32, ("%d\n", c)); + if (c != 0) + return false; +# if RTLNX_VER_MIN(2,5,32) + if (in_atomic()) + return false; +# endif +# if RTLNX_VER_MIN(2,5,28) + if (irqs_disabled()) + return false; +# else + if (!ASMIntAreEnabled()) + return false; +# endif + return true; +#endif +} +RT_EXPORT_SYMBOL(RTThreadPreemptIsEnabled); + + +RTDECL(bool) RTThreadPreemptIsPending(RTTHREAD hThread) +{ + Assert(hThread == NIL_RTTHREAD); RT_NOREF_PV(hThread); +#if RTLNX_VER_MIN(2,5,4) + return !!test_tsk_thread_flag(current, TIF_NEED_RESCHED); + +#elif RTLNX_VER_MIN(2,4,20) + return !!need_resched(); + +#elif RTLNX_VER_MIN(2,1,110) + return current->need_resched != 0; + +#else + return need_resched != 0; +#endif +} +RT_EXPORT_SYMBOL(RTThreadPreemptIsPending); + + +RTDECL(bool) RTThreadPreemptIsPendingTrusty(void) +{ + /* yes, RTThreadPreemptIsPending is reliable. */ + return true; +} +RT_EXPORT_SYMBOL(RTThreadPreemptIsPendingTrusty); + + +RTDECL(bool) RTThreadPreemptIsPossible(void) +{ +#ifdef CONFIG_PREEMPT + return true; /* Yes, kernel preemption is possible. */ +#else + return false; /* No kernel preemption (or CONFIG_PREEMPT_VOLUNTARY). */ +#endif +} +RT_EXPORT_SYMBOL(RTThreadPreemptIsPossible); + + +RTDECL(void) RTThreadPreemptDisable(PRTTHREADPREEMPTSTATE pState) +{ +#ifdef CONFIG_PREEMPT + AssertPtr(pState); + Assert(pState->u32Reserved == 0); + pState->u32Reserved = 42; + /* This ASSUMES that CONFIG_PREEMPT_COUNT is always defined with CONFIG_PREEMPT. */ + preempt_disable(); + RT_ASSERT_PREEMPT_CPUID_DISABLE(pState); + +#else /* !CONFIG_PREEMPT */ + int32_t c; + AssertPtr(pState); + Assert(pState->u32Reserved == 0); + + /* Do our own accounting. */ + c = ASMAtomicIncS32(&g_acPreemptDisabled[smp_processor_id()]); + AssertMsg(c > 0 && c < 32, ("%d\n", c)); + pState->u32Reserved = c; + RT_ASSERT_PREEMPT_CPUID_DISABLE(pState); +#endif +} +RT_EXPORT_SYMBOL(RTThreadPreemptDisable); + + +RTDECL(void) RTThreadPreemptRestore(PRTTHREADPREEMPTSTATE pState) +{ +#ifdef CONFIG_PREEMPT + IPRT_LINUX_SAVE_EFL_AC(); /* paranoia */ + AssertPtr(pState); + Assert(pState->u32Reserved == 42); + RT_ASSERT_PREEMPT_CPUID_RESTORE(pState); + preempt_enable(); + IPRT_LINUX_RESTORE_EFL_ONLY_AC(); /* paranoia */ + +#else + int32_t volatile *pc; + AssertPtr(pState); + AssertMsg(pState->u32Reserved > 0 && pState->u32Reserved < 32, ("%d\n", pState->u32Reserved)); + RT_ASSERT_PREEMPT_CPUID_RESTORE(pState); + + /* Do our own accounting. */ + pc = &g_acPreemptDisabled[smp_processor_id()]; + AssertMsg(pState->u32Reserved == (uint32_t)*pc, ("u32Reserved=%d *pc=%d \n", pState->u32Reserved, *pc)); + ASMAtomicUoWriteS32(pc, pState->u32Reserved - 1); +#endif + pState->u32Reserved = 0; +} +RT_EXPORT_SYMBOL(RTThreadPreemptRestore); + + +RTDECL(bool) RTThreadIsInInterrupt(RTTHREAD hThread) +{ + Assert(hThread == NIL_RTTHREAD); NOREF(hThread); + + return in_interrupt() != 0; +} +RT_EXPORT_SYMBOL(RTThreadIsInInterrupt); + diff --git a/src/VBox/Runtime/r0drv/linux/thread2-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/thread2-r0drv-linux.c new file mode 100644 index 00000000..cd01f32a --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/thread2-r0drv-linux.c @@ -0,0 +1,233 @@ +/* $Id: thread2-r0drv-linux.c $ */ +/** @file + * IPRT - Threads (Part 2), Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/assert.h> +#include <iprt/thread.h> +#include <iprt/errcore.h> +#include "internal/thread.h" + +#if RTLNX_VER_MIN(4,11,0) + #include <uapi/linux/sched/types.h> +#endif /* >= KERNEL_VERSION(4, 11, 0) */ + +RTDECL(RTTHREAD) RTThreadSelf(void) +{ + return rtThreadGetByNative((RTNATIVETHREAD)current); +} + + +DECLHIDDEN(int) rtThreadNativeInit(void) +{ + return VINF_SUCCESS; +} + + +DECLHIDDEN(int) rtThreadNativeSetPriority(PRTTHREADINT pThread, RTTHREADTYPE enmType) +{ +#if RTLNX_VER_MIN(2,5,2) + /* + * Assignments are partially based on g_aTypesLinuxFree but + * scaled up in the high priority end. + * + * 5.9.0 - : + * The sched_set_normal interfaces does not really check the input, + * whereas sched_set_fifo & sched_set_fifo_low have fixed assignments. + * 2.6.11 - 5.9.0: + * Use sched_setscheduler to try effect FIFO scheduling + * for IO and TIMER threads, otherwise use set_user_nice. + * 2.5.2 - 5.9.0: + * Use set_user_nice to renice the thread. + */ + int iNice = 0; +# if RTLNX_VER_MAX(5,9,0) + int rc; +# if RTLNX_VER_MIN(2,6,11) + int iSchedClass = SCHED_NORMAL; + struct sched_param Param = { .sched_priority = 0 }; +# endif +# endif + switch (enmType) + { + case RTTHREADTYPE_INFREQUENT_POLLER: + iNice = +3; + break; + + case RTTHREADTYPE_MAIN_HEAVY_WORKER: + iNice = +2; + break; + + case RTTHREADTYPE_EMULATION: + iNice = +1; + break; + + case RTTHREADTYPE_DEFAULT: + case RTTHREADTYPE_GUI: + case RTTHREADTYPE_MAIN_WORKER: + iNice = 0; + break; + + case RTTHREADTYPE_VRDP_IO: + case RTTHREADTYPE_DEBUGGER: + iNice = -1; + break; + + case RTTHREADTYPE_MSG_PUMP: + iNice = -2; + break; + + case RTTHREADTYPE_IO: +# if RTLNX_VER_MIN(5,9,0) + sched_set_fifo_low(current); + return VINF_SUCCESS; +# else + iNice = -12; +# if RTLNX_VER_MIN(2,6,11) + iSchedClass = SCHED_FIFO; + Param.sched_priority = 1; /* => prio=98; */ +# endif + break; +# endif + + case RTTHREADTYPE_TIMER: +# if RTLNX_VER_MIN(5,9,0) + sched_set_fifo(current); + return VINF_SUCCESS; +# else + iNice = -19; +# if RTLNX_VER_MIN(2,6,11) + iSchedClass = SCHED_FIFO; + Param.sched_priority = MAX_RT_PRIO / 2; /* => prio=49 */ +# endif + break; +# endif + default: + AssertMsgFailedReturn(("enmType=%d\n", enmType), VERR_INVALID_PARAMETER); + } + +# if RTLNX_VER_MIN(5,9,0) + /* + * We only get here for renice work. + */ + sched_set_normal(current, iNice); + +# else /* < 5.9.0 */ +# if RTLNX_VER_MIN(2,6,11) + /* + * Try set scheduler parameters. + * Fall back on normal + nice if this fails for FIFO policy.* + */ + rc = sched_setscheduler(current, iSchedClass, &Param); + if (rc) + { + Param.sched_priority = 0; + iSchedClass = SCHED_NORMAL; + rc = sched_setscheduler(current, iSchedClass, &Param); + } + + /* + * Renice if using normal scheduling class. + */ + if (iSchedClass == SCHED_NORMAL) +# endif /* >= 2.6.11 */ + set_user_nice(current, iNice); + +# endif /* < 5.9.0 */ +#else /* < 2.5.2 */ + RT_NOREF_PV(enmType); +#endif /* < 2.5.2 */ + RT_NOREF_PV(pThread); + return VINF_SUCCESS; +} + + +DECLHIDDEN(int) rtThreadNativeAdopt(PRTTHREADINT pThread) +{ + RT_NOREF_PV(pThread); + return VERR_NOT_IMPLEMENTED; +} + + +DECLHIDDEN(void) rtThreadNativeWaitKludge(PRTTHREADINT pThread) +{ + /** @todo fix RTThreadWait/RTR0Term race on linux. */ + RTThreadSleep(1); NOREF(pThread); +} + + +DECLHIDDEN(void) rtThreadNativeDestroy(PRTTHREADINT pThread) +{ + NOREF(pThread); +} + + +#if RTLNX_VER_MIN(2,6,4) +/** + * Native kernel thread wrapper function. + * + * This will forward to rtThreadMain and do termination upon return. + * + * @param pvArg Pointer to the argument package. + */ +static int rtThreadNativeMain(void *pvArg) +{ + PRTTHREADINT pThread = (PRTTHREADINT)pvArg; + + rtThreadMain(pThread, (RTNATIVETHREAD)current, &pThread->szName[0]); + return 0; +} +#endif + + +DECLHIDDEN(int) rtThreadNativeCreate(PRTTHREADINT pThreadInt, PRTNATIVETHREAD pNativeThread) +{ +#if RTLNX_VER_MIN(2,6,4) + struct task_struct *NativeThread; + IPRT_LINUX_SAVE_EFL_AC(); + + RT_ASSERT_PREEMPTIBLE(); + + NativeThread = kthread_run(rtThreadNativeMain, pThreadInt, "iprt-%s", pThreadInt->szName); + + if (!IS_ERR(NativeThread)) + { + *pNativeThread = (RTNATIVETHREAD)NativeThread; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_GENERAL_FAILURE; +#else + return VERR_NOT_IMPLEMENTED; +#endif +} + diff --git a/src/VBox/Runtime/r0drv/linux/threadctxhooks-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/threadctxhooks-r0drv-linux.c new file mode 100644 index 00000000..089e2ca6 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/threadctxhooks-r0drv-linux.c @@ -0,0 +1,330 @@ +/* $Id: threadctxhooks-r0drv-linux.c $ */ +/** @file + * IPRT - Thread Context Switching Hook, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2013-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/mem.h> +#include <iprt/assert.h> +#include <iprt/thread.h> +#include <iprt/errcore.h> +#include <iprt/asm.h> +#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) +# include <iprt/asm-amd64-x86.h> +#endif +#include "internal/thread.h" + + +/* + * Linux kernel 2.6.23 introduced preemption notifiers but RedHat 2.6.18 kernels + * got it backported. + */ +#if RTLNX_VER_MIN(2,6,18) && defined(CONFIG_PREEMPT_NOTIFIERS) + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * The internal hook object for linux. + */ +typedef struct RTTHREADCTXHOOKINT +{ + /** Magic value (RTTHREADCTXHOOKINT_MAGIC). */ + uint32_t volatile u32Magic; + /** The thread handle (owner) for which the hook is registered. */ + RTNATIVETHREAD hOwner; + /** The preemption notifier object. */ + struct preempt_notifier LnxPreemptNotifier; + /** Whether the hook is enabled or not. If enabled, the LnxPreemptNotifier + * is linked into the owning thread's list of preemption callouts. */ + bool fEnabled; + /** Pointer to the user callback. */ + PFNRTTHREADCTXHOOK pfnCallback; + /** User argument passed to the callback. */ + void *pvUser; + /** The linux callbacks. */ + struct preempt_ops PreemptOps; +#if RTLNX_VER_MIN(3,1,19) && defined(RT_ARCH_AMD64) + /** Starting with 3.1.19, the linux kernel doesn't restore kernel RFLAGS during + * task switch, so we have to do that ourselves. (x86 code is not affected.) */ + RTCCUINTREG fSavedRFlags; +#endif +} RTTHREADCTXHOOKINT; +typedef RTTHREADCTXHOOKINT *PRTTHREADCTXHOOKINT; + + +/** + * Hook function for the thread schedule out event. + * + * @param pPreemptNotifier Pointer to the preempt_notifier struct. + * @param pNext Pointer to the task that is being scheduled + * instead of the current thread. + * + * @remarks Called with the rq (runqueue) lock held and with preemption and + * interrupts disabled! + */ +static void rtThreadCtxHooksLnxSchedOut(struct preempt_notifier *pPreemptNotifier, struct task_struct *pNext) +{ + PRTTHREADCTXHOOKINT pThis = RT_FROM_MEMBER(pPreemptNotifier, RTTHREADCTXHOOKINT, LnxPreemptNotifier); +#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) + RTCCUINTREG fSavedEFlags = ASMGetFlags(); + stac(); +#endif + RT_NOREF_PV(pNext); + + AssertPtr(pThis); + AssertPtr(pThis->pfnCallback); + Assert(pThis->fEnabled); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + pThis->pfnCallback(RTTHREADCTXEVENT_OUT, pThis->pvUser); + +#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) + ASMSetFlags(fSavedEFlags); +# if RTLNX_VER_MIN(3,1,19) && defined(RT_ARCH_AMD64) + pThis->fSavedRFlags = fSavedEFlags; +# endif +#endif +} + + +/** + * Hook function for the thread schedule in event. + * + * @param pPreemptNotifier Pointer to the preempt_notifier struct. + * @param iCpu The CPU this thread is being scheduled on. + * + * @remarks Called without holding the rq (runqueue) lock and with preemption + * enabled! + * @todo r=bird: Preemption is of course disabled when it is called. + */ +static void rtThreadCtxHooksLnxSchedIn(struct preempt_notifier *pPreemptNotifier, int iCpu) +{ + PRTTHREADCTXHOOKINT pThis = RT_FROM_MEMBER(pPreemptNotifier, RTTHREADCTXHOOKINT, LnxPreemptNotifier); +#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) + RTCCUINTREG fSavedEFlags = ASMGetFlags(); + stac(); +#endif + RT_NOREF_PV(iCpu); + + AssertPtr(pThis); + AssertPtr(pThis->pfnCallback); + Assert(pThis->fEnabled); + + pThis->pfnCallback(RTTHREADCTXEVENT_IN, pThis->pvUser); + +#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86) +# if RTLNX_VER_MIN(3,1,19) && defined(RT_ARCH_AMD64) + fSavedEFlags &= ~RT_BIT_64(18) /*X86_EFL_AC*/; + fSavedEFlags |= pThis->fSavedRFlags & RT_BIT_64(18) /*X86_EFL_AC*/; +# endif + ASMSetFlags(fSavedEFlags); +#endif +} + + +/** + * Worker function for RTThreadCtxHooks(Deregister|Release)(). + * + * @param pThis Pointer to the internal thread-context object. + */ +DECLINLINE(void) rtThreadCtxHookDisable(PRTTHREADCTXHOOKINT pThis) +{ + Assert(pThis->PreemptOps.sched_out == rtThreadCtxHooksLnxSchedOut); + Assert(pThis->PreemptOps.sched_in == rtThreadCtxHooksLnxSchedIn); + preempt_disable(); + preempt_notifier_unregister(&pThis->LnxPreemptNotifier); + pThis->fEnabled = false; + preempt_enable(); +} + + +RTDECL(int) RTThreadCtxHookCreate(PRTTHREADCTXHOOK phCtxHook, uint32_t fFlags, PFNRTTHREADCTXHOOK pfnCallback, void *pvUser) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate input. + */ + PRTTHREADCTXHOOKINT pThis; + Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + AssertPtrReturn(pfnCallback, VERR_INVALID_POINTER); + AssertReturn(fFlags == 0, VERR_INVALID_FLAGS); + + /* + * Allocate and initialize a new hook. We don't register it yet, just + * create it. + */ + pThis = (PRTTHREADCTXHOOKINT)RTMemAllocZ(sizeof(*pThis)); + if (RT_UNLIKELY(!pThis)) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_NO_MEMORY; + } + pThis->u32Magic = RTTHREADCTXHOOKINT_MAGIC; + pThis->hOwner = RTThreadNativeSelf(); + pThis->fEnabled = false; + pThis->pfnCallback = pfnCallback; + pThis->pvUser = pvUser; + preempt_notifier_init(&pThis->LnxPreemptNotifier, &pThis->PreemptOps); + pThis->PreemptOps.sched_out = rtThreadCtxHooksLnxSchedOut; + pThis->PreemptOps.sched_in = rtThreadCtxHooksLnxSchedIn; + +#if RTLNX_VER_MIN(4,2,0) + preempt_notifier_inc(); +#endif + + *phCtxHook = pThis; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTThreadCtxHookCreate); + + +RTDECL(int ) RTThreadCtxHookDestroy(RTTHREADCTXHOOK hCtxHook) +{ + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate input. + */ + PRTTHREADCTXHOOKINT pThis = hCtxHook; + if (pThis == NIL_RTTHREADCTXHOOK) + return VINF_SUCCESS; + AssertPtr(pThis); + AssertMsgReturn(pThis->u32Magic == RTTHREADCTXHOOKINT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), + VERR_INVALID_HANDLE); + Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(!pThis->fEnabled || pThis->hOwner == RTThreadNativeSelf()); + + /* + * If there's still a registered thread-context hook, deregister it now before destroying the object. + */ + if (pThis->fEnabled) + { + Assert(pThis->hOwner == RTThreadNativeSelf()); + rtThreadCtxHookDisable(pThis); + Assert(!pThis->fEnabled); /* paranoia */ + } + +#if RTLNX_VER_MIN(4,2,0) + preempt_notifier_dec(); +#endif + + ASMAtomicWriteU32(&pThis->u32Magic, ~RTTHREADCTXHOOKINT_MAGIC); + RTMemFree(pThis); + + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTThreadCtxHookDestroy); + + +RTDECL(int) RTThreadCtxHookEnable(RTTHREADCTXHOOK hCtxHook) +{ + /* + * Validate input. + */ + PRTTHREADCTXHOOKINT pThis = hCtxHook; + AssertPtr(pThis); + AssertMsgReturn(pThis->u32Magic == RTTHREADCTXHOOKINT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), + VERR_INVALID_HANDLE); + Assert(pThis->hOwner == RTThreadNativeSelf()); + Assert(!pThis->fEnabled); + if (!pThis->fEnabled) + { + IPRT_LINUX_SAVE_EFL_AC(); + Assert(pThis->PreemptOps.sched_out == rtThreadCtxHooksLnxSchedOut); + Assert(pThis->PreemptOps.sched_in == rtThreadCtxHooksLnxSchedIn); + + /* + * Register the callback. + */ + preempt_disable(); + pThis->fEnabled = true; + preempt_notifier_register(&pThis->LnxPreemptNotifier); + preempt_enable(); + + IPRT_LINUX_RESTORE_EFL_AC(); + } + + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTThreadCtxHookEnable); + + +RTDECL(int) RTThreadCtxHookDisable(RTTHREADCTXHOOK hCtxHook) +{ + /* + * Validate input. + */ + PRTTHREADCTXHOOKINT pThis = hCtxHook; + if (pThis != NIL_RTTHREADCTXHOOK) + { + AssertPtr(pThis); + AssertMsgReturn(pThis->u32Magic == RTTHREADCTXHOOKINT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), + VERR_INVALID_HANDLE); + Assert(pThis->hOwner == RTThreadNativeSelf()); + + /* + * Deregister the callback. + */ + if (pThis->fEnabled) + { + IPRT_LINUX_SAVE_EFL_AC(); + rtThreadCtxHookDisable(pThis); + IPRT_LINUX_RESTORE_EFL_AC(); + } + } + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTThreadCtxHookDisable); + + +RTDECL(bool) RTThreadCtxHookIsEnabled(RTTHREADCTXHOOK hCtxHook) +{ + /* + * Validate input. + */ + PRTTHREADCTXHOOKINT pThis = hCtxHook; + if (pThis == NIL_RTTHREADCTXHOOK) + return false; + AssertPtr(pThis); + AssertMsgReturn(pThis->u32Magic == RTTHREADCTXHOOKINT_MAGIC, ("pThis->u32Magic=%RX32 pThis=%p\n", pThis->u32Magic, pThis), + false); + + return pThis->fEnabled; +} + +#else /* Not supported / Not needed */ +# include "../generic/threadctxhooks-r0drv-generic.cpp" +#endif /* Not supported / Not needed */ + diff --git a/src/VBox/Runtime/r0drv/linux/time-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/time-r0drv-linux.c new file mode 100644 index 00000000..c72a4e6d --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/time-r0drv-linux.c @@ -0,0 +1,211 @@ +/* $Id: time-r0drv-linux.c $ */ +/** @file + * IPRT - Time, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_TIME +#include "the-linux-kernel.h" +#include "internal/iprt.h" +/* Make sure we have the setting functions we need for RTTimeNow: */ +#if RTLNX_VER_MAX(2,6,16) +# define RTTIME_INCL_TIMEVAL +#elif RTLNX_VER_MAX(3,17,0) +# define RTTIME_INCL_TIMESPEC +#endif +#include <iprt/time.h> +#include <iprt/asm.h> + + + +DECLINLINE(uint64_t) rtTimeGetSystemNanoTS(void) +{ +#if RTLNX_VER_MIN(5,6,0) + /* + * Starting with kernel version 5.6-rc3 only 64-bit time interfaces + * are allowed in the kernel. + */ + uint64_t u64; + struct timespec64 Ts = { 0, 0 }; + + ktime_get_ts64(&Ts); + u64 = Ts.tv_sec * RT_NS_1SEC_64 + Ts.tv_nsec; + return u64; + +#elif RTLNX_VER_MIN(2,6,16) /* This must match timer-r0drv-linux.c! */ + /* + * Use ktime_get_ts, this is also what clock_gettime(CLOCK_MONOTONIC,) is using. + */ + uint64_t u64; + struct timespec Ts = { 0, 0 }; + ktime_get_ts(&Ts); + u64 = Ts.tv_sec * RT_NS_1SEC_64 + Ts.tv_nsec; + return u64; + +#elif RTLNX_VER_MIN(2,5,60) + /* + * Seems there is no way of getting to the exact source of + * sys_clock_gettime(CLOCK_MONOTONIC, &ts) here, I think. But + * 64-bit jiffies adjusted for the initial value should be pretty + * much the same I hope. + */ + uint64_t u64 = get_jiffies_64(); +# ifdef INITIAL_JIFFIES + u64 += INITIAL_JIFFIES; +# endif + u64 *= TICK_NSEC; + return u64; + +#else /* < 2.5.60 */ +# if BITS_PER_LONG >= 64 + /* + * This is the same as above, except that there is no get_jiffies_64() + * here and we rely on long, and therefor jiffies, being 64-bit instead. + */ + uint64_t u64 = jiffies; +# ifdef INITIAL_JIFFIES + u64 += INITIAL_JIFFIES; +# endif + u64 *= TICK_NSEC; + return u64; + +# else /* 32 bit jiffies */ + /* + * We'll have to try track jiffy rollovers here or we'll be + * in trouble every time it flips. + * + * The high dword of the s_u64Last is the rollover count, the + * low dword is the previous jiffies. Updating is done by + * atomic compare & exchange of course. + */ + static uint64_t volatile s_u64Last = 0; + uint64_t u64; + + for (;;) + { + uint64_t u64NewLast; + int32_t iDelta; + uint32_t cRollovers; + uint32_t u32LastJiffies; + + /* sample the values */ + unsigned long ulNow = jiffies; + uint64_t u64Last = s_u64Last; + if (ulNow != jiffies) + continue; /* try again */ +# ifdef INITIAL_JIFFIES + ulNow += INITIAL_JIFFIES; +# endif + + u32LastJiffies = (uint32_t)u64Last; + cRollovers = u64Last >> 32; + + /* + * Check for rollover and update the static last value. + * + * We have to make sure we update it successfully to rule out + * an underrun because of racing someone. + */ + iDelta = ulNow - u32LastJiffies; + if (iDelta < 0) + { + cRollovers++; + u64NewLast = RT_MAKE_U64(ulNow, cRollovers); + if (!ASMAtomicCmpXchgU64(&s_u64Last, u64NewLast, u64Last)) + continue; /* race, try again */ + } + else + { + u64NewLast = RT_MAKE_U64(ulNow, cRollovers); + ASMAtomicCmpXchgU64(&s_u64Last, u64NewLast, u64Last); + } + + /* calculate the return value */ + u64 = ulNow; + u64 *= TICK_NSEC; + u64 += cRollovers * (_4G * TICK_NSEC); + break; + } + + return u64; +# endif /* 32 bit jiffies */ +#endif /* < 2.5.60 */ +} + + +RTDECL(uint64_t) RTTimeNanoTS(void) +{ + return rtTimeGetSystemNanoTS(); +} +RT_EXPORT_SYMBOL(RTTimeNanoTS); + + +RTDECL(uint64_t) RTTimeMilliTS(void) +{ + return rtTimeGetSystemNanoTS() / RT_NS_1MS; +} +RT_EXPORT_SYMBOL(RTTimeMilliTS); + + +RTDECL(uint64_t) RTTimeSystemNanoTS(void) +{ + return rtTimeGetSystemNanoTS(); +} +RT_EXPORT_SYMBOL(RTTimeSystemNanoTS); + + +RTDECL(uint64_t) RTTimeSystemMilliTS(void) +{ + return rtTimeGetSystemNanoTS() / RT_NS_1MS; +} +RT_EXPORT_SYMBOL(RTTimeSystemMilliTS); + + +RTDECL(PRTTIMESPEC) RTTimeNow(PRTTIMESPEC pTime) +{ + IPRT_LINUX_SAVE_EFL_AC(); +#if RTLNX_VER_MIN(3,17,0) + struct timespec64 Ts; + ktime_get_real_ts64(&Ts); /* ktime_get_real_ts64 was added as a macro in 3.17, function since 4.18. */ + IPRT_LINUX_RESTORE_EFL_AC(); + return RTTimeSpecSetTimespec64(pTime, &Ts); + +#elif RTLNX_VER_MIN(2,6,16) + struct timespec Ts; + ktime_get_real_ts(&Ts); /* ktime_get_real_ts was removed in Linux 4.20. */ + IPRT_LINUX_RESTORE_EFL_AC(); + return RTTimeSpecSetTimespec(pTime, &Ts); + +#else /* < 2.6.16 */ + struct timeval Tv; + do_gettimeofday(&Tv); + IPRT_LINUX_RESTORE_EFL_AC(); + return RTTimeSpecSetTimeval(pTime, &Tv); +#endif +} +RT_EXPORT_SYMBOL(RTTimeNow); + diff --git a/src/VBox/Runtime/r0drv/linux/timer-r0drv-linux.c b/src/VBox/Runtime/r0drv/linux/timer-r0drv-linux.c new file mode 100644 index 00000000..8d70c1ab --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/timer-r0drv-linux.c @@ -0,0 +1,1727 @@ +/* $Id: timer-r0drv-linux.c $ */ +/** @file + * IPRT - Timers, Ring-0 Driver, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "the-linux-kernel.h" +#include "internal/iprt.h" + +#include <iprt/timer.h> +#include <iprt/time.h> +#include <iprt/mp.h> +#include <iprt/cpuset.h> +#include <iprt/spinlock.h> +#include <iprt/err.h> +#include <iprt/asm.h> +#include <iprt/assert.h> +#include <iprt/alloc.h> + +#include "internal/magics.h" + +/** @def RTTIMER_LINUX_WITH_HRTIMER + * Whether to use high resolution timers. */ +#if !defined(RTTIMER_LINUX_WITH_HRTIMER) \ + && defined(IPRT_LINUX_HAS_HRTIMER) +# define RTTIMER_LINUX_WITH_HRTIMER +#endif + +#if RTLNX_VER_MAX(2,6,31) +# define mod_timer_pinned mod_timer +# define HRTIMER_MODE_ABS_PINNED HRTIMER_MODE_ABS +#endif + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Timer state machine. + * + * This is used to try handle the issues with MP events and + * timers that runs on all CPUs. It's relatively nasty :-/ + */ +typedef enum RTTIMERLNXSTATE +{ + /** Stopped. */ + RTTIMERLNXSTATE_STOPPED = 0, + /** Transient state; next ACTIVE. */ + RTTIMERLNXSTATE_STARTING, + /** Transient state; next ACTIVE. (not really necessary) */ + RTTIMERLNXSTATE_MP_STARTING, + /** Active. */ + RTTIMERLNXSTATE_ACTIVE, + /** Active and in callback; next ACTIVE, STOPPED or CALLBACK_DESTROYING. */ + RTTIMERLNXSTATE_CALLBACK, + /** Stopped while in the callback; next STOPPED. */ + RTTIMERLNXSTATE_CB_STOPPING, + /** Restarted while in the callback; next ACTIVE, STOPPED, DESTROYING. */ + RTTIMERLNXSTATE_CB_RESTARTING, + /** The callback shall destroy the timer; next STOPPED. */ + RTTIMERLNXSTATE_CB_DESTROYING, + /** Transient state; next STOPPED. */ + RTTIMERLNXSTATE_STOPPING, + /** Transient state; next STOPPED. */ + RTTIMERLNXSTATE_MP_STOPPING, + /** The usual 32-bit hack. */ + RTTIMERLNXSTATE_32BIT_HACK = 0x7fffffff +} RTTIMERLNXSTATE; + + +/** + * A Linux sub-timer. + */ +typedef struct RTTIMERLNXSUBTIMER +{ + /** Timer specific data. */ + union + { +#if defined(RTTIMER_LINUX_WITH_HRTIMER) + /** High resolution timer. */ + struct + { + /** The linux timer structure. */ + struct hrtimer LnxTimer; + } Hr; +#endif + /** Standard timer. */ + struct + { + /** The linux timer structure. */ + struct timer_list LnxTimer; + /** The start of the current run (ns). + * This is used to calculate when the timer ought to fire the next time. */ + uint64_t u64NextTS; + /** When the timer was started. */ + uint64_t nsStartTS; + /** The u64NextTS in jiffies. */ + unsigned long ulNextJiffies; + /** Set when starting or changing the timer so that u64StartTs + * and u64NextTS gets reinitialized (eliminating some jitter). */ + bool volatile fFirstAfterChg; + } Std; + } u; + /** The current tick number. */ + uint64_t iTick; + /** Restart the single shot timer at this specific time. + * Used when a single shot timer is restarted from the callback. */ + uint64_t volatile uNsRestartAt; + /** Pointer to the parent timer. */ + PRTTIMER pParent; + /** The current sub-timer state. */ + RTTIMERLNXSTATE volatile enmState; +} RTTIMERLNXSUBTIMER; +/** Pointer to a linux sub-timer. */ +typedef RTTIMERLNXSUBTIMER *PRTTIMERLNXSUBTIMER; + + +/** + * The internal representation of an Linux timer handle. + */ +typedef struct RTTIMER +{ + /** Magic. + * This is RTTIMER_MAGIC, but changes to something else before the timer + * is destroyed to indicate clearly that thread should exit. */ + uint32_t volatile u32Magic; + /** Spinlock synchronizing the fSuspended and MP event handling. + * This is NIL_RTSPINLOCK if cCpus == 1. */ + RTSPINLOCK hSpinlock; + /** Flag indicating that the timer is suspended. */ + bool volatile fSuspended; + /** Whether the timer must run on one specific CPU or not. */ + bool fSpecificCpu; +#ifdef CONFIG_SMP + /** Whether the timer must run on all CPUs or not. */ + bool fAllCpus; +#endif /* else: All -> specific on non-SMP kernels */ + /** Whether it is a high resolution timer or a standard one. */ + bool fHighRes; + /** The id of the CPU it must run on if fSpecificCpu is set. */ + RTCPUID idCpu; + /** The number of CPUs this timer should run on. */ + RTCPUID cCpus; + /** Callback. */ + PFNRTTIMER pfnTimer; + /** User argument. */ + void *pvUser; + /** The timer interval. 0 if one-shot. */ + uint64_t volatile u64NanoInterval; + /** This is set to the number of jiffies between ticks if the interval is + * an exact number of jiffies. (Standard timers only.) */ + unsigned long volatile cJiffies; + /** The change interval spinlock for standard timers only. */ + spinlock_t ChgIntLock; + /** Workqueue item for delayed destruction. */ + RTR0LNXWORKQUEUEITEM DtorWorkqueueItem; + /** Sub-timers. + * Normally there is just one, but for RTTIMER_FLAGS_CPU_ALL this will contain + * an entry for all possible cpus. In that case the index will be the same as + * for the RTCpuSet. */ + RTTIMERLNXSUBTIMER aSubTimers[1]; +} RTTIMER; + + +/** + * A rtTimerLinuxStartOnCpu and rtTimerLinuxStartOnCpu argument package. + */ +typedef struct RTTIMERLINUXSTARTONCPUARGS +{ + /** The current time (RTTimeSystemNanoTS). */ + uint64_t u64Now; + /** When to start firing (delta). */ + uint64_t u64First; +} RTTIMERLINUXSTARTONCPUARGS; +/** Pointer to a rtTimerLinuxStartOnCpu argument package. */ +typedef RTTIMERLINUXSTARTONCPUARGS *PRTTIMERLINUXSTARTONCPUARGS; + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +#ifdef CONFIG_SMP +static DECLCALLBACK(void) rtTimerLinuxMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser); +#endif + +#if 0 +#define DEBUG_HACKING +#include <iprt/string.h> +#include <iprt/asm-amd64-x86.h> +static void myLogBackdoorPrintf(const char *pszFormat, ...) +{ + char szTmp[256]; + va_list args; + size_t cb; + + cb = RTStrPrintf(szTmp, sizeof(szTmp) - 10, "%d: ", RTMpCpuId()); + va_start(args, pszFormat); + cb += RTStrPrintfV(&szTmp[cb], sizeof(szTmp) - cb, pszFormat, args); + va_end(args); + + ASMOutStrU8(0x504, (uint8_t *)&szTmp[0], cb); +} +# define RTAssertMsg1Weak(pszExpr, uLine, pszFile, pszFunction) \ + myLogBackdoorPrintf("\n!!Guest Assertion failed!!\n%s(%d) %s\n%s\n", uLine, pszFile, pszFunction, (pszExpr)) +# define RTAssertMsg2Weak myLogBackdoorPrintf +# define RTTIMERLNX_LOG(a) myLogBackdoorPrintf a +#else +# define RTTIMERLNX_LOG(a) do { } while (0) +#endif + +/** + * Sets the state. + */ +DECLINLINE(void) rtTimerLnxSetState(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState) +{ +#ifdef DEBUG_HACKING + RTTIMERLNX_LOG(("set %d -> %d\n", *penmState, enmNewState)); +#endif + ASMAtomicWriteU32((uint32_t volatile *)penmState, enmNewState); +} + + +/** + * Sets the state if it has a certain value. + * + * @return true if xchg was done. + * @return false if xchg wasn't done. + */ +#ifdef DEBUG_HACKING +#define rtTimerLnxCmpXchgState(penmState, enmNewState, enmCurState) rtTimerLnxCmpXchgStateDebug(penmState, enmNewState, enmCurState, __LINE__) +static bool rtTimerLnxCmpXchgStateDebug(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState, + RTTIMERLNXSTATE enmCurState, uint32_t uLine) +{ + RTTIMERLNXSTATE enmOldState = enmCurState; + bool fRc = ASMAtomicCmpXchgExU32((uint32_t volatile *)penmState, enmNewState, enmCurState, (uint32_t *)&enmOldState); + RTTIMERLNX_LOG(("cxg %d -> %d - %d at %u\n", enmOldState, enmNewState, fRc, uLine)); + return fRc; +} +#else +DECLINLINE(bool) rtTimerLnxCmpXchgState(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState, + RTTIMERLNXSTATE enmCurState) +{ + return ASMAtomicCmpXchgU32((uint32_t volatile *)penmState, enmNewState, enmCurState); +} +#endif + + +/** + * Gets the state. + */ +DECLINLINE(RTTIMERLNXSTATE) rtTimerLnxGetState(RTTIMERLNXSTATE volatile *penmState) +{ + return (RTTIMERLNXSTATE)ASMAtomicUoReadU32((uint32_t volatile *)penmState); +} + +#ifdef RTTIMER_LINUX_WITH_HRTIMER + +/** + * Converts a nano second time stamp to ktime_t. + * + * ASSUMES RTTimeSystemNanoTS() is implemented using ktime_get_ts(). + * + * @returns ktime_t. + * @param cNanoSecs Nanoseconds. + */ +DECLINLINE(ktime_t) rtTimerLnxNanoToKt(uint64_t cNanoSecs) +{ + /* With some luck the compiler optimizes the division out of this... (Bet it doesn't.) */ + return ktime_set(cNanoSecs / 1000000000, cNanoSecs % 1000000000); +} + +/** + * Converts ktime_t to a nano second time stamp. + * + * ASSUMES RTTimeSystemNanoTS() is implemented using ktime_get_ts(). + * + * @returns nano second time stamp. + * @param Kt ktime_t. + */ +DECLINLINE(uint64_t) rtTimerLnxKtToNano(ktime_t Kt) +{ + return ktime_to_ns(Kt); +} + +#endif /* RTTIMER_LINUX_WITH_HRTIMER */ + +/** + * Converts a nano second interval to jiffies. + * + * @returns Jiffies. + * @param cNanoSecs Nanoseconds. + */ +DECLINLINE(unsigned long) rtTimerLnxNanoToJiffies(uint64_t cNanoSecs) +{ + /* this can be made even better... */ + if (cNanoSecs > (uint64_t)TICK_NSEC * MAX_JIFFY_OFFSET) + return MAX_JIFFY_OFFSET; +# if ARCH_BITS == 32 + if (RT_LIKELY(cNanoSecs <= UINT32_MAX)) + return ((uint32_t)cNanoSecs + (TICK_NSEC-1)) / TICK_NSEC; +# endif + return (cNanoSecs + (TICK_NSEC-1)) / TICK_NSEC; +} + + +/** + * Starts a sub-timer (RTTimerStart). + * + * @param pSubTimer The sub-timer to start. + * @param u64Now The current timestamp (RTTimeSystemNanoTS()). + * @param u64First The interval from u64Now to the first time the timer should fire. + * @param fPinned true = timer pinned to a specific CPU, + * false = timer can migrate between CPUs + * @param fHighRes Whether the user requested a high resolution timer or not. + * @param enmOldState The old timer state. + */ +static void rtTimerLnxStartSubTimer(PRTTIMERLNXSUBTIMER pSubTimer, uint64_t u64Now, uint64_t u64First, + bool fPinned, bool fHighRes) +{ + /* + * Calc when it should start firing. + */ + uint64_t u64NextTS = u64Now + u64First; + if (!fHighRes) + { + pSubTimer->u.Std.u64NextTS = u64NextTS; + pSubTimer->u.Std.nsStartTS = u64NextTS; + } + RTTIMERLNX_LOG(("startsubtimer %p\n", pSubTimer->pParent)); + + pSubTimer->iTick = 0; + +#ifdef RTTIMER_LINUX_WITH_HRTIMER + if (fHighRes) + hrtimer_start(&pSubTimer->u.Hr.LnxTimer, rtTimerLnxNanoToKt(u64NextTS), + fPinned ? HRTIMER_MODE_ABS_PINNED : HRTIMER_MODE_ABS); + else +#endif + { + unsigned long cJiffies = !u64First ? 0 : rtTimerLnxNanoToJiffies(u64First); + pSubTimer->u.Std.ulNextJiffies = jiffies + cJiffies; + pSubTimer->u.Std.fFirstAfterChg = true; +#ifdef CONFIG_SMP + if (fPinned) + { +# if RTLNX_VER_MIN(4,8,0) + mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies); +# else + mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies); +# endif + } + else +#endif + mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies); + } + + /* Be a bit careful here since we could be racing the callback. */ + if (!rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_STARTING)) + rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_MP_STARTING); +} + + +/** + * Stops a sub-timer (RTTimerStart and rtTimerLinuxMpEvent()). + * + * The caller has already changed the state, so we will not be in a callback + * situation wrt to the calling thread. + * + * @param pSubTimer The sub-timer. + * @param fHighRes Whether the user requested a high resolution timer or not. + */ +static void rtTimerLnxStopSubTimer(PRTTIMERLNXSUBTIMER pSubTimer, bool fHighRes) +{ + RTTIMERLNX_LOG(("stopsubtimer %p %d\n", pSubTimer->pParent, fHighRes)); +#ifdef RTTIMER_LINUX_WITH_HRTIMER + if (fHighRes) + { + /* There is no equivalent to del_timer in the hrtimer API, + hrtimer_cancel() == del_timer_sync(). Just like the WARN_ON in + del_timer_sync() asserts, waiting for a timer callback to complete + is deadlock prone, so don't do it. */ + int rc = hrtimer_try_to_cancel(&pSubTimer->u.Hr.LnxTimer); + if (rc < 0) + { + hrtimer_start(&pSubTimer->u.Hr.LnxTimer, ktime_set(KTIME_SEC_MAX, 0), HRTIMER_MODE_ABS); + hrtimer_try_to_cancel(&pSubTimer->u.Hr.LnxTimer); + } + } + else +#endif + del_timer(&pSubTimer->u.Std.LnxTimer); + + rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED); +} + + +/** + * Used by RTTimerDestroy and rtTimerLnxCallbackDestroy to do the actual work. + * + * @param pTimer The timer in question. + */ +static void rtTimerLnxDestroyIt(PRTTIMER pTimer) +{ + RTSPINLOCK hSpinlock = pTimer->hSpinlock; + RTCPUID iCpu; + Assert(pTimer->fSuspended); + RTTIMERLNX_LOG(("destroyit %p\n", pTimer)); + + /* + * Remove the MP notifications first because it'll reduce the risk of + * us overtaking any MP event that might theoretically be racing us here. + */ +#ifdef CONFIG_SMP + if ( pTimer->cCpus > 1 + && hSpinlock != NIL_RTSPINLOCK) + { + int rc = RTMpNotificationDeregister(rtTimerLinuxMpEvent, pTimer); + AssertRC(rc); + } +#endif /* CONFIG_SMP */ + + /* + * Invalidate the handle. + */ + ASMAtomicWriteU32(&pTimer->u32Magic, ~RTTIMER_MAGIC); + + /* + * Make sure all timers have stopped executing since we're stopping them in + * an asynchronous manner up in rtTimerLnxStopSubTimer. + */ + iCpu = pTimer->cCpus; + while (iCpu-- > 0) + { +#ifdef RTTIMER_LINUX_WITH_HRTIMER + if (pTimer->fHighRes) + hrtimer_cancel(&pTimer->aSubTimers[iCpu].u.Hr.LnxTimer); + else +#endif + del_timer_sync(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer); + } + + /* + * Finally, free the resources. + */ + RTMemFreeEx(pTimer, RT_UOFFSETOF_DYN(RTTIMER, aSubTimers[pTimer->cCpus])); + if (hSpinlock != NIL_RTSPINLOCK) + RTSpinlockDestroy(hSpinlock); +} + + +/** + * Workqueue callback (no DECLCALLBACK!) for deferred destruction. + * + * @param pWork Pointer to the DtorWorkqueueItem member of our timer + * structure. + */ +static void rtTimerLnxDestroyDeferred(RTR0LNXWORKQUEUEITEM *pWork) +{ + PRTTIMER pTimer = RT_FROM_MEMBER(pWork, RTTIMER, DtorWorkqueueItem); + rtTimerLnxDestroyIt(pTimer); +} + + +/** + * Called when the timer was destroyed by the callback function. + * + * @param pTimer The timer. + * @param pSubTimer The sub-timer which we're handling, the state of this + * will be RTTIMERLNXSTATE_CALLBACK_DESTROYING. + */ +static void rtTimerLnxCallbackDestroy(PRTTIMER pTimer, PRTTIMERLNXSUBTIMER pSubTimer) +{ + /* + * If it's an omni timer, the last dude does the destroying. + */ + if (pTimer->cCpus > 1) + { + uint32_t iCpu = pTimer->cCpus; + RTSpinlockAcquire(pTimer->hSpinlock); + + Assert(pSubTimer->enmState == RTTIMERLNXSTATE_CB_DESTROYING); + rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED); + + while (iCpu-- > 0) + if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) != RTTIMERLNXSTATE_STOPPED) + { + RTSpinlockRelease(pTimer->hSpinlock); + return; + } + + RTSpinlockRelease(pTimer->hSpinlock); + } + + /* + * Destroying a timer from the callback is unsafe since the callout code + * might be touching the timer structure upon return (hrtimer does!). So, + * we have to defer the actual destruction to the IRPT workqueue. + */ + rtR0LnxWorkqueuePush(&pTimer->DtorWorkqueueItem, rtTimerLnxDestroyDeferred); +} + + +#ifdef CONFIG_SMP +/** + * Deal with a sub-timer that has migrated. + * + * @param pTimer The timer. + * @param pSubTimer The sub-timer. + */ +static void rtTimerLnxCallbackHandleMigration(PRTTIMER pTimer, PRTTIMERLNXSUBTIMER pSubTimer) +{ + RTTIMERLNXSTATE enmState; + if (pTimer->cCpus > 1) + RTSpinlockAcquire(pTimer->hSpinlock); + + do + { + enmState = rtTimerLnxGetState(&pSubTimer->enmState); + switch (enmState) + { + case RTTIMERLNXSTATE_STOPPING: + case RTTIMERLNXSTATE_MP_STOPPING: + enmState = RTTIMERLNXSTATE_STOPPED; + case RTTIMERLNXSTATE_STOPPED: + break; + + default: + AssertMsgFailed(("%d\n", enmState)); RT_FALL_THRU(); + case RTTIMERLNXSTATE_STARTING: + case RTTIMERLNXSTATE_MP_STARTING: + case RTTIMERLNXSTATE_ACTIVE: + case RTTIMERLNXSTATE_CALLBACK: + case RTTIMERLNXSTATE_CB_STOPPING: + case RTTIMERLNXSTATE_CB_RESTARTING: + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, enmState)) + enmState = RTTIMERLNXSTATE_STOPPED; + break; + + case RTTIMERLNXSTATE_CB_DESTROYING: + { + if (pTimer->cCpus > 1) + RTSpinlockRelease(pTimer->hSpinlock); + + rtTimerLnxCallbackDestroy(pTimer, pSubTimer); + return; + } + } + } while (enmState != RTTIMERLNXSTATE_STOPPED); + + if (pTimer->cCpus > 1) + RTSpinlockRelease(pTimer->hSpinlock); +} +#endif /* CONFIG_SMP */ + + +/** + * The slow path of rtTimerLnxChangeToCallbackState. + * + * @returns true if changed successfully, false if not. + * @param pSubTimer The sub-timer. + */ +static bool rtTimerLnxChangeToCallbackStateSlow(PRTTIMERLNXSUBTIMER pSubTimer) +{ + for (;;) + { + RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState); + switch (enmState) + { + case RTTIMERLNXSTATE_ACTIVE: + case RTTIMERLNXSTATE_STARTING: + case RTTIMERLNXSTATE_MP_STARTING: + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CALLBACK, enmState)) + return true; + break; + + case RTTIMERLNXSTATE_CALLBACK: + case RTTIMERLNXSTATE_CB_STOPPING: + case RTTIMERLNXSTATE_CB_RESTARTING: + case RTTIMERLNXSTATE_CB_DESTROYING: + AssertMsgFailed(("%d\n", enmState)); RT_FALL_THRU(); + default: + return false; + } + ASMNopPause(); + } +} + + +/** + * Tries to change the sub-timer state to 'callback'. + * + * @returns true if changed successfully, false if not. + * @param pSubTimer The sub-timer. + */ +DECLINLINE(bool) rtTimerLnxChangeToCallbackState(PRTTIMERLNXSUBTIMER pSubTimer) +{ + if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CALLBACK, RTTIMERLNXSTATE_ACTIVE))) + return true; + return rtTimerLnxChangeToCallbackStateSlow(pSubTimer); +} + + +#ifdef RTTIMER_LINUX_WITH_HRTIMER +/** + * Timer callback function for high resolution timers. + * + * @returns HRTIMER_NORESTART or HRTIMER_RESTART depending on whether it's a + * one-shot or interval timer. + * @param pHrTimer Pointer to the sub-timer structure. + */ +static enum hrtimer_restart rtTimerLinuxHrCallback(struct hrtimer *pHrTimer) +{ + PRTTIMERLNXSUBTIMER pSubTimer = RT_FROM_MEMBER(pHrTimer, RTTIMERLNXSUBTIMER, u.Hr.LnxTimer); + PRTTIMER pTimer = pSubTimer->pParent; + + + RTTIMERLNX_LOG(("hrcallback %p\n", pTimer)); + if (RT_UNLIKELY(!rtTimerLnxChangeToCallbackState(pSubTimer))) + return HRTIMER_NORESTART; + +#ifdef CONFIG_SMP + /* + * Check for unwanted migration. + */ + if (pTimer->fAllCpus || pTimer->fSpecificCpu) + { + RTCPUID idCpu = RTMpCpuId(); + if (RT_UNLIKELY( pTimer->fAllCpus + ? (RTCPUID)(pSubTimer - &pTimer->aSubTimers[0]) != idCpu + : pTimer->idCpu != idCpu)) + { + rtTimerLnxCallbackHandleMigration(pTimer, pSubTimer); + return HRTIMER_NORESTART; + } + } +#endif + + if (pTimer->u64NanoInterval) + { + /* + * Periodic timer, run it and update the native timer afterwards so + * we can handle RTTimerStop and RTTimerChangeInterval from the + * callback as well as a racing control thread. + */ + pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick); + hrtimer_add_expires_ns(&pSubTimer->u.Hr.LnxTimer, ASMAtomicReadU64(&pTimer->u64NanoInterval)); + if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CALLBACK))) + return HRTIMER_RESTART; + } + else + { + /* + * One shot timer (no omni), stop it before dispatching it. + * Allow RTTimerStart as well as RTTimerDestroy to be called from + * the callback. + */ + ASMAtomicWriteBool(&pTimer->fSuspended, true); + pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick); + if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CALLBACK))) + return HRTIMER_NORESTART; + } + + /* + * Some state change occurred while we were in the callback routine. + */ + for (;;) + { + RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState); + switch (enmState) + { + case RTTIMERLNXSTATE_CB_DESTROYING: + rtTimerLnxCallbackDestroy(pTimer, pSubTimer); + return HRTIMER_NORESTART; + + case RTTIMERLNXSTATE_CB_STOPPING: + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CB_STOPPING)) + return HRTIMER_NORESTART; + break; + + case RTTIMERLNXSTATE_CB_RESTARTING: + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CB_RESTARTING)) + { + pSubTimer->iTick = 0; + hrtimer_set_expires(&pSubTimer->u.Hr.LnxTimer, rtTimerLnxNanoToKt(pSubTimer->uNsRestartAt)); + return HRTIMER_RESTART; + } + break; + + default: + AssertMsgFailed(("%d\n", enmState)); + return HRTIMER_NORESTART; + } + ASMNopPause(); + } +} +#endif /* RTTIMER_LINUX_WITH_HRTIMER */ + + +#if RTLNX_VER_MIN(4,15,0) +/** + * Timer callback function for standard timers. + * + * @param pLnxTimer Pointer to the Linux timer structure. + */ +static void rtTimerLinuxStdCallback(struct timer_list *pLnxTimer) +{ + PRTTIMERLNXSUBTIMER pSubTimer = from_timer(pSubTimer, pLnxTimer, u.Std.LnxTimer); +#else +/** + * Timer callback function for standard timers. + * + * @param ulUser Address of the sub-timer structure. + */ +static void rtTimerLinuxStdCallback(unsigned long ulUser) +{ + PRTTIMERLNXSUBTIMER pSubTimer = (PRTTIMERLNXSUBTIMER)ulUser; +#endif + PRTTIMER pTimer = pSubTimer->pParent; + + RTTIMERLNX_LOG(("stdcallback %p\n", pTimer)); + if (RT_UNLIKELY(!rtTimerLnxChangeToCallbackState(pSubTimer))) + return; + +#ifdef CONFIG_SMP + /* + * Check for unwanted migration. + */ + if (pTimer->fAllCpus || pTimer->fSpecificCpu) + { + RTCPUID idCpu = RTMpCpuId(); + if (RT_UNLIKELY( pTimer->fAllCpus + ? (RTCPUID)(pSubTimer - &pTimer->aSubTimers[0]) != idCpu + : pTimer->idCpu != idCpu)) + { + rtTimerLnxCallbackHandleMigration(pTimer, pSubTimer); + return; + } + } +#endif + + if (pTimer->u64NanoInterval) + { + /* + * Interval timer, calculate the next timeout. + * + * The first time around, we'll re-adjust the u.Std.u64NextTS to + * try prevent some jittering if we were started at a bad time. + */ + const uint64_t iTick = ++pSubTimer->iTick; + unsigned long uCurJiffies = jiffies; + unsigned long ulNextJiffies; + uint64_t u64NanoInterval; + unsigned long cJiffies; + unsigned long flFlags; + + spin_lock_irqsave(&pTimer->ChgIntLock, flFlags); + u64NanoInterval = pTimer->u64NanoInterval; + cJiffies = pTimer->cJiffies; + if (RT_UNLIKELY(pSubTimer->u.Std.fFirstAfterChg)) + { + pSubTimer->u.Std.fFirstAfterChg = false; + pSubTimer->u.Std.u64NextTS = RTTimeSystemNanoTS(); + pSubTimer->u.Std.nsStartTS = pSubTimer->u.Std.u64NextTS - u64NanoInterval * (iTick - 1); + pSubTimer->u.Std.ulNextJiffies = uCurJiffies = jiffies; + } + spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags); + + pSubTimer->u.Std.u64NextTS += u64NanoInterval; + if (cJiffies) + { + ulNextJiffies = pSubTimer->u.Std.ulNextJiffies + cJiffies; + pSubTimer->u.Std.ulNextJiffies = ulNextJiffies; + if (time_after_eq(ulNextJiffies, uCurJiffies)) + { /* likely */ } + else + { + unsigned long cJiffiesBehind = uCurJiffies - ulNextJiffies; + ulNextJiffies = uCurJiffies + cJiffies / 2; + if (cJiffiesBehind >= HZ / 4) /* Conside if we're lagging too far behind. Screw the u64NextTS member. */ + pSubTimer->u.Std.ulNextJiffies = ulNextJiffies; + /*else: Don't update u.Std.ulNextJiffies so we can continue catching up in the next tick. */ + } + } + else + { + const uint64_t u64NanoTS = RTTimeSystemNanoTS(); + const int64_t cNsBehind = u64NanoTS - pSubTimer->u.Std.u64NextTS; + if (cNsBehind <= 0) + ulNextJiffies = uCurJiffies + rtTimerLnxNanoToJiffies(pSubTimer->u.Std.u64NextTS - u64NanoTS); + else if (u64NanoInterval >= RT_NS_1SEC_64 * 2 / HZ) + { + ulNextJiffies = uCurJiffies + rtTimerLnxNanoToJiffies(u64NanoInterval / 2); + if (cNsBehind >= RT_NS_1SEC_64 / HZ / 4) /* Conside if we're lagging too far behind. */ + pSubTimer->u.Std.u64NextTS = u64NanoTS + u64NanoInterval / 2; + } + else + { + ulNextJiffies = uCurJiffies + 1; + if (cNsBehind >= RT_NS_1SEC_64 / HZ / 4) /* Conside if we're lagging too far behind. */ + pSubTimer->u.Std.u64NextTS = u64NanoTS + RT_NS_1SEC_64 / HZ; + } + pSubTimer->u.Std.ulNextJiffies = ulNextJiffies; + } + + /* + * Run the timer and re-arm it unless the state changed . + * . + * We must re-arm it afterwards as we're not in a position to undo this . + * operation if for instance someone stopped or destroyed us while we . + * were in the callback. (Linux takes care of any races here.) + */ + pTimer->pfnTimer(pTimer, pTimer->pvUser, iTick); + if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CALLBACK))) + { +#ifdef CONFIG_SMP + if (pTimer->fSpecificCpu || pTimer->fAllCpus) + { +# if RTLNX_VER_MIN(4,8,0) + mod_timer(&pSubTimer->u.Std.LnxTimer, ulNextJiffies); +# else + mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, ulNextJiffies); +# endif + } + else +#endif + mod_timer(&pSubTimer->u.Std.LnxTimer, ulNextJiffies); + return; + } + } + else + { + /* + * One shot timer, stop it before dispatching it. + * Allow RTTimerStart as well as RTTimerDestroy to be called from + * the callback. + */ + ASMAtomicWriteBool(&pTimer->fSuspended, true); + pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick); + if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CALLBACK))) + return; + } + + /* + * Some state change occurred while we were in the callback routine. + */ + for (;;) + { + RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState); + switch (enmState) + { + case RTTIMERLNXSTATE_CB_DESTROYING: + rtTimerLnxCallbackDestroy(pTimer, pSubTimer); + return; + + case RTTIMERLNXSTATE_CB_STOPPING: + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CB_STOPPING)) + return; + break; + + case RTTIMERLNXSTATE_CB_RESTARTING: + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CB_RESTARTING)) + { + uint64_t u64NanoTS; + uint64_t u64NextTS; + unsigned long flFlags; + + spin_lock_irqsave(&pTimer->ChgIntLock, flFlags); + u64NextTS = pSubTimer->uNsRestartAt; + u64NanoTS = RTTimeSystemNanoTS(); + pSubTimer->iTick = 0; + pSubTimer->u.Std.u64NextTS = u64NextTS; + pSubTimer->u.Std.fFirstAfterChg = true; + pSubTimer->u.Std.ulNextJiffies = u64NextTS > u64NanoTS + ? jiffies + rtTimerLnxNanoToJiffies(u64NextTS - u64NanoTS) + : jiffies; + spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags); + +#ifdef CONFIG_SMP + if (pTimer->fSpecificCpu || pTimer->fAllCpus) + { +# if RTLNX_VER_MIN(4,8,0) + mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies); +# else + mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies); +# endif + } + else +#endif + mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies); + return; + } + break; + + default: + AssertMsgFailed(("%d\n", enmState)); + return; + } + ASMNopPause(); + } +} + + +#ifdef CONFIG_SMP + +/** + * Per-cpu callback function (RTMpOnAll/RTMpOnSpecific). + * + * @param idCpu The current CPU. + * @param pvUser1 Pointer to the timer. + * @param pvUser2 Pointer to the argument structure. + */ +static DECLCALLBACK(void) rtTimerLnxStartAllOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2; + PRTTIMER pTimer = (PRTTIMER)pvUser1; + Assert(idCpu < pTimer->cCpus); + rtTimerLnxStartSubTimer(&pTimer->aSubTimers[idCpu], pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes); +} + + +/** + * Worker for RTTimerStart() that takes care of the ugly bits. + * + * @returns RTTimerStart() return value. + * @param pTimer The timer. + * @param pArgs The argument structure. + */ +static int rtTimerLnxOmniStart(PRTTIMER pTimer, PRTTIMERLINUXSTARTONCPUARGS pArgs) +{ + RTCPUID iCpu; + RTCPUSET OnlineSet; + RTCPUSET OnlineSet2; + int rc2; + + /* + * Prepare all the sub-timers for the startup and then flag the timer + * as a whole as non-suspended, make sure we get them all before + * clearing fSuspended as the MP handler will be waiting on this + * should something happen while we're looping. + */ + RTSpinlockAcquire(pTimer->hSpinlock); + + /* Just make it a omni timer restriction that no stop/start races are allowed. */ + for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++) + if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) != RTTIMERLNXSTATE_STOPPED) + { + RTSpinlockRelease(pTimer->hSpinlock); + return VERR_TIMER_BUSY; + } + + do + { + RTMpGetOnlineSet(&OnlineSet); + for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++) + { + Assert(pTimer->aSubTimers[iCpu].enmState != RTTIMERLNXSTATE_MP_STOPPING); + rtTimerLnxSetState(&pTimer->aSubTimers[iCpu].enmState, + RTCpuSetIsMember(&OnlineSet, iCpu) + ? RTTIMERLNXSTATE_STARTING + : RTTIMERLNXSTATE_STOPPED); + } + } while (!RTCpuSetIsEqual(&OnlineSet, RTMpGetOnlineSet(&OnlineSet2))); + + ASMAtomicWriteBool(&pTimer->fSuspended, false); + + RTSpinlockRelease(pTimer->hSpinlock); + + /* + * Start them (can't find any exported function that allows me to + * do this without the cross calls). + */ + pArgs->u64Now = RTTimeSystemNanoTS(); + rc2 = RTMpOnAll(rtTimerLnxStartAllOnCpu, pTimer, pArgs); + AssertRC(rc2); /* screw this if it fails. */ + + /* + * Reset the sub-timers who didn't start up (ALL CPUs case). + */ + RTSpinlockAcquire(pTimer->hSpinlock); + + for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++) + if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_STARTING)) + { + /** @todo very odd case for a rainy day. Cpus that temporarily went offline while + * we were between calls needs to nudged as the MP handler will ignore events for + * them because of the STARTING state. This is an extremely unlikely case - not that + * that means anything in my experience... ;-) */ + RTTIMERLNX_LOG(("what!? iCpu=%u -> didn't start\n", iCpu)); + } + + RTSpinlockRelease(pTimer->hSpinlock); + + return VINF_SUCCESS; +} + + +/** + * Worker for RTTimerStop() that takes care of the ugly SMP bits. + * + * @returns true if there was any active callbacks, false if not. + * @param pTimer The timer (valid). + * @param fForDestroy Whether this is for RTTimerDestroy or not. + */ +static bool rtTimerLnxOmniStop(PRTTIMER pTimer, bool fForDestroy) +{ + bool fActiveCallbacks = false; + RTCPUID iCpu; + RTTIMERLNXSTATE enmState; + + + /* + * Mark the timer as suspended and flag all timers as stopping, except + * for those being stopped by an MP event. + */ + RTSpinlockAcquire(pTimer->hSpinlock); + + ASMAtomicWriteBool(&pTimer->fSuspended, true); + for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++) + { + for (;;) + { + enmState = rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState); + if ( enmState == RTTIMERLNXSTATE_STOPPED + || enmState == RTTIMERLNXSTATE_MP_STOPPING) + break; + if ( enmState == RTTIMERLNXSTATE_CALLBACK + || enmState == RTTIMERLNXSTATE_CB_STOPPING + || enmState == RTTIMERLNXSTATE_CB_RESTARTING) + { + Assert(enmState != RTTIMERLNXSTATE_CB_STOPPING || fForDestroy); + if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, + !fForDestroy ? RTTIMERLNXSTATE_CB_STOPPING : RTTIMERLNXSTATE_CB_DESTROYING, + enmState)) + { + fActiveCallbacks = true; + break; + } + } + else + { + Assert(enmState == RTTIMERLNXSTATE_ACTIVE); + if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_STOPPING, enmState)) + break; + } + ASMNopPause(); + } + } + + RTSpinlockRelease(pTimer->hSpinlock); + + /* + * Do the actual stopping. Fortunately, this doesn't require any IPIs. + * Unfortunately it cannot be done synchronously. + */ + for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++) + if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) == RTTIMERLNXSTATE_STOPPING) + rtTimerLnxStopSubTimer(&pTimer->aSubTimers[iCpu], pTimer->fHighRes); + + return fActiveCallbacks; +} + + +/** + * Per-cpu callback function (RTMpOnSpecific) used by rtTimerLinuxMpEvent() + * to start a sub-timer on a cpu that just have come online. + * + * @param idCpu The current CPU. + * @param pvUser1 Pointer to the timer. + * @param pvUser2 Pointer to the argument structure. + */ +static DECLCALLBACK(void) rtTimerLinuxMpStartOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2; + PRTTIMER pTimer = (PRTTIMER)pvUser1; + RTSPINLOCK hSpinlock; + Assert(idCpu < pTimer->cCpus); + + /* + * We have to be kind of careful here as we might be racing RTTimerStop + * (and/or RTTimerDestroy, thus the paranoia. + */ + hSpinlock = pTimer->hSpinlock; + if ( hSpinlock != NIL_RTSPINLOCK + && pTimer->u32Magic == RTTIMER_MAGIC) + { + RTSpinlockAcquire(hSpinlock); + + if ( !ASMAtomicUoReadBool(&pTimer->fSuspended) + && pTimer->u32Magic == RTTIMER_MAGIC) + { + /* We're sane and the timer is not suspended yet. */ + PRTTIMERLNXSUBTIMER pSubTimer = &pTimer->aSubTimers[idCpu]; + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STARTING, RTTIMERLNXSTATE_STOPPED)) + rtTimerLnxStartSubTimer(pSubTimer, pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes); + } + + RTSpinlockRelease(hSpinlock); + } +} + + +/** + * MP event notification callback. + * + * @param enmEvent The event. + * @param idCpu The cpu it applies to. + * @param pvUser The timer. + */ +static DECLCALLBACK(void) rtTimerLinuxMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser) +{ + PRTTIMER pTimer = (PRTTIMER)pvUser; + PRTTIMERLNXSUBTIMER pSubTimer = &pTimer->aSubTimers[idCpu]; + RTSPINLOCK hSpinlock; + + Assert(idCpu < pTimer->cCpus); + + /* + * Some initial paranoia. + */ + if (pTimer->u32Magic != RTTIMER_MAGIC) + return; + hSpinlock = pTimer->hSpinlock; + if (hSpinlock == NIL_RTSPINLOCK) + return; + + RTSpinlockAcquire(hSpinlock); + + /* Is it active? */ + if ( !ASMAtomicUoReadBool(&pTimer->fSuspended) + && pTimer->u32Magic == RTTIMER_MAGIC) + { + switch (enmEvent) + { + /* + * Try do it without leaving the spin lock, but if we have to, retake it + * when we're on the right cpu. + */ + case RTMPEVENT_ONLINE: + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STARTING, RTTIMERLNXSTATE_STOPPED)) + { + RTTIMERLINUXSTARTONCPUARGS Args; + Args.u64Now = RTTimeSystemNanoTS(); + Args.u64First = 0; + + if (RTMpCpuId() == idCpu) + rtTimerLnxStartSubTimer(pSubTimer, Args.u64Now, Args.u64First, true /*fPinned*/, pTimer->fHighRes); + else + { + rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED); /* we'll recheck it. */ + RTSpinlockRelease(hSpinlock); + + RTMpOnSpecific(idCpu, rtTimerLinuxMpStartOnCpu, pTimer, &Args); + return; /* we've left the spinlock */ + } + } + break; + + /* + * The CPU is (going) offline, make sure the sub-timer is stopped. + * + * Linux will migrate it to a different CPU, but we don't want this. The + * timer function is checking for this. + */ + case RTMPEVENT_OFFLINE: + { + RTTIMERLNXSTATE enmState; + while ( (enmState = rtTimerLnxGetState(&pSubTimer->enmState)) == RTTIMERLNXSTATE_ACTIVE + || enmState == RTTIMERLNXSTATE_CALLBACK + || enmState == RTTIMERLNXSTATE_CB_RESTARTING) + { + if (enmState == RTTIMERLNXSTATE_ACTIVE) + { + if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STOPPING, RTTIMERLNXSTATE_ACTIVE)) + { + RTSpinlockRelease(hSpinlock); + + rtTimerLnxStopSubTimer(pSubTimer, pTimer->fHighRes); + return; /* we've left the spinlock */ + } + } + else if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CB_STOPPING, enmState)) + break; + + /* State not stable, try again. */ + ASMNopPause(); + } + break; + } + } + } + + RTSpinlockRelease(hSpinlock); +} + +#endif /* CONFIG_SMP */ + + +/** + * Callback function use by RTTimerStart via RTMpOnSpecific to start a timer + * running on a specific CPU. + * + * @param idCpu The current CPU. + * @param pvUser1 Pointer to the timer. + * @param pvUser2 Pointer to the argument structure. + */ +static DECLCALLBACK(void) rtTimerLnxStartOnSpecificCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2; + PRTTIMER pTimer = (PRTTIMER)pvUser1; + RT_NOREF_PV(idCpu); + rtTimerLnxStartSubTimer(&pTimer->aSubTimers[0], pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes); +} + + +RTDECL(int) RTTimerStart(PRTTIMER pTimer, uint64_t u64First) +{ + RTTIMERLINUXSTARTONCPUARGS Args; + int rc2; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate. + */ + AssertPtrReturn(pTimer, VERR_INVALID_HANDLE); + AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE); + + if (!ASMAtomicUoReadBool(&pTimer->fSuspended)) + return VERR_TIMER_ACTIVE; + RTTIMERLNX_LOG(("start %p cCpus=%d\n", pTimer, pTimer->cCpus)); + + Args.u64First = u64First; +#ifdef CONFIG_SMP + /* + * Omni timer? + */ + if (pTimer->fAllCpus) + { + rc2 = rtTimerLnxOmniStart(pTimer, &Args); + IPRT_LINUX_RESTORE_EFL_AC(); + return rc2; + } +#endif + + /* + * Simple timer - Pretty straight forward if it wasn't for restarting. + */ + Args.u64Now = RTTimeSystemNanoTS(); + ASMAtomicWriteU64(&pTimer->aSubTimers[0].uNsRestartAt, Args.u64Now + u64First); + for (;;) + { + RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[0].enmState); + switch (enmState) + { + case RTTIMERLNXSTATE_STOPPED: + if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STARTING, RTTIMERLNXSTATE_STOPPED)) + { + ASMAtomicWriteBool(&pTimer->fSuspended, false); + if (!pTimer->fSpecificCpu) + rtTimerLnxStartSubTimer(&pTimer->aSubTimers[0], Args.u64Now, Args.u64First, + false /*fPinned*/, pTimer->fHighRes); + else + { + rc2 = RTMpOnSpecific(pTimer->idCpu, rtTimerLnxStartOnSpecificCpu, pTimer, &Args); + if (RT_FAILURE(rc2)) + { + /* Suspend it, the cpu id is probably invalid or offline. */ + ASMAtomicWriteBool(&pTimer->fSuspended, true); + rtTimerLnxSetState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STOPPED); + return rc2; + } + } + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + break; + + case RTTIMERLNXSTATE_CALLBACK: + case RTTIMERLNXSTATE_CB_STOPPING: + if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_CB_RESTARTING, enmState)) + { + ASMAtomicWriteBool(&pTimer->fSuspended, false); + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } + break; + + default: + AssertMsgFailed(("%d\n", enmState)); + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_INTERNAL_ERROR_4; + } + ASMNopPause(); + } +} +RT_EXPORT_SYMBOL(RTTimerStart); + + +/** + * Common worker for RTTimerStop and RTTimerDestroy. + * + * @returns true if there was any active callbacks, false if not. + * @param pTimer The timer to stop. + * @param fForDestroy Whether it's RTTimerDestroy calling or not. + */ +static bool rtTimerLnxStop(PRTTIMER pTimer, bool fForDestroy) +{ + RTTIMERLNX_LOG(("lnxstop %p %d\n", pTimer, fForDestroy)); +#ifdef CONFIG_SMP + /* + * Omni timer? + */ + if (pTimer->fAllCpus) + return rtTimerLnxOmniStop(pTimer, fForDestroy); +#endif + + /* + * Simple timer. + */ + ASMAtomicWriteBool(&pTimer->fSuspended, true); + for (;;) + { + RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[0].enmState); + switch (enmState) + { + case RTTIMERLNXSTATE_ACTIVE: + if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STOPPING, RTTIMERLNXSTATE_ACTIVE)) + { + rtTimerLnxStopSubTimer(&pTimer->aSubTimers[0], pTimer->fHighRes); + return false; + } + break; + + case RTTIMERLNXSTATE_CALLBACK: + case RTTIMERLNXSTATE_CB_RESTARTING: + case RTTIMERLNXSTATE_CB_STOPPING: + Assert(enmState != RTTIMERLNXSTATE_CB_STOPPING || fForDestroy); + if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, + !fForDestroy ? RTTIMERLNXSTATE_CB_STOPPING : RTTIMERLNXSTATE_CB_DESTROYING, + enmState)) + return true; + break; + + case RTTIMERLNXSTATE_STOPPED: + return VINF_SUCCESS; + + case RTTIMERLNXSTATE_CB_DESTROYING: + AssertMsgFailed(("enmState=%d pTimer=%p\n", enmState, pTimer)); + return true; + + default: + case RTTIMERLNXSTATE_STARTING: + case RTTIMERLNXSTATE_MP_STARTING: + case RTTIMERLNXSTATE_STOPPING: + case RTTIMERLNXSTATE_MP_STOPPING: + AssertMsgFailed(("enmState=%d pTimer=%p\n", enmState, pTimer)); + return false; + } + + /* State not stable, try again. */ + ASMNopPause(); + } +} + + +RTDECL(int) RTTimerStop(PRTTIMER pTimer) +{ + /* + * Validate. + */ + IPRT_LINUX_SAVE_EFL_AC(); + AssertPtrReturn(pTimer, VERR_INVALID_HANDLE); + AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE); + RTTIMERLNX_LOG(("stop %p\n", pTimer)); + + if (ASMAtomicUoReadBool(&pTimer->fSuspended)) + return VERR_TIMER_SUSPENDED; + + rtTimerLnxStop(pTimer, false /*fForDestroy*/); + + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTTimerStop); + + +RTDECL(int) RTTimerChangeInterval(PRTTIMER pTimer, uint64_t u64NanoInterval) +{ + unsigned long cJiffies; + unsigned long flFlags; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate. + */ + AssertPtrReturn(pTimer, VERR_INVALID_HANDLE); + AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE); + AssertReturn(u64NanoInterval, VERR_INVALID_PARAMETER); + AssertReturn(u64NanoInterval < UINT64_MAX / 8, VERR_INVALID_PARAMETER); + AssertReturn(pTimer->u64NanoInterval, VERR_INVALID_STATE); + RTTIMERLNX_LOG(("change %p %llu\n", pTimer, u64NanoInterval)); + +#ifdef RTTIMER_LINUX_WITH_HRTIMER + /* + * For the high resolution timers it is easy since we don't care so much + * about when it is applied to the sub-timers. + */ + if (pTimer->fHighRes) + { + ASMAtomicWriteU64(&pTimer->u64NanoInterval, u64NanoInterval); + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; + } +#endif + + /* + * Standard timers have a bit more complicated way of calculating + * their interval and such. So, forget omni timers for now. + */ + if (pTimer->cCpus > 1) + return VERR_NOT_SUPPORTED; + + cJiffies = u64NanoInterval / (RT_NS_1SEC / HZ); + if (cJiffies * (RT_NS_1SEC / HZ) != u64NanoInterval) + cJiffies = 0; + + spin_lock_irqsave(&pTimer->ChgIntLock, flFlags); + pTimer->aSubTimers[0].u.Std.fFirstAfterChg = true; + pTimer->cJiffies = cJiffies; + ASMAtomicWriteU64(&pTimer->u64NanoInterval, u64NanoInterval); + spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags); + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTTimerChangeInterval); + + +RTDECL(int) RTTimerDestroy(PRTTIMER pTimer) +{ + bool fCanDestroy; + IPRT_LINUX_SAVE_EFL_AC(); + + /* + * Validate. It's ok to pass NULL pointer. + */ + if (pTimer == /*NIL_RTTIMER*/ NULL) + return VINF_SUCCESS; + AssertPtrReturn(pTimer, VERR_INVALID_HANDLE); + AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE); + RTTIMERLNX_LOG(("destroy %p\n", pTimer)); +/** @todo We should invalidate the magic here! */ + + /* + * Stop the timer if it's still active, then destroy it if we can. + */ + if (!ASMAtomicUoReadBool(&pTimer->fSuspended)) + fCanDestroy = rtTimerLnxStop(pTimer, true /*fForDestroy*/); + else + { + uint32_t iCpu = pTimer->cCpus; + if (pTimer->cCpus > 1) + RTSpinlockAcquire(pTimer->hSpinlock); + + fCanDestroy = true; + while (iCpu-- > 0) + { + for (;;) + { + RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState); + switch (enmState) + { + case RTTIMERLNXSTATE_CALLBACK: + case RTTIMERLNXSTATE_CB_RESTARTING: + case RTTIMERLNXSTATE_CB_STOPPING: + if (!rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_CB_DESTROYING, enmState)) + continue; + fCanDestroy = false; + break; + + case RTTIMERLNXSTATE_CB_DESTROYING: + AssertMsgFailed(("%d\n", enmState)); + fCanDestroy = false; + break; + default: + break; + } + break; + } + } + + if (pTimer->cCpus > 1) + RTSpinlockRelease(pTimer->hSpinlock); + } + + if (fCanDestroy) + { + /* For paranoid reasons, defer actually destroying the semaphore when + in atomic or interrupt context. */ +#if RTLNX_VER_MIN(2,5,32) + if (in_atomic() || in_interrupt()) +#else + if (in_interrupt()) +#endif + rtR0LnxWorkqueuePush(&pTimer->DtorWorkqueueItem, rtTimerLnxDestroyDeferred); + else + rtTimerLnxDestroyIt(pTimer); + } + + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTTimerDestroy); + + +RTDECL(int) RTTimerCreateEx(PRTTIMER *ppTimer, uint64_t u64NanoInterval, uint32_t fFlags, PFNRTTIMER pfnTimer, void *pvUser) +{ + PRTTIMER pTimer; + RTCPUID iCpu; + unsigned cCpus; + int rc; + IPRT_LINUX_SAVE_EFL_AC(); + + rtR0LnxWorkqueueFlush(); /* for 2.4 */ + *ppTimer = NULL; + + /* + * Validate flags. + */ + if (!RTTIMER_FLAGS_ARE_VALID(fFlags)) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_INVALID_PARAMETER; + } + if ( (fFlags & RTTIMER_FLAGS_CPU_SPECIFIC) + && (fFlags & RTTIMER_FLAGS_CPU_ALL) != RTTIMER_FLAGS_CPU_ALL + && !RTMpIsCpuPossible(RTMpCpuIdFromSetIndex(fFlags & RTTIMER_FLAGS_CPU_MASK))) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return VERR_CPU_NOT_FOUND; + } + + /* + * Allocate the timer handler. + */ + cCpus = 1; +#ifdef CONFIG_SMP + if ((fFlags & RTTIMER_FLAGS_CPU_ALL) == RTTIMER_FLAGS_CPU_ALL) + { + cCpus = RTMpGetMaxCpuId() + 1; + Assert(cCpus <= RTCPUSET_MAX_CPUS); /* On linux we have a 1:1 relationship between cpuid and set index. */ + AssertReturnStmt(u64NanoInterval, IPRT_LINUX_RESTORE_EFL_AC(), VERR_NOT_IMPLEMENTED); /* We don't implement single shot on all cpus, sorry. */ + } +#endif + + rc = RTMemAllocEx(RT_UOFFSETOF_DYN(RTTIMER, aSubTimers[cCpus]), 0, + RTMEMALLOCEX_FLAGS_ZEROED | RTMEMALLOCEX_FLAGS_ANY_CTX_FREE, (void **)&pTimer); + if (RT_FAILURE(rc)) + { + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; + } + + /* + * Initialize it. + */ + pTimer->u32Magic = RTTIMER_MAGIC; + pTimer->hSpinlock = NIL_RTSPINLOCK; + pTimer->fSuspended = true; + pTimer->fHighRes = !!(fFlags & RTTIMER_FLAGS_HIGH_RES); +#ifdef CONFIG_SMP + pTimer->fSpecificCpu = (fFlags & RTTIMER_FLAGS_CPU_SPECIFIC) && (fFlags & RTTIMER_FLAGS_CPU_ALL) != RTTIMER_FLAGS_CPU_ALL; + pTimer->fAllCpus = (fFlags & RTTIMER_FLAGS_CPU_ALL) == RTTIMER_FLAGS_CPU_ALL; + pTimer->idCpu = pTimer->fSpecificCpu + ? RTMpCpuIdFromSetIndex(fFlags & RTTIMER_FLAGS_CPU_MASK) + : NIL_RTCPUID; +#else + pTimer->fSpecificCpu = !!(fFlags & RTTIMER_FLAGS_CPU_SPECIFIC); + pTimer->idCpu = RTMpCpuId(); +#endif + pTimer->cCpus = cCpus; + pTimer->pfnTimer = pfnTimer; + pTimer->pvUser = pvUser; + pTimer->u64NanoInterval = u64NanoInterval; + pTimer->cJiffies = u64NanoInterval / (RT_NS_1SEC / HZ); + if (pTimer->cJiffies * (RT_NS_1SEC / HZ) != u64NanoInterval) + pTimer->cJiffies = 0; + spin_lock_init(&pTimer->ChgIntLock); + + for (iCpu = 0; iCpu < cCpus; iCpu++) + { +#ifdef RTTIMER_LINUX_WITH_HRTIMER + if (pTimer->fHighRes) + { + hrtimer_init(&pTimer->aSubTimers[iCpu].u.Hr.LnxTimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + pTimer->aSubTimers[iCpu].u.Hr.LnxTimer.function = rtTimerLinuxHrCallback; + } + else +#endif + { +#if RTLNX_VER_MIN(4,15,0) + timer_setup(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer, rtTimerLinuxStdCallback, TIMER_PINNED); +#elif RTLNX_VER_MIN(4,8,0) + init_timer_pinned(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer); +#else + init_timer(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer); +#endif +#if RTLNX_VER_MAX(4,15,0) + pTimer->aSubTimers[iCpu].u.Std.LnxTimer.data = (unsigned long)&pTimer->aSubTimers[iCpu]; + pTimer->aSubTimers[iCpu].u.Std.LnxTimer.function = rtTimerLinuxStdCallback; +#endif + pTimer->aSubTimers[iCpu].u.Std.LnxTimer.expires = jiffies; + pTimer->aSubTimers[iCpu].u.Std.u64NextTS = 0; + } + pTimer->aSubTimers[iCpu].iTick = 0; + pTimer->aSubTimers[iCpu].pParent = pTimer; + pTimer->aSubTimers[iCpu].enmState = RTTIMERLNXSTATE_STOPPED; + } + +#ifdef CONFIG_SMP + /* + * If this is running on ALL cpus, we'll have to register a callback + * for MP events (so timers can be started/stopped on cpus going + * online/offline). We also create the spinlock for synchronizing + * stop/start/mp-event. + */ + if (cCpus > 1) + { + int rc = RTSpinlockCreate(&pTimer->hSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_SAFE, "RTTimerLnx"); + if (RT_SUCCESS(rc)) + rc = RTMpNotificationRegister(rtTimerLinuxMpEvent, pTimer); + else + pTimer->hSpinlock = NIL_RTSPINLOCK; + if (RT_FAILURE(rc)) + { + RTTimerDestroy(pTimer); + IPRT_LINUX_RESTORE_EFL_AC(); + return rc; + } + } +#endif /* CONFIG_SMP */ + + RTTIMERLNX_LOG(("create %p hires=%d fFlags=%#x cCpus=%u\n", pTimer, pTimer->fHighRes, fFlags, cCpus)); + *ppTimer = pTimer; + IPRT_LINUX_RESTORE_EFL_AC(); + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTTimerCreateEx); + + +RTDECL(uint32_t) RTTimerGetSystemGranularity(void) +{ +#if 0 /** @todo Not sure if this is what we want or not... Add new API for + * querying the resolution of the high res timers? */ + struct timespec Ts; + int rc; + IPRT_LINUX_SAVE_EFL_AC(); + rc = hrtimer_get_res(CLOCK_MONOTONIC, &Ts); + IPRT_LINUX_RESTORE_EFL_AC(); + if (!rc) + { + Assert(!Ts.tv_sec); + return Ts.tv_nsec; + } +#endif + /* */ +#if RTLNX_VER_MAX(4,9,0) || RTLNX_VER_MIN(4,13,0) + /* On 4.9, 4.10 and 4.12 we've observed tstRTR0Timer failures of the omni timer tests + where we get about half of the ticks we want. The failing test is using this value + as interval. So, this is a very very crude hack to try make omni timers work + correctly without actually knowing what's going wrong... */ + return RT_NS_1SEC * 2 / HZ; /* ns */ +#else + return RT_NS_1SEC / HZ; /* ns */ +#endif +} +RT_EXPORT_SYMBOL(RTTimerGetSystemGranularity); + + +RTDECL(int) RTTimerRequestSystemGranularity(uint32_t u32Request, uint32_t *pu32Granted) +{ + RT_NOREF_PV(u32Request); RT_NOREF_PV(*pu32Granted); + return VERR_NOT_SUPPORTED; +} +RT_EXPORT_SYMBOL(RTTimerRequestSystemGranularity); + + +RTDECL(int) RTTimerReleaseSystemGranularity(uint32_t u32Granted) +{ + RT_NOREF_PV(u32Granted); + return VERR_NOT_SUPPORTED; +} +RT_EXPORT_SYMBOL(RTTimerReleaseSystemGranularity); + + +RTDECL(bool) RTTimerCanDoHighResolution(void) +{ +#ifdef RTTIMER_LINUX_WITH_HRTIMER + return true; +#else + return false; +#endif +} +RT_EXPORT_SYMBOL(RTTimerCanDoHighResolution); + diff --git a/src/VBox/Runtime/r0drv/linux/waitqueue-r0drv-linux.h b/src/VBox/Runtime/r0drv/linux/waitqueue-r0drv-linux.h new file mode 100644 index 00000000..e0f2faf8 --- /dev/null +++ b/src/VBox/Runtime/r0drv/linux/waitqueue-r0drv-linux.h @@ -0,0 +1,292 @@ +/* $Id: waitqueue-r0drv-linux.h $ */ +/** @file + * IPRT - Linux Ring-0 Driver Helpers for Abstracting Wait Queues, + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + +#ifndef IPRT_INCLUDED_SRC_r0drv_linux_waitqueue_r0drv_linux_h +#define IPRT_INCLUDED_SRC_r0drv_linux_waitqueue_r0drv_linux_h +#ifndef RT_WITHOUT_PRAGMA_ONCE +# pragma once +#endif + +#include "the-linux-kernel.h" + +#include <iprt/asm-math.h> +#include <iprt/err.h> +#include <iprt/string.h> +#include <iprt/time.h> + +/** The resolution (nanoseconds) specified when using + * schedule_hrtimeout_range. */ +#define RTR0SEMLNXWAIT_RESOLUTION 50000 + + +/** + * Kernel mode Linux wait state structure. + */ +typedef struct RTR0SEMLNXWAIT +{ + /** The wait queue entry. */ +#if RTLNX_VER_MIN(4,13,0) || RTLNX_SUSE_MAJ_PREREQ(12, 4) || RTLNX_SUSE_MAJ_PREREQ(15, 0) + wait_queue_entry_t WaitQE; +#else + wait_queue_t WaitQE; +#endif + /** The absolute timeout given as nano seconds since the start of the + * monotonic clock. */ + uint64_t uNsAbsTimeout; + /** The timeout in nano seconds relative to the start of the wait. */ + uint64_t cNsRelTimeout; + /** The native timeout value. */ + union + { +#ifdef IPRT_LINUX_HAS_HRTIMER + /** The timeout when fHighRes is true. Absolute, so no updating. */ + ktime_t KtTimeout; +#endif + /** The timeout when fHighRes is false. Updated after waiting. */ + long lTimeout; + } u; + /** Set if we use high resolution timeouts. */ + bool fHighRes; + /** Set if it's an indefinite wait. */ + bool fIndefinite; + /** Set if we've already timed out. + * Set by rtR0SemLnxWaitDoIt and read by rtR0SemLnxWaitHasTimedOut. */ + bool fTimedOut; + /** TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE. */ + int iWaitState; + /** The wait queue. */ + wait_queue_head_t *pWaitQueue; +} RTR0SEMLNXWAIT; +/** Pointer to a linux wait state. */ +typedef RTR0SEMLNXWAIT *PRTR0SEMLNXWAIT; + + +/** + * Initializes a wait. + * + * The caller MUST check the wait condition BEFORE calling this function or the + * timeout logic will be flawed. + * + * @returns VINF_SUCCESS or VERR_TIMEOUT. + * @param pWait The wait structure. + * @param fFlags The wait flags. + * @param uTimeout The timeout. + * @param pWaitQueue The wait queue head. + */ +DECLINLINE(int) rtR0SemLnxWaitInit(PRTR0SEMLNXWAIT pWait, uint32_t fFlags, uint64_t uTimeout, + wait_queue_head_t *pWaitQueue) +{ + /* + * Process the flags and timeout. + */ + if (!(fFlags & RTSEMWAIT_FLAGS_INDEFINITE)) + { +/** @todo optimize: millisecs -> nanosecs -> millisec -> jiffies */ + if (fFlags & RTSEMWAIT_FLAGS_MILLISECS) + uTimeout = uTimeout < UINT64_MAX / RT_US_1SEC * RT_US_1SEC + ? uTimeout * RT_US_1SEC + : UINT64_MAX; + if (uTimeout == UINT64_MAX) + fFlags |= RTSEMWAIT_FLAGS_INDEFINITE; + else + { + uint64_t u64Now; + if (fFlags & RTSEMWAIT_FLAGS_RELATIVE) + { + if (uTimeout == 0) + return VERR_TIMEOUT; + + u64Now = RTTimeSystemNanoTS(); + pWait->cNsRelTimeout = uTimeout; + pWait->uNsAbsTimeout = u64Now + uTimeout; + if (pWait->uNsAbsTimeout < u64Now) /* overflow */ + fFlags |= RTSEMWAIT_FLAGS_INDEFINITE; + } + else + { + u64Now = RTTimeSystemNanoTS(); + if (u64Now >= uTimeout) + return VERR_TIMEOUT; + + pWait->cNsRelTimeout = uTimeout - u64Now; + pWait->uNsAbsTimeout = uTimeout; + } + } + } + + if (!(fFlags & RTSEMWAIT_FLAGS_INDEFINITE)) + { + pWait->fIndefinite = false; +#ifdef IPRT_LINUX_HAS_HRTIMER + if ( (fFlags & (RTSEMWAIT_FLAGS_NANOSECS | RTSEMWAIT_FLAGS_ABSOLUTE)) + || pWait->cNsRelTimeout < RT_NS_1SEC / HZ * 4) + { + pWait->fHighRes = true; +# if BITS_PER_LONG < 64 + if ( KTIME_SEC_MAX <= LONG_MAX + && pWait->uNsAbsTimeout >= KTIME_SEC_MAX * RT_NS_1SEC_64 + (RT_NS_1SEC - 1)) + fFlags |= RTSEMWAIT_FLAGS_INDEFINITE; + else +# endif + pWait->u.KtTimeout = ns_to_ktime(pWait->uNsAbsTimeout); + } + else +#endif + { + uint64_t cJiffies = ASMMultU64ByU32DivByU32(pWait->cNsRelTimeout, HZ, RT_NS_1SEC); + if (cJiffies >= MAX_JIFFY_OFFSET) + fFlags |= RTSEMWAIT_FLAGS_INDEFINITE; + else + { + pWait->u.lTimeout = (long)cJiffies; + pWait->fHighRes = false; + } + } + } + + if (fFlags & RTSEMWAIT_FLAGS_INDEFINITE) + { + pWait->fIndefinite = true; + pWait->fHighRes = false; + pWait->uNsAbsTimeout = UINT64_MAX; + pWait->cNsRelTimeout = UINT64_MAX; + pWait->u.lTimeout = LONG_MAX; + } + + pWait->fTimedOut = false; + + /* + * Initialize the wait queue related bits. + */ +#if RTLNX_VER_MIN(2,5,39) + init_wait((&pWait->WaitQE)); +#else + RT_ZERO(pWait->WaitQE); + init_waitqueue_entry((&pWait->WaitQE), current); +#endif + pWait->pWaitQueue = pWaitQueue; + pWait->iWaitState = fFlags & RTSEMWAIT_FLAGS_INTERRUPTIBLE + ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; + + return VINF_SUCCESS; +} + + +/** + * Prepares the next wait. + * + * This must be called before rtR0SemLnxWaitDoIt, and the caller should check + * the exit conditions in-between the two calls. + * + * @param pWait The wait structure. + */ +DECLINLINE(void) rtR0SemLnxWaitPrepare(PRTR0SEMLNXWAIT pWait) +{ + /* Make everything thru schedule*() atomic scheduling wise. (Is this correct?) */ + prepare_to_wait(pWait->pWaitQueue, &pWait->WaitQE, pWait->iWaitState); +} + + +/** + * Do the actual wait. + * + * @param pWait The wait structure. + */ +DECLINLINE(void) rtR0SemLnxWaitDoIt(PRTR0SEMLNXWAIT pWait) +{ + if (pWait->fIndefinite) + schedule(); +#ifdef IPRT_LINUX_HAS_HRTIMER + else if (pWait->fHighRes) + { + int rc = schedule_hrtimeout_range(&pWait->u.KtTimeout, HRTIMER_MODE_ABS, RTR0SEMLNXWAIT_RESOLUTION); + if (!rc) + pWait->fTimedOut = true; + } +#endif + else + { + pWait->u.lTimeout = schedule_timeout(pWait->u.lTimeout); + if (pWait->u.lTimeout <= 0) + pWait->fTimedOut = true; + } + after_wait((&pWait->WaitQE)); +} + + +/** + * Checks if a linux wait was interrupted. + * + * @returns true / false + * @param pWait The wait structure. + * @remarks This shall be called before the first rtR0SemLnxWaitDoIt(). + */ +DECLINLINE(bool) rtR0SemLnxWaitWasInterrupted(PRTR0SEMLNXWAIT pWait) +{ + return pWait->iWaitState == TASK_INTERRUPTIBLE + && signal_pending(current); +} + + +/** + * Checks if a linux wait has timed out. + * + * @returns true / false + * @param pWait The wait structure. + */ +DECLINLINE(bool) rtR0SemLnxWaitHasTimedOut(PRTR0SEMLNXWAIT pWait) +{ + return pWait->fTimedOut; +} + + +/** + * Deletes a linux wait. + * + * @param pWait The wait structure. + */ +DECLINLINE(void) rtR0SemLnxWaitDelete(PRTR0SEMLNXWAIT pWait) +{ + finish_wait(pWait->pWaitQueue, &pWait->WaitQE); +} + + +/** + * Gets the max resolution of the timeout machinery. + * + * @returns Resolution specified in nanoseconds. + */ +DECLINLINE(uint32_t) rtR0SemLnxWaitGetResolution(void) +{ +#ifdef IPRT_LINUX_HAS_HRTIMER + return RTR0SEMLNXWAIT_RESOLUTION; +#else + return RT_NS_1SEC / HZ; /* ns */ +#endif +} + +#endif /* !IPRT_INCLUDED_SRC_r0drv_linux_waitqueue_r0drv_linux_h */ + |