diff options
Diffstat (limited to 'src/VBox/Runtime/r3/linux')
21 files changed, 6392 insertions, 0 deletions
diff --git a/src/VBox/Runtime/r3/linux/Makefile.kup b/src/VBox/Runtime/r3/linux/Makefile.kup new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/VBox/Runtime/r3/linux/Makefile.kup diff --git a/src/VBox/Runtime/r3/linux/RTFileCopyPartEx-linux.cpp b/src/VBox/Runtime/r3/linux/RTFileCopyPartEx-linux.cpp new file mode 100644 index 00000000..198df368 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/RTFileCopyPartEx-linux.cpp @@ -0,0 +1,186 @@ +/* $Id: RTFileCopyPartEx-linux.cpp $ */ +/** @file + * IPRT - RTFileCopyPartEx, linux specific implementation. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include <iprt/file.h> +#include "internal/iprt.h" + +#include <iprt/asm.h> +#include <iprt/assert.h> +#include <iprt/err.h> + +#include <errno.h> +#include <unistd.h> +#include <sys/syscall.h> + +#ifndef __NR_copy_file_range +# if defined(RT_ARCH_X86) +# define __NR_copy_file_range 377 +# elif defined(RT_ARCH_AMD64) +# define __NR_copy_file_range 326 +# endif +#endif + + +#ifndef __NR_copy_file_range +# include "../../generic/RTFileCopyPartEx-generic.cpp" +#else /* __NR_copy_file_range - whole file */ +/* Include the generic code as a fallback since copy_file_range is rather new . */ +# define IPRT_FALLBACK_VERSION +# include "../../generic/RTFileCopyPartEx-generic.cpp" +# undef IPRT_FALLBACK_VERSION + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +static int32_t volatile g_fCopyFileRangeSupported = -1; + + +DECLINLINE(loff_t) +MyCopyFileRangeSysCall(int fdIn, loff_t *poffIn, int fdOut, loff_t *poffOut, size_t cbChunk, unsigned int fFlags) +{ + return syscall(__NR_copy_file_range, fdIn, poffIn, fdOut, poffOut, cbChunk, fFlags); +} + + +DECL_NO_INLINE(static, bool) HasCopyFileRangeSyscallSlow(void) +{ + errno = 0; + MyCopyFileRangeSysCall(-1, NULL, -1, NULL, 4096, 0); + if (errno != ENOSYS) + { + ASMAtomicWriteS32(&g_fCopyFileRangeSupported, 1); + return true; + } + ASMAtomicWriteS32(&g_fCopyFileRangeSupported, 0); + return false; +} + +DECLINLINE(bool) HasCopyFileRangeSyscall(void) +{ + int32_t i = ASMAtomicUoReadS32(&g_fCopyFileRangeSupported); + if (i != -1) + return i == 1; + return HasCopyFileRangeSyscallSlow(); +} + + + +RTDECL(int) RTFileCopyPartPrep(PRTFILECOPYPARTBUFSTATE pBufState, uint64_t cbToCopy) +{ + if (HasCopyFileRangeSyscall()) + { + pBufState->iAllocType = -42; + pBufState->pbBuf = NULL; + pBufState->cbBuf = 0; + pBufState->uMagic = RTFILECOPYPARTBUFSTATE_MAGIC; + return VINF_SUCCESS; + } + return rtFileCopyPartPrepFallback(pBufState, cbToCopy); +} + + +RTDECL(void) RTFileCopyPartCleanup(PRTFILECOPYPARTBUFSTATE pBufState) +{ + return rtFileCopyPartCleanupFallback(pBufState); +} + + +RTDECL(int) RTFileCopyPartEx(RTFILE hFileSrc, RTFOFF offSrc, RTFILE hFileDst, RTFOFF offDst, uint64_t cbToCopy, + uint32_t fFlags, PRTFILECOPYPARTBUFSTATE pBufState, uint64_t *pcbCopied) +{ + /* + * Validate input. + */ + if (pcbCopied) + *pcbCopied = 0; + AssertReturn(pBufState->uMagic == RTFILECOPYPARTBUFSTATE_MAGIC, VERR_INVALID_FLAGS); + if (pBufState->iAllocType == -42) + { /* more and more likely as time goes */ } + else + return rtFileCopyPartExFallback(hFileSrc, offSrc, hFileDst, offDst, cbToCopy, fFlags, pBufState, pcbCopied); + AssertReturn(offSrc >= 0, VERR_NEGATIVE_SEEK); + AssertReturn(offDst >= 0, VERR_NEGATIVE_SEEK); + AssertReturn(!fFlags, VERR_INVALID_FLAGS); + + /* + * If nothing to copy, return right away. + */ + if (!cbToCopy) + return VINF_SUCCESS; + + /* + * Do the copying. + */ + uint64_t cbCopied = 0; + int rc = VINF_SUCCESS; + do + { + size_t cbThisCopy = (size_t)RT_MIN(cbToCopy - cbCopied, _1G); + loff_t offThisDst = offSrc + cbCopied; + loff_t offThisSrc = offDst + cbCopied; + ssize_t cbActual = MyCopyFileRangeSysCall((int)RTFileToNative(hFileSrc), &offThisSrc, + (int)RTFileToNative(hFileDst), &offThisDst, + cbThisCopy, 0); + if (cbActual < 0) + { + rc = errno; + Assert(rc != 0); + rc = rc != 0 ? RTErrConvertFromErrno(rc) : VERR_READ_ERROR; + if (rc != VERR_NOT_SAME_DEVICE || cbCopied != 0) + break; + + /* Fall back to generic implementation if the syscall refuses to handle the case. */ + rc = rtFileCopyPartPrepFallback(pBufState, cbToCopy); + if (RT_SUCCESS(rc)) + return rtFileCopyPartExFallback(hFileSrc, offSrc, hFileDst, offDst, cbToCopy, fFlags, pBufState, pcbCopied); + return rc; + } + Assert(offThisSrc == offSrc + (int64_t)cbCopied + cbActual); + Assert(offThisDst == offDst + (int64_t)cbCopied + cbActual); + + if (cbActual == 0) + { + if (!pcbCopied) + rc = VERR_EOF; + break; + } + + cbCopied += cbActual; + } while (cbCopied < cbToCopy); + + if (pcbCopied) + *pcbCopied = cbCopied; + + return rc; +} + +#endif /* __NR_copy_file_range */ + diff --git a/src/VBox/Runtime/r3/linux/RTFileSetAllocationSize-linux.cpp b/src/VBox/Runtime/r3/linux/RTFileSetAllocationSize-linux.cpp new file mode 100644 index 00000000..ce042eeb --- /dev/null +++ b/src/VBox/Runtime/r3/linux/RTFileSetAllocationSize-linux.cpp @@ -0,0 +1,77 @@ +/* $Id: RTFileSetAllocationSize-linux.cpp $ */ +/** @file + * IPRT - RTFileSetAllocationSize, linux implementation. + */ + +/* + * Copyright (C) 2016-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_FILE +#include <iprt/file.h> +#include "internal/iprt.h" + +#include <iprt/assert.h> +#include <iprt/errcore.h> + +#include <dlfcn.h> +#include <errno.h> +#include <unistd.h> +#include <sys/fcntl.h> + +/** + * The Linux specific fallocate() method. + */ +typedef int (*PFNLNXFALLOCATE) (int iFd, int fMode, off_t offStart, off_t cb); +/** Flag to specify that the file size should not be extended. */ +#define LNX_FALLOC_FL_KEEP_SIZE 1 + +RTDECL(int) RTFileSetAllocationSize(RTFILE hFile, uint64_t cbSize, uint32_t fFlags) +{ + AssertReturn(hFile != NIL_RTFILE, VERR_INVALID_PARAMETER); + AssertReturn(!(fFlags & ~RTFILE_ALLOC_SIZE_F_VALID), VERR_INVALID_PARAMETER); + AssertMsgReturn(sizeof(off_t) >= sizeof(cbSize) || RT_HIDWORD(cbSize) == 0, + ("64-bit filesize not supported! cbSize=%lld\n", cbSize), + VERR_NOT_SUPPORTED); + + int rc = VINF_SUCCESS; + PFNLNXFALLOCATE pfnLnxFAllocate = (PFNLNXFALLOCATE)(uintptr_t)dlsym(RTLD_DEFAULT, "fallocate64"); + if (VALID_PTR(pfnLnxFAllocate)) + { + int fLnxFlags = (fFlags & RTFILE_ALLOC_SIZE_F_KEEP_SIZE) ? LNX_FALLOC_FL_KEEP_SIZE : 0; + int rcLnx = pfnLnxFAllocate(RTFileToNative(hFile), fLnxFlags, 0, cbSize); + if (rcLnx != 0) + { + if (errno == EOPNOTSUPP) + rc = VERR_NOT_SUPPORTED; + else + rc = RTErrConvertFromErrno(errno); + } + } + else + rc = VERR_NOT_SUPPORTED; + + return rc; +} +RT_EXPORT_SYMBOL(RTFileSetAllocationSize); diff --git a/src/VBox/Runtime/r3/linux/RTProcIsRunningByName-linux.cpp b/src/VBox/Runtime/r3/linux/RTProcIsRunningByName-linux.cpp new file mode 100644 index 00000000..bf706808 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/RTProcIsRunningByName-linux.cpp @@ -0,0 +1,118 @@ +/* $Id: RTProcIsRunningByName-linux.cpp $ */ +/** @file + * IPRT - RTProcIsRunningByName, Linux implementation. + */ + +/* + * Copyright (C) 2009-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_PROCESS +#include <iprt/process.h> +#include <iprt/string.h> +#include <iprt/dir.h> +#include <iprt/path.h> +#include <iprt/stream.h> +#include <iprt/param.h> +#include <iprt/assert.h> + +#include <unistd.h> + + +RTR3DECL(bool) RTProcIsRunningByName(const char *pszName) +{ + /* + * Quick validation. + */ + if (!pszName) + return false; + + bool const fWithPath = RTPathHavePath(pszName); + + /* + * Enumerate /proc. + */ + RTDIR hDir; + int rc = RTDirOpen(&hDir, "/proc"); + AssertMsgRCReturn(rc, ("RTDirOpen on /proc failed: rc=%Rrc\n", rc), false); + if (RT_SUCCESS(rc)) + { + RTDIRENTRY DirEntry; + while (RT_SUCCESS(RTDirRead(hDir, &DirEntry, NULL))) + { + /* + * Filter numeric directory entries only. + */ + if ( ( DirEntry.enmType == RTDIRENTRYTYPE_DIRECTORY + || DirEntry.enmType == RTDIRENTRYTYPE_UNKNOWN) + && RTStrToUInt32(DirEntry.szName) > 0) + { + /* + * Try readlink on exe first since it's more faster and reliable. + * Fall back on reading the first line in cmdline if that fails + * (access errors typically). cmdline is unreliable as it might + * contain whatever the execv caller passes as argv[0]. + */ + char szName[RTPATH_MAX]; + RTStrPrintf(szName, sizeof(szName), "/proc/%s/exe", &DirEntry.szName[0]); + char szExe[RTPATH_MAX]; + int cchLink = readlink(szName, szExe, sizeof(szExe) - 1); + if ( cchLink > 0 + && (size_t)cchLink < sizeof(szExe)) + { + szExe[cchLink] = '\0'; + rc = VINF_SUCCESS; + } + else + { + RTStrPrintf(szName, sizeof(szName), "/proc/%s/cmdline", &DirEntry.szName[0]); + PRTSTREAM pStream; + rc = RTStrmOpen(szName, "r", &pStream); + if (RT_SUCCESS(rc)) + { + rc = RTStrmGetLine(pStream, szExe, sizeof(szExe)); + RTStrmClose(pStream); + } + } + if (RT_SUCCESS(rc)) + { + /* + * We are interested on the file name part only. + */ + char const *pszProcName = fWithPath ? szExe : RTPathFilename(szExe); + if (RTStrCmp(pszProcName, pszName) == 0) + { + /* Found it! */ + RTDirClose(hDir); + return true; + } + } + } + } + RTDirClose(hDir); + } + + return false; +} + diff --git a/src/VBox/Runtime/r3/linux/RTSystemFirmware-linux.cpp b/src/VBox/Runtime/r3/linux/RTSystemFirmware-linux.cpp new file mode 100644 index 00000000..673b3583 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/RTSystemFirmware-linux.cpp @@ -0,0 +1,105 @@ +/* $Id: RTSystemFirmware-linux.cpp $ */ +/** @file + * IPRT - System firmware information, linux. + */ + +/* + * Copyright (C) 2019-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include "internal/iprt.h" +#include <iprt/system.h> + +#include <iprt/err.h> +#include <iprt/file.h> +#include <iprt/string.h> +#include <iprt/linux/sysfs.h> + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +/** Defines the UEFI Globals UUID that is used here as filename suffix (case sensitive). */ +#define VBOX_UEFI_UUID_GLOBALS "8be4df61-93ca-11d2-aa0d-00e098032b8c" + + +RTDECL(int) RTSystemQueryFirmwareType(PRTSYSFWTYPE penmFirmwareType) +{ + if (RTLinuxSysFsExists("firmware/efi/")) + *penmFirmwareType = RTSYSFWTYPE_UEFI; + else if (RTLinuxSysFsExists("")) + *penmFirmwareType = RTSYSFWTYPE_BIOS; + else + { + *penmFirmwareType = RTSYSFWTYPE_INVALID; + return VERR_NOT_SUPPORTED; + } + return VINF_SUCCESS; +} +RT_EXPORT_SYMBOL(RTSystemQueryFirmwareType); + + +RTDECL(int) RTSystemQueryFirmwareBoolean(RTSYSFWBOOL enmBoolean, bool *pfValue) +{ + *pfValue = false; + + /* + * Translate the property to variable base filename. + */ + const char *pszName; + switch (enmBoolean) + { + case RTSYSFWBOOL_SECURE_BOOT: + pszName = "firmware/efi/efivars/SecureBoot"; + break; + + default: + AssertReturn(enmBoolean > RTSYSFWBOOL_INVALID && enmBoolean < RTSYSFWBOOL_END, VERR_INVALID_PARAMETER); + return VERR_SYS_UNSUPPORTED_FIRMWARE_PROPERTY; + + } + + /* + * Try open and read the variable value. + */ + RTFILE hFile; + int rc = RTLinuxSysFsOpen(&hFile, "%s-" VBOX_UEFI_UUID_GLOBALS, pszName); + /** @todo try other suffixes if file-not-found. */ + if (RT_SUCCESS(rc)) + { + uint8_t abBuf[16]; + size_t cbRead = 0; + rc = RTLinuxSysFsReadFile(hFile, abBuf, sizeof(abBuf), &cbRead); + *pfValue = cbRead > 1 && abBuf[cbRead - 1] != 0; + RTFileClose(hFile); + } + else if (rc == VERR_FILE_NOT_FOUND || rc == VERR_PATH_NOT_FOUND) + rc = VINF_SUCCESS; + else if (rc == VERR_PERMISSION_DENIED) + rc = VERR_NOT_SUPPORTED; + + return rc; +} +RT_EXPORT_SYMBOL(RTSystemQueryFirmwareBoolean); + diff --git a/src/VBox/Runtime/r3/linux/RTSystemQueryDmiString-linux.cpp b/src/VBox/Runtime/r3/linux/RTSystemQueryDmiString-linux.cpp new file mode 100644 index 00000000..3b81ceab --- /dev/null +++ b/src/VBox/Runtime/r3/linux/RTSystemQueryDmiString-linux.cpp @@ -0,0 +1,86 @@ +/* $Id: RTSystemQueryDmiString-linux.cpp $ */ +/** @file + * IPRT - RTSystemQueryDmiString, linux ring-3. + */ + +/* + * Copyright (C) 2010-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include <iprt/system.h> +#include "internal/iprt.h" + +#include <iprt/err.h> +#include <iprt/assert.h> +#include <iprt/linux/sysfs.h> + +#include <errno.h> + + +RTDECL(int) RTSystemQueryDmiString(RTSYSDMISTR enmString, char *pszBuf, size_t cbBuf) +{ + AssertPtrReturn(pszBuf, VERR_INVALID_POINTER); + AssertReturn(cbBuf > 0, VERR_INVALID_PARAMETER); + *pszBuf = '\0'; + AssertReturn(enmString > RTSYSDMISTR_INVALID && enmString < RTSYSDMISTR_END, VERR_INVALID_PARAMETER); + + const char *pszSysFsName; + switch (enmString) + { + case RTSYSDMISTR_PRODUCT_NAME: pszSysFsName = "id/product_name"; break; + case RTSYSDMISTR_PRODUCT_VERSION: pszSysFsName = "id/product_version"; break; + case RTSYSDMISTR_PRODUCT_UUID: pszSysFsName = "id/product_uuid"; break; + case RTSYSDMISTR_PRODUCT_SERIAL: pszSysFsName = "id/product_serial"; break; + case RTSYSDMISTR_MANUFACTURER: pszSysFsName = "id/sys_vendor"; break; + default: + return VERR_NOT_SUPPORTED; + } + + size_t cbRead = 0; + int rc = RTLinuxSysFsReadStrFile(pszBuf, cbBuf, &cbRead, "devices/virtual/dmi/%s", pszSysFsName); + if (RT_FAILURE(rc) && rc != VERR_BUFFER_OVERFLOW) + rc = RTLinuxSysFsReadStrFile(pszBuf, cbBuf, &cbRead, "class/dmi/%s", pszSysFsName); + if (RT_FAILURE(rc) && rc != VERR_BUFFER_OVERFLOW) + { + switch (rc) + { + case VINF_SUCCESS: + AssertFailed(); + break; + case VERR_FILE_NOT_FOUND: + case VERR_PATH_NOT_FOUND: + case VERR_IS_A_DIRECTORY: + rc = VERR_NOT_SUPPORTED; + break; + case VERR_PERMISSION_DENIED: + case VERR_ACCESS_DENIED: + rc = VERR_ACCESS_DENIED; + break; + } + } + + return rc; +} +RT_EXPORT_SYMBOL(RTSystemQueryDmiString); + diff --git a/src/VBox/Runtime/r3/linux/RTSystemShutdown-linux.cpp b/src/VBox/Runtime/r3/linux/RTSystemShutdown-linux.cpp new file mode 100644 index 00000000..0e8c0e21 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/RTSystemShutdown-linux.cpp @@ -0,0 +1,101 @@ +/* $Id: RTSystemShutdown-linux.cpp $ */ +/** @file + * IPRT - RTSystemShutdown, linux implementation. + */ + +/* + * Copyright (C) 2012-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include <iprt/system.h> +#include "internal/iprt.h" + +#include <iprt/assert.h> +#include <iprt/env.h> +#include <iprt/err.h> +#include <iprt/process.h> +#include <iprt/string.h> + + +RTDECL(int) RTSystemShutdown(RTMSINTERVAL cMsDelay, uint32_t fFlags, const char *pszLogMsg) +{ + AssertPtrReturn(pszLogMsg, VERR_INVALID_POINTER); + AssertReturn(!(fFlags & ~RTSYSTEM_SHUTDOWN_VALID_MASK), VERR_INVALID_PARAMETER); + + /* + * Assemble the argument vector. + */ + int iArg = 0; + const char *apszArgs[6]; + + RT_BZERO(apszArgs, sizeof(apszArgs)); + + apszArgs[iArg++] = "/sbin/shutdown"; + switch (fFlags & RTSYSTEM_SHUTDOWN_ACTION_MASK) + { + case RTSYSTEM_SHUTDOWN_HALT: + apszArgs[iArg++] = "-h"; + apszArgs[iArg++] = "-H"; + break; + case RTSYSTEM_SHUTDOWN_REBOOT: + apszArgs[iArg++] = "-r"; + break; + case RTSYSTEM_SHUTDOWN_POWER_OFF: + case RTSYSTEM_SHUTDOWN_POWER_OFF_HALT: + apszArgs[iArg++] = "-h"; + apszArgs[iArg++] = "-P"; + break; + } + + char szWhen[80]; + if (cMsDelay < 500) + strcpy(szWhen, "now"); + else + RTStrPrintf(szWhen, sizeof(szWhen), "%u", (unsigned)((cMsDelay + 499) / 1000)); + apszArgs[iArg++] = szWhen; + + apszArgs[iArg++] = pszLogMsg; + + + /* + * Start the shutdown process and wait for it to complete. + */ + RTPROCESS hProc; + int rc = RTProcCreate(apszArgs[0], apszArgs, RTENV_DEFAULT, 0 /*fFlags*/, &hProc); + if (RT_FAILURE(rc)) + return rc; + + RTPROCSTATUS ProcStatus; + rc = RTProcWait(hProc, RTPROCWAIT_FLAGS_BLOCK, &ProcStatus); + if (RT_SUCCESS(rc)) + { + if ( ProcStatus.enmReason != RTPROCEXITREASON_NORMAL + || ProcStatus.iStatus != 0) + rc = VERR_SYS_SHUTDOWN_FAILED; + } + + return rc; +} +RT_EXPORT_SYMBOL(RTSystemShutdown); + diff --git a/src/VBox/Runtime/r3/linux/RTThreadGetNativeState-linux.cpp b/src/VBox/Runtime/r3/linux/RTThreadGetNativeState-linux.cpp new file mode 100644 index 00000000..cd0c2e85 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/RTThreadGetNativeState-linux.cpp @@ -0,0 +1,111 @@ +/* $Id: RTThreadGetNativeState-linux.cpp $ */ +/** @file + * IPRT - RTThreadGetNativeState, linux implementation. + */ + +/* + * Copyright (C) 2010-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_PROCESS +#include <iprt/thread.h> +#include "internal/iprt.h" + +#include <iprt/assert.h> +#include <iprt/ctype.h> +#include <iprt/errcore.h> +#include <iprt/string.h> + +#include "internal/thread.h" + +#include <unistd.h> +#include <sys/fcntl.h> + + +RTDECL(RTTHREADNATIVESTATE) RTThreadGetNativeState(RTTHREAD hThread) +{ + RTTHREADNATIVESTATE enmRet = RTTHREADNATIVESTATE_INVALID; + PRTTHREADINT pThread = rtThreadGet(hThread); + if (pThread) + { + enmRet = RTTHREADNATIVESTATE_UNKNOWN; + + char szName[512]; + RTStrPrintf(szName, sizeof(szName), "/proc/self/task/%u/stat", pThread->tid); + int fd = open(szName, O_RDONLY, 0); + if (fd >= 0) + { + ssize_t cch = read(fd, szName, sizeof(szName) - 1); + close(fd); + if (cch > 0) + { + szName[cch] = '\0'; + + /* skip the pid, the (comm name) and stop at the status char. */ + const char *psz = szName; + while ( *psz + && ( *psz != ')' + || !RT_C_IS_SPACE(psz[1]) + || !RT_C_IS_ALPHA(psz[2]) + || !RT_C_IS_SPACE(psz[3]) + ) + ) + psz++; + if (*psz == ')') + { + switch (psz[2]) + { + case 'R': /* running */ + enmRet = RTTHREADNATIVESTATE_RUNNING; + break; + + case 'S': /* sleeping */ + case 'D': /* disk sleeping */ + enmRet = RTTHREADNATIVESTATE_BLOCKED; + break; + + case 'T': /* stopped or tracking stop */ + enmRet = RTTHREADNATIVESTATE_SUSPENDED; + break; + + case 'Z': /* zombie */ + case 'X': /* dead */ + enmRet = RTTHREADNATIVESTATE_TERMINATED; + break; + + default: + AssertMsgFailed(("state=%c\n", psz[2])); + enmRet = RTTHREADNATIVESTATE_UNKNOWN; + break; + } + } + else + AssertMsgFailed(("stat='%s'\n", szName)); + } + } + rtThreadRelease(pThread); + } + return enmRet; +} + diff --git a/src/VBox/Runtime/r3/linux/fileaio-linux.cpp b/src/VBox/Runtime/r3/linux/fileaio-linux.cpp new file mode 100644 index 00000000..16b08857 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/fileaio-linux.cpp @@ -0,0 +1,838 @@ +/* $Id: fileaio-linux.cpp $ */ +/** @file + * IPRT - File async I/O, native implementation for the Linux host platform. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + +/** @page pg_rtfileaio_linux RTFile Async I/O - Linux Implementation Notes + * @internal + * + * Linux implements the kernel async I/O API through the io_* syscalls. They are + * not exposed in the glibc (the aio_* API uses userspace threads and blocking + * I/O operations to simulate async behavior). There is an external library + * called libaio which implements these syscalls but because we don't want to + * have another dependency and this library is not installed by default and the + * interface is really simple we use the kernel interface directly using wrapper + * functions. + * + * The interface has some limitations. The first one is that the file must be + * opened with O_DIRECT. This disables caching done by the kernel which can be + * compensated if the user of this API implements caching itself. The next + * limitation is that data buffers must be aligned at a 512 byte boundary or the + * request will fail. + */ +/** @todo r=bird: What's this about "must be opened with O_DIRECT"? An + * explanation would be nice, esp. seeing what Linus is quoted saying + * about it in the open man page... */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_FILE +#include <iprt/asm.h> +#include <iprt/mem.h> +#include <iprt/assert.h> +#include <iprt/string.h> +#include <iprt/err.h> +#include <iprt/log.h> +#include <iprt/thread.h> +#include "internal/fileaio.h" + +#include <unistd.h> +#include <sys/syscall.h> +#include <errno.h> + +#include <iprt/file.h> + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** The async I/O context handle */ +typedef unsigned long LNXKAIOCONTEXT; + +/** + * Supported commands for the iocbs + */ +enum +{ + LNXKAIO_IOCB_CMD_READ = 0, + LNXKAIO_IOCB_CMD_WRITE = 1, + LNXKAIO_IOCB_CMD_FSYNC = 2, + LNXKAIO_IOCB_CMD_FDSYNC = 3 +}; + +/** + * The iocb structure of a request which is passed to the kernel. + * + * We redefined this here because the version in the header lacks padding + * for 32bit. + */ +typedef struct LNXKAIOIOCB +{ + /** Opaque pointer to data which is returned on an I/O event. */ + void *pvUser; +#ifdef RT_ARCH_X86 + uint32_t u32Padding0; +#endif + /** Contains the request number and is set by the kernel. */ + uint32_t u32Key; + /** Reserved. */ + uint32_t u32Reserved0; + /** The I/O opcode. */ + uint16_t u16IoOpCode; + /** Request priority. */ + int16_t i16Priority; + /** The file descriptor. */ + uint32_t uFileDesc; + /** The userspace pointer to the buffer containing/receiving the data. */ + void *pvBuf; +#ifdef RT_ARCH_X86 + uint32_t u32Padding1; +#endif + /** How many bytes to transfer. */ +#ifdef RT_ARCH_X86 + uint32_t cbTransfer; + uint32_t u32Padding2; +#elif defined(RT_ARCH_AMD64) + uint64_t cbTransfer; +#else +# error "Unknown architecture" +#endif + /** At which offset to start the transfer. */ + int64_t off; + /** Reserved. */ + uint64_t u64Reserved1; + /** Flags */ + uint32_t fFlags; + /** Readyness signal file descriptor. */ + uint32_t u32ResFd; +} LNXKAIOIOCB, *PLNXKAIOIOCB; + +/** + * I/O event structure to notify about completed requests. + * Redefined here too because of the padding. + */ +typedef struct LNXKAIOIOEVENT +{ + /** The pvUser field from the iocb. */ + void *pvUser; +#ifdef RT_ARCH_X86 + uint32_t u32Padding0; +#endif + /** The LNXKAIOIOCB object this event is for. */ + PLNXKAIOIOCB *pIoCB; +#ifdef RT_ARCH_X86 + uint32_t u32Padding1; +#endif + /** The result code of the operation .*/ +#ifdef RT_ARCH_X86 + int32_t rc; + uint32_t u32Padding2; +#elif defined(RT_ARCH_AMD64) + int64_t rc; +#else +# error "Unknown architecture" +#endif + /** Secondary result code. */ +#ifdef RT_ARCH_X86 + int32_t rc2; + uint32_t u32Padding3; +#elif defined(RT_ARCH_AMD64) + int64_t rc2; +#else +# error "Unknown architecture" +#endif +} LNXKAIOIOEVENT, *PLNXKAIOIOEVENT; + + +/** + * Async I/O completion context state. + */ +typedef struct RTFILEAIOCTXINTERNAL +{ + /** Handle to the async I/O context. */ + LNXKAIOCONTEXT AioContext; + /** Maximum number of requests this context can handle. */ + int cRequestsMax; + /** Current number of requests active on this context. */ + volatile int32_t cRequests; + /** The ID of the thread which is currently waiting for requests. */ + volatile RTTHREAD hThreadWait; + /** Flag whether the thread was woken up. */ + volatile bool fWokenUp; + /** Flag whether the thread is currently waiting in the syscall. */ + volatile bool fWaiting; + /** Flags given during creation. */ + uint32_t fFlags; + /** Magic value (RTFILEAIOCTX_MAGIC). */ + uint32_t u32Magic; +} RTFILEAIOCTXINTERNAL; +/** Pointer to an internal context structure. */ +typedef RTFILEAIOCTXINTERNAL *PRTFILEAIOCTXINTERNAL; + +/** + * Async I/O request state. + */ +typedef struct RTFILEAIOREQINTERNAL +{ + /** The aio control block. This must be the FIRST elment in + * the structure! (see notes below) */ + LNXKAIOIOCB AioCB; + /** Current state the request is in. */ + RTFILEAIOREQSTATE enmState; + /** The I/O context this request is associated with. */ + LNXKAIOCONTEXT AioContext; + /** Return code the request completed with. */ + int Rc; + /** Number of bytes actually transferred. */ + size_t cbTransfered; + /** Completion context we are assigned to. */ + PRTFILEAIOCTXINTERNAL pCtxInt; + /** Magic value (RTFILEAIOREQ_MAGIC). */ + uint32_t u32Magic; +} RTFILEAIOREQINTERNAL; +/** Pointer to an internal request structure. */ +typedef RTFILEAIOREQINTERNAL *PRTFILEAIOREQINTERNAL; + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +/** The max number of events to get in one call. */ +#define AIO_MAXIMUM_REQUESTS_PER_CONTEXT 64 + + +/** + * Creates a new async I/O context. + */ +DECLINLINE(int) rtFileAsyncIoLinuxCreate(unsigned cEvents, LNXKAIOCONTEXT *pAioContext) +{ + int rc = syscall(__NR_io_setup, cEvents, pAioContext); + if (RT_UNLIKELY(rc == -1)) + { + if (errno == EAGAIN) + return VERR_FILE_AIO_INSUFFICIENT_EVENTS; + else + return RTErrConvertFromErrno(errno); + } + + return VINF_SUCCESS; +} + +/** + * Destroys a async I/O context. + */ +DECLINLINE(int) rtFileAsyncIoLinuxDestroy(LNXKAIOCONTEXT AioContext) +{ + int rc = syscall(__NR_io_destroy, AioContext); + if (RT_UNLIKELY(rc == -1)) + return RTErrConvertFromErrno(errno); + + return VINF_SUCCESS; +} + +/** + * Submits an array of I/O requests to the kernel. + */ +DECLINLINE(int) rtFileAsyncIoLinuxSubmit(LNXKAIOCONTEXT AioContext, long cReqs, LNXKAIOIOCB **ppIoCB, int *pcSubmitted) +{ + int rc = syscall(__NR_io_submit, AioContext, cReqs, ppIoCB); + if (RT_UNLIKELY(rc == -1)) + return RTErrConvertFromErrno(errno); + + *pcSubmitted = rc; + + return VINF_SUCCESS; +} + +/** + * Cancels a I/O request. + */ +DECLINLINE(int) rtFileAsyncIoLinuxCancel(LNXKAIOCONTEXT AioContext, PLNXKAIOIOCB pIoCB, PLNXKAIOIOEVENT pIoResult) +{ + int rc = syscall(__NR_io_cancel, AioContext, pIoCB, pIoResult); + if (RT_UNLIKELY(rc == -1)) + return RTErrConvertFromErrno(errno); + + return VINF_SUCCESS; +} + +/** + * Waits for I/O events. + * @returns Number of events (natural number w/ 0), IPRT error code (negative). + */ +DECLINLINE(int) rtFileAsyncIoLinuxGetEvents(LNXKAIOCONTEXT AioContext, long cReqsMin, long cReqs, + PLNXKAIOIOEVENT paIoResults, struct timespec *pTimeout) +{ + int rc = syscall(__NR_io_getevents, AioContext, cReqsMin, cReqs, paIoResults, pTimeout); + if (RT_UNLIKELY(rc == -1)) + return RTErrConvertFromErrno(errno); + + return rc; +} + +RTR3DECL(int) RTFileAioGetLimits(PRTFILEAIOLIMITS pAioLimits) +{ + int rc = VINF_SUCCESS; + AssertPtrReturn(pAioLimits, VERR_INVALID_POINTER); + + /* + * Check if the API is implemented by creating a + * completion port. + */ + LNXKAIOCONTEXT AioContext = 0; + rc = rtFileAsyncIoLinuxCreate(1, &AioContext); + if (RT_FAILURE(rc)) + return rc; + + rc = rtFileAsyncIoLinuxDestroy(AioContext); + if (RT_FAILURE(rc)) + return rc; + + /* Supported - fill in the limits. The alignment is the only restriction. */ + pAioLimits->cReqsOutstandingMax = RTFILEAIO_UNLIMITED_REQS; + pAioLimits->cbBufferAlignment = 512; + + return VINF_SUCCESS; +} + + +RTR3DECL(int) RTFileAioReqCreate(PRTFILEAIOREQ phReq) +{ + AssertPtrReturn(phReq, VERR_INVALID_POINTER); + + /* + * Allocate a new request and initialize it. + */ + PRTFILEAIOREQINTERNAL pReqInt = (PRTFILEAIOREQINTERNAL)RTMemAllocZ(sizeof(*pReqInt)); + if (RT_UNLIKELY(!pReqInt)) + return VERR_NO_MEMORY; + + pReqInt->pCtxInt = NULL; + pReqInt->u32Magic = RTFILEAIOREQ_MAGIC; + RTFILEAIOREQ_SET_STATE(pReqInt, COMPLETED); + + *phReq = (RTFILEAIOREQ)pReqInt; + return VINF_SUCCESS; +} + + +RTDECL(int) RTFileAioReqDestroy(RTFILEAIOREQ hReq) +{ + /* + * Validate the handle and ignore nil. + */ + if (hReq == NIL_RTFILEAIOREQ) + return VINF_SUCCESS; + PRTFILEAIOREQINTERNAL pReqInt = hReq; + RTFILEAIOREQ_VALID_RETURN(pReqInt); + RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_IN_PROGRESS); + + /* + * Trash the magic and free it. + */ + ASMAtomicUoWriteU32(&pReqInt->u32Magic, ~RTFILEAIOREQ_MAGIC); + RTMemFree(pReqInt); + return VINF_SUCCESS; +} + + +/** + * Worker setting up the request. + */ +DECLINLINE(int) rtFileAioReqPrepareTransfer(RTFILEAIOREQ hReq, RTFILE hFile, + uint16_t uTransferDirection, + RTFOFF off, void *pvBuf, size_t cbTransfer, + void *pvUser) +{ + /* + * Validate the input. + */ + PRTFILEAIOREQINTERNAL pReqInt = hReq; + RTFILEAIOREQ_VALID_RETURN(pReqInt); + RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_IN_PROGRESS); + Assert(hFile != NIL_RTFILE); + + if (uTransferDirection != LNXKAIO_IOCB_CMD_FSYNC) + { + AssertPtr(pvBuf); + Assert(off >= 0); + Assert(cbTransfer > 0); + } + + /* + * Setup the control block and clear the finished flag. + */ + pReqInt->AioCB.u16IoOpCode = uTransferDirection; + pReqInt->AioCB.uFileDesc = RTFileToNative(hFile); + pReqInt->AioCB.off = off; + pReqInt->AioCB.cbTransfer = cbTransfer; + pReqInt->AioCB.pvBuf = pvBuf; + pReqInt->AioCB.pvUser = pvUser; + + pReqInt->pCtxInt = NULL; + RTFILEAIOREQ_SET_STATE(pReqInt, PREPARED); + + return VINF_SUCCESS; +} + + +RTDECL(int) RTFileAioReqPrepareRead(RTFILEAIOREQ hReq, RTFILE hFile, RTFOFF off, + void *pvBuf, size_t cbRead, void *pvUser) +{ + return rtFileAioReqPrepareTransfer(hReq, hFile, LNXKAIO_IOCB_CMD_READ, + off, pvBuf, cbRead, pvUser); +} + + +RTDECL(int) RTFileAioReqPrepareWrite(RTFILEAIOREQ hReq, RTFILE hFile, RTFOFF off, + void const *pvBuf, size_t cbWrite, void *pvUser) +{ + return rtFileAioReqPrepareTransfer(hReq, hFile, LNXKAIO_IOCB_CMD_WRITE, + off, (void *)pvBuf, cbWrite, pvUser); +} + + +RTDECL(int) RTFileAioReqPrepareFlush(RTFILEAIOREQ hReq, RTFILE hFile, void *pvUser) +{ + PRTFILEAIOREQINTERNAL pReqInt = hReq; + RTFILEAIOREQ_VALID_RETURN(pReqInt); + AssertReturn(hFile != NIL_RTFILE, VERR_INVALID_HANDLE); + RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_IN_PROGRESS); + + return rtFileAioReqPrepareTransfer(pReqInt, hFile, LNXKAIO_IOCB_CMD_FSYNC, + 0, NULL, 0, pvUser); +} + + +RTDECL(void *) RTFileAioReqGetUser(RTFILEAIOREQ hReq) +{ + PRTFILEAIOREQINTERNAL pReqInt = hReq; + RTFILEAIOREQ_VALID_RETURN_RC(pReqInt, NULL); + + return pReqInt->AioCB.pvUser; +} + + +RTDECL(int) RTFileAioReqCancel(RTFILEAIOREQ hReq) +{ + PRTFILEAIOREQINTERNAL pReqInt = hReq; + RTFILEAIOREQ_VALID_RETURN(pReqInt); + RTFILEAIOREQ_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_NOT_SUBMITTED); + + LNXKAIOIOEVENT AioEvent; + int rc = rtFileAsyncIoLinuxCancel(pReqInt->AioContext, &pReqInt->AioCB, &AioEvent); + if (RT_SUCCESS(rc)) + { + /* + * Decrement request count because the request will never arrive at the + * completion port. + */ + AssertMsg(VALID_PTR(pReqInt->pCtxInt), + ("Invalid state. Request was canceled but wasn't submitted\n")); + + ASMAtomicDecS32(&pReqInt->pCtxInt->cRequests); + pReqInt->Rc = VERR_FILE_AIO_CANCELED; + RTFILEAIOREQ_SET_STATE(pReqInt, COMPLETED); + return VINF_SUCCESS; + } + if (rc == VERR_TRY_AGAIN) + return VERR_FILE_AIO_IN_PROGRESS; + return rc; +} + + +RTDECL(int) RTFileAioReqGetRC(RTFILEAIOREQ hReq, size_t *pcbTransfered) +{ + PRTFILEAIOREQINTERNAL pReqInt = hReq; + RTFILEAIOREQ_VALID_RETURN(pReqInt); + AssertPtrNull(pcbTransfered); + RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, SUBMITTED, VERR_FILE_AIO_IN_PROGRESS); + RTFILEAIOREQ_NOT_STATE_RETURN_RC(pReqInt, PREPARED, VERR_FILE_AIO_NOT_SUBMITTED); + + if ( pcbTransfered + && RT_SUCCESS(pReqInt->Rc)) + *pcbTransfered = pReqInt->cbTransfered; + + return pReqInt->Rc; +} + + +RTDECL(int) RTFileAioCtxCreate(PRTFILEAIOCTX phAioCtx, uint32_t cAioReqsMax, + uint32_t fFlags) +{ + PRTFILEAIOCTXINTERNAL pCtxInt; + AssertPtrReturn(phAioCtx, VERR_INVALID_POINTER); + AssertReturn(!(fFlags & ~RTFILEAIOCTX_FLAGS_VALID_MASK), VERR_INVALID_PARAMETER); + + /* The kernel interface needs a maximum. */ + if (cAioReqsMax == RTFILEAIO_UNLIMITED_REQS) + return VERR_OUT_OF_RANGE; + + pCtxInt = (PRTFILEAIOCTXINTERNAL)RTMemAllocZ(sizeof(RTFILEAIOCTXINTERNAL)); + if (RT_UNLIKELY(!pCtxInt)) + return VERR_NO_MEMORY; + + /* Init the event handle. */ + int rc = rtFileAsyncIoLinuxCreate(cAioReqsMax, &pCtxInt->AioContext); + if (RT_SUCCESS(rc)) + { + pCtxInt->fWokenUp = false; + pCtxInt->fWaiting = false; + pCtxInt->hThreadWait = NIL_RTTHREAD; + pCtxInt->cRequestsMax = cAioReqsMax; + pCtxInt->fFlags = fFlags; + pCtxInt->u32Magic = RTFILEAIOCTX_MAGIC; + *phAioCtx = (RTFILEAIOCTX)pCtxInt; + } + else + RTMemFree(pCtxInt); + + return rc; +} + + +RTDECL(int) RTFileAioCtxDestroy(RTFILEAIOCTX hAioCtx) +{ + /* Validate the handle and ignore nil. */ + if (hAioCtx == NIL_RTFILEAIOCTX) + return VINF_SUCCESS; + PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx; + RTFILEAIOCTX_VALID_RETURN(pCtxInt); + + /* Cannot destroy a busy context. */ + if (RT_UNLIKELY(pCtxInt->cRequests)) + return VERR_FILE_AIO_BUSY; + + /* The native bit first, then mark it as dead and free it. */ + int rc = rtFileAsyncIoLinuxDestroy(pCtxInt->AioContext); + if (RT_FAILURE(rc)) + return rc; + ASMAtomicUoWriteU32(&pCtxInt->u32Magic, RTFILEAIOCTX_MAGIC_DEAD); + RTMemFree(pCtxInt); + + return VINF_SUCCESS; +} + + +RTDECL(uint32_t) RTFileAioCtxGetMaxReqCount(RTFILEAIOCTX hAioCtx) +{ + /* Nil means global here. */ + if (hAioCtx == NIL_RTFILEAIOCTX) + return RTFILEAIO_UNLIMITED_REQS; /** @todo r=bird: I'm a bit puzzled by this return value since it + * is completely useless in RTFileAioCtxCreate. */ + + /* Return 0 if the handle is invalid, it's better than garbage I think... */ + PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx; + RTFILEAIOCTX_VALID_RETURN_RC(pCtxInt, 0); + + return pCtxInt->cRequestsMax; +} + +RTDECL(int) RTFileAioCtxAssociateWithFile(RTFILEAIOCTX hAioCtx, RTFILE hFile) +{ + /* Nothing to do. */ + NOREF(hAioCtx); NOREF(hFile); + return VINF_SUCCESS; +} + +RTDECL(int) RTFileAioCtxSubmit(RTFILEAIOCTX hAioCtx, PRTFILEAIOREQ pahReqs, size_t cReqs) +{ + int rc = VINF_SUCCESS; + + /* + * Parameter validation. + */ + PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx; + RTFILEAIOCTX_VALID_RETURN(pCtxInt); + AssertReturn(cReqs > 0, VERR_INVALID_PARAMETER); + AssertPtrReturn(pahReqs, VERR_INVALID_POINTER); + uint32_t i = cReqs; + PRTFILEAIOREQINTERNAL pReqInt = NULL; + + /* + * Validate requests and associate with the context. + */ + while (i-- > 0) + { + pReqInt = pahReqs[i]; + if (RTFILEAIOREQ_IS_NOT_VALID(pReqInt)) + { + /* Undo everything and stop submitting. */ + size_t iUndo = cReqs; + while (iUndo-- > i) + { + pReqInt = pahReqs[iUndo]; + RTFILEAIOREQ_SET_STATE(pReqInt, PREPARED); + pReqInt->pCtxInt = NULL; + } + return VERR_INVALID_HANDLE; + } + + pReqInt->AioContext = pCtxInt->AioContext; + pReqInt->pCtxInt = pCtxInt; + RTFILEAIOREQ_SET_STATE(pReqInt, SUBMITTED); + } + + do + { + /* + * We cast pahReqs to the Linux iocb structure to avoid copying the requests + * into a temporary array. This is possible because the iocb structure is + * the first element in the request structure (see PRTFILEAIOCTXINTERNAL). + */ + int cReqsSubmitted = 0; + rc = rtFileAsyncIoLinuxSubmit(pCtxInt->AioContext, cReqs, + (PLNXKAIOIOCB *)pahReqs, + &cReqsSubmitted); + if (RT_FAILURE(rc)) + { + /* + * We encountered an error. + * This means that the first IoCB + * is not correctly initialized + * (invalid buffer alignment or bad file descriptor). + * Revert every request into the prepared state except + * the first one which will switch to completed. + * Another reason could be insufficient resources. + */ + i = cReqs; + while (i-- > 0) + { + /* Already validated. */ + pReqInt = pahReqs[i]; + pReqInt->pCtxInt = NULL; + pReqInt->AioContext = 0; + RTFILEAIOREQ_SET_STATE(pReqInt, PREPARED); + } + + if (rc == VERR_TRY_AGAIN) + return VERR_FILE_AIO_INSUFFICIENT_RESSOURCES; + else + { + /* The first request failed. */ + pReqInt = pahReqs[0]; + RTFILEAIOREQ_SET_STATE(pReqInt, COMPLETED); + pReqInt->Rc = rc; + pReqInt->cbTransfered = 0; + return rc; + } + } + + /* Advance. */ + cReqs -= cReqsSubmitted; + pahReqs += cReqsSubmitted; + ASMAtomicAddS32(&pCtxInt->cRequests, cReqsSubmitted); + + } while (cReqs); + + return rc; +} + + +RTDECL(int) RTFileAioCtxWait(RTFILEAIOCTX hAioCtx, size_t cMinReqs, RTMSINTERVAL cMillies, + PRTFILEAIOREQ pahReqs, size_t cReqs, uint32_t *pcReqs) +{ + /* + * Validate the parameters, making sure to always set pcReqs. + */ + AssertPtrReturn(pcReqs, VERR_INVALID_POINTER); + *pcReqs = 0; /* always set */ + PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx; + RTFILEAIOCTX_VALID_RETURN(pCtxInt); + AssertPtrReturn(pahReqs, VERR_INVALID_POINTER); + AssertReturn(cReqs != 0, VERR_INVALID_PARAMETER); + AssertReturn(cReqs >= cMinReqs, VERR_OUT_OF_RANGE); + + /* + * Can't wait if there are not requests around. + */ + if ( RT_UNLIKELY(ASMAtomicUoReadS32(&pCtxInt->cRequests) == 0) + && !(pCtxInt->fFlags & RTFILEAIOCTX_FLAGS_WAIT_WITHOUT_PENDING_REQUESTS)) + return VERR_FILE_AIO_NO_REQUEST; + + /* + * Convert the timeout if specified. + */ + struct timespec *pTimeout = NULL; + struct timespec Timeout = {0,0}; + uint64_t StartNanoTS = 0; + if (cMillies != RT_INDEFINITE_WAIT) + { + Timeout.tv_sec = cMillies / 1000; + Timeout.tv_nsec = cMillies % 1000 * 1000000; + pTimeout = &Timeout; + StartNanoTS = RTTimeNanoTS(); + } + + /* Wait for at least one. */ + if (!cMinReqs) + cMinReqs = 1; + + /* For the wakeup call. */ + Assert(pCtxInt->hThreadWait == NIL_RTTHREAD); + ASMAtomicWriteHandle(&pCtxInt->hThreadWait, RTThreadSelf()); + + /* + * Loop until we're woken up, hit an error (incl timeout), or + * have collected the desired number of requests. + */ + int rc = VINF_SUCCESS; + int cRequestsCompleted = 0; + while (!pCtxInt->fWokenUp) + { + LNXKAIOIOEVENT aPortEvents[AIO_MAXIMUM_REQUESTS_PER_CONTEXT]; + int cRequestsToWait = RT_MIN(cReqs, AIO_MAXIMUM_REQUESTS_PER_CONTEXT); + ASMAtomicXchgBool(&pCtxInt->fWaiting, true); + rc = rtFileAsyncIoLinuxGetEvents(pCtxInt->AioContext, cMinReqs, cRequestsToWait, &aPortEvents[0], pTimeout); + ASMAtomicXchgBool(&pCtxInt->fWaiting, false); + if (RT_FAILURE(rc)) + break; + uint32_t const cDone = rc; + rc = VINF_SUCCESS; + + /* + * Process received events / requests. + */ + for (uint32_t i = 0; i < cDone; i++) + { + /* + * The iocb is the first element in our request structure. + * So we can safely cast it directly to the handle (see above) + */ + PRTFILEAIOREQINTERNAL pReqInt = (PRTFILEAIOREQINTERNAL)aPortEvents[i].pIoCB; + AssertPtr(pReqInt); + Assert(pReqInt->u32Magic == RTFILEAIOREQ_MAGIC); + + /** @todo aeichner: The rc field contains the result code + * like you can find in errno for the normal read/write ops. + * But there is a second field called rc2. I don't know the + * purpose for it yet. + */ + if (RT_UNLIKELY(aPortEvents[i].rc < 0)) + pReqInt->Rc = RTErrConvertFromErrno(-aPortEvents[i].rc); /* Convert to positive value. */ + else + { + pReqInt->Rc = VINF_SUCCESS; + pReqInt->cbTransfered = aPortEvents[i].rc; + } + + /* Mark the request as finished. */ + RTFILEAIOREQ_SET_STATE(pReqInt, COMPLETED); + + pahReqs[cRequestsCompleted++] = (RTFILEAIOREQ)pReqInt; + } + + /* + * Done Yet? If not advance and try again. + */ + if (cDone >= cMinReqs) + break; + cMinReqs -= cDone; + cReqs -= cDone; + + if (cMillies != RT_INDEFINITE_WAIT) + { + /* The API doesn't return ETIMEDOUT, so we have to fix that ourselves. */ + uint64_t NanoTS = RTTimeNanoTS(); + uint64_t cMilliesElapsed = (NanoTS - StartNanoTS) / 1000000; + if (cMilliesElapsed >= cMillies) + { + rc = VERR_TIMEOUT; + break; + } + + /* The syscall supposedly updates it, but we're paranoid. :-) */ + Timeout.tv_sec = (cMillies - (RTMSINTERVAL)cMilliesElapsed) / 1000; + Timeout.tv_nsec = (cMillies - (RTMSINTERVAL)cMilliesElapsed) % 1000 * 1000000; + } + } + + /* + * Update the context state and set the return value. + */ + *pcReqs = cRequestsCompleted; + ASMAtomicSubS32(&pCtxInt->cRequests, cRequestsCompleted); + Assert(pCtxInt->hThreadWait == RTThreadSelf()); + ASMAtomicWriteHandle(&pCtxInt->hThreadWait, NIL_RTTHREAD); + + /* + * Clear the wakeup flag and set rc. + */ + if ( pCtxInt->fWokenUp + && RT_SUCCESS(rc)) + { + ASMAtomicXchgBool(&pCtxInt->fWokenUp, false); + rc = VERR_INTERRUPTED; + } + + return rc; +} + + +RTDECL(int) RTFileAioCtxWakeup(RTFILEAIOCTX hAioCtx) +{ + PRTFILEAIOCTXINTERNAL pCtxInt = hAioCtx; + RTFILEAIOCTX_VALID_RETURN(pCtxInt); + + /** @todo r=bird: Define the protocol for how to resume work after calling + * this function. */ + + bool fWokenUp = ASMAtomicXchgBool(&pCtxInt->fWokenUp, true); + + /* + * Read the thread handle before the status flag. + * If we read the handle after the flag we might + * end up with an invalid handle because the thread + * waiting in RTFileAioCtxWakeup() might get scheduled + * before we read the flag and returns. + * We can ensure that the handle is valid if fWaiting is true + * when reading the handle before the status flag. + */ + RTTHREAD hThread; + ASMAtomicReadHandle(&pCtxInt->hThreadWait, &hThread); + bool fWaiting = ASMAtomicReadBool(&pCtxInt->fWaiting); + if ( !fWokenUp + && fWaiting) + { + /* + * If a thread waits the handle must be valid. + * It is possible that the thread returns from + * rtFileAsyncIoLinuxGetEvents() before the signal + * is send. + * This is no problem because we already set fWokenUp + * to true which will let the thread return VERR_INTERRUPTED + * and the next call to RTFileAioCtxWait() will not + * return VERR_INTERRUPTED because signals are not saved + * and will simply vanish if the destination thread can't + * receive it. + */ + Assert(hThread != NIL_RTTHREAD); + RTThreadPoke(hThread); + } + + return VINF_SUCCESS; +} + diff --git a/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp b/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp new file mode 100644 index 00000000..36d69c4d --- /dev/null +++ b/src/VBox/Runtime/r3/linux/ioqueue-iouringfile-provider.cpp @@ -0,0 +1,934 @@ +/* $Id: ioqueue-iouringfile-provider.cpp $ */ +/** @file + * IPRT - I/O queue, Linux io_uring interface I/O file provider. + */ + +/* + * Copyright (C) 2019-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + +/** @page pg_rtioqueue_linux RTIoQueue - Linux io_uring implementation notes + * @internal + * + * The io_uring interface is the most recent interface added to the Linux kernel + * to deliver fast and efficient I/O. It was first added with kernel version 5.1 and is + * thus not available on most systems as of writing this backend (July 2019). + * It supersedes the old async I/O interface and cleans up with some restrictions like + * having to disable caching for the file. + * The interface is centered around a submission and completion queue to queue multiple new + * requests for the kernel to process and get notified about completions to reduce the amount + * of context switches to an absolute minimum. It also offers advanced features like + * registering a fixed set of memory buffers for I/O upfront to reduce the processing overhead + * even more. + * + * The first implementation will only make use of the basic features and more advanced features + * will be added later. + * The adept developer probably noticed that the public IPRT I/O queue API resembles the io_uring + * interface in many aspects. This is not by accident but to reduce our own overhead as much as possible + * while still keeping a consistent platform independent API which allows efficient implementations on + * other hosts when they come up. + * + * The public kernel io_uring interface is completely defined in this file to avoid dragging in additional + * dependencies and to avoid compile problems on older hosts missing the interface just like it is done + * for the Linux RTFileAio* API The necessary interface definitions and descriptions where retrieved from: + * * http://kernel.dk/io_uring.pdf + * * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/io_uring.h + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_IOQUEUE +#include <iprt/ioqueue.h> + +#include <iprt/assertcompile.h> +#include <iprt/asm.h> +#include <iprt/errcore.h> +#include <iprt/file.h> +#include <iprt/log.h> +#include <iprt/mem.h> +#include <iprt/string.h> + +#include <errno.h> +#include <unistd.h> +#include <signal.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/uio.h> + +#include "internal/ioqueue.h" + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ + +/** The syscall number of io_uring_setup(). */ +#define LNX_IOURING_SYSCALL_SETUP 425 +/** The syscall number of io_uring_enter(). */ +#define LNX_IOURING_SYSCALL_ENTER 426 +/** The syscall number of io_uring_register(). */ +#define LNX_IOURING_SYSCALL_REGISTER 427 +/** eventfd2() syscall not associated with io_uring but used for kicking waiters. */ +#define LNX_SYSCALL_EVENTFD2 19 + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ + +/** + * Linux io_uring completion event. + */ +typedef struct LNXIOURINGCQE +{ + /** Opaque user data associated with the completed request. */ + uint64_t u64User; + /** The status code of the request. */ + int32_t rcLnx; + /** Some flags which are not used as of now. */ + uint32_t fFlags; +} LNXIOURINGCQE; +AssertCompileSize(LNXIOURINGCQE, 16); +/** Pointer to a Linux io_uring completion event. */ +typedef LNXIOURINGCQE *PLNXIOURINGCQE; +/** Pointer to a constant linux io_uring completion event. */ +typedef const LNXIOURINGCQE *PCLNXIOURINGCQE; + + +/** + * Linux io_uring submission queue entry. + */ +typedef struct LNXIOURINGSQE +{ + /** The opcode for the request. */ + uint8_t u8Opc; + /** Common flags for the request. */ + uint8_t u8Flags; + /** Assigned I/O priority. */ + uint16_t u16IoPrio; + /** The file descriptor the request is for. */ + int32_t i32Fd; + /** The start offset into the file for the request. */ + uint64_t u64OffStart; + /** Buffer pointer or Pointer to io vector array depending on opcode. */ + uint64_t u64AddrBufIoVec; + /** Size of the buffer in bytes or number of io vectors. */ + uint32_t u32BufIoVecSz; + /** Opcode dependent data. */ + union + { + /** Flags for read/write requests. */ + uint32_t u32KrnlRwFlags; + /** Flags for fsync() like requests. */ + uint32_t u32FsyncFlags; + /** Flags for poll() like requests. */ + uint16_t u16PollFlags; + /** Flags for sync_file_range() like requests. */ + uint32_t u32SyncFileRangeFlags; + /** Flags for requests requiring a msg structure. */ + uint32_t u32MsgFlags; + } uOpc; + /** Opaque user data associated with the request and returned durign completion. */ + uint64_t u64User; + /** Request type dependent data. */ + union + { + /** Fixed buffer index if indicated by the request flags. */ + uint16_t u16FixedBufIdx; + /** Padding to align the structure to 64 bytes. */ + uint64_t au64Padding[3]; + } uReq; +} LNXIOURINGSQE; +AssertCompileSize(LNXIOURINGSQE, 64); +/** Pointer to a Linux io_uring submission queue entry. */ +typedef LNXIOURINGSQE *PLNXIOURINGSQE; +/** Pointer to a constant Linux io_uring submission queue entry. */ +typedef const LNXIOURINGSQE *PCLNXIOURINGSQE; + + +/** + * Linux u_ioring SQ ring header structure to maintain the queue. + */ +typedef struct LNXIOURINGSQ +{ + /** The current head position to fill in new requests. */ + uint32_t u32OffHead; + /** The current tail position the kernel starts processing from. */ + uint32_t u32OffTail; + /** The mask for the head and tail counters to apply to retrieve the index. */ + uint32_t u32OffRingMask; + /** Number of entries in the SQ ring. */ + uint32_t u32OffRingEntries; + /** Flags set asychronously by the kernel. */ + uint32_t u32OffFlags; + /** Counter of dropped requests. */ + uint32_t u32OffDroppedReqs; + /** Offset where to find the array of SQ entries. */ + uint32_t u32OffArray; + /** Reserved. */ + uint32_t u32Rsvd0; + /** Reserved. */ + uint64_t u64Rsvd1; +} LNXIOURINGSQ; +AssertCompileSize(LNXIOURINGSQ, 40); +/** Pointer to a Linux u_ioring SQ ring header. */ +typedef LNXIOURINGSQ *PLNXIOURINGSQ; +/** Pointer to a constant Linux u_ioring SQ ring header. */ +typedef const LNXIOURINGSQ *PCLNXIOURINGSQ; + + +/** + * Linux io_uring CQ ring header structure to maintain the queue. + */ +typedef struct LNXIOURINGCQ +{ + /** The current head position the kernel modifies when completion events happen. */ + uint32_t u32OffHead; + /** The current tail position to read completion events from. */ + uint32_t u32OffTail; + /** The mask for the head and tail counters to apply to retrieve the index. */ + uint32_t u32OffRingMask; + /** Number of entries in the CQ ring. */ + uint32_t u32OffRingEntries; + /** Number of CQ overflows happened. */ + uint32_t u32OffOverflowCnt; + /** */ + uint32_t u32OffCqes; + /** Reserved. */ + uint64_t au64Rsvd0[2]; +} LNXIOURINGCQ; +AssertCompileSize(LNXIOURINGCQ, 40); +/** Pointer to a Linux u_ioring CQ ring header. */ +typedef LNXIOURINGCQ *PLNXIOURINGCQ; +/** Pointer to a constant Linux u_ioring CQ ring header. */ +typedef const LNXIOURINGCQ *PCLNXIOURINGCQ; + + +/** + * Linux io_uring parameters passed to io_uring_setup(). + */ +typedef struct LNXIOURINGPARAMS +{ + /** Number of SQ entries requested, must be power of 2. */ + uint32_t u32SqEntriesCnt; + /** Number of CQ entries requested, must be power of 2. */ + uint32_t u32CqEntriesCnt; + /** Flags for the ring, , see LNX_IOURING_SETUP_F_*. */ + uint32_t u32Flags; + /** Affinity of the kernel side SQ polling thread if enabled. */ + uint32_t u32SqPollCpu; + /** Milliseconds after the kernel side SQ polling thread goes to sleep + * if there is are no requests to process. */ + uint32_t u32SqPollIdleMs; + /** Reserved. */ + uint32_t au32Rsvd0[5]; + /** Offsets returned for the submission queue. */ + LNXIOURINGSQ SqOffsets; + /** Offsets returned for the completion queue. */ + LNXIOURINGCQ CqOffsets; +} LNXIOURINGPARAMS; +/** Pointer to Linux io_uring parameters. */ +typedef LNXIOURINGPARAMS *PLNXIOURINGPARAMS; +/** Pointer to constant Linux io_uring parameters. */ +typedef const LNXIOURINGPARAMS *PCLNXIOURINGPARAMS; + + +/** + * @name LNXIOURINGSQE::u8Opc defined opcodes. + * @{ */ +/** Opcode to profile the interface, does nothing. */ +#define LNX_IOURING_OPC_NOP 0 +/** preadv() like request. */ +#define LNX_IOURING_OPC_READV 1 +/** pwritev() like request. */ +#define LNX_IOURING_OPC_WRITEV 2 +/** fsync() like request. */ +#define LNX_IOURING_OPC_FSYNC 3 +/** Read request using a fixed preset buffer. */ +#define LNX_IOURING_OPC_READ_FIXED 4 +/** Write request using a fixed preset buffer. */ +#define LNX_IOURING_OPC_WRITE_FIXED 5 +/** Add file descriptor to pollset. */ +#define LNX_IOURING_OPC_POLL_ADD 6 +/** Remove file descriptor from pollset. */ +#define LNX_IOURING_OPC_POLL_REMOVE 7 +/** sync_file_range() like request. */ +#define LNX_IOURING_OPC_SYNC_FILE_RANGE 8 +/** sendmsg() like request. */ +#define LNX_IOURING_OPC_SENDMSG 9 +/** recvmsg() like request. */ +#define LNX_IOURING_OPC_RECVMSG 10 +/** @} */ + + +/** + * @name Additional flags for LNX_IOURING_OPC_FSYNC requests. + * @{ */ +/** Sync userdata as well instead of metadata only. */ +#define LNX_IOURING_OPC_FSYNC_DATASYNC RT_BIT_32(0) +/** @} */ + + +/** + * @name Flags for the LNX_IOURING_SYSCALL_SETUP syscall. + * @{ */ +/** The I/O context is polled. */ +#define LNX_IOURING_SETUP_F_IOPOLL RT_BIT_32(0) +/** The kernel should poll the submission queue. */ +#define LNX_IOURING_SETUP_F_SQPOLL RT_BIT_32(1) +/** Sets the CPU affinity of the kernel thread polling the submission queue. */ +#define LNX_IOURING_SETUP_F_SQAFF RT_BIT_32(2) +/** @} */ + + +/** + * @name Flags for LNXIOURINGSQE::u8Flags. + * @{ */ +/** The file descriptor was registered before use. */ +#define LNX_IOURING_SQE_F_FIXED_FILE RT_BIT(0) +/** Complete all active requests before issuing the request with the flag set. */ +#define LNX_IOURING_SQE_F_IO_DRAIN RT_BIT(1) +/** Links the request with the flag set to the next one. */ +#define LNX_IOURING_SQE_F_IO_LINK RT_BIT(2) +/** @} */ + + +/** + * @name Magic mmap offsets to map submission and completion queues. + * @{ */ +/** Used to map the submission queue. */ +#define LNX_IOURING_MMAP_OFF_SQ UINT64_C(0) +/** Used to map the completion queue. */ +#define LNX_IOURING_MMAP_OFF_CQ UINT64_C(0x8000000) +/** Used to map the submission queue entries array. */ +#define LNX_IOURING_MMAP_OFF_SQES UINT64_C(0x10000000) +/** @} */ + + +/** + * @name Flags used for the SQ ring structure. + * @{ */ +/** The kernel thread needs a io_uring_enter() wakeup to continue processing requests. */ +#define LNX_IOURING_SQ_RING_F_NEED_WAKEUP RT_BIT_32(0) +/** @} */ + + +/** + * @name Flags for the LNX_IOURING_SYSCALL_ENTER syscall. + * { */ +/** Retrieve completion events for the completion queue. */ +#define LNX_IOURING_ENTER_F_GETEVENTS RT_BIT_32(0) +/** Wakes the suspended kernel thread processing the requests. */ +#define LNX_IOURING_ENTER_F_SQ_WAKEUP RT_BIT_32(1) +/** @} */ + + +/** + * @name Opcodes for the LNX_IOURING_SYSCALL_REGISTER syscall. + * { */ +/** Register a fixed set of buffers. */ +#define LNX_IOURING_REGISTER_OPC_BUFFERS_REGISTER 0 +/** Unregisters a fixed set of buffers registered previously. */ +#define LNX_IOURING_REGISTER_OPC_BUFFERS_UNREGISTER 1 +/** Register a fixed set of files. */ +#define LNX_IOURING_REGISTER_OPC_FILES_REGISTER 2 +/** Unregisters a fixed set of files registered previously. */ +#define LNX_IOURING_REGISTER_OPC_FILES_UNREGISTER 3 +/** Register an eventfd associated with the I/O ring. */ +#define LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER 4 +/** Unregisters an eventfd registered previously. */ +#define LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER 5 +/** @} */ + + +/** + * SQ ring structure. + * + * @note Some members of this structure point to memory shared with the kernel, + * hence the volatile keyword. + */ +typedef struct RTIOQUEUESQ +{ + /** Pointer to the head counter. */ + volatile uint32_t *pidxHead; + /** Pointer to the tail counter. */ + volatile uint32_t *pidxTail; + /** Mask to apply for the counters to get to the index. */ + uint32_t fRingMask; + /** Number of entries in the ring. */ + uint32_t cEntries; + /** Pointer to the global flags. */ + volatile uint32_t *pfFlags; + /** Pointer to the indirection array used for indexing the real SQ entries. */ + volatile uint32_t *paidxSqes; +} RTIOQUEUESQ; + + +/** + * CQ ring structure. + * + * @note Some members of this structure point to memory shared with the kernel, + * hence the volatile keyword. + */ +typedef struct RTIOQUEUECQ +{ + /** Pointer to the head counter. */ + volatile uint32_t *pidxHead; + /** Pointer to the tail counter. */ + volatile uint32_t *pidxTail; + /** Mask to apply for the counters to get to the index. */ + uint32_t fRingMask; + /** Number of entries in the ring. */ + uint32_t cEntries; + /** Pointer to the completion entry ring. */ + volatile LNXIOURINGCQE *paCqes; +} RTIOQUEUECQ; + + +/** + * Internal I/O queue provider instance data. + */ +typedef struct RTIOQUEUEPROVINT +{ + /** The io_uring file descriptor. */ + int iFdIoCtx; + /** The eventfd file descriptor registered with the ring. */ + int iFdEvt; + /** The submission queue. */ + RTIOQUEUESQ Sq; + /** The currently uncommitted tail for the SQ. */ + uint32_t idxSqTail; + /** Numbere of uncommitted SQEs. */ + uint32_t cSqesToCommit; + /** The completion queue. */ + RTIOQUEUECQ Cq; + /** Pointer to the mapped SQES entries. */ + PLNXIOURINGSQE paSqes; + /** Pointer to the iovec structure used for non S/G requests. */ + struct iovec *paIoVecs; + /** Pointer returned by mmap() for the SQ ring, used for unmapping. */ + void *pvMMapSqRing; + /** Pointer returned by mmap() for the CQ ring, used for unmapping. */ + void *pvMMapCqRing; + /** Pointer returned by mmap() for the SQ entries array, used for unmapping. */ + void *pvMMapSqes; + /** Size of the mapped SQ ring, used for unmapping. */ + size_t cbMMapSqRing; + /** Size of the mapped CQ ring, used for unmapping. */ + size_t cbMMapCqRing; + /** Size of the mapped SQ entries array, used for unmapping. */ + size_t cbMMapSqes; + /** Flag whether the waiter was woken up externally. */ + volatile bool fExtIntr; +} RTIOQUEUEPROVINT; +/** Pointer to the internal I/O queue provider instance data. */ +typedef RTIOQUEUEPROVINT *PRTIOQUEUEPROVINT; + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ + +/** + * Syscall wrapper for io_uring_setup(). + * + * @returns IPRT status code. + * @param cEntries Number of entries for submission and completion queues. + * @param pParams Additional parameters for the I/O ring and updated return values + * on success. + * @param piFdIoCtx Where to store the file descriptor of the I/O ring on success. + */ +DECLINLINE(int) rtIoQueueLnxIoURingSetup(uint32_t cEntries, PLNXIOURINGPARAMS pParams, int32_t *piFdIoCtx) +{ + int rcLnx = syscall(LNX_IOURING_SYSCALL_SETUP, cEntries, pParams); + if (RT_UNLIKELY(rcLnx == -1)) + return RTErrConvertFromErrno(errno); + + *piFdIoCtx = rcLnx; + return VINF_SUCCESS; +} + + +/** + * Syscall wrapper for io_uring_enter(). + * + * @returns IPRT status code. + * @param iFdIoCtx The I/O ring file descriptor. + * @param cToSubmit Maximum number of requests waiting for processing. + * @param cMinComplete Minimum number of completion events to accumulate before returning. + * @param fFlags Flags for io_uring_enter(), see LNX_IOURING_ENTER_F_*. + */ +DECLINLINE(int) rtIoQueueLnxIoURingEnter(int32_t iFdIoCtx, uint32_t cToSubmit, uint32_t cMinComplete, + uint32_t fFlags) +{ + int rcLnx = syscall(LNX_IOURING_SYSCALL_ENTER, iFdIoCtx, cToSubmit, cMinComplete, fFlags, + NULL, 0); + if (RT_UNLIKELY(rcLnx == -1)) + return RTErrConvertFromErrno(errno); + + return VINF_SUCCESS; +} + + +/** + * Syscall wrapper for io_uring_register(). + * + * @returns IPRT status code. + * @param iFdIoCtx The I/O ring file descriptor. + * @param uOpc Operation to perform, see LNX_IOURING_REGISTER_OPC_*. + * @param pvArg Opaque arguments. + * @param cArgs Number of arguments. + */ +DECLINLINE(int) rtIoQueueLnxIoURingRegister(int32_t iFdIoCtx, uint32_t uOpc, void *pvArg, + uint32_t cArgs) +{ + int rcLnx = syscall(LNX_IOURING_SYSCALL_REGISTER, iFdIoCtx, uOpc, pvArg, cArgs); + if (RT_UNLIKELY(rcLnx == -1)) + return RTErrConvertFromErrno(errno); + + return VINF_SUCCESS; +} + + +/** + * mmap() wrapper for the common bits and returning an IPRT status code. + * + * @returns IPRT status code. + * @param iFdIoCtx The I/O ring file descriptor. + * @param offMmap The mmap() offset. + * @param cbMmap How much to map. + * @param ppv Where to store the pointer to the mapping on success. + */ +DECLINLINE(int) rtIoQueueLnxIoURingMmap(int iFdIoCtx, off_t offMmap, size_t cbMmap, void **ppv) +{ + void *pv = mmap(0, cbMmap, PROT_READ | PROT_WRITE , MAP_SHARED | MAP_POPULATE, iFdIoCtx, offMmap); + if (pv != MAP_FAILED) + { + *ppv = pv; + return VINF_SUCCESS; + } + + return RTErrConvertFromErrno(errno); +} + + +/** + * eventfd2() syscall wrapper. + * + * @returns IPRT status code. + * @param uValInit The initial value of the maintained counter. + * @param fFlags Flags controlling the eventfd behavior. + * @param piFdEvt Where to store the file descriptor of the eventfd object on success. + */ +DECLINLINE(int) rtIoQueueLnxEventfd2(uint32_t uValInit, uint32_t fFlags, int *piFdEvt) +{ + int rcLnx = syscall(LNX_SYSCALL_EVENTFD2, uValInit, fFlags); + if (RT_UNLIKELY(rcLnx == -1)) + return RTErrConvertFromErrno(errno); + + *piFdEvt = rcLnx; + return VINF_SUCCESS; +} + + +/** + * Checks the completion event queue for pending events. + * + * @returns nothing. + * @param pThis The provider instance. + * @param paCEvt Pointer to the array of completion events. + * @param cCEvt Maximum number of completion events the array can hold. + * @param pcCEvtSeen Where to store the number of completion events processed. + */ +static void rtIoQueueLnxIoURingFileProvCqCheck(PRTIOQUEUEPROVINT pThis, PRTIOQUEUECEVT paCEvt, + uint32_t cCEvt, uint32_t *pcCEvtSeen) +{ + /* The fencing and atomic accesses are kind of overkill and probably not required (dev paranoia). */ + ASMReadFence(); + uint32_t idxCqHead = ASMAtomicReadU32(pThis->Cq.pidxHead); + uint32_t idxCqTail = ASMAtomicReadU32(pThis->Cq.pidxTail); + ASMReadFence(); + + uint32_t cCEvtSeen = 0; + + while ( idxCqTail != idxCqHead + && cCEvtSeen < cCEvt) + { + /* Get the index. */ + uint32_t idxCqe = idxCqHead & pThis->Cq.fRingMask; + volatile LNXIOURINGCQE *pCqe = &pThis->Cq.paCqes[idxCqe]; + + paCEvt->pvUser = (void *)(uintptr_t)pCqe->u64User; + if (pCqe->rcLnx >= 0) + { + paCEvt->rcReq = VINF_SUCCESS; + paCEvt->cbXfered = (size_t)pCqe->rcLnx; + } + else + paCEvt->rcReq = RTErrConvertFromErrno(-pCqe->rcLnx); + + paCEvt++; + cCEvtSeen++; + idxCqHead++; + } + + *pcCEvtSeen = cCEvtSeen; + + /* Paranoia strikes again. */ + ASMWriteFence(); + ASMAtomicWriteU32(pThis->Cq.pidxHead, idxCqHead); + ASMWriteFence(); +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnIsSupported} */ +static DECLCALLBACK(bool) rtIoQueueLnxIoURingFileProv_IsSupported(void) +{ + /* + * Try to create a simple I/O ring and close it again. + * The common code/public API already checked for the proper handle type. + */ + int iFdIoCtx = 0; + bool fSupp = false; + LNXIOURINGPARAMS Params; + RT_ZERO(Params); + + int rc = rtIoQueueLnxIoURingSetup(16, &Params, &iFdIoCtx); + if (RT_SUCCESS(rc)) + { + /* + * Check that we can register an eventfd descriptor to get notified about + * completion events while being able to kick the waiter externally out of the wait. + */ + int iFdEvt = 0; + rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &iFdEvt); + if (RT_SUCCESS(rc)) + { + rc = rtIoQueueLnxIoURingRegister(iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, + &iFdEvt, 1 /*cArgs*/); + if (RT_SUCCESS(rc)) + fSupp = true; + + int rcLnx = close(iFdEvt); Assert(!rcLnx); RT_NOREF(rcLnx); + } + int rcLnx = close(iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx); + } + + return fSupp; +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueInit} */ +static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_QueueInit(RTIOQUEUEPROV hIoQueueProv, uint32_t fFlags, + uint32_t cSqEntries, uint32_t cCqEntries) +{ + RT_NOREF(fFlags, cCqEntries); + + PRTIOQUEUEPROVINT pThis = hIoQueueProv; + LNXIOURINGPARAMS Params; + RT_ZERO(Params); + + pThis->cSqesToCommit = 0; + pThis->fExtIntr = false; + + int rc = rtIoQueueLnxIoURingSetup(cSqEntries, &Params, &pThis->iFdIoCtx); + if (RT_SUCCESS(rc)) + { + /* Map the rings into userspace. */ + pThis->cbMMapSqRing = Params.SqOffsets.u32OffArray + Params.u32SqEntriesCnt * sizeof(uint32_t); + pThis->cbMMapCqRing = Params.CqOffsets.u32OffCqes + Params.u32CqEntriesCnt * sizeof(LNXIOURINGCQE); + pThis->cbMMapSqes = Params.u32SqEntriesCnt * sizeof(LNXIOURINGSQE); + + pThis->paIoVecs = (struct iovec *)RTMemAllocZ(Params.u32SqEntriesCnt * sizeof(struct iovec)); + if (RT_LIKELY(pThis->paIoVecs)) + { + rc = rtIoQueueLnxEventfd2(0 /*uValInit*/, 0 /*fFlags*/, &pThis->iFdEvt); + if (RT_SUCCESS(rc)) + { + rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_REGISTER, &pThis->iFdEvt, 1 /*cArgs*/); + if (RT_SUCCESS(rc)) + { + rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQ, pThis->cbMMapSqRing, &pThis->pvMMapSqRing); + if (RT_SUCCESS(rc)) + { + rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_CQ, pThis->cbMMapCqRing, &pThis->pvMMapCqRing); + if (RT_SUCCESS(rc)) + { + rc = rtIoQueueLnxIoURingMmap(pThis->iFdIoCtx, LNX_IOURING_MMAP_OFF_SQES, pThis->cbMMapSqes, &pThis->pvMMapSqes); + if (RT_SUCCESS(rc)) + { + uint8_t *pbTmp = (uint8_t *)pThis->pvMMapSqRing; + + pThis->Sq.pidxHead = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffHead); + pThis->Sq.pidxTail = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffTail); + pThis->Sq.fRingMask = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingMask); + pThis->Sq.cEntries = *(uint32_t *)(pbTmp + Params.SqOffsets.u32OffRingEntries); + pThis->Sq.pfFlags = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffFlags); + pThis->Sq.paidxSqes = (uint32_t *)(pbTmp + Params.SqOffsets.u32OffArray); + pThis->idxSqTail = *pThis->Sq.pidxTail; + + pThis->paSqes = (PLNXIOURINGSQE)pThis->pvMMapSqes; + + pbTmp = (uint8_t *)pThis->pvMMapCqRing; + + pThis->Cq.pidxHead = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffHead); + pThis->Cq.pidxTail = (uint32_t *)(pbTmp + Params.CqOffsets.u32OffTail); + pThis->Cq.fRingMask = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingMask); + pThis->Cq.cEntries = *(uint32_t *)(pbTmp + Params.CqOffsets.u32OffRingEntries); + pThis->Cq.paCqes = (PLNXIOURINGCQE)(pbTmp + Params.CqOffsets.u32OffCqes); + return VINF_SUCCESS; + } + + munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); + } + + munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); + } + + rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0); + AssertRC(rc); + } + + close(pThis->iFdEvt); + } + + RTMemFree(pThis->paIoVecs); + } + + int rcLnx = close(pThis->iFdIoCtx); Assert(!rcLnx); RT_NOREF(rcLnx); + } + + return rc; +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnQueueDestroy} */ +static DECLCALLBACK(void) rtIoQueueLnxIoURingFileProv_QueueDestroy(RTIOQUEUEPROV hIoQueueProv) +{ + PRTIOQUEUEPROVINT pThis = hIoQueueProv; + + int rcLnx = munmap(pThis->pvMMapSqRing, pThis->cbMMapSqRing); Assert(!rcLnx); RT_NOREF(rcLnx); + rcLnx = munmap(pThis->pvMMapCqRing, pThis->cbMMapCqRing); Assert(!rcLnx); RT_NOREF(rcLnx); + rcLnx = munmap(pThis->pvMMapSqes, pThis->cbMMapSqes); Assert(!rcLnx); RT_NOREF(rcLnx); + + int rc = rtIoQueueLnxIoURingRegister(pThis->iFdIoCtx, LNX_IOURING_REGISTER_OPC_EVENTFD_UNREGISTER, NULL, 0); + AssertRC(rc); + + close(pThis->iFdEvt); + close(pThis->iFdIoCtx); + RTMemFree(pThis->paIoVecs); + + RT_ZERO(pThis); +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleRegister} */ +static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleRegister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle) +{ + RT_NOREF(hIoQueueProv, pHandle); + /** @todo Add support for fixed file sets later. */ + return VINF_SUCCESS; +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnHandleDeregister} */ +static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_HandleDeregister(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle) +{ + RT_NOREF(hIoQueueProv, pHandle); + /** @todo Add support for fixed file sets later. */ + return VINF_SUCCESS; +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnReqPrepare} */ +static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_ReqPrepare(RTIOQUEUEPROV hIoQueueProv, PCRTHANDLE pHandle, RTIOQUEUEOP enmOp, + uint64_t off, void *pvBuf, size_t cbBuf, uint32_t fReqFlags, + void *pvUser) +{ + PRTIOQUEUEPROVINT pThis = hIoQueueProv; + RT_NOREF(fReqFlags); + + uint32_t idx = pThis->idxSqTail & pThis->Sq.fRingMask; + PLNXIOURINGSQE pSqe = &pThis->paSqes[idx]; + struct iovec *pIoVec = &pThis->paIoVecs[idx]; + + pIoVec->iov_base = pvBuf; + pIoVec->iov_len = cbBuf; + + pSqe->u8Flags = 0; + pSqe->u16IoPrio = 0; + pSqe->i32Fd = (int32_t)RTFileToNative(pHandle->u.hFile); + pSqe->u64OffStart = off; + pSqe->u64AddrBufIoVec = (uint64_t)(uintptr_t)pIoVec; + pSqe->u64User = (uint64_t)(uintptr_t)pvUser; + + switch (enmOp) + { + case RTIOQUEUEOP_READ: + pSqe->u8Opc = LNX_IOURING_OPC_READV; + pSqe->uOpc.u32KrnlRwFlags = 0; + break; + case RTIOQUEUEOP_WRITE: + pSqe->u8Opc = LNX_IOURING_OPC_WRITEV; + pSqe->uOpc.u32KrnlRwFlags = 0; + break; + case RTIOQUEUEOP_SYNC: + pSqe->u8Opc = LNX_IOURING_OPC_FSYNC; + pSqe->uOpc.u32FsyncFlags = 0; + break; + default: + AssertMsgFailedReturn(("Invalid I/O queue operation: %d\n", enmOp), + VERR_INVALID_PARAMETER); + } + + pThis->idxSqTail++; + pThis->cSqesToCommit++; + return VINF_SUCCESS; +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnCommit} */ +static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_Commit(RTIOQUEUEPROV hIoQueueProv, uint32_t *pcReqsCommitted) +{ + PRTIOQUEUEPROVINT pThis = hIoQueueProv; + RT_NOREF(pThis, pcReqsCommitted); + + ASMWriteFence(); + ASMAtomicWriteU32(pThis->Sq.pidxTail, pThis->idxSqTail); + ASMWriteFence(); + + int rc = rtIoQueueLnxIoURingEnter(pThis->iFdIoCtx, pThis->cSqesToCommit, 0, 0 /*fFlags*/); + if (RT_SUCCESS(rc)) + { + *pcReqsCommitted = pThis->cSqesToCommit; + pThis->cSqesToCommit = 0; + } + + return rc; +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWait} */ +static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWait(RTIOQUEUEPROV hIoQueueProv, PRTIOQUEUECEVT paCEvt, uint32_t cCEvt, + uint32_t cMinWait, uint32_t *pcCEvt, uint32_t fFlags) +{ + PRTIOQUEUEPROVINT pThis = hIoQueueProv; + int rc = VINF_SUCCESS; + uint32_t cCEvtSeen = 0; + + RT_NOREF(fFlags); + + /* + * Check the completion queue first for any completed events which might save us a + * context switch later on. + */ + rtIoQueueLnxIoURingFileProvCqCheck(pThis, paCEvt, cCEvt, &cCEvtSeen); + + while ( cCEvtSeen < cMinWait + && RT_SUCCESS(rc)) + { + /* + * We can employ a blocking read on the event file descriptor, it will return + * either when woken up externally or when there are completion events pending. + */ + uint64_t uCnt = 0; /**< The counter value returned upon a successful read(). */ + ssize_t rcLnx = read(pThis->iFdEvt, &uCnt, sizeof(uCnt)); + if (rcLnx == sizeof(uCnt)) + { + uint32_t cCEvtThisSeen = 0; + rtIoQueueLnxIoURingFileProvCqCheck(pThis, &paCEvt[cCEvtSeen], cCEvt - cCEvtSeen, &cCEvtThisSeen); + cCEvtSeen += cCEvtThisSeen; + + /* Whether we got woken up externally. */ + if (ASMAtomicXchgBool(&pThis->fExtIntr, false)) + rc = VERR_INTERRUPTED; + } + else if (rcLnx == -1) + rc = RTErrConvertFromErrno(errno); + else + AssertMsgFailed(("Unexpected read() -> 0\n")); + } + + *pcCEvt = cCEvtSeen; + return rc; +} + + +/** @interface_method_impl{RTIOQUEUEPROVVTABLE,pfnEvtWaitWakeup} */ +static DECLCALLBACK(int) rtIoQueueLnxIoURingFileProv_EvtWaitWakeup(RTIOQUEUEPROV hIoQueueProv) +{ + PRTIOQUEUEPROVINT pThis = hIoQueueProv; + int rc = VINF_SUCCESS; + + if (!ASMAtomicXchgBool(&pThis->fExtIntr, true)) + { + const uint64_t uValAdd = 1; + ssize_t rcLnx = write(pThis->iFdEvt, &uValAdd, sizeof(uValAdd)); + + Assert(rcLnx == -1 || rcLnx == sizeof(uValAdd)); + if (rcLnx == -1) + rc = RTErrConvertFromErrno(errno); + } + + return rc; +} + + +/** + * Async file I/O queue provider virtual method table. + */ +RT_DECL_DATA_CONST(RTIOQUEUEPROVVTABLE const) g_RTIoQueueLnxIoURingProv = +{ + /** uVersion */ + RTIOQUEUEPROVVTABLE_VERSION, + /** pszId */ + "LnxIoURingFile", + /** cbIoQueueProv */ + sizeof(RTIOQUEUEPROVINT), + /** enmHnd */ + RTHANDLETYPE_FILE, + /** fFlags */ + 0, + /** pfnIsSupported */ + rtIoQueueLnxIoURingFileProv_IsSupported, + /** pfnQueueInit */ + rtIoQueueLnxIoURingFileProv_QueueInit, + /** pfnQueueDestroy */ + rtIoQueueLnxIoURingFileProv_QueueDestroy, + /** pfnHandleRegister */ + rtIoQueueLnxIoURingFileProv_HandleRegister, + /** pfnHandleDeregister */ + rtIoQueueLnxIoURingFileProv_HandleDeregister, + /** pfnReqPrepare */ + rtIoQueueLnxIoURingFileProv_ReqPrepare, + /** pfnReqPrepareSg */ + NULL, + /** pfnCommit */ + rtIoQueueLnxIoURingFileProv_Commit, + /** pfnEvtWait */ + rtIoQueueLnxIoURingFileProv_EvtWait, + /** pfnEvtWaitWakeup */ + rtIoQueueLnxIoURingFileProv_EvtWaitWakeup, + /** uEndMarker */ + RTIOQUEUEPROVVTABLE_VERSION +}; + diff --git a/src/VBox/Runtime/r3/linux/krnlmod-linux.cpp b/src/VBox/Runtime/r3/linux/krnlmod-linux.cpp new file mode 100644 index 00000000..27bdeabe --- /dev/null +++ b/src/VBox/Runtime/r3/linux/krnlmod-linux.cpp @@ -0,0 +1,324 @@ +/* $Id: krnlmod-linux.cpp $ */ +/** @file + * IPRT - Kernel module, Linux. + */ + +/* + * Copyright (C) 2017-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_SYSTEM +#include <iprt/krnlmod.h> +#include <iprt/linux/sysfs.h> +#include <iprt/asm.h> +#include <iprt/assert.h> +#include <iprt/dir.h> +#include <iprt/err.h> +#include <iprt/mem.h> +#include <iprt/string.h> +#include <iprt/types.h> + + +/** + * Internal kernel information record state. + */ +typedef struct RTKRNLMODINFOINT +{ + /** Reference counter. */ + volatile uint32_t cRefs; + /** Reference count for the kernel module. */ + uint32_t cRefKrnlMod; + /** Load address of the kernel module. */ + RTR0UINTPTR uLoadAddr; + /** Size of the kernel module. */ + size_t cbKrnlMod; + /** Size of the name in characters including the zero terminator. */ + size_t cchName; + /** Module name - variable in size. */ + char achName[1]; +} RTKRNLMODINFOINT; +/** Pointer to the internal kernel module information record. */ +typedef RTKRNLMODINFOINT *PRTKRNLMODINFOINT; +/** Pointer to a const internal kernel module information record. */ +typedef const RTKRNLMODINFOINT *PCRTKRNLMODINFOINT; + + + +/** + * Destroy the given kernel module information record. + * + * @returns nothing. + * @param pThis The record to destroy. + */ +static void rtKrnlModInfoDestroy(PRTKRNLMODINFOINT pThis) +{ + RTMemFree(pThis); +} + + +static int rtKrnlModLinuxReadIntFileDef(unsigned uBase, int64_t *pi64, int64_t i64Def, + const char *pszName, const char *pszPath) +{ + int rc = RTLinuxSysFsReadIntFile(uBase, pi64, "module/%s/%s", pszName, pszPath); + if (rc == VERR_FILE_NOT_FOUND) + { + *pi64 = i64Def; + rc = VINF_SUCCESS; + } + + return rc; +} + +/** + * Creates a new kernel module information record for the given module. + * + * @returns IPRT status code. + * @param pszName The kernel module name. + * @param phKrnlModInfo Where to store the handle to the kernel module information record + * on success. + */ +static int rtKrnlModLinuxInfoCreate(const char *pszName, PRTKRNLMODINFO phKrnlModInfo) +{ + int rc = VINF_SUCCESS; + size_t cchName = strlen(pszName) + 1; + PRTKRNLMODINFOINT pThis = (PRTKRNLMODINFOINT)RTMemAllocZ(RT_UOFFSETOF_DYN(RTKRNLMODINFOINT, achName[cchName])); + if (RT_LIKELY(pThis)) + { + memcpy(&pThis->achName[0], pszName, cchName); + pThis->cchName = cchName; + pThis->cRefs = 1; + + int64_t iTmp = 0; + rc = rtKrnlModLinuxReadIntFileDef(10, &iTmp, 0, pszName, "refcnt"); + if (RT_SUCCESS(rc)) + pThis->cRefKrnlMod = (uint32_t)iTmp; + + rc = rtKrnlModLinuxReadIntFileDef(10, &iTmp, 0, pszName, "coresize"); + if (RT_SUCCESS(rc)) + pThis->cbKrnlMod = iTmp; + + rc = rtKrnlModLinuxReadIntFileDef(16, &iTmp, 0, pszName, "sections/.text"); + if (RT_SUCCESS(rc)) + pThis->uLoadAddr = iTmp; + + if (RT_SUCCESS(rc)) + *phKrnlModInfo = pThis; + else + RTMemFree(pThis); + } + else + rc = VERR_NO_MEMORY; + + return rc; +} + + +RTDECL(int) RTKrnlModQueryLoaded(const char *pszName, bool *pfLoaded) +{ + AssertPtrReturn(pszName, VERR_INVALID_POINTER); + AssertPtrReturn(pfLoaded, VERR_INVALID_POINTER); + + int rc = RTLinuxSysFsExists("module/%s", pszName); + if (rc == VINF_SUCCESS) + *pfLoaded = true; + else if (rc == VERR_FILE_NOT_FOUND) + { + *pfLoaded = false; + rc = VINF_SUCCESS; + } + + return rc; +} + + +RTDECL(int) RTKrnlModLoadedQueryInfo(const char *pszName, PRTKRNLMODINFO phKrnlModInfo) +{ + AssertPtrReturn(pszName, VERR_INVALID_POINTER); + AssertPtrReturn(phKrnlModInfo, VERR_INVALID_POINTER); + + int rc = RTLinuxSysFsExists("module/%s", pszName); + if (rc == VINF_SUCCESS) + rc = rtKrnlModLinuxInfoCreate(pszName, phKrnlModInfo); + else if (rc == VERR_FILE_NOT_FOUND) + rc = VERR_NOT_FOUND; + + return rc; +} + + +RTDECL(uint32_t) RTKrnlModLoadedGetCount(void) +{ + uint32_t cKmodsLoaded = 0; + + RTDIR hDir = NULL; + int rc = RTDirOpen(&hDir, "/sys/module"); + if (RT_SUCCESS(rc)) + { + RTDIRENTRY DirEnt; + rc = RTDirRead(hDir, &DirEnt, NULL); + while (RT_SUCCESS(rc)) + { + if (!RTDirEntryIsStdDotLink(&DirEnt)) + cKmodsLoaded++; + rc = RTDirRead(hDir, &DirEnt, NULL); + } + + RTDirClose(hDir); + } + + + return cKmodsLoaded; +} + + +RTDECL(int) RTKrnlModLoadedQueryInfoAll(PRTKRNLMODINFO pahKrnlModInfo, uint32_t cEntriesMax, + uint32_t *pcEntries) +{ + AssertReturn(VALID_PTR(pahKrnlModInfo) || cEntriesMax == 0, VERR_INVALID_PARAMETER); + + uint32_t cKmodsLoaded = RTKrnlModLoadedGetCount(); + if (cEntriesMax < cKmodsLoaded) + { + if (*pcEntries) + *pcEntries = cKmodsLoaded; + return VERR_BUFFER_OVERFLOW; + } + + RTDIR hDir = NULL; + int rc = RTDirOpen(&hDir, "/sys/module"); + if (RT_SUCCESS(rc)) + { + unsigned idxKrnlModInfo = 0; + RTDIRENTRY DirEnt; + + rc = RTDirRead(hDir, &DirEnt, NULL); + while (RT_SUCCESS(rc)) + { + if (!RTDirEntryIsStdDotLink(&DirEnt)) + { + rc = rtKrnlModLinuxInfoCreate(DirEnt.szName, &pahKrnlModInfo[idxKrnlModInfo]); + if (RT_SUCCESS(rc)) + idxKrnlModInfo++; + } + + if (RT_SUCCESS(rc)) + rc = RTDirRead(hDir, &DirEnt, NULL); + } + + if (rc == VERR_NO_MORE_FILES) + rc = VINF_SUCCESS; + else if (RT_FAILURE(rc)) + { + /* Rollback */ + while (idxKrnlModInfo-- > 0) + RTKrnlModInfoRelease(pahKrnlModInfo[idxKrnlModInfo]); + } + + if (*pcEntries) + *pcEntries = cKmodsLoaded; + + RTDirClose(hDir); + } + + return rc; +} + + +RTDECL(uint32_t) RTKrnlModInfoRetain(RTKRNLMODINFO hKrnlModInfo) +{ + PRTKRNLMODINFOINT pThis = hKrnlModInfo; + AssertPtrReturn(pThis, UINT32_MAX); + + uint32_t cRefs = ASMAtomicIncU32(&pThis->cRefs); + AssertMsg(cRefs > 1 && cRefs < _1M, ("%#x %p\n", cRefs, pThis)); + return cRefs; +} + + +RTDECL(uint32_t) RTKrnlModInfoRelease(RTKRNLMODINFO hKrnlModInfo) +{ + PRTKRNLMODINFOINT pThis = hKrnlModInfo; + if (!pThis) + return 0; + AssertPtrReturn(pThis, UINT32_MAX); + + uint32_t cRefs = ASMAtomicDecU32(&pThis->cRefs); + AssertMsg(cRefs < _1M, ("%#x %p\n", cRefs, pThis)); + if (cRefs == 0) + rtKrnlModInfoDestroy(pThis); + return cRefs; +} + + +RTDECL(uint32_t) RTKrnlModInfoGetRefCnt(RTKRNLMODINFO hKrnlModInfo) +{ + PRTKRNLMODINFOINT pThis = hKrnlModInfo; + AssertPtrReturn(pThis, 0); + + return pThis->cRefKrnlMod; +} + + +RTDECL(const char *) RTKrnlModInfoGetName(RTKRNLMODINFO hKrnlModInfo) +{ + PRTKRNLMODINFOINT pThis = hKrnlModInfo; + AssertPtrReturn(pThis, NULL); + + return &pThis->achName[0]; +} + + +RTDECL(const char *) RTKrnlModInfoGetFilePath(RTKRNLMODINFO hKrnlModInfo) +{ + PRTKRNLMODINFOINT pThis = hKrnlModInfo; + AssertPtrReturn(pThis, NULL); + + return NULL; +} + + +RTDECL(size_t) RTKrnlModInfoGetSize(RTKRNLMODINFO hKrnlModInfo) +{ + PRTKRNLMODINFOINT pThis = hKrnlModInfo; + AssertPtrReturn(pThis, 0); + + return pThis->cbKrnlMod; +} + + +RTDECL(RTR0UINTPTR) RTKrnlModInfoGetLoadAddr(RTKRNLMODINFO hKrnlModInfo) +{ + PRTKRNLMODINFOINT pThis = hKrnlModInfo; + AssertPtrReturn(pThis, 0); + + return pThis->uLoadAddr; +} + + +RTDECL(int) RTKrnlModInfoQueryRefModInfo(RTKRNLMODINFO hKrnlModInfo, uint32_t idx, + PRTKRNLMODINFO phKrnlModInfoRef) +{ + RT_NOREF3(hKrnlModInfo, idx, phKrnlModInfoRef); + return VERR_NOT_IMPLEMENTED; +} diff --git a/src/VBox/Runtime/r3/linux/mp-linux.cpp b/src/VBox/Runtime/r3/linux/mp-linux.cpp new file mode 100644 index 00000000..b952ae04 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/mp-linux.cpp @@ -0,0 +1,318 @@ +/* $Id: mp-linux.cpp $ */ +/** @file + * IPRT - Multiprocessor, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_SYSTEM +#include <stdio.h> +#include <errno.h> + +#include <iprt/mp.h> +#include "internal/iprt.h" + +#include <iprt/alloca.h> +#include <iprt/cpuset.h> +#include <iprt/assert.h> +#include <iprt/string.h> +#include <iprt/linux/sysfs.h> + + +/** + * Internal worker that determines the max possible CPU count. + * + * @returns Max cpus. + */ +static RTCPUID rtMpLinuxMaxCpus(void) +{ +#if 0 /* this doesn't do the right thing :-/ */ + int cMax = sysconf(_SC_NPROCESSORS_CONF); + Assert(cMax >= 1); + return cMax; +#else + static uint32_t s_cMax = 0; + if (!s_cMax) + { + int cMax = 1; + for (unsigned iCpu = 0; iCpu < RTCPUSET_MAX_CPUS; iCpu++) + if (RTLinuxSysFsExists("devices/system/cpu/cpu%d", iCpu)) + cMax = iCpu + 1; + ASMAtomicUoWriteU32((uint32_t volatile *)&s_cMax, cMax); + return cMax; + } + return s_cMax; +#endif +} + +/** + * Internal worker that picks the processor speed in MHz from /proc/cpuinfo. + * + * @returns CPU frequency. + */ +static uint32_t rtMpLinuxGetFrequency(RTCPUID idCpu) +{ + FILE *pFile = fopen("/proc/cpuinfo", "r"); + if (!pFile) + return 0; + + char sz[256]; + RTCPUID idCpuFound = NIL_RTCPUID; + uint32_t Frequency = 0; + while (fgets(sz, sizeof(sz), pFile)) + { + char *psz; + if ( !strncmp(sz, RT_STR_TUPLE("processor")) + && (sz[10] == ' ' || sz[10] == '\t' || sz[10] == ':') + && (psz = strchr(sz, ':'))) + { + psz += 2; + int64_t iCpu; + int rc = RTStrToInt64Ex(psz, NULL, 0, &iCpu); + if (RT_SUCCESS(rc)) + idCpuFound = iCpu; + } + else if ( idCpu == idCpuFound + && !strncmp(sz, RT_STR_TUPLE("cpu MHz")) + && (sz[10] == ' ' || sz[10] == '\t' || sz[10] == ':') + && (psz = strchr(sz, ':'))) + { + psz += 2; + int64_t v; + int rc = RTStrToInt64Ex(psz, &psz, 0, &v); + if (RT_SUCCESS(rc)) + { + Frequency = v; + break; + } + } + } + fclose(pFile); + return Frequency; +} + + +/** @todo RTmpCpuId(). */ + +RTDECL(int) RTMpCpuIdToSetIndex(RTCPUID idCpu) +{ + return idCpu < rtMpLinuxMaxCpus() ? (int)idCpu : -1; +} + + +RTDECL(RTCPUID) RTMpCpuIdFromSetIndex(int iCpu) +{ + return (unsigned)iCpu < rtMpLinuxMaxCpus() ? iCpu : NIL_RTCPUID; +} + + +RTDECL(RTCPUID) RTMpGetMaxCpuId(void) +{ + return rtMpLinuxMaxCpus() - 1; +} + + +RTDECL(bool) RTMpIsCpuOnline(RTCPUID idCpu) +{ + /** @todo check if there is a simpler interface than this... */ + int64_t i = 0; + int rc = RTLinuxSysFsReadIntFile(0, &i, "devices/system/cpu/cpu%d/online", (int)idCpu); + if ( RT_FAILURE(rc) + && RTLinuxSysFsExists("devices/system/cpu/cpu%d", (int)idCpu)) + { + /** @todo Assert(!RTLinuxSysFsExists("devices/system/cpu/cpu%d/online", + * (int)idCpu)); + * Unfortunately, the online file wasn't always world readable (centos + * 2.6.18-164). */ + i = 1; + rc = VINF_SUCCESS; + } + + AssertMsg(i == 0 || i == -1 || i == 1, ("i=%d\n", i)); + return RT_SUCCESS(rc) && i != 0; +} + + +RTDECL(bool) RTMpIsCpuPossible(RTCPUID idCpu) +{ + /** @todo check this up with hotplugging! */ + return RTLinuxSysFsExists("devices/system/cpu/cpu%d", (int)idCpu); +} + + +RTDECL(PRTCPUSET) RTMpGetSet(PRTCPUSET pSet) +{ + RTCpuSetEmpty(pSet); + RTCPUID cMax = rtMpLinuxMaxCpus(); + for (RTCPUID idCpu = 0; idCpu < cMax; idCpu++) + if (RTMpIsCpuPossible(idCpu)) + RTCpuSetAdd(pSet, idCpu); + return pSet; +} + + +RTDECL(RTCPUID) RTMpGetCount(void) +{ + RTCPUSET Set; + RTMpGetSet(&Set); + return RTCpuSetCount(&Set); +} + + +RTDECL(RTCPUID) RTMpGetCoreCount(void) +{ + RTCPUID cMax = rtMpLinuxMaxCpus(); + uint32_t *paidCores = (uint32_t *)alloca(sizeof(paidCores[0]) * (cMax + 1)); + uint32_t *paidPckgs = (uint32_t *)alloca(sizeof(paidPckgs[0]) * (cMax + 1)); + uint32_t cCores = 0; + for (RTCPUID idCpu = 0; idCpu < cMax; idCpu++) + { + if (RTMpIsCpuPossible(idCpu)) + { + int64_t idCore = 0; + int64_t idPckg = 0; + + int rc = RTLinuxSysFsReadIntFile(0, &idCore, "devices/system/cpu/cpu%d/topology/core_id", (int)idCpu); + if (RT_SUCCESS(rc)) + rc = RTLinuxSysFsReadIntFile(0, &idPckg, "devices/system/cpu/cpu%d/topology/physical_package_id", (int)idCpu); + + if (RT_SUCCESS(rc)) + { + uint32_t i; + + for (i = 0; i < cCores; i++) + if ( paidCores[i] == (uint32_t)idCore + && paidPckgs[i] == (uint32_t)idPckg) + break; + if (i >= cCores) + { + paidCores[cCores] = (uint32_t)idCore; + paidPckgs[cCores] = (uint32_t)idPckg; + cCores++; + } + } + } + } + Assert(cCores > 0); + return cCores; +} + + +RTDECL(PRTCPUSET) RTMpGetOnlineSet(PRTCPUSET pSet) +{ + RTCpuSetEmpty(pSet); + RTCPUID cMax = rtMpLinuxMaxCpus(); + for (RTCPUID idCpu = 0; idCpu < cMax; idCpu++) + if (RTMpIsCpuOnline(idCpu)) + RTCpuSetAdd(pSet, idCpu); + return pSet; +} + + +RTDECL(RTCPUID) RTMpGetOnlineCount(void) +{ + RTCPUSET Set; + RTMpGetOnlineSet(&Set); + return RTCpuSetCount(&Set); +} + + +RTDECL(RTCPUID) RTMpGetOnlineCoreCount(void) +{ + RTCPUID cMax = rtMpLinuxMaxCpus(); + uint32_t *paidCores = (uint32_t *)alloca(sizeof(paidCores[0]) * (cMax + 1)); + uint32_t *paidPckgs = (uint32_t *)alloca(sizeof(paidPckgs[0]) * (cMax + 1)); + uint32_t cCores = 0; + for (RTCPUID idCpu = 0; idCpu < cMax; idCpu++) + { + if (RTMpIsCpuOnline(idCpu)) + { + int64_t idCore = 0; + int64_t idPckg = 0; + + int rc = RTLinuxSysFsReadIntFile(0, &idCore, "devices/system/cpu/cpu%d/topology/core_id", (int)idCpu); + if (RT_SUCCESS(rc)) + rc = RTLinuxSysFsReadIntFile(0, &idPckg, "devices/system/cpu/cpu%d/topology/physical_package_id", (int)idCpu); + + if (RT_SUCCESS(rc)) + { + uint32_t i; + + for (i = 0; i < cCores; i++) + if ( paidCores[i] == idCore + && paidPckgs[i] == idPckg) + break; + if (i >= cCores) + { + paidCores[cCores] = idCore; + paidPckgs[cCores] = idPckg; + cCores++; + } + } + } + } + Assert(cCores > 0); + return cCores; +} + + + +RTDECL(uint32_t) RTMpGetCurFrequency(RTCPUID idCpu) +{ + int64_t kHz = 0; + int rc = RTLinuxSysFsReadIntFile(0, &kHz, "devices/system/cpu/cpu%d/cpufreq/cpuinfo_cur_freq", (int)idCpu); + if (RT_FAILURE(rc)) + { + /* + * The file may be just unreadable - in that case use plan B, i.e. + * /proc/cpuinfo to get the data we want. The assumption is that if + * cpuinfo_cur_freq doesn't exist then the speed won't change, and + * thus cur == max. If it does exist then cpuinfo contains the + * current frequency. + */ + kHz = rtMpLinuxGetFrequency(idCpu) * 1000; + } + return (kHz + 999) / 1000; +} + + +RTDECL(uint32_t) RTMpGetMaxFrequency(RTCPUID idCpu) +{ + int64_t kHz = 0; + int rc = RTLinuxSysFsReadIntFile(0, &kHz, "devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", (int)idCpu); + if (RT_FAILURE(rc)) + { + /* + * Check if the file isn't there - if it is there, then /proc/cpuinfo + * would provide current frequency information, which is wrong. + */ + if (!RTLinuxSysFsExists("devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", (int)idCpu)) + kHz = rtMpLinuxGetFrequency(idCpu) * 1000; + else + kHz = 0; + } + return (kHz + 999) / 1000; +} diff --git a/src/VBox/Runtime/r3/linux/rtProcInitExePath-linux.cpp b/src/VBox/Runtime/r3/linux/rtProcInitExePath-linux.cpp new file mode 100644 index 00000000..1a0d5aee --- /dev/null +++ b/src/VBox/Runtime/r3/linux/rtProcInitExePath-linux.cpp @@ -0,0 +1,69 @@ +/* $Id: rtProcInitExePath-linux.cpp $ */ +/** @file + * IPRT - rtProcInitName, Linux. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_PROCESS +#include <unistd.h> +#include <errno.h> + +#include <iprt/string.h> +#include <iprt/assert.h> +#include <iprt/errcore.h> +#include <iprt/path.h> +#include "internal/process.h" +#include "internal/path.h" + + +DECLHIDDEN(int) rtProcInitExePath(char *pszPath, size_t cchPath) +{ + /* + * Read the /proc/self/exe link, convert to native and return it. + */ + int cchLink = readlink("/proc/self/exe", pszPath, cchPath - 1); + if (cchLink > 0 && (size_t)cchLink <= cchPath - 1) + { + pszPath[cchLink] = '\0'; + + char const *pszTmp; + int rc = rtPathFromNative(&pszTmp, pszPath, NULL); + AssertMsgRCReturn(rc, ("rc=%Rrc pszLink=\"%s\"\nhex: %.*Rhxs\n", rc, pszPath, cchLink, pszPath), rc); + if (pszTmp != pszPath) + { + rc = RTStrCopy(pszPath, cchPath, pszTmp); + rtPathFreeIprt(pszTmp, pszPath); + } + return rc; + } + + int err = errno; + int rc = RTErrConvertFromErrno(err); + AssertMsgFailed(("rc=%Rrc err=%d cchLink=%d\n", rc, err, cchLink)); + return rc; +} + diff --git a/src/VBox/Runtime/r3/linux/sched-linux.cpp b/src/VBox/Runtime/r3/linux/sched-linux.cpp new file mode 100644 index 00000000..5bd223ac --- /dev/null +++ b/src/VBox/Runtime/r3/linux/sched-linux.cpp @@ -0,0 +1,717 @@ +/* $Id: sched-linux.cpp $ */ +/** @file + * IPRT - Scheduling, POSIX. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + +/* + * !WARNING! + * + * When talking about lowering and raising priority, we do *NOT* refer to + * the common direction priority values takes on unix systems (lower means + * higher). So, when we raise the priority of a linux thread the nice + * value will decrease, and when we lower the priority the nice value + * will increase. Confusing, right? + * + * !WARNING! + */ + + + +/** @def THREAD_LOGGING + * Be very careful with enabling this, it may cause deadlocks when combined + * with the 'thread' logging prefix. + */ +#ifdef DOXYGEN_RUNNING +# define THREAD_LOGGING +#endif + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_THREAD +#include <errno.h> +#include <pthread.h> +#include <limits.h> +#include <sched.h> +#include <unistd.h> +#include <sys/resource.h> + +#include <iprt/thread.h> +#include <iprt/process.h> +#include <iprt/semaphore.h> +#include <iprt/string.h> +#include <iprt/assert.h> +#include <iprt/log.h> +#include <iprt/errcore.h> +#include "internal/sched.h" +#include "internal/thread.h" + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ + +/** Array scheduler attributes corresponding to each of the thread types. + * @internal */ +typedef struct PROCPRIORITYTYPE +{ + /** For sanity include the array index. */ + RTTHREADTYPE enmType; + /** The thread priority or nice delta - depends on which priority type. */ + int iPriority; +} PROCPRIORITYTYPE; + + +/** + * Configuration of one priority. + * @internal + */ +typedef struct +{ + /** The priority. */ + RTPROCPRIORITY enmPriority; + /** The name of this priority. */ + const char *pszName; + /** The process nice value. */ + int iNice; + /** The delta applied to the iPriority value. */ + int iDelta; + /** Array scheduler attributes corresponding to each of the thread types. */ + const PROCPRIORITYTYPE *paTypes; +} PROCPRIORITY; + + +/** + * Saved priority settings + * @internal + */ +typedef struct +{ + /** Process priority. */ + int iPriority; + /** Process level. */ + struct sched_param SchedParam; + /** Process level. */ + int iPolicy; + /** pthread level. */ + struct sched_param PthreadSchedParam; + /** pthread level. */ + int iPthreadPolicy; +} SAVEDPRIORITY, *PSAVEDPRIORITY; + + +/** + * Priorities for checking by separate thread + * @internal + */ +typedef struct +{ + /** The current thread priority to assume first. */ + int iCurrent; + /** The thread priority to try set afterwards. */ + int iNew; +} VALIDATORPRIORITYPAIR, *PVALIDATORPRIORITYPAIR; + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +/** + * Deltas for a process in which we are not restricted + * to only be lowering the priority. + */ +static const PROCPRIORITYTYPE g_aTypesLinuxFree[RTTHREADTYPE_END] = +{ + { RTTHREADTYPE_INVALID, -999999999 }, + { RTTHREADTYPE_INFREQUENT_POLLER, +3 }, + { RTTHREADTYPE_MAIN_HEAVY_WORKER, +2 }, + { RTTHREADTYPE_EMULATION, +1 }, + { RTTHREADTYPE_DEFAULT, 0 }, + { RTTHREADTYPE_GUI, 0 }, + { RTTHREADTYPE_MAIN_WORKER, 0 }, + { RTTHREADTYPE_VRDP_IO, -1 }, + { RTTHREADTYPE_DEBUGGER, -1 }, + { RTTHREADTYPE_MSG_PUMP, -2 }, + { RTTHREADTYPE_IO, -3 }, + { RTTHREADTYPE_TIMER, -4 } +}; + +/** + * Deltas for a process in which we are restricted and can only lower the priority. + */ +static const PROCPRIORITYTYPE g_aTypesLinuxRestricted[RTTHREADTYPE_END] = +{ + { RTTHREADTYPE_INVALID, -999999999 }, + { RTTHREADTYPE_INFREQUENT_POLLER, +3 }, + { RTTHREADTYPE_MAIN_HEAVY_WORKER, +2 }, + { RTTHREADTYPE_EMULATION, +1 }, + { RTTHREADTYPE_DEFAULT, 0 }, + { RTTHREADTYPE_GUI, 0 }, + { RTTHREADTYPE_MAIN_WORKER, 0 }, + { RTTHREADTYPE_VRDP_IO, 0 }, + { RTTHREADTYPE_DEBUGGER, 0 }, + { RTTHREADTYPE_MSG_PUMP, 0 }, + { RTTHREADTYPE_IO, 0 }, + { RTTHREADTYPE_TIMER, 0 } +}; + +/** + * All threads have the same priority. + * + * This is typically chosen when we find that we can't raise the priority + * to the process default of a thread created by a low priority thread. + */ +static const PROCPRIORITYTYPE g_aTypesLinuxFlat[RTTHREADTYPE_END] = +{ + { RTTHREADTYPE_INVALID, -999999999 }, + { RTTHREADTYPE_INFREQUENT_POLLER, 0 }, + { RTTHREADTYPE_MAIN_HEAVY_WORKER, 0 }, + { RTTHREADTYPE_EMULATION, 0 }, + { RTTHREADTYPE_DEFAULT, 0 }, + { RTTHREADTYPE_GUI, 0 }, + { RTTHREADTYPE_MAIN_WORKER, 0 }, + { RTTHREADTYPE_VRDP_IO, 0 }, + { RTTHREADTYPE_DEBUGGER, 0 }, + { RTTHREADTYPE_MSG_PUMP, 0 }, + { RTTHREADTYPE_IO, 0 }, + { RTTHREADTYPE_TIMER, 0 } +}; + +/** + * Process and thread level priority, full access at thread level. + */ +static const PROCPRIORITY g_aUnixConfigs[] = +{ + { RTPROCPRIORITY_FLAT, "Flat", 0, 0, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_LOW, "Low", 9, 9, g_aTypesLinuxFree }, + { RTPROCPRIORITY_LOW, "Low", 9, 9, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_LOW, "Low", 15, 15, g_aTypesLinuxFree }, + { RTPROCPRIORITY_LOW, "Low", 15, 15, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_LOW, "Low", 17, 17, g_aTypesLinuxFree }, + { RTPROCPRIORITY_LOW, "Low", 17, 17, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_LOW, "Low", 19, 19, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_LOW, "Low", 9, 9, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_LOW, "Low", 15, 15, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_LOW, "Low", 17, 17, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_NORMAL, "Normal", 0, 0, g_aTypesLinuxFree }, + { RTPROCPRIORITY_NORMAL, "Normal", 0, 0, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_NORMAL, "Normal", 0, 0, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_HIGH, "High", -9, -9, g_aTypesLinuxFree }, + { RTPROCPRIORITY_HIGH, "High", -7, -7, g_aTypesLinuxFree }, + { RTPROCPRIORITY_HIGH, "High", -5, -5, g_aTypesLinuxFree }, + { RTPROCPRIORITY_HIGH, "High", -3, -3, g_aTypesLinuxFree }, + { RTPROCPRIORITY_HIGH, "High", -1, -1, g_aTypesLinuxFree }, + { RTPROCPRIORITY_HIGH, "High", -9, -9, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_HIGH, "High", -7, -7, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_HIGH, "High", -5, -5, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_HIGH, "High", -3, -3, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_HIGH, "High", -1, -1, g_aTypesLinuxRestricted }, + { RTPROCPRIORITY_HIGH, "High", -9, -9, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_HIGH, "High", -7, -7, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_HIGH, "High", -5, -5, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_HIGH, "High", -3, -3, g_aTypesLinuxFlat }, + { RTPROCPRIORITY_HIGH, "High", -1, -1, g_aTypesLinuxFlat } +}; + +/** + * The dynamic default priority configuration. + * + * This will be recalulated at runtime depending on what the + * system allow us to do and what the current priority is. + */ +static PROCPRIORITY g_aDefaultPriority = +{ + RTPROCPRIORITY_LOW, "Default", 0, 0, g_aTypesLinuxRestricted +}; + +/** Pointer to the current priority configuration. */ +static const PROCPRIORITY *g_pProcessPriority = &g_aDefaultPriority; + +/** Set if we can raise the priority of a thread beyond the default. + * + * It might mean we have the CAP_SYS_NICE capability or that the + * process's RLIMIT_NICE is higher than the priority of the thread + * calculating the defaults. + */ +static bool g_fCanRaisePriority = false; + +/** Set if we can restore the priority after having temporarily lowered or raised it. */ +static bool g_fCanRestorePriority = false; + +/** Set if we can NOT raise the priority to the process default in a thread + * created by a thread running below the process default. + */ +static bool g_fScrewedUpMaxPriorityLimitInheritance = true; + +/** The highest priority we can set. */ +static int g_iMaxPriority = 0; + +/** The lower priority we can set. */ +static int g_iMinPriority = 19; + +/** Set when we've successfully determined the capabilities of the process and kernel. */ +static bool g_fInitialized = false; + + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ + + +/** + * Saves all the scheduling attributes we can think of. + */ +static void rtSchedNativeSave(PSAVEDPRIORITY pSave) +{ + memset(pSave, 0xff, sizeof(*pSave)); + + errno = 0; + pSave->iPriority = getpriority(PRIO_PROCESS, 0 /* current process */); + Assert(errno == 0); + + errno = 0; + sched_getparam(0 /* current process */, &pSave->SchedParam); + Assert(errno == 0); + + errno = 0; + pSave->iPolicy = sched_getscheduler(0 /* current process */); + Assert(errno == 0); + + int rc = pthread_getschedparam(pthread_self(), &pSave->iPthreadPolicy, &pSave->PthreadSchedParam); + Assert(rc == 0); NOREF(rc); +} + + +/** + * Restores scheduling attributes. + * Most of this won't work right, but anyway... + */ +static void rtSchedNativeRestore(PSAVEDPRIORITY pSave) +{ + setpriority(PRIO_PROCESS, 0, pSave->iPriority); + sched_setscheduler(0, pSave->iPolicy, &pSave->SchedParam); + sched_setparam(0, &pSave->SchedParam); + pthread_setschedparam(pthread_self(), pSave->iPthreadPolicy, &pSave->PthreadSchedParam); +} + + +/** + * Called on the priority proxy thread if requested running, otherwise + * rtSchedRunThread() calls it directly. + */ +static DECLCALLBACK(int) rtSchedRunThreadCallback(pthread_t *pThread, void *(*pfnThread)(void *pvArg), void *pvArg) +{ + int rc = pthread_create(pThread, NULL, pfnThread, pvArg); + if (!rc) + return VINF_SUCCESS; + return RTErrConvertFromErrno(rc); +} + + +/** + * Starts a worker thread and wait for it to complete. + * + * We cannot use RTThreadCreate since we're already owner of the RW lock. + */ +static int rtSchedRunThread(void *(*pfnThread)(void *pvArg), void *pvArg, bool fUsePriorityProxy) +{ + /* + * Create the thread. + */ + pthread_t Thread; + int rc; +#ifndef RTTHREAD_POSIX_WITH_CREATE_PRIORITY_PROXY + RT_NOREF(fUsePriorityProxy); +#else + if ( fUsePriorityProxy + && rtThreadPosixPriorityProxyStart()) + rc = rtThreadPosixPriorityProxyCall(NULL, (PFNRT)rtSchedRunThreadCallback, 3, &Thread, pfnThread, pvArg); + else +#endif + rc = rtSchedRunThreadCallback(&Thread, pfnThread, pvArg); + if (RT_SUCCESS(rc)) + { + /* + * Wait for the thread to finish. + */ + void *pvRet = (void *)-1; + do + { + rc = pthread_join(Thread, &pvRet); + } while (rc == EINTR); + if (rc) + return RTErrConvertFromErrno(rc); + return (int)(uintptr_t)pvRet; + } + return rc; +} + + +static void rtSchedDumpPriority(void) +{ +#ifdef THREAD_LOGGING + Log(("Priority: g_fCanRaisePriority=%RTbool g_fCanRestorePriority=%RTbool g_fScrewedUpMaxPriorityLimitInheritance=%RTbool\n", + g_fCanRaisePriority, g_fCanRestorePriority, g_fScrewedUpMaxPriorityLimitInheritance)); + Log(("Priority: g_iMaxPriority=%d g_iMinPriority=%d\n", g_iMaxPriority, g_iMinPriority)); + Log(("Priority: enmPriority=%d \"%s\" iNice=%d iDelta=%d\n", + g_pProcessPriority->enmPriority, + g_pProcessPriority->pszName, + g_pProcessPriority->iNice, + g_pProcessPriority->iDelta)); + Log(("Priority: %2d INFREQUENT_POLLER = %d\n", RTTHREADTYPE_INFREQUENT_POLLER, g_pProcessPriority->paTypes[RTTHREADTYPE_INFREQUENT_POLLER].iPriority)); + Log(("Priority: %2d MAIN_HEAVY_WORKER = %d\n", RTTHREADTYPE_MAIN_HEAVY_WORKER, g_pProcessPriority->paTypes[RTTHREADTYPE_MAIN_HEAVY_WORKER].iPriority)); + Log(("Priority: %2d EMULATION = %d\n", RTTHREADTYPE_EMULATION , g_pProcessPriority->paTypes[RTTHREADTYPE_EMULATION ].iPriority)); + Log(("Priority: %2d DEFAULT = %d\n", RTTHREADTYPE_DEFAULT , g_pProcessPriority->paTypes[RTTHREADTYPE_DEFAULT ].iPriority)); + Log(("Priority: %2d GUI = %d\n", RTTHREADTYPE_GUI , g_pProcessPriority->paTypes[RTTHREADTYPE_GUI ].iPriority)); + Log(("Priority: %2d MAIN_WORKER = %d\n", RTTHREADTYPE_MAIN_WORKER , g_pProcessPriority->paTypes[RTTHREADTYPE_MAIN_WORKER ].iPriority)); + Log(("Priority: %2d VRDP_IO = %d\n", RTTHREADTYPE_VRDP_IO , g_pProcessPriority->paTypes[RTTHREADTYPE_VRDP_IO ].iPriority)); + Log(("Priority: %2d DEBUGGER = %d\n", RTTHREADTYPE_DEBUGGER , g_pProcessPriority->paTypes[RTTHREADTYPE_DEBUGGER ].iPriority)); + Log(("Priority: %2d MSG_PUMP = %d\n", RTTHREADTYPE_MSG_PUMP , g_pProcessPriority->paTypes[RTTHREADTYPE_MSG_PUMP ].iPriority)); + Log(("Priority: %2d IO = %d\n", RTTHREADTYPE_IO , g_pProcessPriority->paTypes[RTTHREADTYPE_IO ].iPriority)); + Log(("Priority: %2d TIMER = %d\n", RTTHREADTYPE_TIMER , g_pProcessPriority->paTypes[RTTHREADTYPE_TIMER ].iPriority)); +#endif +} + + +/** + * This just checks if it can raise the priority after having been + * created by a thread with a low priority. + * + * @returns zero on success, non-zero on failure. + * @param pvUser The priority of the parent before it was lowered (cast to int). + */ +static void *rtSchedNativeSubProberThread(void *pvUser) +{ + int iPriority = getpriority(PRIO_PROCESS, 0); + Assert(iPriority == g_iMinPriority); + + if (setpriority(PRIO_PROCESS, 0, iPriority + 1)) + return (void *)-1; + if (setpriority(PRIO_PROCESS, 0, (int)(intptr_t)pvUser)) + return (void *)-1; + return (void *)0; +} + + +/** + * The prober thread. + * We don't want to mess with the priority of the calling thread. + * + * @remark This is pretty presumptive stuff, but if it works on Linux and + * FreeBSD it does what I want. + */ +static void *rtSchedNativeProberThread(void *pvUser) +{ + NOREF(pvUser); + SAVEDPRIORITY SavedPriority; + rtSchedNativeSave(&SavedPriority); + + /* + * Check if we can get higher priority (typically only root can do this). + * (Won't work right if our priority is -19 to start with, but what the heck.) + * + * We assume that the priority range is -19 to 19. Should probably find the right + * define for this. + */ + int iStart = getpriority(PRIO_PROCESS, 0); + int i = iStart; + while (i-- > -20) + if (setpriority(PRIO_PROCESS, 0, i)) + break; + g_iMaxPriority = getpriority(PRIO_PROCESS, 0); + g_fCanRaisePriority = g_iMaxPriority < iStart; + g_fCanRestorePriority = setpriority(PRIO_PROCESS, 0, iStart) == 0; + + /* + * Check if we temporarily lower the thread priority. + * Again, we assume we're not at the extreme end of the priority scale. + */ + iStart = getpriority(PRIO_PROCESS, 0); + i = iStart; + while (i++ < 19) + if (setpriority(PRIO_PROCESS, 0, i)) + break; + g_iMinPriority = getpriority(PRIO_PROCESS, 0); + if ( setpriority(PRIO_PROCESS, 0, iStart) + || getpriority(PRIO_PROCESS, 0) != iStart) + g_fCanRestorePriority = false; + if (g_iMinPriority == g_iMaxPriority) + g_fCanRestorePriority = g_fCanRaisePriority = false; + + /* + * Check what happens to child threads when the parent lowers the + * priority when it's being created. + */ + iStart = getpriority(PRIO_PROCESS, 0); + g_fScrewedUpMaxPriorityLimitInheritance = true; + if ( g_fCanRestorePriority + && !setpriority(PRIO_PROCESS, 0, g_iMinPriority) + && iStart != g_iMinPriority) + { + if (rtSchedRunThread(rtSchedNativeSubProberThread, (void *)(intptr_t)iStart, false /*fUsePriorityProxy*/) == 0) + g_fScrewedUpMaxPriorityLimitInheritance = false; + } + + /* done */ + rtSchedNativeRestore(&SavedPriority); + return (void *)VINF_SUCCESS; +} + + +/** + * Calculate the scheduling properties for all the threads in the default + * process priority, assuming the current thread have the type enmType. + * + * @returns iprt status code. + * @param enmType The thread type to be assumed for the current thread. + */ +DECLHIDDEN(int) rtSchedNativeCalcDefaultPriority(RTTHREADTYPE enmType) +{ + Assert(enmType > RTTHREADTYPE_INVALID && enmType < RTTHREADTYPE_END); + + /* + * First figure out what's we're allowed to do in this process. + */ + if (!g_fInitialized) + { + int iPriority = getpriority(PRIO_PROCESS, 0); +#ifdef RLIMIT_RTPRIO + /** @todo */ +#endif + int rc = rtSchedRunThread(rtSchedNativeProberThread, NULL, false /*fUsePriorityProxy*/); + if (RT_FAILURE(rc)) + return rc; + Assert(getpriority(PRIO_PROCESS, 0) == iPriority); NOREF(iPriority); + g_fInitialized = true; + } + + /* + * Select the right priority type table and update the default + * process priority structure. + */ + if (g_fCanRaisePriority && g_fCanRestorePriority && !g_fScrewedUpMaxPriorityLimitInheritance) + g_aDefaultPriority.paTypes = &g_aTypesLinuxFree[0]; + else if (!g_fCanRaisePriority && g_fCanRestorePriority && !g_fScrewedUpMaxPriorityLimitInheritance) + g_aDefaultPriority.paTypes = &g_aTypesLinuxRestricted[0]; + else + g_aDefaultPriority.paTypes = &g_aTypesLinuxFlat[0]; + Assert(enmType == g_aDefaultPriority.paTypes[enmType].enmType); + + int iPriority = getpriority(PRIO_PROCESS, 0 /* current process */); + g_aDefaultPriority.iNice = iPriority - g_aDefaultPriority.paTypes[enmType].iPriority; + g_aDefaultPriority.iDelta = g_aDefaultPriority.iNice; + + rtSchedDumpPriority(); + return VINF_SUCCESS; +} + + +/** + * The process priority validator thread. + * (We don't want to mess with the priority of the calling thread.) + */ +static void *rtSchedNativeValidatorThread(void *pvUser) +{ + PVALIDATORPRIORITYPAIR pPrioPair = (PVALIDATORPRIORITYPAIR)pvUser; + SAVEDPRIORITY SavedPriority; + rtSchedNativeSave(&SavedPriority); + + int rc = VINF_SUCCESS; + + /* + * Set the priority to the current value for specified thread type, but + * only if we have any threads of this type (caller checked - INT_MAX). + */ + if (pPrioPair->iCurrent != INT_MAX) + if (setpriority(PRIO_PROCESS, 0, pPrioPair->iCurrent)) + rc = RTErrConvertFromErrno(errno); + + /* + * Try set the new priority. + */ + if (RT_SUCCESS(rc) && setpriority(PRIO_PROCESS, 0, pPrioPair->iNew)) + rc = RTErrConvertFromErrno(errno); + + /* done */ + rtSchedNativeRestore(&SavedPriority); + return (void *)(intptr_t)rc; +} + + +/** + * Validates the ability to apply suggested priority scheme. + * + * The function checks that we're able to apply all the thread types in the + * suggested priority scheme. + * + * @returns iprt status code. + * @param pCfg The priority scheme to validate. + * @param fHavePriorityProxy Set if we've got a priority proxy thread, + * otherwise clear. + */ +static int rtSchedNativeCheckThreadTypes(const PROCPRIORITY *pCfg, bool fHavePriorityProxy) +{ + int i = RTTHREADTYPE_END; + while (--i > RTTHREADTYPE_INVALID) + { + VALIDATORPRIORITYPAIR PrioPair; + PrioPair.iCurrent = g_pProcessPriority->paTypes[i].iPriority + g_pProcessPriority->iDelta; + PrioPair.iNew = pCfg->paTypes[i].iPriority + pCfg->iDelta; + if (g_acRTThreadTypeStats[i] == 0) + PrioPair.iCurrent = INT_MAX; + +#ifdef RT_STRICT + int const iPriority = getpriority(PRIO_PROCESS, 0); +#endif + int rc = rtSchedRunThread(rtSchedNativeValidatorThread, &PrioPair, fHavePriorityProxy /*fUsePriorityProxy*/); + Assert(getpriority(PRIO_PROCESS, 0) == iPriority); + + if (RT_FAILURE(rc)) + return rc; + } + return VINF_SUCCESS; +} + + +/** + * Validates and sets the process priority. + * + * This will check that all rtThreadNativeSetPriority() will success for all the + * thread types when applied to the current thread. + * + * @returns iprt status code. + * @param enmPriority The priority to validate and set. + */ +DECLHIDDEN(int) rtProcNativeSetPriority(RTPROCPRIORITY enmPriority) +{ + Assert(enmPriority > RTPROCPRIORITY_INVALID && enmPriority < RTPROCPRIORITY_LAST); + +#ifdef RTTHREAD_POSIX_WITH_CREATE_PRIORITY_PROXY + /* + * Make sure the proxy creation thread is started so we don't 'lose' our + * initial priority if it's lowered. + */ + bool const fHavePriorityProxy = rtThreadPosixPriorityProxyStart(); +#else + bool const fHavePriorityProxy = false; +#endif + + int rc; + if (enmPriority == RTPROCPRIORITY_DEFAULT) + { + /* + * If we've lowered priority since the process started, it may be impossible + * to raise it again for existing thread (new threads will work fine). + */ + rc = rtSchedNativeCheckThreadTypes(&g_aDefaultPriority, fHavePriorityProxy); + if (RT_SUCCESS(rc)) + g_pProcessPriority = &g_aDefaultPriority; + } + else + { + /* + * Find a configuration which matches and can be applied. + */ + rc = VERR_NOT_FOUND; + for (unsigned i = 0; i < RT_ELEMENTS(g_aUnixConfigs); i++) + if (g_aUnixConfigs[i].enmPriority == enmPriority) + { + int rc2 = rtSchedNativeCheckThreadTypes(&g_aUnixConfigs[i], fHavePriorityProxy); + if (RT_SUCCESS(rc2)) + { + g_pProcessPriority = &g_aUnixConfigs[i]; + rc = VINF_SUCCESS; + break; + } + if (rc == VERR_NOT_FOUND || rc == VERR_ACCESS_DENIED) + rc = rc2; + } + } + +#ifdef THREAD_LOGGING + LogFlow(("rtProcNativeSetPriority: returns %Rrc enmPriority=%d\n", rc, enmPriority)); + rtSchedDumpPriority(); +#endif + return rc; +} + + +/** + * Called on the priority proxy thread if it's running, otherwise + * rtThreadNativeSetPriority calls it directly. + */ +static DECLCALLBACK(int) rtThreadLinuxSetPriorityCallback(PRTTHREADINT pThread, int iPriority) +{ + if (!setpriority(PRIO_PROCESS, pThread->tid, iPriority)) + { + AssertMsg(iPriority == getpriority(PRIO_PROCESS, pThread->tid), + ("iPriority=%d getpriority()=%d\n", iPriority, getpriority(PRIO_PROCESS, pThread->tid))); +#ifdef THREAD_LOGGING + Log(("rtThreadNativeSetPriority: Thread=%p enmType=%d iPriority=%d pid=%d tid=%d\n", + pThread->Core.Key, enmType, iPriority, getpid(), pThread->tid)); +#endif + return VINF_SUCCESS; + } + AssertMsgFailed(("setpriority(,, %d) -> errno=%d rc=%Rrc\n", iPriority, errno, RTErrConvertFromErrno(errno))); + return VINF_SUCCESS; //non-fatal for now. +} + + +/** + * Sets the priority of the thread according to the thread type + * and current process priority. + * + * The RTTHREADINT::enmType member has not yet been updated and will be updated by + * the caller on a successful return. + * + * @returns iprt status code. + * @param pThread The thread in question. + * @param enmType The thread type. + */ +DECLHIDDEN(int) rtThreadNativeSetPriority(PRTTHREADINT pThread, RTTHREADTYPE enmType) +{ + /* sanity */ + Assert(enmType > RTTHREADTYPE_INVALID && enmType < RTTHREADTYPE_END); + Assert(enmType == g_pProcessPriority->paTypes[enmType].enmType); + + /* + * The thread ID is zero for alien threads, so skip these or we'd risk + * modifying our own priority. + */ + if (!pThread->tid) + return VINF_SUCCESS; + + /* + * Calculate the thread priority and apply it, preferrably via the priority proxy thread. + */ + int const iPriority = g_pProcessPriority->paTypes[enmType].iPriority + g_pProcessPriority->iDelta; +#ifdef RTTHREAD_POSIX_WITH_CREATE_PRIORITY_PROXY + if (rtThreadPosixPriorityProxyStart()) + return rtThreadPosixPriorityProxyCall(pThread, (PFNRT)rtThreadLinuxSetPriorityCallback, 2, pThread, iPriority); +#endif + return rtThreadLinuxSetPriorityCallback(pThread, iPriority); +} + diff --git a/src/VBox/Runtime/r3/linux/semevent-linux.cpp b/src/VBox/Runtime/r3/linux/semevent-linux.cpp new file mode 100644 index 00000000..941b07e5 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/semevent-linux.cpp @@ -0,0 +1,417 @@ +/* $Id: semevent-linux.cpp $ */ +/** @file + * IPRT - Event Semaphore, Linux (2.6.x+). + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + +#include <features.h> +#if __GLIBC_PREREQ(2,6) && !defined(IPRT_WITH_FUTEX_BASED_SEMS) + +/* + * glibc 2.6 fixed a serious bug in the mutex implementation. We wrote this + * linux specific event semaphores code in order to work around the bug. We + * will fall back on the pthread-based implementation if glibc is known to + * contain the bug fix. + * + * The external reference to epoll_pwait is a hack which prevents that we link + * against glibc < 2.6. + */ +#include "../posix/semevent-posix.cpp" +__asm__ (".global epoll_pwait"); + +#else /* glibc < 2.6 */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include <iprt/semaphore.h> +#include "internal/iprt.h" + +#include <iprt/asm.h> +#include <iprt/assert.h> +#include <iprt/err.h> +#include <iprt/lockvalidator.h> +#include <iprt/mem.h> +#include <iprt/time.h> +#include "internal/magics.h" +#include "internal/mem.h" +#include "internal/strict.h" + +#include <errno.h> +#include <limits.h> +#include <pthread.h> +#include <unistd.h> +#include <sys/time.h> +#include <sys/syscall.h> +#if 0 /* With 2.6.17 futex.h has become C++ unfriendly. */ +# include <linux/futex.h> +#else +# define FUTEX_WAIT 0 +# define FUTEX_WAKE 1 +#endif + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Linux (single wakup) event semaphore. + */ +struct RTSEMEVENTINTERNAL +{ + /** Magic value. */ + intptr_t volatile iMagic; + /** The futex state variable. + * 0 means not signalled. + 1 means signalled. */ + uint32_t volatile fSignalled; + /** The number of waiting threads */ + int32_t volatile cWaiters; +#ifdef RTSEMEVENT_STRICT + /** Signallers. */ + RTLOCKVALRECSHRD Signallers; + /** Indicates that lock validation should be performed. */ + bool volatile fEverHadSignallers; +#endif + /** The creation flags. */ + uint32_t fFlags; +}; + + +/** + * Wrapper for the futex syscall. + */ +static long sys_futex(uint32_t volatile *uaddr, int op, int val, struct timespec *utime, int32_t *uaddr2, int val3) +{ + errno = 0; + long rc = syscall(__NR_futex, uaddr, op, val, utime, uaddr2, val3); + if (rc < 0) + { + Assert(rc == -1); + rc = -errno; + } + return rc; +} + + + +RTDECL(int) RTSemEventCreate(PRTSEMEVENT phEventSem) +{ + return RTSemEventCreateEx(phEventSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, NULL); +} + + +RTDECL(int) RTSemEventCreateEx(PRTSEMEVENT phEventSem, uint32_t fFlags, RTLOCKVALCLASS hClass, const char *pszNameFmt, ...) +{ + AssertReturn(!(fFlags & ~(RTSEMEVENT_FLAGS_NO_LOCK_VAL | RTSEMEVENT_FLAGS_BOOTSTRAP_HACK)), VERR_INVALID_PARAMETER); + Assert(!(fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK) || (fFlags & RTSEMEVENT_FLAGS_NO_LOCK_VAL)); + + /* + * Allocate semaphore handle. + */ + struct RTSEMEVENTINTERNAL *pThis; + if (!(fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK)) + pThis = (struct RTSEMEVENTINTERNAL *)RTMemAlloc(sizeof(struct RTSEMEVENTINTERNAL)); + else + pThis = (struct RTSEMEVENTINTERNAL *)rtMemBaseAlloc(sizeof(struct RTSEMEVENTINTERNAL)); + if (pThis) + { + pThis->iMagic = RTSEMEVENT_MAGIC; + pThis->cWaiters = 0; + pThis->fSignalled = 0; + pThis->fFlags = fFlags; +#ifdef RTSEMEVENT_STRICT + if (!pszNameFmt) + { + static uint32_t volatile s_iSemEventAnon = 0; + RTLockValidatorRecSharedInit(&pThis->Signallers, hClass, RTLOCKVAL_SUB_CLASS_ANY, pThis, + true /*fSignaller*/, !(fFlags & RTSEMEVENT_FLAGS_NO_LOCK_VAL), + "RTSemEvent-%u", ASMAtomicIncU32(&s_iSemEventAnon) - 1); + } + else + { + va_list va; + va_start(va, pszNameFmt); + RTLockValidatorRecSharedInitV(&pThis->Signallers, hClass, RTLOCKVAL_SUB_CLASS_ANY, pThis, + true /*fSignaller*/, !(fFlags & RTSEMEVENT_FLAGS_NO_LOCK_VAL), + pszNameFmt, va); + va_end(va); + } + pThis->fEverHadSignallers = false; +#else + RT_NOREF(hClass, pszNameFmt); +#endif + + *phEventSem = pThis; + return VINF_SUCCESS; + } + return VERR_NO_MEMORY; +} + + +RTDECL(int) RTSemEventDestroy(RTSEMEVENT hEventSem) +{ + /* + * Validate input. + */ + struct RTSEMEVENTINTERNAL *pThis = hEventSem; + if (pThis == NIL_RTSEMEVENT) + return VINF_SUCCESS; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertReturn(pThis->iMagic == RTSEMEVENT_MAGIC, VERR_INVALID_HANDLE); + + /* + * Invalidate the semaphore and wake up anyone waiting on it. + */ + ASMAtomicXchgSize(&pThis->iMagic, RTSEMEVENT_MAGIC | UINT32_C(0x80000000)); + if (ASMAtomicXchgS32(&pThis->cWaiters, INT32_MIN / 2) > 0) + { + sys_futex(&pThis->fSignalled, FUTEX_WAKE, INT_MAX, NULL, NULL, 0); + usleep(1000); + } + + /* + * Free the semaphore memory and be gone. + */ +#ifdef RTSEMEVENT_STRICT + RTLockValidatorRecSharedDelete(&pThis->Signallers); +#endif + if (!(pThis->fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK)) + RTMemFree(pThis); + else + rtMemBaseFree(pThis); + return VINF_SUCCESS; +} + + +RTDECL(int) RTSemEventSignal(RTSEMEVENT hEventSem) +{ + /* + * Validate input. + */ + struct RTSEMEVENTINTERNAL *pThis = hEventSem; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertReturn(pThis->iMagic == RTSEMEVENT_MAGIC, VERR_INVALID_HANDLE); + +#ifdef RTSEMEVENT_STRICT + if (pThis->fEverHadSignallers) + { + int rc9 = RTLockValidatorRecSharedCheckSignaller(&pThis->Signallers, NIL_RTTHREAD); + if (RT_FAILURE(rc9)) + return rc9; + } +#endif + + ASMAtomicWriteU32(&pThis->fSignalled, 1); + if (ASMAtomicReadS32(&pThis->cWaiters) < 1) + return VINF_SUCCESS; + + /* somebody is waiting, try wake up one of them. */ + long cWoken = sys_futex(&pThis->fSignalled, FUTEX_WAKE, 1, NULL, NULL, 0); + if (RT_LIKELY(cWoken >= 0)) + return VINF_SUCCESS; + + if (RT_UNLIKELY(pThis->iMagic != RTSEMEVENT_MAGIC)) + return VERR_SEM_DESTROYED; + + return VERR_INVALID_PARAMETER; +} + + +static int rtSemEventWait(RTSEMEVENT hEventSem, RTMSINTERVAL cMillies, bool fAutoResume) +{ +#ifdef RTSEMEVENT_STRICT + PCRTLOCKVALSRCPOS pSrcPos = NULL; +#endif + + /* + * Validate input. + */ + struct RTSEMEVENTINTERNAL *pThis = hEventSem; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertReturn(pThis->iMagic == RTSEMEVENT_MAGIC, VERR_INVALID_HANDLE); + + /* + * Quickly check whether it's signaled. + */ + /** @todo this isn't fair if someone is already waiting on it. They should + * have the first go at it! + * (ASMAtomicReadS32(&pThis->cWaiters) == 0 || !cMillies) && ... */ + if (ASMAtomicCmpXchgU32(&pThis->fSignalled, 0, 1)) + return VINF_SUCCESS; + + /* + * Convert the timeout value. + */ + struct timespec ts; + struct timespec *pTimeout = NULL; + uint64_t u64End = 0; /* shut up gcc */ + if (cMillies != RT_INDEFINITE_WAIT) + { + if (!cMillies) + return VERR_TIMEOUT; + ts.tv_sec = cMillies / 1000; + ts.tv_nsec = (cMillies % 1000) * UINT32_C(1000000); + u64End = RTTimeSystemNanoTS() + cMillies * UINT64_C(1000000); + pTimeout = &ts; + } + + ASMAtomicIncS32(&pThis->cWaiters); + + /* + * The wait loop. + */ +#ifdef RTSEMEVENT_STRICT + RTTHREAD hThreadSelf = !(pThis->fFlags & RTSEMEVENT_FLAGS_BOOTSTRAP_HACK) + ? RTThreadSelfAutoAdopt() + : RTThreadSelf(); +#else + RTTHREAD hThreadSelf = RTThreadSelf(); +#endif + int rc = VINF_SUCCESS; + for (;;) + { +#ifdef RTSEMEVENT_STRICT + if (pThis->fEverHadSignallers) + { + rc = RTLockValidatorRecSharedCheckBlocking(&pThis->Signallers, hThreadSelf, pSrcPos, false, + cMillies, RTTHREADSTATE_EVENT, true); + if (RT_FAILURE(rc)) + break; + } +#endif + RTThreadBlocking(hThreadSelf, RTTHREADSTATE_EVENT, true); + long lrc = sys_futex(&pThis->fSignalled, FUTEX_WAIT, 0, pTimeout, NULL, 0); + RTThreadUnblocked(hThreadSelf, RTTHREADSTATE_EVENT); + if (RT_UNLIKELY(pThis->iMagic != RTSEMEVENT_MAGIC)) + { + rc = VERR_SEM_DESTROYED; + break; + } + + if (RT_LIKELY(lrc == 0 || lrc == -EWOULDBLOCK)) + { + /* successful wakeup or fSignalled > 0 in the meantime */ + if (ASMAtomicCmpXchgU32(&pThis->fSignalled, 0, 1)) + break; + } + else if (lrc == -ETIMEDOUT) + { + rc = VERR_TIMEOUT; + break; + } + else if (lrc == -EINTR) + { + if (!fAutoResume) + { + rc = VERR_INTERRUPTED; + break; + } + } + else + { + /* this shouldn't happen! */ + AssertMsgFailed(("rc=%ld errno=%d\n", lrc, errno)); + rc = RTErrConvertFromErrno(lrc); + break; + } + /* adjust the relative timeout */ + if (pTimeout) + { + int64_t i64Diff = u64End - RTTimeSystemNanoTS(); + if (i64Diff < 1000) + { + rc = VERR_TIMEOUT; + break; + } + ts.tv_sec = (uint64_t)i64Diff / UINT32_C(1000000000); + ts.tv_nsec = (uint64_t)i64Diff % UINT32_C(1000000000); + } + } + + ASMAtomicDecS32(&pThis->cWaiters); + return rc; +} + + +RTDECL(int) RTSemEventWait(RTSEMEVENT hEventSem, RTMSINTERVAL cMillies) +{ + int rc = rtSemEventWait(hEventSem, cMillies, true); + Assert(rc != VERR_INTERRUPTED); + Assert(rc != VERR_TIMEOUT || cMillies != RT_INDEFINITE_WAIT); + return rc; +} + + +RTDECL(int) RTSemEventWaitNoResume(RTSEMEVENT hEventSem, RTMSINTERVAL cMillies) +{ + return rtSemEventWait(hEventSem, cMillies, false); +} + + +RTDECL(void) RTSemEventSetSignaller(RTSEMEVENT hEventSem, RTTHREAD hThread) +{ +#ifdef RTSEMEVENT_STRICT + struct RTSEMEVENTINTERNAL *pThis = hEventSem; + AssertPtrReturnVoid(pThis); + AssertReturnVoid(pThis->iMagic == RTSEMEVENT_MAGIC); + + ASMAtomicWriteBool(&pThis->fEverHadSignallers, true); + RTLockValidatorRecSharedResetOwner(&pThis->Signallers, hThread, NULL); +#else + RT_NOREF(hEventSem, hThread); +#endif +} + + +RTDECL(void) RTSemEventAddSignaller(RTSEMEVENT hEventSem, RTTHREAD hThread) +{ +#ifdef RTSEMEVENT_STRICT + struct RTSEMEVENTINTERNAL *pThis = hEventSem; + AssertPtrReturnVoid(pThis); + AssertReturnVoid(pThis->iMagic == RTSEMEVENT_MAGIC); + + ASMAtomicWriteBool(&pThis->fEverHadSignallers, true); + RTLockValidatorRecSharedAddOwner(&pThis->Signallers, hThread, NULL); +#else + RT_NOREF(hEventSem, hThread); +#endif +} + + +RTDECL(void) RTSemEventRemoveSignaller(RTSEMEVENT hEventSem, RTTHREAD hThread) +{ +#ifdef RTSEMEVENT_STRICT + struct RTSEMEVENTINTERNAL *pThis = hEventSem; + AssertPtrReturnVoid(pThis); + AssertReturnVoid(pThis->iMagic == RTSEMEVENT_MAGIC); + + RTLockValidatorRecSharedRemoveOwner(&pThis->Signallers, hThread); +#else + RT_NOREF(hEventSem, hThread); +#endif +} + +#endif /* glibc < 2.6 || IPRT_WITH_FUTEX_BASED_SEMS */ + diff --git a/src/VBox/Runtime/r3/linux/semeventmulti-linux.cpp b/src/VBox/Runtime/r3/linux/semeventmulti-linux.cpp new file mode 100644 index 00000000..4ce2db02 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/semeventmulti-linux.cpp @@ -0,0 +1,453 @@ +/* $Id: semeventmulti-linux.cpp $ */ +/** @file + * IPRT - Multiple Release Event Semaphore, Linux (2.6.x+). + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +#include <features.h> +#if __GLIBC_PREREQ(2,6) && !defined(IPRT_WITH_FUTEX_BASED_SEMS) + +/* + * glibc 2.6 fixed a serious bug in the mutex implementation. We wrote this + * linux specific event semaphores code in order to work around the bug. As it + * turns out, this code seems to have an unresolved issue (@bugref{2599}), so we'll + * fall back on the pthread based implementation if glibc is known to contain + * the bug fix. + * + * The external reference to epoll_pwait is a hack which prevents that we link + * against glibc < 2.6. + */ +#include "../posix/semeventmulti-posix.cpp" +__asm__ (".global epoll_pwait"); + +#else /* glibc < 2.6 */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include <iprt/semaphore.h> +#include "internal/iprt.h" + +#include <iprt/assert.h> +#include <iprt/asm.h> +#include <iprt/err.h> +#include <iprt/lockvalidator.h> +#include <iprt/mem.h> +#include <iprt/time.h> +#include "internal/magics.h" +#include "internal/strict.h" + + +#include <errno.h> +#include <limits.h> +#include <pthread.h> +#include <unistd.h> +#include <sys/time.h> +#include <sys/syscall.h> +#if 0 /* With 2.6.17 futex.h has become C++ unfriendly. */ +# include <linux/futex.h> +#else +# define FUTEX_WAIT 0 +# define FUTEX_WAKE 1 +#endif + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Linux multiple wakup event semaphore. + */ +struct RTSEMEVENTMULTIINTERNAL +{ + /** Magic value. */ + uint32_t volatile u32Magic; + /** The futex state variable. + * -1 means signaled. + * 0 means not signaled, no waiters. + * 1 means not signaled and that someone is waiting. + */ + int32_t volatile iState; +#ifdef RTSEMEVENTMULTI_STRICT + /** Signallers. */ + RTLOCKVALRECSHRD Signallers; + /** Indicates that lock validation should be performed. */ + bool volatile fEverHadSignallers; +#endif +}; + + +/** + * Wrapper for the futex syscall. + */ +static long sys_futex(int32_t volatile *uaddr, int op, int val, struct timespec *utime, int32_t *uaddr2, int val3) +{ + errno = 0; + long rc = syscall(__NR_futex, uaddr, op, val, utime, uaddr2, val3); + if (rc < 0) + { + Assert(rc == -1); + rc = -errno; + } + return rc; +} + + +RTDECL(int) RTSemEventMultiCreate(PRTSEMEVENTMULTI phEventMultiSem) +{ + return RTSemEventMultiCreateEx(phEventMultiSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, NULL); +} + + +RTDECL(int) RTSemEventMultiCreateEx(PRTSEMEVENTMULTI phEventMultiSem, uint32_t fFlags, RTLOCKVALCLASS hClass, + const char *pszNameFmt, ...) +{ + AssertReturn(!(fFlags & ~RTSEMEVENTMULTI_FLAGS_NO_LOCK_VAL), VERR_INVALID_PARAMETER); + + /* + * Allocate semaphore handle. + */ + struct RTSEMEVENTMULTIINTERNAL *pThis = (struct RTSEMEVENTMULTIINTERNAL *)RTMemAlloc(sizeof(struct RTSEMEVENTMULTIINTERNAL)); + if (pThis) + { + pThis->u32Magic = RTSEMEVENTMULTI_MAGIC; + pThis->iState = 0; +#ifdef RTSEMEVENTMULTI_STRICT + if (!pszNameFmt) + { + static uint32_t volatile s_iSemEventMultiAnon = 0; + RTLockValidatorRecSharedInit(&pThis->Signallers, hClass, RTLOCKVAL_SUB_CLASS_ANY, pThis, + true /*fSignaller*/, !(fFlags & RTSEMEVENTMULTI_FLAGS_NO_LOCK_VAL), + "RTSemEventMulti-%u", ASMAtomicIncU32(&s_iSemEventMultiAnon) - 1); + } + else + { + va_list va; + va_start(va, pszNameFmt); + RTLockValidatorRecSharedInitV(&pThis->Signallers, hClass, RTLOCKVAL_SUB_CLASS_ANY, pThis, + true /*fSignaller*/, !(fFlags & RTSEMEVENTMULTI_FLAGS_NO_LOCK_VAL), + pszNameFmt, va); + va_end(va); + } + pThis->fEverHadSignallers = false; +#else + RT_NOREF(hClass, pszNameFmt); +#endif + + *phEventMultiSem = pThis; + return VINF_SUCCESS; + } + return VERR_NO_MEMORY; +} + + +RTDECL(int) RTSemEventMultiDestroy(RTSEMEVENTMULTI hEventMultiSem) +{ + /* + * Validate input. + */ + struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem; + if (pThis == NIL_RTSEMEVENTMULTI) + return VINF_SUCCESS; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, VERR_INVALID_HANDLE); + + /* + * Invalidate the semaphore and wake up anyone waiting on it. + */ + ASMAtomicWriteU32(&pThis->u32Magic, RTSEMEVENTMULTI_MAGIC + 1); + if (ASMAtomicXchgS32(&pThis->iState, -1) == 1) + { + sys_futex(&pThis->iState, FUTEX_WAKE, INT_MAX, NULL, NULL, 0); + usleep(1000); + } + + /* + * Free the semaphore memory and be gone. + */ +#ifdef RTSEMEVENTMULTI_STRICT + RTLockValidatorRecSharedDelete(&pThis->Signallers); +#endif + RTMemFree(pThis); + return VINF_SUCCESS; +} + + +RTDECL(int) RTSemEventMultiSignal(RTSEMEVENTMULTI hEventMultiSem) +{ + /* + * Validate input. + */ + struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem; + AssertReturn(VALID_PTR(pThis) && pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, + VERR_INVALID_HANDLE); + +#ifdef RTSEMEVENTMULTI_STRICT + if (pThis->fEverHadSignallers) + { + int rc9 = RTLockValidatorRecSharedCheckSignaller(&pThis->Signallers, NIL_RTTHREAD); + if (RT_FAILURE(rc9)) + return rc9; + } +#endif + + + /* + * Signal it. + */ + int32_t iOld = ASMAtomicXchgS32(&pThis->iState, -1); + if (iOld > 0) + { + /* wake up sleeping threads. */ + long cWoken = sys_futex(&pThis->iState, FUTEX_WAKE, INT_MAX, NULL, NULL, 0); + AssertMsg(cWoken >= 0, ("%ld\n", cWoken)); NOREF(cWoken); + } + Assert(iOld == 0 || iOld == -1 || iOld == 1); + return VINF_SUCCESS; +} + + +RTDECL(int) RTSemEventMultiReset(RTSEMEVENTMULTI hEventMultiSem) +{ + /* + * Validate input. + */ + struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem; + AssertReturn(VALID_PTR(pThis) && pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, + VERR_INVALID_HANDLE); +#ifdef RT_STRICT + int32_t i = pThis->iState; + Assert(i == 0 || i == -1 || i == 1); +#endif + + /* + * Reset it. + */ + ASMAtomicCmpXchgS32(&pThis->iState, 0, -1); + return VINF_SUCCESS; +} + + + +DECLINLINE(int) rtSemEventLnxMultiWait(struct RTSEMEVENTMULTIINTERNAL *pThis, uint32_t fFlags, uint64_t uTimeout, + PCRTLOCKVALSRCPOS pSrcPos) +{ + RT_NOREF(pSrcPos); + + /* + * Validate input. + */ + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertReturn(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC, VERR_INVALID_HANDLE); + AssertReturn(RTSEMWAIT_FLAGS_ARE_VALID(fFlags), VERR_INVALID_PARAMETER); + + /* + * Quickly check whether it's signaled. + */ + int32_t iCur = ASMAtomicUoReadS32(&pThis->iState); + Assert(iCur == 0 || iCur == -1 || iCur == 1); + if (iCur == -1) + return VINF_SUCCESS; + + /* + * Check and convert the timeout value. + */ + struct timespec ts; + struct timespec *pTimeout = NULL; + uint64_t u64Deadline = 0; /* shut up gcc */ + if (!(fFlags & RTSEMWAIT_FLAGS_INDEFINITE)) + { + /* If the timeout is zero, then we're done. */ + if (!uTimeout) + return VERR_TIMEOUT; + + /* Convert it to a deadline + interval timespec. */ + if (fFlags & RTSEMWAIT_FLAGS_MILLISECS) + uTimeout = uTimeout < UINT64_MAX / UINT32_C(1000000) * UINT32_C(1000000) + ? uTimeout * UINT32_C(1000000) + : UINT64_MAX; + if (uTimeout != UINT64_MAX) /* unofficial way of indicating an indefinite wait */ + { + if (fFlags & RTSEMWAIT_FLAGS_RELATIVE) + u64Deadline = RTTimeSystemNanoTS() + uTimeout; + else + { + uint64_t u64Now = RTTimeSystemNanoTS(); + if (uTimeout <= u64Now) + return VERR_TIMEOUT; + u64Deadline = uTimeout; + uTimeout -= u64Now; + } + if ( sizeof(ts.tv_sec) >= sizeof(uint64_t) + || uTimeout <= UINT64_C(1000000000) * UINT32_MAX) + { + ts.tv_nsec = uTimeout % UINT32_C(1000000000); + ts.tv_sec = uTimeout / UINT32_C(1000000000); + pTimeout = &ts; + } + } + } + + /* + * The wait loop. + */ +#ifdef RTSEMEVENTMULTI_STRICT + RTTHREAD hThreadSelf = RTThreadSelfAutoAdopt(); +#else + RTTHREAD hThreadSelf = RTThreadSelf(); +#endif + for (unsigned i = 0;; i++) + { + /* + * Start waiting. We only account for there being or having been + * threads waiting on the semaphore to keep things simple. + */ + iCur = ASMAtomicUoReadS32(&pThis->iState); + Assert(iCur == 0 || iCur == -1 || iCur == 1); + if ( iCur == 1 + || ASMAtomicCmpXchgS32(&pThis->iState, 1, 0)) + { + /* adjust the relative timeout */ + if (pTimeout) + { + int64_t i64Diff = u64Deadline - RTTimeSystemNanoTS(); + if (i64Diff < 1000) + return VERR_TIMEOUT; + ts.tv_sec = (uint64_t)i64Diff / UINT32_C(1000000000); + ts.tv_nsec = (uint64_t)i64Diff % UINT32_C(1000000000); + } +#ifdef RTSEMEVENTMULTI_STRICT + if (pThis->fEverHadSignallers) + { + int rc9 = RTLockValidatorRecSharedCheckBlocking(&pThis->Signallers, hThreadSelf, pSrcPos, false, + uTimeout / UINT32_C(1000000), RTTHREADSTATE_EVENT_MULTI, true); + if (RT_FAILURE(rc9)) + return rc9; + } +#endif + RTThreadBlocking(hThreadSelf, RTTHREADSTATE_EVENT_MULTI, true); + long rc = sys_futex(&pThis->iState, FUTEX_WAIT, 1, pTimeout, NULL, 0); + RTThreadUnblocked(hThreadSelf, RTTHREADSTATE_EVENT_MULTI); + if (RT_UNLIKELY(pThis->u32Magic != RTSEMEVENTMULTI_MAGIC)) + return VERR_SEM_DESTROYED; + if (rc == 0) + return VINF_SUCCESS; + + /* + * Act on the wakup code. + */ + if (rc == -ETIMEDOUT) + { +/** @todo something is broken here. shows up every now and again in the ata + * code. Should try to run the timeout against RTTimeMilliTS to + * check that it's doing the right thing... */ + Assert(pTimeout); + return VERR_TIMEOUT; + } + if (rc == -EWOULDBLOCK) + /* retry, the value changed. */; + else if (rc == -EINTR) + { + if (fFlags & RTSEMWAIT_FLAGS_NORESUME) + return VERR_INTERRUPTED; + } + else + { + /* this shouldn't happen! */ + AssertMsgFailed(("rc=%ld errno=%d\n", rc, errno)); + return RTErrConvertFromErrno(rc); + } + } + else if (iCur == -1) + return VINF_SUCCESS; + } +} + + +#undef RTSemEventMultiWaitEx +RTDECL(int) RTSemEventMultiWaitEx(RTSEMEVENTMULTI hEventMultiSem, uint32_t fFlags, uint64_t uTimeout) +{ +#ifndef RTSEMEVENT_STRICT + return rtSemEventLnxMultiWait(hEventMultiSem, fFlags, uTimeout, NULL); +#else + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API(); + return rtSemEventLnxMultiWait(hEventMultiSem, fFlags, uTimeout, &SrcPos); +#endif +} + + +RTDECL(int) RTSemEventMultiWaitExDebug(RTSEMEVENTMULTI hEventMultiSem, uint32_t fFlags, uint64_t uTimeout, + RTHCUINTPTR uId, RT_SRC_POS_DECL) +{ + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API(); + return rtSemEventLnxMultiWait(hEventMultiSem, fFlags, uTimeout, &SrcPos); +} + + +RTDECL(void) RTSemEventMultiSetSignaller(RTSEMEVENTMULTI hEventMultiSem, RTTHREAD hThread) +{ +#ifdef RTSEMEVENTMULTI_STRICT + struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem; + AssertPtrReturnVoid(pThis); + AssertReturnVoid(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC); + + ASMAtomicWriteBool(&pThis->fEverHadSignallers, true); + RTLockValidatorRecSharedResetOwner(&pThis->Signallers, hThread, NULL); +#else + RT_NOREF(hEventMultiSem, hThread); +#endif +} + + +RTDECL(void) RTSemEventMultiAddSignaller(RTSEMEVENTMULTI hEventMultiSem, RTTHREAD hThread) +{ +#ifdef RTSEMEVENTMULTI_STRICT + struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem; + AssertPtrReturnVoid(pThis); + AssertReturnVoid(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC); + + ASMAtomicWriteBool(&pThis->fEverHadSignallers, true); + RTLockValidatorRecSharedAddOwner(&pThis->Signallers, hThread, NULL); +#else + RT_NOREF(hEventMultiSem, hThread); +#endif +} + + +RTDECL(void) RTSemEventMultiRemoveSignaller(RTSEMEVENTMULTI hEventMultiSem, RTTHREAD hThread) +{ +#ifdef RTSEMEVENTMULTI_STRICT + struct RTSEMEVENTMULTIINTERNAL *pThis = hEventMultiSem; + AssertPtrReturnVoid(pThis); + AssertReturnVoid(pThis->u32Magic == RTSEMEVENTMULTI_MAGIC); + + RTLockValidatorRecSharedRemoveOwner(&pThis->Signallers, hThread); +#else + RT_NOREF(hEventMultiSem, hThread); +#endif +} + +#endif /* glibc < 2.6 || IPRT_WITH_FUTEX_BASED_SEMS */ + diff --git a/src/VBox/Runtime/r3/linux/semmutex-linux.cpp b/src/VBox/Runtime/r3/linux/semmutex-linux.cpp new file mode 100644 index 00000000..4fa67bfc --- /dev/null +++ b/src/VBox/Runtime/r3/linux/semmutex-linux.cpp @@ -0,0 +1,465 @@ +/* $Id: semmutex-linux.cpp $ */ +/** @file + * IPRT - Mutex Semaphore, Linux (2.6.x+). + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include <iprt/semaphore.h> +#include "internal/iprt.h" + +#include <iprt/alloc.h> +#include <iprt/asm.h> +#include <iprt/assert.h> +#include <iprt/err.h> +#include <iprt/lockvalidator.h> +#include <iprt/thread.h> +#include <iprt/time.h> +#include "internal/magics.h" +#include "internal/strict.h" + +#include <errno.h> +#include <limits.h> +#include <pthread.h> +#include <unistd.h> +#include <sys/time.h> +#include <sys/syscall.h> +#if 0 /* With 2.6.17 futex.h has become C++ unfriendly. */ +# include <linux/futex.h> +#else +# define FUTEX_WAIT 0 +# define FUTEX_WAKE 1 +#endif + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * Linux internal representation of a Mutex semaphore. + */ +struct RTSEMMUTEXINTERNAL +{ + /** The futex state variable. + * 0 means unlocked. + * 1 means locked, no waiters. + * 2 means locked, one or more waiters. + */ + int32_t volatile iState; + /** Nesting count. */ + uint32_t volatile cNestings; + /** The owner of the mutex. */ + pthread_t volatile Owner; + /** Magic value (RTSEMMUTEX_MAGIC). */ + uint32_t volatile u32Magic; +#ifdef RTSEMMUTEX_STRICT + /** Lock validator record associated with this mutex. */ + RTLOCKVALRECEXCL ValidatorRec; +#endif +}; + + + +/** + * Wrapper for the futex syscall. + */ +static long sys_futex(int32_t volatile *uaddr, int op, int val, struct timespec *utime, int32_t *uaddr2, int val3) +{ + errno = 0; + long rc = syscall(__NR_futex, uaddr, op, val, utime, uaddr2, val3); + if (rc < 0) + { + Assert(rc == -1); + rc = -errno; + } + return rc; +} + + +#undef RTSemMutexCreate +RTDECL(int) RTSemMutexCreate(PRTSEMMUTEX phMutexSem) +{ + return RTSemMutexCreateEx(phMutexSem, 0 /*fFlags*/, NIL_RTLOCKVALCLASS, RTLOCKVAL_SUB_CLASS_NONE, NULL); +} + + +RTDECL(int) RTSemMutexCreateEx(PRTSEMMUTEX phMutexSem, uint32_t fFlags, + RTLOCKVALCLASS hClass, uint32_t uSubClass, const char *pszNameFmt, ...) +{ + AssertReturn(!(fFlags & ~RTSEMMUTEX_FLAGS_NO_LOCK_VAL), VERR_INVALID_PARAMETER); + + /* + * Allocate semaphore handle. + */ + struct RTSEMMUTEXINTERNAL *pThis = (struct RTSEMMUTEXINTERNAL *)RTMemAlloc(sizeof(struct RTSEMMUTEXINTERNAL)); + if (pThis) + { + pThis->u32Magic = RTSEMMUTEX_MAGIC; + pThis->iState = 0; + pThis->Owner = (pthread_t)~0; + pThis->cNestings = 0; +#ifdef RTSEMMUTEX_STRICT + if (!pszNameFmt) + { + static uint32_t volatile s_iMutexAnon = 0; + RTLockValidatorRecExclInit(&pThis->ValidatorRec, hClass, uSubClass, pThis, + !(fFlags & RTSEMMUTEX_FLAGS_NO_LOCK_VAL), + "RTSemMutex-%u", ASMAtomicIncU32(&s_iMutexAnon) - 1); + } + else + { + va_list va; + va_start(va, pszNameFmt); + RTLockValidatorRecExclInitV(&pThis->ValidatorRec, hClass, uSubClass, pThis, + !(fFlags & RTSEMMUTEX_FLAGS_NO_LOCK_VAL), pszNameFmt, va); + va_end(va); + } +#else + RT_NOREF(hClass, uSubClass, pszNameFmt); +#endif + + *phMutexSem = pThis; + return VINF_SUCCESS; + } + + return VERR_NO_MEMORY; +} + + +RTDECL(int) RTSemMutexDestroy(RTSEMMUTEX hMutexSem) +{ + /* + * Validate input. + */ + if (hMutexSem == NIL_RTSEMMUTEX) + return VINF_SUCCESS; + struct RTSEMMUTEXINTERNAL *pThis = hMutexSem; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertMsgReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, + ("hMutexSem=%p u32Magic=%#x\n", pThis, pThis->u32Magic), + VERR_INVALID_HANDLE); + + /* + * Invalidate the semaphore and wake up anyone waiting on it. + */ + ASMAtomicWriteU32(&pThis->u32Magic, RTSEMMUTEX_MAGIC_DEAD); + if (ASMAtomicXchgS32(&pThis->iState, 0) > 0) + { + sys_futex(&pThis->iState, FUTEX_WAKE, INT_MAX, NULL, NULL, 0); + usleep(1000); + } + pThis->Owner = (pthread_t)~0; + pThis->cNestings = 0; +#ifdef RTSEMMUTEX_STRICT + RTLockValidatorRecExclDelete(&pThis->ValidatorRec); +#endif + + /* + * Free the semaphore memory and be gone. + */ + RTMemFree(pThis); + return VINF_SUCCESS; +} + + +RTDECL(uint32_t) RTSemMutexSetSubClass(RTSEMMUTEX hMutexSem, uint32_t uSubClass) +{ +#ifdef RTSEMMUTEX_STRICT + /* + * Validate. + */ + RTSEMMUTEXINTERNAL *pThis = hMutexSem; + AssertPtrReturn(pThis, RTLOCKVAL_SUB_CLASS_INVALID); + AssertReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, RTLOCKVAL_SUB_CLASS_INVALID); + + return RTLockValidatorRecExclSetSubClass(&pThis->ValidatorRec, uSubClass); +#else + RT_NOREF(hMutexSem, uSubClass); + return RTLOCKVAL_SUB_CLASS_INVALID; +#endif +} + + +DECL_FORCE_INLINE(int) rtSemMutexRequest(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, bool fAutoResume, PCRTLOCKVALSRCPOS pSrcPos) +{ + RT_NOREF(pSrcPos); + + /* + * Validate input. + */ + struct RTSEMMUTEXINTERNAL *pThis = hMutexSem; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, VERR_INVALID_HANDLE); + + /* + * Check if nested request. + */ + pthread_t Self = pthread_self(); + if ( pThis->Owner == Self + && pThis->cNestings > 0) + { +#ifdef RTSEMMUTEX_STRICT + int rc9 = RTLockValidatorRecExclRecursion(&pThis->ValidatorRec, pSrcPos); + if (RT_FAILURE(rc9)) + return rc9; +#endif + ASMAtomicIncU32(&pThis->cNestings); + return VINF_SUCCESS; + } + +#ifdef RTSEMMUTEX_STRICT + RTTHREAD hThreadSelf = RTThreadSelfAutoAdopt(); + if (cMillies) + { + int rc9 = RTLockValidatorRecExclCheckOrder(&pThis->ValidatorRec, hThreadSelf, pSrcPos, cMillies); + if (RT_FAILURE(rc9)) + return rc9; + } +#else + RTTHREAD hThreadSelf = RTThreadSelf(); +#endif + + /* + * Convert timeout value. + */ + struct timespec ts; + struct timespec *pTimeout = NULL; + uint64_t u64End = 0; /* shut up gcc */ + if (cMillies != RT_INDEFINITE_WAIT) + { + ts.tv_sec = cMillies / 1000; + ts.tv_nsec = (cMillies % 1000) * UINT32_C(1000000); + u64End = RTTimeSystemNanoTS() + cMillies * UINT64_C(1000000); + pTimeout = &ts; + } + + /* + * Lock the mutex. + * Optimize for the uncontended case (makes 1-2 ns difference). + */ + if (RT_UNLIKELY(!ASMAtomicCmpXchgS32(&pThis->iState, 1, 0))) + { + for (;;) + { + int32_t iOld = ASMAtomicXchgS32(&pThis->iState, 2); + + /* + * Was the lock released in the meantime? This is unlikely (but possible) + */ + if (RT_UNLIKELY(iOld == 0)) + break; + + /* + * Go to sleep. + */ + if (pTimeout && ( pTimeout->tv_sec || pTimeout->tv_nsec )) + { +#ifdef RTSEMMUTEX_STRICT + int rc9 = RTLockValidatorRecExclCheckBlocking(&pThis->ValidatorRec, hThreadSelf, pSrcPos, true, + cMillies, RTTHREADSTATE_MUTEX, true); + if (RT_FAILURE(rc9)) + return rc9; +#else + RTThreadBlocking(hThreadSelf, RTTHREADSTATE_MUTEX, true); +#endif + } + + long rc = sys_futex(&pThis->iState, FUTEX_WAIT, 2, pTimeout, NULL, 0); + + RTThreadUnblocked(hThreadSelf, RTTHREADSTATE_MUTEX); + if (RT_UNLIKELY(pThis->u32Magic != RTSEMMUTEX_MAGIC)) + return VERR_SEM_DESTROYED; + + /* + * Act on the wakup code. + */ + if (rc == -ETIMEDOUT) + { + Assert(pTimeout); + return VERR_TIMEOUT; + } + if (rc == 0) + /* we'll leave the loop now unless another thread is faster */; + else if (rc == -EWOULDBLOCK) + /* retry with new value. */; + else if (rc == -EINTR) + { + if (!fAutoResume) + return VERR_INTERRUPTED; + } + else + { + /* this shouldn't happen! */ + AssertMsgFailed(("rc=%ld errno=%d\n", rc, errno)); + return RTErrConvertFromErrno(rc); + } + + /* adjust the relative timeout */ + if (pTimeout) + { + int64_t i64Diff = u64End - RTTimeSystemNanoTS(); + if (i64Diff < 1000) + { + rc = VERR_TIMEOUT; + break; + } + ts.tv_sec = (uint64_t)i64Diff / UINT32_C(1000000000); + ts.tv_nsec = (uint64_t)i64Diff % UINT32_C(1000000000); + } + } + + /* + * When leaving this loop, iState is set to 2. This means that we gained the + * lock and there are _possibly_ some waiters. We don't know exactly as another + * thread might entered this loop at nearly the same time. Therefore we will + * call futex_wakeup once too often (if _no_ other thread entered this loop). + * The key problem is the simple futex_wait test for x != y (iState != 2) in + * our case). + */ + } + + /* + * Set the owner and nesting. + */ + pThis->Owner = Self; + ASMAtomicWriteU32(&pThis->cNestings, 1); +#ifdef RTSEMMUTEX_STRICT + RTLockValidatorRecExclSetOwner(&pThis->ValidatorRec, hThreadSelf, pSrcPos, true); +#endif + return VINF_SUCCESS; +} + + +#undef RTSemMutexRequest +RTDECL(int) RTSemMutexRequest(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies) +{ +#ifndef RTSEMMUTEX_STRICT + int rc = rtSemMutexRequest(hMutexSem, cMillies, true, NULL); +#else + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API(); + int rc = rtSemMutexRequest(hMutexSem, cMillies, true, &SrcPos); +#endif + Assert(rc != VERR_INTERRUPTED); + return rc; +} + + +RTDECL(int) RTSemMutexRequestDebug(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, RTHCUINTPTR uId, RT_SRC_POS_DECL) +{ + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API(); + int rc = rtSemMutexRequest(hMutexSem, cMillies, true, &SrcPos); + Assert(rc != VERR_INTERRUPTED); + return rc; +} + + +#undef RTSemMutexRequestNoResume +RTDECL(int) RTSemMutexRequestNoResume(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies) +{ +#ifndef RTSEMMUTEX_STRICT + return rtSemMutexRequest(hMutexSem, cMillies, false, NULL); +#else + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_NORMAL_API(); + return rtSemMutexRequest(hMutexSem, cMillies, false, &SrcPos); +#endif +} + + +RTDECL(int) RTSemMutexRequestNoResumeDebug(RTSEMMUTEX hMutexSem, RTMSINTERVAL cMillies, RTHCUINTPTR uId, RT_SRC_POS_DECL) +{ + RTLOCKVALSRCPOS SrcPos = RTLOCKVALSRCPOS_INIT_DEBUG_API(); + return rtSemMutexRequest(hMutexSem, cMillies, false, &SrcPos); +} + + +RTDECL(int) RTSemMutexRelease(RTSEMMUTEX hMutexSem) +{ + /* + * Validate input. + */ + struct RTSEMMUTEXINTERNAL *pThis = hMutexSem; + AssertPtrReturn(pThis, VERR_INVALID_HANDLE); + AssertReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, VERR_INVALID_HANDLE); + +#ifdef RTSEMMUTEX_STRICT + int rc9 = RTLockValidatorRecExclReleaseOwner(&pThis->ValidatorRec, pThis->cNestings == 1); + if (RT_FAILURE(rc9)) + return rc9; +#endif + + /* + * Check if nested. + */ + pthread_t Self = pthread_self(); + if (RT_UNLIKELY( pThis->Owner != Self + || pThis->cNestings == 0)) + { + AssertMsgFailed(("Not owner of mutex %p!! Self=%08x Owner=%08x cNestings=%d\n", + pThis, Self, pThis->Owner, pThis->cNestings)); + return VERR_NOT_OWNER; + } + + /* + * If nested we'll just pop a nesting. + */ + if (pThis->cNestings > 1) + { + ASMAtomicDecU32(&pThis->cNestings); + return VINF_SUCCESS; + } + + /* + * Clear the state. (cNestings == 1) + */ + pThis->Owner = (pthread_t)~0; + ASMAtomicWriteU32(&pThis->cNestings, 0); + + /* + * Release the mutex. + */ + int32_t iNew = ASMAtomicDecS32(&pThis->iState); + if (RT_UNLIKELY(iNew != 0)) + { + /* somebody is waiting, try wake up one of them. */ + ASMAtomicXchgS32(&pThis->iState, 0); + (void)sys_futex(&pThis->iState, FUTEX_WAKE, 1, NULL, NULL, 0); + } + return VINF_SUCCESS; +} + + +RTDECL(bool) RTSemMutexIsOwned(RTSEMMUTEX hMutexSem) +{ + /* + * Validate. + */ + RTSEMMUTEXINTERNAL *pThis = hMutexSem; + AssertPtrReturn(pThis, false); + AssertReturn(pThis->u32Magic == RTSEMMUTEX_MAGIC, false); + + return pThis->Owner != (pthread_t)~0; +} + diff --git a/src/VBox/Runtime/r3/linux/sysfs.cpp b/src/VBox/Runtime/r3/linux/sysfs.cpp new file mode 100644 index 00000000..198a98c3 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/sysfs.cpp @@ -0,0 +1,710 @@ +/* $Id: sysfs.cpp $ */ +/** @file + * IPRT - Linux sysfs access. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_SYSTEM +#include <iprt/assert.h> +#include <iprt/dir.h> +#include <iprt/err.h> +#include <iprt/file.h> +#include <iprt/fs.h> +#include <iprt/param.h> +#include <iprt/path.h> +#include <iprt/string.h> +#include <iprt/symlink.h> + +#include <iprt/linux/sysfs.h> + +#include <unistd.h> +#include <stdio.h> +#include <sys/stat.h> +#include <sys/fcntl.h> +#include <sys/sysmacros.h> +#include <errno.h> + + + +/** + * Constructs the path of a sysfs file from the format parameters passed, + * prepending a prefix if the path is relative. + * + * @returns IPRT status code. + * @param pszPrefix The prefix to prepend if the path is relative. Must end + * in '/'. + * @param pszBuf Where to write the path. Must be at least + * sizeof(@a pszPrefix) characters long + * @param cchBuf The size of the buffer pointed to by @a pszBuf. + * @param pszFormat The name format, either absolute or relative to the + * prefix specified by @a pszPrefix. + * @param va The format args. + */ +static int rtLinuxConstructPathV(char *pszBuf, size_t cchBuf, + const char *pszPrefix, + const char *pszFormat, va_list va) +{ + size_t const cchPrefix = strlen(pszPrefix); + AssertReturn(pszPrefix[cchPrefix - 1] == '/', VERR_INVALID_PARAMETER); + AssertReturn(cchBuf > cchPrefix + 1, VERR_INVALID_PARAMETER); + + ssize_t cch = RTStrPrintf2V(pszBuf, cchBuf, pszFormat, va); + AssertReturn(cch >= 0, VERR_BUFFER_OVERFLOW); + + if (*pszBuf != '/') + { + AssertReturn(cchBuf >= (size_t)cch + cchPrefix + 1, VERR_BUFFER_OVERFLOW); + memmove(pszBuf + cchPrefix, pszBuf, (size_t)cch + 1); + memcpy(pszBuf, pszPrefix, cchPrefix); + } + return VINF_SUCCESS; +} + + +/** + * Constructs the path of a sysfs file from the format parameters passed, + * prepending a prefix if the path is relative. + * + * @returns IPRT status code. + * @param pszPrefix The prefix to prepend if the path is relative. Must end + * in '/'. + * @param pszBuf Where to write the path. Must be at least + * sizeof(@a pszPrefix) characters long + * @param cchBuf The size of the buffer pointed to by @a pszBuf. + * @param pszFormat The name format, either absolute or relative to "/sys/". + * @param ... The format args. + */ +DECLINLINE(int) rtLinuxConstructPath(char *pszBuf, size_t cchBuf, + const char *pszPrefix, + const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = rtLinuxConstructPathV(pszBuf, cchBuf, pszPrefix, pszFormat, va); + va_end(va); + return rc; +} + + +/** + * Constructs the path of a sysfs file from the format parameters passed, + * prepending "/sys/" if the path is relative. + * + * @returns IPRT status code. + * @param pszBuf Where to write the path. Must be at least + * sizeof("/sys/") characters long + * @param cchBuf The size of the buffer pointed to by @a pszBuf. + * @param pszFormat The name format, either absolute or relative to "/sys/". + * @param va The format args. + */ +DECLINLINE(int) rtLinuxSysFsConstructPath(char *pszBuf, size_t cchBuf, const char *pszFormat, va_list va) +{ + return rtLinuxConstructPathV(pszBuf, cchBuf, "/sys/", pszFormat, va); +} + + +RTDECL(int) RTLinuxSysFsExistsExV(const char *pszFormat, va_list va) +{ + int iSavedErrno = errno; + + /* + * Construct the filename and call stat. + */ + char szFilename[RTPATH_MAX]; + int rc = rtLinuxSysFsConstructPath(szFilename, sizeof(szFilename), pszFormat, va); + if (RT_SUCCESS(rc)) + { + struct stat st; + int rcStat = stat(szFilename, &st); + if (rcStat != 0) + rc = RTErrConvertFromErrno(errno); + } + + errno = iSavedErrno; + return rc; +} + + +RTDECL(bool) RTLinuxSysFsExistsV(const char *pszFormat, va_list va) +{ + return RT_SUCCESS(RTLinuxSysFsExistsExV(pszFormat, va)); +} + + +RTDECL(int) RTLinuxSysFsExistsEx(const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsExistsExV(pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(bool) RTLinuxSysFsExists(const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + bool fRet = RTLinuxSysFsExistsV(pszFormat, va); + va_end(va); + return fRet; +} + + +RTDECL(int) RTLinuxSysFsOpenV(PRTFILE phFile, const char *pszFormat, va_list va) +{ + /* + * Construct the filename and call open. + */ + char szFilename[RTPATH_MAX]; + int rc = rtLinuxSysFsConstructPath(szFilename, sizeof(szFilename), pszFormat, va); + if (RT_SUCCESS(rc)) + rc = RTFileOpen(phFile, szFilename, RTFILE_O_OPEN | RTFILE_O_READ | RTFILE_O_DENY_NONE); + return rc; +} + + +RTDECL(int) RTLinuxSysFsOpenExV(PRTFILE phFile, uint64_t fOpen, const char *pszFormat, va_list va) +{ + /* + * Construct the filename and call open. + */ + char szFilename[RTPATH_MAX]; + int rc = rtLinuxSysFsConstructPath(szFilename, sizeof(szFilename), pszFormat, va); + if (RT_SUCCESS(rc)) + rc = RTFileOpen(phFile, szFilename, fOpen); + return rc; +} + + +RTDECL(int) RTLinuxSysFsOpen(PRTFILE phFile, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsOpenV(phFile, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsOpenEx(PRTFILE phFile, uint64_t fOpen, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsOpenExV(phFile, fOpen, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsReadStr(RTFILE hFile, char *pszBuf, size_t cchBuf, size_t *pcchRead) +{ + Assert(cchBuf > 1); /* not mandatory */ + + int rc; + size_t cchRead; + rc = RTFileRead(hFile, pszBuf, cchBuf, &cchRead); + if (RT_SUCCESS(rc)) + { + /* + * ASSUME that if we've read less than we asked for, we've reached the + * end of the file. Otherwise, we've been given a buffer too small for + * the entire remainder of the file. + */ + if (cchRead < cchBuf) + pszBuf[cchRead] = '\0'; + else if (cchBuf) + { + rc = RTFileSeek(hFile, -1, RTFILE_SEEK_CURRENT, NULL); + if (RT_SUCCESS(rc)) + rc = VERR_BUFFER_OVERFLOW; + cchRead = cchBuf - 1; + pszBuf[cchRead] = '\0'; + } + else + rc = VERR_BUFFER_OVERFLOW; + } + else + { + if (cchBuf > 0) + *pszBuf = '\0'; + cchRead = 0; + } + + if (pcchRead) + *pcchRead = cchRead; + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteStr(RTFILE hFile, const char *pszBuf, size_t cchBuf, size_t *pcchWritten) +{ + if (!cchBuf) + cchBuf = strlen(pszBuf) + 1; /* Include the terminator */ + return RTFileWrite(hFile, pszBuf, cchBuf, pcchWritten); +} + + +RTDECL(int) RTLinuxSysFsReadFile(RTFILE hFile, void *pvBuf, size_t cbBuf, size_t *pcbRead) +{ + int rc; + size_t cbRead = 0; + + rc = RTFileRead(hFile, pvBuf, cbBuf, &cbRead); + if (RT_SUCCESS(rc)) + { + if (pcbRead) + *pcbRead = cbRead; + if (cbRead < cbBuf) + rc = VINF_SUCCESS; + else + { + /* Check for EOF */ + uint64_t offCur = 0; + uint8_t bRead; + rc = RTFileSeek(hFile, 0, RTFILE_SEEK_CURRENT, &offCur); + if (RT_SUCCESS(rc)) + { + int rc2 = RTFileRead(hFile, &bRead, 1, NULL); + if (RT_SUCCESS(rc2)) + { + rc = VERR_BUFFER_OVERFLOW; + + rc2 = RTFileSeek(hFile, offCur, RTFILE_SEEK_BEGIN, NULL); + if (RT_FAILURE(rc2)) + rc = rc2; + } + else if (rc2 != VERR_EOF) + rc = rc2; + } + } + } + + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteFile(RTFILE hFile, void *pvBuf, size_t cbBuf, size_t *pcbWritten) +{ + return RTFileWrite(hFile, pvBuf, cbBuf, pcbWritten); +} + + +RTDECL(int) RTLinuxSysFsReadIntFileV(unsigned uBase, int64_t *pi64, const char *pszFormat, va_list va) +{ + RTFILE hFile; + + AssertPtrReturn(pi64, VERR_INVALID_POINTER); + + int rc = RTLinuxSysFsOpenV(&hFile, pszFormat, va); + if (RT_SUCCESS(rc)) + { + char szNum[128]; + size_t cchNum; + rc = RTLinuxSysFsReadStr(hFile, szNum, sizeof(szNum), &cchNum); + if (RT_SUCCESS(rc)) + { + if (cchNum > 0) + { + int64_t i64Ret = -1; + rc = RTStrToInt64Ex(szNum, NULL, uBase, &i64Ret); + if (RT_SUCCESS(rc)) + *pi64 = i64Ret; + } + else + rc = VERR_INVALID_PARAMETER; + } + + RTFileClose(hFile); + } + + return rc; +} + + +RTDECL(int) RTLinuxSysFsReadIntFile(unsigned uBase, int64_t *pi64, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsReadIntFileV(uBase, pi64, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteU8FileV(unsigned uBase, uint8_t u8, const char *pszFormat, va_list va) +{ + return RTLinuxSysFsWriteU64FileV(uBase, u8, pszFormat, va); +} + + +RTDECL(int) RTLinuxSysFsWriteU8File(unsigned uBase, uint8_t u8, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsWriteU64FileV(uBase, u8, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteU16FileV(unsigned uBase, uint16_t u16, const char *pszFormat, va_list va) +{ + return RTLinuxSysFsWriteU64FileV(uBase, u16, pszFormat, va); +} + + +RTDECL(int) RTLinuxSysFsWriteU16File(unsigned uBase, uint16_t u16, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsWriteU64FileV(uBase, u16, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteU32FileV(unsigned uBase, uint32_t u32, const char *pszFormat, va_list va) +{ + return RTLinuxSysFsWriteU64FileV(uBase, u32, pszFormat, va); +} + + +RTDECL(int) RTLinuxSysFsWriteU32File(unsigned uBase, uint32_t u32, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsWriteU64FileV(uBase, u32, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteU64FileV(unsigned uBase, uint64_t u64, const char *pszFormat, va_list va) +{ + RTFILE hFile; + + const char *pszFmt = NULL; + switch (uBase) + { + case 8: + pszFmt = "%#llo"; + break; + case 10: + pszFmt = "%llu"; + break; + case 16: + pszFmt = "%#llx"; + break; + default: + return VERR_INVALID_PARAMETER; + } + + int rc = RTLinuxSysFsOpenExV(&hFile, RTFILE_O_OPEN | RTFILE_O_WRITE | RTFILE_O_DENY_NONE, pszFormat, va); + if (RT_SUCCESS(rc)) + { + char szNum[128]; + size_t cchNum = RTStrPrintf(szNum, sizeof(szNum), pszFmt, u64); + if (cchNum > 0) + { + size_t cbWritten = 0; + rc = RTLinuxSysFsWriteStr(hFile, &szNum[0], cchNum, &cbWritten); + if ( RT_SUCCESS(rc) + && cbWritten != cchNum) + rc = VERR_BUFFER_OVERFLOW; + } + else + rc = VERR_INVALID_PARAMETER; + + RTFileClose(hFile); + } + + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteU64File(unsigned uBase, uint32_t u64, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsWriteU64FileV(uBase, u64, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsReadDevNumFileV(dev_t *pDevNum, const char *pszFormat, va_list va) +{ + RTFILE hFile; + + AssertPtrReturn(pDevNum, VERR_INVALID_POINTER); + + int rc = RTLinuxSysFsOpenV(&hFile, pszFormat, va); + if (RT_SUCCESS(rc)) + { + size_t cchNum = 0; + char szNum[128]; + rc = RTLinuxSysFsReadStr(hFile, szNum, sizeof(szNum), &cchNum); + if (RT_SUCCESS(rc)) + { + if (cchNum > 0) + { + uint32_t u32Maj = 0; + uint32_t u32Min = 0; + char *pszNext = NULL; + rc = RTStrToUInt32Ex(szNum, &pszNext, 10, &u32Maj); + if (RT_FAILURE(rc) || (rc != VWRN_TRAILING_CHARS) || (*pszNext != ':')) + rc = VERR_INVALID_PARAMETER; + else + { + rc = RTStrToUInt32Ex(pszNext + 1, NULL, 10, &u32Min); + if ( rc != VINF_SUCCESS + && rc != VWRN_TRAILING_CHARS + && rc != VWRN_TRAILING_SPACES) + rc = VERR_INVALID_PARAMETER; + else + *pDevNum = makedev(u32Maj, u32Min); + } + } + else + rc = VERR_INVALID_PARAMETER; + } + + RTFileClose(hFile); + } + + return rc; +} + + +RTDECL(int) RTLinuxSysFsReadDevNumFile(dev_t *pDevNum, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsReadDevNumFileV(pDevNum, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsReadStrFileV(char *pszBuf, size_t cchBuf, size_t *pcchRead, const char *pszFormat, va_list va) +{ + RTFILE hFile; + + AssertPtrReturn(pszBuf, VERR_INVALID_POINTER); + + int rc = RTLinuxSysFsOpenV(&hFile, pszFormat, va); + if (RT_SUCCESS(rc)) + { + /* + * Note! We cannot use RTLinuxSysFsReadStr here as it has different + * semantics wrt to newline characters. It is not known why + * the semantics has to differ... Michael, any clues? + */ + size_t cchRead; + rc = RTFileRead(hFile, pszBuf, cchBuf, &cchRead); + if (RT_SUCCESS(rc)) + { + char *pchNewLine = (char *)memchr(pszBuf, '\n', cchRead); + if (pchNewLine) + { + *pchNewLine = '\0'; + cchRead = pchNewLine - pszBuf; + } + else if (cchRead < cchBuf) + pszBuf[cchRead] = '\0'; + else + { + if (cchBuf) + { + cchRead = cchBuf - 1; + pszBuf[cchRead] = '\0'; + } + else + cchRead = 0; + rc = VERR_BUFFER_OVERFLOW; + } + } + else + cchRead = 0; + + RTFileClose(hFile); + + if (pcchRead) + *pcchRead = cchRead; + } + else + { + if (cchBuf) + *pszBuf = '\0'; + if (pcchRead) + *pcchRead = 0; + } + return rc; +} + + +RTDECL(int) RTLinuxSysFsReadStrFile(char *pszBuf, size_t cchBuf, size_t *pcchRead, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsReadStrFileV(pszBuf, cchBuf, pcchRead, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteStrFileV(const char *pszBuf, size_t cchBuf, size_t *pcchWritten, const char *pszFormat, va_list va) +{ + RTFILE hFile; + + AssertPtrReturn(pszBuf, VERR_INVALID_POINTER); + + int rc = RTLinuxSysFsOpenExV(&hFile, RTFILE_O_OPEN | RTFILE_O_WRITE | RTFILE_O_DENY_NONE, pszFormat, va); + if (RT_SUCCESS(rc)) + { + rc = RTLinuxSysFsWriteStr(hFile, pszBuf, cchBuf, pcchWritten); + RTFileClose(hFile); + } + return rc; +} + + +RTDECL(int) RTLinuxSysFsWriteStrFile(const char *pszBuf, size_t cchBuf, size_t *pcchWritten, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsWriteStrFileV(pszBuf, cchBuf, pcchWritten, pszFormat, va); + va_end(va); + return rc; +} + +RTDECL(int) RTLinuxSysFsGetLinkDestV(char *pszBuf, size_t cchBuf, size_t *pchBuf, const char *pszFormat, va_list va) +{ + AssertReturn(cchBuf >= 2, VERR_INVALID_PARAMETER); + + /* + * Construct the filename and read the link. + */ + char szFilename[RTPATH_MAX]; + int rc = rtLinuxSysFsConstructPath(szFilename, sizeof(szFilename), pszFormat, va); + if (RT_SUCCESS(rc)) + { + char szLink[RTPATH_MAX]; + rc = RTSymlinkRead(szFilename, szLink, sizeof(szLink), 0); + if (RT_SUCCESS(rc)) + { + /* + * Extract the file name component and copy it into the return buffer. + */ + size_t cchName; + const char *pszName = RTPathFilename(szLink); + if (pszName) + { + cchName = strlen(pszName); + if (cchName < cchBuf) + memcpy(pszBuf, pszName, cchName + 1); + else + rc = VERR_BUFFER_OVERFLOW; + } + else + { + *pszBuf = '\0'; + cchName = 0; + } + + if (pchBuf) + *pchBuf = cchName; + } + } + + return rc; +} + + +RTDECL(int) RTLinuxSysFsGetLinkDest(char *pszBuf, size_t cchBuf, size_t *pchBuf, const char *pszFormat, ...) +{ + va_list va; + va_start(va, pszFormat); + int rc = RTLinuxSysFsGetLinkDestV(pszBuf, cchBuf, pchBuf, pszFormat, va); + va_end(va); + return rc; +} + + +RTDECL(int) RTLinuxCheckDevicePathV(dev_t DevNum, RTFMODE fMode, char *pszBuf, + size_t cchBuf, const char *pszPattern, + va_list va) +{ + AssertReturn(cchBuf >= 2, VERR_INVALID_PARAMETER); + AssertReturn( fMode == RTFS_TYPE_DEV_CHAR + || fMode == RTFS_TYPE_DEV_BLOCK, + VERR_INVALID_PARAMETER); + AssertPtrReturn(pszPattern, VERR_INVALID_PARAMETER); + + /* + * Construct the filename and read the link. + */ + char szFilename[RTPATH_MAX]; + int rc = rtLinuxConstructPathV(szFilename, sizeof(szFilename), "/dev/", + pszPattern, va); + if (RT_SUCCESS(rc)) + { + RTFSOBJINFO Info; + rc = RTPathQueryInfo(szFilename, &Info, RTFSOBJATTRADD_UNIX); + if ( rc == VERR_PATH_NOT_FOUND + || ( RT_SUCCESS(rc) + && ( Info.Attr.u.Unix.Device != DevNum + || (Info.Attr.fMode & RTFS_TYPE_MASK) != fMode))) + rc = VERR_FILE_NOT_FOUND; + + if (RT_SUCCESS(rc)) + { + size_t cchPath = strlen(szFilename); + if (cchPath < cchBuf) + memcpy(pszBuf, szFilename, cchPath + 1); + else + rc = VERR_BUFFER_OVERFLOW; + } + } + + return rc; +} + + +RTDECL(int) RTLinuxCheckDevicePath(dev_t DevNum, RTFMODE fMode, char *pszBuf, + size_t cchBuf, const char *pszPattern, + ...) +{ + va_list va; + va_start(va, pszPattern); + int rc = RTLinuxCheckDevicePathV(DevNum, fMode, pszBuf, cchBuf, + pszPattern, va); + va_end(va); + return rc; +} + diff --git a/src/VBox/Runtime/r3/linux/systemmem-linux.cpp b/src/VBox/Runtime/r3/linux/systemmem-linux.cpp new file mode 100644 index 00000000..ef853196 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/systemmem-linux.cpp @@ -0,0 +1,109 @@ +/* $Id: systemmem-linux.cpp $ */ +/** @file + * IPRT - RTSystemQueryTotalRam, Linux ring-3. + */ + +/* + * Copyright (C) 2012-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#include <iprt/system.h> +#include "internal/iprt.h" + +#include <iprt/errcore.h> +#include <iprt/assert.h> +#include <iprt/string.h> + +#include <stdio.h> +#include <errno.h> + +/* Satisfy compiller warning */ +#define __EXPORTED_HEADERS__ +#include <sys/sysinfo.h> +#undef __EXPORTED_HEADERS__ + + +RTDECL(int) RTSystemQueryTotalRam(uint64_t *pcb) +{ + AssertPtrReturn(pcb, VERR_INVALID_POINTER); + + struct sysinfo info; + int rc = sysinfo(&info); + if (rc == 0) + { + *pcb = (uint64_t)info.totalram * info.mem_unit; + return VINF_SUCCESS; + } + return RTErrConvertFromErrno(errno); +} + + +RTDECL(int) RTSystemQueryAvailableRam(uint64_t *pcb) +{ + AssertPtrReturn(pcb, VERR_INVALID_POINTER); + + FILE *pFile = fopen("/proc/meminfo", "r"); + if (pFile) + { + int rc = VERR_NOT_FOUND; + uint64_t cbTotal = 0; + uint64_t cbFree = 0; + uint64_t cbBuffers = 0; + uint64_t cbCached = 0; + char sz[256]; + while (fgets(sz, sizeof(sz), pFile)) + { + if (!strncmp(sz, RT_STR_TUPLE("MemTotal:"))) + rc = RTStrToUInt64Ex(RTStrStripL(&sz[sizeof("MemTotal:")]), NULL, 0, &cbTotal); + else if (!strncmp(sz, RT_STR_TUPLE("MemFree:"))) + rc = RTStrToUInt64Ex(RTStrStripL(&sz[sizeof("MemFree:")]), NULL, 0, &cbFree); + else if (!strncmp(sz, RT_STR_TUPLE("Buffers:"))) + rc = RTStrToUInt64Ex(RTStrStripL(&sz[sizeof("Buffers:")]), NULL, 0, &cbBuffers); + else if (!strncmp(sz, RT_STR_TUPLE("Cached:"))) + rc = RTStrToUInt64Ex(RTStrStripL(&sz[sizeof("Cached:")]), NULL, 0, &cbCached); + if (RT_FAILURE(rc)) + break; + } + fclose(pFile); + if (RT_SUCCESS(rc)) + { + *pcb = (cbFree + cbBuffers + cbCached) * _1K; + return VINF_SUCCESS; + } + } + /* + * Fallback (e.g. /proc not mapped) to sysinfo. Less accurat because there + * is no information about the cached memory. 'Cached:' from above is only + * accessible through proc :-( + */ + struct sysinfo info; + int rc = sysinfo(&info); + if (rc == 0) + { + *pcb = ((uint64_t)info.freeram + info.bufferram) * info.mem_unit; + return VINF_SUCCESS; + } + return RTErrConvertFromErrno(errno); +} + diff --git a/src/VBox/Runtime/r3/linux/thread-affinity-linux.cpp b/src/VBox/Runtime/r3/linux/thread-affinity-linux.cpp new file mode 100644 index 00000000..88dbe99c --- /dev/null +++ b/src/VBox/Runtime/r3/linux/thread-affinity-linux.cpp @@ -0,0 +1,95 @@ +/* $Id: thread-affinity-linux.cpp $ */ +/** @file + * IPRT - Thread Affinity, Linux ring-3 implementation. + */ + +/* + * Copyright (C) 2011-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif +#include <features.h> +#if __GLIBC_PREREQ(2,4) + +#include <sched.h> +#include <unistd.h> +#include <errno.h> +#include <pthread.h> + +#include <iprt/thread.h> +#include "internal/iprt.h" + +#include <iprt/assert.h> +#include <iprt/cpuset.h> +#include <iprt/err.h> +#include <iprt/mp.h> + + + +RTR3DECL(int) RTThreadSetAffinity(PCRTCPUSET pCpuSet) +{ + /* convert */ + cpu_set_t LnxCpuSet; + CPU_ZERO(&LnxCpuSet); + if (!pCpuSet) + for (unsigned iCpu = 0; iCpu < CPU_SETSIZE; iCpu++) + CPU_SET(iCpu, &LnxCpuSet); + else + for (unsigned iCpu = 0; iCpu < RT_MIN(CPU_SETSIZE, RTCPUSET_MAX_CPUS); iCpu++) + if (RTCpuSetIsMemberByIndex(pCpuSet, iCpu)) + CPU_SET(iCpu, &LnxCpuSet); + + int rc = pthread_setaffinity_np(pthread_self(), sizeof(LnxCpuSet), &LnxCpuSet); + if (!rc) + return VINF_SUCCESS; + rc = errno; + if (rc == ENOENT) + return VERR_CPU_NOT_FOUND; + return RTErrConvertFromErrno(errno); +} + + +RTR3DECL(int) RTThreadGetAffinity(PRTCPUSET pCpuSet) +{ + cpu_set_t LnxCpuSet; + int rc = pthread_getaffinity_np(pthread_self(), sizeof(LnxCpuSet), &LnxCpuSet); + if (rc != 0) + return RTErrConvertFromErrno(errno); + + /* convert */ + RTCpuSetEmpty(pCpuSet); + for (unsigned iCpu = 0; iCpu < RT_MIN(CPU_SETSIZE, RTCPUSET_MAX_CPUS); iCpu++) + if (CPU_ISSET(iCpu, &LnxCpuSet)) + RTCpuSetAddByIndex(pCpuSet, iCpu); + + return VINF_SUCCESS; +} + +#else +# include "../../generic/RTThreadGetAffinity-stub-generic.cpp" +# include "../../generic/RTThreadSetAffinity-stub-generic.cpp" +#endif + diff --git a/src/VBox/Runtime/r3/linux/time-linux.cpp b/src/VBox/Runtime/r3/linux/time-linux.cpp new file mode 100644 index 00000000..fddbd004 --- /dev/null +++ b/src/VBox/Runtime/r3/linux/time-linux.cpp @@ -0,0 +1,159 @@ +/* $Id: time-linux.cpp $ */ +/** @file + * IPRT - Time, POSIX. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + * + * The contents of this file may alternatively be used under the terms + * of the Common Development and Distribution License Version 1.0 + * (CDDL) only, as it comes in the "COPYING.CDDL" file of the + * VirtualBox OSE distribution, in which case the provisions of the + * CDDL are applicable instead of those of the GPL. + * + * You may elect to license modified versions of this file under the + * terms and conditions of either the GPL or the CDDL or both. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP RTLOGGROUP_TIME +#define RTTIME_INCL_TIMEVAL +#include <sys/time.h> +#include <time.h> +#include <sys/syscall.h> +#include <unistd.h> +#ifndef __NR_clock_gettime +# define __NR_timer_create 259 +# define __NR_clock_gettime (__NR_timer_create+6) +#endif + +#include <iprt/time.h> +#include "internal/time.h" + + +DECLINLINE(int) sys_clock_gettime(clockid_t id, struct timespec *ts) +{ + int rc = syscall(__NR_clock_gettime, id, ts); + if (rc >= 0) + return rc; + return -1; +} + + +/** + * Wrapper around various monotone time sources. + */ +DECLINLINE(int) mono_clock(struct timespec *ts) +{ + static int iWorking = -1; + switch (iWorking) + { +#ifdef CLOCK_MONOTONIC + /* + * Standard clock_gettime() + */ + case 0: + return clock_gettime(CLOCK_MONOTONIC, ts); + + /* + * Syscall clock_gettime(). + */ + case 1: + return sys_clock_gettime(CLOCK_MONOTONIC, ts); + +#endif /* CLOCK_MONOTONIC */ + + + /* + * Figure out what's working. + */ + case -1: + { +#ifdef CLOCK_MONOTONIC + /* + * Real-Time API. + */ + int rc = clock_gettime(CLOCK_MONOTONIC, ts); + if (!rc) + { + iWorking = 0; + return 0; + } + + rc = sys_clock_gettime(CLOCK_MONOTONIC, ts); + if (!rc) + { + iWorking = 1; + return 0; + } +#endif /* CLOCK_MONOTONIC */ + + /* give up */ + iWorking = -2; + break; + } + } + return -1; +} + + +DECLINLINE(uint64_t) rtTimeGetSystemNanoTS(void) +{ + /* check monotonic clock first. */ + static bool fMonoClock = true; + if (fMonoClock) + { + struct timespec ts; + if (!mono_clock(&ts)) + return (uint64_t)ts.tv_sec * RT_NS_1SEC_64 + + ts.tv_nsec; + fMonoClock = false; + } + + /* fallback to gettimeofday(). */ + struct timeval tv; + gettimeofday(&tv, NULL); + return (uint64_t)tv.tv_sec * RT_NS_1SEC_64 + + (uint64_t)(tv.tv_usec * RT_NS_1US); +} + + +/** + * Gets the current nanosecond timestamp. + * + * This differs from RTTimeNanoTS in that it will use system APIs and not do any + * resolution or performance optimizations. + * + * @returns nanosecond timestamp. + */ +RTDECL(uint64_t) RTTimeSystemNanoTS(void) +{ + return rtTimeGetSystemNanoTS(); +} + + +/** + * Gets the current millisecond timestamp. + * + * This differs from RTTimeNanoTS in that it will use system APIs and not do any + * resolution or performance optimizations. + * + * @returns millisecond timestamp. + */ +RTDECL(uint64_t) RTTimeSystemMilliTS(void) +{ + return rtTimeGetSystemNanoTS() / RT_NS_1MS; +} + |