diff options
Diffstat (limited to 'src/VBox/VMM/VMMRZ')
-rw-r--r-- | src/VBox/VMM/VMMRZ/CPUMRZ.cpp | 144 | ||||
-rw-r--r-- | src/VBox/VMM/VMMRZ/CPUMRZA.asm | 384 | ||||
-rw-r--r-- | src/VBox/VMM/VMMRZ/DBGFRZ.cpp | 240 | ||||
-rw-r--r-- | src/VBox/VMM/VMMRZ/Makefile.kup | 0 | ||||
-rw-r--r-- | src/VBox/VMM/VMMRZ/PGMRZDynMap.cpp | 2692 | ||||
-rw-r--r-- | src/VBox/VMM/VMMRZ/VMMRZ.cpp | 253 |
6 files changed, 3713 insertions, 0 deletions
diff --git a/src/VBox/VMM/VMMRZ/CPUMRZ.cpp b/src/VBox/VMM/VMMRZ/CPUMRZ.cpp new file mode 100644 index 00000000..4e765df8 --- /dev/null +++ b/src/VBox/VMM/VMMRZ/CPUMRZ.cpp @@ -0,0 +1,144 @@ +/* $Id: CPUMRZ.cpp $ */ +/** @file + * CPUM - Raw-mode and ring-0 context. + */ + +/* + * Copyright (C) 2016-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_CPUM +#include <VBox/vmm/cpum.h> +#include "CPUMInternal.h" +#include <VBox/vmm/vmcc.h> + +#include <VBox/err.h> +#include <VBox/log.h> +#include <VBox/vmm/hm.h> +#include <iprt/assert.h> +#include <iprt/x86.h> + + + + +/** + * Prepares the host FPU/SSE/AVX stuff for IEM action. + * + * This will make sure the FPU/SSE/AVX guest state is _not_ loaded in the CPU. + * This will make sure the FPU/SSE/AVX host state is saved. + * Finally, it will make sure the FPU/SSE/AVX host features can be safely + * accessed. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMRZ_INT_DECL(void) CPUMRZFpuStatePrepareHostCpuForUse(PVMCPUCC pVCpu) +{ + pVCpu->cpum.s.fChanged |= CPUM_CHANGED_FPU_REM; + switch (pVCpu->cpum.s.fUseFlags & (CPUM_USED_FPU_GUEST | CPUM_USED_FPU_HOST)) + { + case 0: + if (cpumRZSaveHostFPUState(&pVCpu->cpum.s) == VINF_CPUM_HOST_CR0_MODIFIED) + HMR0NotifyCpumModifiedHostCr0(pVCpu); + Log6(("CPUMRZFpuStatePrepareHostCpuForUse: #0 - %#x\n", ASMGetCR0())); + break; + + case CPUM_USED_FPU_HOST: + Log6(("CPUMRZFpuStatePrepareHostCpuForUse: #1 - %#x\n", ASMGetCR0())); + break; + + case CPUM_USED_FPU_GUEST | CPUM_USED_FPU_HOST: + cpumRZSaveGuestFpuState(&pVCpu->cpum.s, true /*fLeaveFpuAccessible*/); +#ifdef IN_RING0 + HMR0NotifyCpumUnloadedGuestFpuState(pVCpu); +#endif + Log6(("CPUMRZFpuStatePrepareHostCpuForUse: #2 - %#x\n", ASMGetCR0())); + break; + + default: + AssertFailed(); + } + +} + + +/** + * Makes sure the FPU/SSE/AVX guest state is saved in CPUMCPU::Guest and will be + * reloaded before direct use. + * + * No promisses about the FPU/SSE/AVX host features are made. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMRZ_INT_DECL(void) CPUMRZFpuStateActualizeForChange(PVMCPUCC pVCpu) +{ + CPUMRZFpuStatePrepareHostCpuForUse(pVCpu); +} + + +/** + * Makes sure the FPU/SSE/AVX state in CPUMCPU::Guest is up to date. + * + * This will not cause CPUM_USED_FPU_GUEST to change. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMRZ_INT_DECL(void) CPUMRZFpuStateActualizeForRead(PVMCPUCC pVCpu) +{ + if (pVCpu->cpum.s.fUseFlags & CPUM_USED_FPU_GUEST) + { + cpumRZSaveGuestFpuState(&pVCpu->cpum.s, false /*fLeaveFpuAccessible*/); + pVCpu->cpum.s.fUseFlags |= CPUM_USED_FPU_GUEST; + Log7(("CPUMRZFpuStateActualizeForRead\n")); + } +} + + +/** + * Makes sure the XMM0..XMM15 and MXCSR state in CPUMCPU::Guest is up to date. + * + * This will not cause CPUM_USED_FPU_GUEST to change. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMRZ_INT_DECL(void) CPUMRZFpuStateActualizeSseForRead(PVMCPUCC pVCpu) +{ +#if defined(VBOX_WITH_KERNEL_USING_XMM) && HC_ARCH_BITS == 64 + NOREF(pVCpu); +#else + if (pVCpu->cpum.s.fUseFlags & CPUM_USED_FPU_GUEST) + { + cpumRZSaveGuestSseRegisters(&pVCpu->cpum.s); + Log7(("CPUMRZFpuStateActualizeSseForRead\n")); + } +#endif +} + + +/** + * Makes sure the YMM0..YMM15 and MXCSR state in CPUMCPU::Guest is up to date. + * + * This will not cause CPUM_USED_FPU_GUEST to change. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMRZ_INT_DECL(void) CPUMRZFpuStateActualizeAvxForRead(PVMCPUCC pVCpu) +{ + if (pVCpu->cpum.s.fUseFlags & CPUM_USED_FPU_GUEST) + { + cpumRZSaveGuestAvxRegisters(&pVCpu->cpum.s); + Log7(("CPUMRZFpuStateActualizeAvxForRead\n")); + } +} + diff --git a/src/VBox/VMM/VMMRZ/CPUMRZA.asm b/src/VBox/VMM/VMMRZ/CPUMRZA.asm new file mode 100644 index 00000000..14309a9f --- /dev/null +++ b/src/VBox/VMM/VMMRZ/CPUMRZA.asm @@ -0,0 +1,384 @@ + ; $Id: CPUMRZA.asm $ +;; @file +; CPUM - Raw-mode and Ring-0 Context Assembly Routines. +; + +; +; Copyright (C) 2006-2020 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + + +;******************************************************************************* +;* Header Files * +;******************************************************************************* +%define RT_ASM_WITH_SEH64 +%include "VBox/asmdefs.mac" +%include "CPUMInternal.mac" +%include "iprt/x86.mac" +%include "VBox/vmm/cpum.mac" +%include "VBox/err.mac" + + + +BEGINCODE + + +;; +; Saves the host FPU/SSE/AVX state. +; +; Will return with CR0.EM and CR0.TS cleared! This is the normal state in ring-0. +; +; @returns VINF_SUCCESS (0) or VINF_CPUM_HOST_CR0_MODIFIED. (EAX) +; @param pCpumCpu x86:[ebp+8] gcc:rdi msc:rcx CPUMCPU pointer +; +align 16 +BEGINPROC cpumRZSaveHostFPUState + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + + ; + ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input. + ; +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_MSC + mov r11, rcx + %else + mov r11, rdi + %endif + %define pCpumCpu r11 + %define pXState r10 +%else + push ebx + push esi + mov ebx, dword [ebp + 8] + %define pCpumCpu ebx + %define pXState esi +%endif + + pushf ; The darwin kernel can get upset or upset things if an + cli ; interrupt occurs while we're doing fxsave/fxrstor/cr0. + + ; + ; We may have to update CR0, indirectly or directly. We must report any + ; changes to the VT-x code. + ; + CPUMRZ_TOUCH_FPU_CLEAR_CR0_FPU_TRAPS_SET_RC xCX, xAX, pCpumCpu ; xCX is the return value (xAX scratch) + + ; + ; Save the host state (xsave/fxsave will cause thread FPU state to be + ; loaded on systems where we are allowed to use it in ring-0. + ; + CPUMR0_SAVE_HOST + + or dword [pCpumCpu + CPUMCPU.fUseFlags], (CPUM_USED_FPU_HOST | CPUM_USED_FPU_SINCE_REM) ; Latter is not necessarily true, but normally yes. + popf + + mov eax, ecx ; The return value from above. +%ifdef RT_ARCH_X86 + pop esi + pop ebx +%endif + leave + ret +%undef pCpumCpu +%undef pXState +ENDPROC cpumRZSaveHostFPUState + + +;; +; Saves the guest FPU/SSE/AVX state. +; +; @param pCpumCpu x86:[ebp+8] gcc:rdi msc:rcx CPUMCPU pointer +; @param fLeaveFpuAccessible x86:[ebp+c] gcc:sil msc:dl Whether to restore CR0 and XCR0 on +; the way out. Only really applicable to RC. +; +; @remarks 64-bit Windows drivers shouldn't use AVX registers without saving+loading: +; https://msdn.microsoft.com/en-us/library/windows/hardware/ff545910%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396 +; However the compiler docs have different idea: +; https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx +; We'll go with the former for now. +; +align 16 +BEGINPROC cpumRZSaveGuestFpuState + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + + ; + ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input. + ; +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_MSC + mov r11, rcx + %else + mov r11, rdi + %endif + %define pCpumCpu r11 + %define pXState r10 +%else + push ebx + push esi + mov ebx, dword [ebp + 8] + %define pCpumCpu ebx + %define pXState esi +%endif + pushf ; The darwin kernel can get upset or upset things if an + cli ; interrupt occurs while we're doing fxsave/fxrstor/cr0. + + %ifdef IN_RC + mov ecx, cr0 ; ecx = saved cr0 + test ecx, X86_CR0_TS | X86_CR0_EM + jz .skip_cr0_write + mov eax, ecx + and eax, ~(X86_CR0_TS | X86_CR0_EM) + mov cr0, eax +.skip_cr0_write: + %endif + + %ifndef VBOX_WITH_KERNEL_USING_XMM + CPUMR0_SAVE_GUEST + %else + ; + ; The XMM0..XMM15 registers have been saved already. We exploit the + ; host state here to temporarly save the non-volatile XMM registers, + ; so we can load the guest ones while saving. This is safe. + ; + + ; Save caller's XMM registers. + mov pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0] + movdqa [pXState + X86FXSTATE.xmm6 ], xmm6 + movdqa [pXState + X86FXSTATE.xmm7 ], xmm7 + movdqa [pXState + X86FXSTATE.xmm8 ], xmm8 + movdqa [pXState + X86FXSTATE.xmm9 ], xmm9 + movdqa [pXState + X86FXSTATE.xmm10], xmm10 + movdqa [pXState + X86FXSTATE.xmm11], xmm11 + movdqa [pXState + X86FXSTATE.xmm12], xmm12 + movdqa [pXState + X86FXSTATE.xmm13], xmm13 + movdqa [pXState + X86FXSTATE.xmm14], xmm14 + movdqa [pXState + X86FXSTATE.xmm15], xmm15 + stmxcsr [pXState + X86FXSTATE.MXCSR] + + ; Load the guest XMM register values we already saved in HMR0VMXStartVMWrapXMM. + mov pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0] + movdqa xmm0, [pXState + X86FXSTATE.xmm0] + movdqa xmm1, [pXState + X86FXSTATE.xmm1] + movdqa xmm2, [pXState + X86FXSTATE.xmm2] + movdqa xmm3, [pXState + X86FXSTATE.xmm3] + movdqa xmm4, [pXState + X86FXSTATE.xmm4] + movdqa xmm5, [pXState + X86FXSTATE.xmm5] + movdqa xmm6, [pXState + X86FXSTATE.xmm6] + movdqa xmm7, [pXState + X86FXSTATE.xmm7] + movdqa xmm8, [pXState + X86FXSTATE.xmm8] + movdqa xmm9, [pXState + X86FXSTATE.xmm9] + movdqa xmm10, [pXState + X86FXSTATE.xmm10] + movdqa xmm11, [pXState + X86FXSTATE.xmm11] + movdqa xmm12, [pXState + X86FXSTATE.xmm12] + movdqa xmm13, [pXState + X86FXSTATE.xmm13] + movdqa xmm14, [pXState + X86FXSTATE.xmm14] + movdqa xmm15, [pXState + X86FXSTATE.xmm15] + ldmxcsr [pXState + X86FXSTATE.MXCSR] + + CPUMR0_SAVE_GUEST + + ; Restore caller's XMM registers. + mov pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0] + movdqa xmm6, [pXState + X86FXSTATE.xmm6 ] + movdqa xmm7, [pXState + X86FXSTATE.xmm7 ] + movdqa xmm8, [pXState + X86FXSTATE.xmm8 ] + movdqa xmm9, [pXState + X86FXSTATE.xmm9 ] + movdqa xmm10, [pXState + X86FXSTATE.xmm10] + movdqa xmm11, [pXState + X86FXSTATE.xmm11] + movdqa xmm12, [pXState + X86FXSTATE.xmm12] + movdqa xmm13, [pXState + X86FXSTATE.xmm13] + movdqa xmm14, [pXState + X86FXSTATE.xmm14] + movdqa xmm15, [pXState + X86FXSTATE.xmm15] + ldmxcsr [pXState + X86FXSTATE.MXCSR] + + %endif + + and dword [pCpumCpu + CPUMCPU.fUseFlags], ~CPUM_USED_FPU_GUEST + %ifdef IN_RC + test byte [ebp + 0ch], 1 ; fLeaveFpuAccessible + jz .no_cr0_restore + CPUMRZ_RESTORE_CR0_IF_TS_OR_EM_SET ecx +.no_cr0_restore: + %endif + popf +%ifdef RT_ARCH_X86 + pop esi + pop ebx +%endif + leave + ret +%undef pCpumCpu +%undef pXState +ENDPROC cpumRZSaveGuestFpuState + + +;; +; Saves the guest XMM0..15 registers and MXCSR. +; +; The purpose is to actualize the register state for read-only use, so CR0 is +; restored in raw-mode context (so, the FPU/SSE/AVX CPU features can be +; inaccessible upon return). +; +; @param pCpumCpu x86:[ebp+8] gcc:rdi msc:rcx CPUMCPU pointer +; +align 16 +BEGINPROC cpumRZSaveGuestSseRegisters + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + +%ifndef VBOX_WITH_KERNEL_USING_XMM + ; + ; Load xCX with the guest pXStateR0. + ; + %ifdef ASM_CALL64_GCC + mov xCX, rdi + %elifdef RT_ARCH_X86 + mov xCX, dword [ebp + 8] + %endif + %ifdef IN_RING0 + mov xCX, [xCX + CPUMCPU.Guest.pXStateR0] + %elifdef IN_RC + mov xCX, [xCX + CPUMCPU.Guest.pXStateRC] + %else + %error "Invalid context!" + %endif + + %ifdef IN_RC + ; Temporarily grant access to the SSE state. xDX must be preserved until CR0 is restored! + mov edx, cr0 + test edx, X86_CR0_TS | X86_CR0_EM + jz .skip_cr0_write + mov eax, edx + and eax, ~(X86_CR0_TS | X86_CR0_EM) + mov cr0, eax +.skip_cr0_write: + %endif + + ; + ; Do the job. + ; + stmxcsr [xCX + X86FXSTATE.MXCSR] + movdqa [xCX + X86FXSTATE.xmm0 ], xmm0 + movdqa [xCX + X86FXSTATE.xmm1 ], xmm1 + movdqa [xCX + X86FXSTATE.xmm2 ], xmm2 + movdqa [xCX + X86FXSTATE.xmm3 ], xmm3 + movdqa [xCX + X86FXSTATE.xmm4 ], xmm4 + movdqa [xCX + X86FXSTATE.xmm5 ], xmm5 + movdqa [xCX + X86FXSTATE.xmm6 ], xmm6 + movdqa [xCX + X86FXSTATE.xmm7 ], xmm7 + %if ARCH_BITS == 64 + movdqa [xCX + X86FXSTATE.xmm8 ], xmm8 + movdqa [xCX + X86FXSTATE.xmm9 ], xmm9 + movdqa [xCX + X86FXSTATE.xmm10], xmm10 + movdqa [xCX + X86FXSTATE.xmm11], xmm11 + movdqa [xCX + X86FXSTATE.xmm12], xmm12 + movdqa [xCX + X86FXSTATE.xmm13], xmm13 + movdqa [xCX + X86FXSTATE.xmm14], xmm14 + movdqa [xCX + X86FXSTATE.xmm15], xmm15 + %endif + + %ifdef IN_RC + CPUMRZ_RESTORE_CR0_IF_TS_OR_EM_SET edx ; Restore CR0 if we changed it above. + %endif + +%endif ; !VBOX_WITH_KERNEL_USING_XMM + + leave + ret +ENDPROC cpumRZSaveGuestSseRegisters + +;; +; Saves the guest YMM0..15 registers. +; +; The purpose is to actualize the register state for read-only use, so CR0 is +; restored in raw-mode context (so, the FPU/SSE/AVX CPU features can be +; inaccessible upon return). +; +; @param pCpumCpu x86:[ebp+8] gcc:rdi msc:rcx CPUMCPU pointer +; +align 16 +BEGINPROC cpumRZSaveGuestAvxRegisters + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +%ifdef IN_RC + push xBX +%endif +SEH64_END_PROLOGUE + + ; + ; Load xCX with the guest pXStateR0. + ; +%ifdef ASM_CALL64_GCC + mov xCX, rdi +%elifdef RT_ARCH_X86 + mov xCX, dword [ebp + 8] +%endif +%ifdef IN_RING0 + mov xCX, [xCX + CPUMCPU.Guest.pXStateR0] +%elifdef IN_RC + mov xCX, [xCX + CPUMCPU.Guest.pXStateRC] +%else + %error "Invalid context!" +%endif + +%ifdef IN_RC + ; Temporarily grant access to the SSE state. xBX must be preserved until CR0 is restored! + mov ebx, cr0 + test ebx, X86_CR0_TS | X86_CR0_EM + jz .skip_cr0_write + mov eax, ebx + and eax, ~(X86_CR0_TS | X86_CR0_EM) + mov cr0, eax +.skip_cr0_write: +%endif + + ; + ; Use XSAVE to do the job. + ; + ; Drivers shouldn't use AVX registers without saving+loading: + ; https://msdn.microsoft.com/en-us/library/windows/hardware/ff545910%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396 + ; However the compiler docs have different idea: + ; https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx + ; We'll go with the former for now. + ; +%ifdef VBOX_WITH_KERNEL_USING_XMM + mov eax, XSAVE_C_YMM +%else + mov eax, XSAVE_C_YMM | XSAVE_C_SSE ; The SSE component includes MXCSR. +%endif + xor edx, edx +%if ARCH_BITS == 64 + o64 xsave [xCX] +%else + xsave [xCX] +%endif + +%ifdef IN_RC + CPUMRZ_RESTORE_CR0_IF_TS_OR_EM_SET ebx ; Restore CR0 if we changed it above. + pop xBX +%endif + leave + ret +ENDPROC cpumRZSaveGuestAvxRegisters + diff --git a/src/VBox/VMM/VMMRZ/DBGFRZ.cpp b/src/VBox/VMM/VMMRZ/DBGFRZ.cpp new file mode 100644 index 00000000..44066fd5 --- /dev/null +++ b/src/VBox/VMM/VMMRZ/DBGFRZ.cpp @@ -0,0 +1,240 @@ +/* $Id: DBGFRZ.cpp $ */ +/** @file + * DBGF - Debugger Facility, RZ part. + */ + +/* + * Copyright (C) 2006-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_DBGF +#include <VBox/vmm/dbgf.h> +#include <VBox/vmm/selm.h> +#ifdef IN_RC +# include <VBox/vmm/trpm.h> +#endif +#include <VBox/log.h> +#include "DBGFInternal.h" +#include <VBox/vmm/vmcc.h> +#include <VBox/err.h> +#include <iprt/assert.h> + +#ifdef IN_RC +DECLASM(void) TRPMRCHandlerAsmTrap03(void); +#endif + + +/** + * \#DB (Debug event) handler. + * + * @returns VBox status code. + * VINF_SUCCESS means we completely handled this trap, + * other codes are passed execution to host context. + * + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * @param pRegFrame Pointer to the register frame for the trap. + * @param uDr6 The DR6 hypervisor register value. + * @param fAltStepping Alternative stepping indicator. + */ +VMMRZ_INT_DECL(int) DBGFRZTrap01Handler(PVM pVM, PVMCPU pVCpu, PCPUMCTXCORE pRegFrame, RTGCUINTREG uDr6, bool fAltStepping) +{ +#ifdef IN_RC + const bool fInHyper = !(pRegFrame->ss.Sel & X86_SEL_RPL) && !pRegFrame->eflags.Bits.u1VM; +#else + NOREF(pRegFrame); + const bool fInHyper = false; +#endif + + /** @todo Intel docs say that X86_DR6_BS has the highest priority... */ + /* + * A breakpoint? + */ + AssertCompile(X86_DR6_B0 == 1 && X86_DR6_B1 == 2 && X86_DR6_B2 == 4 && X86_DR6_B3 == 8); + if ( (uDr6 & (X86_DR6_B0 | X86_DR6_B1 | X86_DR6_B2 | X86_DR6_B3)) + && pVM->dbgf.s.cEnabledHwBreakpoints > 0) + { + for (unsigned iBp = 0; iBp < RT_ELEMENTS(pVM->dbgf.s.aHwBreakpoints); iBp++) + { + if ( ((uint32_t)uDr6 & RT_BIT_32(iBp)) + && pVM->dbgf.s.aHwBreakpoints[iBp].enmType == DBGFBPTYPE_REG) + { + pVCpu->dbgf.s.iActiveBp = pVM->dbgf.s.aHwBreakpoints[iBp].iBp; + pVCpu->dbgf.s.fSingleSteppingRaw = false; + LogFlow(("DBGFRZTrap03Handler: hit hw breakpoint %d at %04x:%RGv\n", + pVM->dbgf.s.aHwBreakpoints[iBp].iBp, pRegFrame->cs.Sel, pRegFrame->rip)); + + return fInHyper ? VINF_EM_DBG_HYPER_BREAKPOINT : VINF_EM_DBG_BREAKPOINT; + } + } + } + + /* + * Single step? + * Are we single stepping or is it the guest? + */ + if ( (uDr6 & X86_DR6_BS) + && (fInHyper || pVCpu->dbgf.s.fSingleSteppingRaw || fAltStepping)) + { + pVCpu->dbgf.s.fSingleSteppingRaw = false; + LogFlow(("DBGFRZTrap01Handler: single step at %04x:%RGv\n", pRegFrame->cs.Sel, pRegFrame->rip)); + return fInHyper ? VINF_EM_DBG_HYPER_STEPPED : VINF_EM_DBG_STEPPED; + } + +#ifdef IN_RC + /* + * Either an ICEBP in hypervisor code or a guest related debug exception + * of sorts. + */ + if (RT_UNLIKELY(fInHyper)) + { + /* + * Is this a guest debug event that was delayed past a ring transition? + * + * Since we do no allow sysenter/syscall in raw-mode, the only + * non-trap/fault type transitions that can occur are thru interrupt gates. + * Of those, only INT3 (#BP) has a DPL other than 0 with a CS.RPL of 0. + * See bugref:9171 and bs3-cpu-weird-1 for more details. + * + * We need to reconstruct the guest register state from the hypervisor one + * here, so here is the layout of the IRET frame on the stack: + * 20:[8] GS (V86 only) + * 1C:[7] FS (V86 only) + * 18:[6] DS (V86 only) + * 14:[5] ES (V86 only) + * 10:[4] SS + * 0c:[3] ESP + * 08:[2] EFLAGS + * 04:[1] CS + * 00:[0] EIP + */ + if (pRegFrame->rip == (uintptr_t)TRPMRCHandlerAsmTrap03) + { + uint32_t const *pu32Stack = (uint32_t const *)pRegFrame->esp; + if ( (pu32Stack[2] & X86_EFL_VM) + || (pu32Stack[1] & X86_SEL_RPL)) + { + LogFlow(("DBGFRZTrap01Handler: Detected guest #DB delayed past ring transition %04x:%RX32 %#x\n", + pu32Stack[1] & 0xffff, pu32Stack[0], pu32Stack[2])); + PCPUMCTX pGstCtx = CPUMQueryGuestCtxPtr(pVCpu); + pGstCtx->rip = pu32Stack[0]; + pGstCtx->cs.Sel = pu32Stack[1]; + pGstCtx->eflags.u = pu32Stack[2]; + pGstCtx->rsp = pu32Stack[3]; + pGstCtx->ss.Sel = pu32Stack[4]; + if (pu32Stack[2] & X86_EFL_VM) + { + pGstCtx->es.Sel = pu32Stack[5]; + pGstCtx->ds.Sel = pu32Stack[6]; + pGstCtx->fs.Sel = pu32Stack[7]; + pGstCtx->gs.Sel = pu32Stack[8]; + } + else + { + pGstCtx->es.Sel = pRegFrame->es.Sel; + pGstCtx->ds.Sel = pRegFrame->ds.Sel; + pGstCtx->fs.Sel = pRegFrame->fs.Sel; + pGstCtx->gs.Sel = pRegFrame->gs.Sel; + } + pGstCtx->rax = pRegFrame->rax; + pGstCtx->rcx = pRegFrame->rcx; + pGstCtx->rdx = pRegFrame->rdx; + pGstCtx->rbx = pRegFrame->rbx; + pGstCtx->rsi = pRegFrame->rsi; + pGstCtx->rdi = pRegFrame->rdi; + pGstCtx->rbp = pRegFrame->rbp; + + /* + * We should assert a #BP followed by a #DB here, but TRPM cannot + * do that. So, we'll just assert the #BP and ignore the #DB, even + * if that isn't strictly correct. + */ + TRPMResetTrap(pVCpu); + TRPMAssertTrap(pVCpu, X86_XCPT_BP, TRPM_SOFTWARE_INT); + return VINF_EM_RAW_GUEST_TRAP; + } + } + + LogFlow(("DBGFRZTrap01Handler: Unknown bp at %04x:%RGv\n", pRegFrame->cs.Sel, pRegFrame->rip)); + return VERR_DBGF_HYPER_DB_XCPT; + } +#endif + + LogFlow(("DBGFRZTrap01Handler: guest debug event %#x at %04x:%RGv!\n", (uint32_t)uDr6, pRegFrame->cs.Sel, pRegFrame->rip)); + return VINF_EM_RAW_GUEST_TRAP; +} + + +/** + * \#BP (Breakpoint) handler. + * + * @returns VBox status code. + * VINF_SUCCESS means we completely handled this trap, + * other codes are passed execution to host context. + * + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * @param pRegFrame Pointer to the register frame for the trap. + */ +VMMRZ_INT_DECL(int) DBGFRZTrap03Handler(PVM pVM, PVMCPU pVCpu, PCPUMCTXCORE pRegFrame) +{ +#ifdef IN_RC + const bool fInHyper = !(pRegFrame->ss.Sel & X86_SEL_RPL) && !pRegFrame->eflags.Bits.u1VM; +#else + const bool fInHyper = false; +#endif + + /* + * Get the trap address and look it up in the breakpoint table. + * Don't bother if we don't have any breakpoints. + */ + unsigned cToSearch = pVM->dbgf.s.Int3.cToSearch; + if (cToSearch > 0) + { + RTGCPTR pPc; + int rc = SELMValidateAndConvertCSAddr(pVCpu, pRegFrame->eflags, pRegFrame->ss.Sel, pRegFrame->cs.Sel, &pRegFrame->cs, +#ifdef IN_RC + pRegFrame->eip - 1, +#else + pRegFrame->rip /* no -1 in R0 */, +#endif + &pPc); + AssertRCReturn(rc, rc); + + unsigned iBp = pVM->dbgf.s.Int3.iStartSearch; + while (cToSearch-- > 0) + { + if ( pVM->dbgf.s.aBreakpoints[iBp].u.GCPtr == (RTGCUINTPTR)pPc + && pVM->dbgf.s.aBreakpoints[iBp].enmType == DBGFBPTYPE_INT3) + { + pVM->dbgf.s.aBreakpoints[iBp].cHits++; + pVCpu->dbgf.s.iActiveBp = pVM->dbgf.s.aBreakpoints[iBp].iBp; + + LogFlow(("DBGFRZTrap03Handler: hit breakpoint %d at %RGv (%04x:%RGv) cHits=0x%RX64\n", + pVM->dbgf.s.aBreakpoints[iBp].iBp, pPc, pRegFrame->cs.Sel, pRegFrame->rip, + pVM->dbgf.s.aBreakpoints[iBp].cHits)); + return fInHyper + ? VINF_EM_DBG_HYPER_BREAKPOINT + : VINF_EM_DBG_BREAKPOINT; + } + iBp++; + } + } + + return fInHyper + ? VINF_EM_DBG_HYPER_ASSERTION + : VINF_EM_RAW_GUEST_TRAP; +} + diff --git a/src/VBox/VMM/VMMRZ/Makefile.kup b/src/VBox/VMM/VMMRZ/Makefile.kup new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/VBox/VMM/VMMRZ/Makefile.kup diff --git a/src/VBox/VMM/VMMRZ/PGMRZDynMap.cpp b/src/VBox/VMM/VMMRZ/PGMRZDynMap.cpp new file mode 100644 index 00000000..0f4313bb --- /dev/null +++ b/src/VBox/VMM/VMMRZ/PGMRZDynMap.cpp @@ -0,0 +1,2692 @@ +/* $Id: PGMRZDynMap.cpp $ */ +/** @file + * PGM - Page Manager and Monitor, dynamic mapping cache. + */ + +/* + * Copyright (C) 2008-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_PGM_DYNMAP +#include <VBox/vmm/pgm.h> +#include "PGMInternal.h" +#include <VBox/vmm/vm.h> +#include "PGMInline.h" +#include <VBox/err.h> +#include <VBox/param.h> +#include <VBox/sup.h> +#include <iprt/asm.h> +#include <iprt/asm-amd64-x86.h> +#include <iprt/assert.h> +#ifndef IN_RC +# include <iprt/cpuset.h> +# include <iprt/mem.h> +# include <iprt/memobj.h> +# include <iprt/mp.h> +# include <iprt/semaphore.h> +# include <iprt/spinlock.h> +#endif +#include <iprt/string.h> + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +#ifdef IN_RING0 +/** The max size of the mapping cache (in pages). */ +# define PGMR0DYNMAP_MAX_PAGES ((16*_1M) >> PAGE_SHIFT) +/** The small segment size that is adopted on out-of-memory conditions with a + * single big segment. */ +# define PGMR0DYNMAP_SMALL_SEG_PAGES 128 +/** The number of pages we reserve per CPU. */ +# define PGMR0DYNMAP_PAGES_PER_CPU 256 +/** The minimum number of pages we reserve per CPU. + * This must be equal or larger than the autoset size. */ +# define PGMR0DYNMAP_PAGES_PER_CPU_MIN 64 +/** Calcs the overload threshold (safety margin). Current set at 50%. */ +# define PGMR0DYNMAP_CALC_OVERLOAD(cPages) ((cPages) / 2) +/** The number of guard pages. + * @remarks Never do tuning of the hashing or whatnot with a strict build! */ +# if defined(VBOX_STRICT) +# define PGMR0DYNMAP_GUARD_PAGES 1 +# else +# define PGMR0DYNMAP_GUARD_PAGES 0 +# endif +#endif /* IN_RING0 */ +/** The dummy physical address of guard pages. */ +#define PGMR0DYNMAP_GUARD_PAGE_HCPHYS UINT32_C(0x7777feed) +/** The dummy reference count of guard pages. (Must be non-zero.) */ +#define PGMR0DYNMAP_GUARD_PAGE_REF_COUNT INT32_C(0x7777feed) +#if 0 +/** Define this to just clear the present bit on guard pages. + * The alternative is to replace the entire PTE with an bad not-present + * PTE. Either way, XNU will screw us. :-/ */ +# define PGMR0DYNMAP_GUARD_NP +#endif +/** The dummy PTE value for a page. */ +#define PGMR0DYNMAP_GUARD_PAGE_LEGACY_PTE X86_PTE_PG_MASK +/** The dummy PTE value for a page. */ +#define PGMR0DYNMAP_GUARD_PAGE_PAE_PTE UINT64_MAX /*X86_PTE_PAE_PG_MASK*/ + +#ifdef IN_RING0 /* Note! Assertions causes panics if preemption is disabled, + * disable this to work around that. */ +/** + * Acquire the spinlock. + * This will declare a temporary variable and expands to two statements! + */ +# define PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis) \ + RTSpinlockAcquire((pThis)->hSpinlock) + +/** + * Releases the spinlock. + */ +# define PGMRZDYNMAP_SPINLOCK_RELEASE(pThis) \ + RTSpinlockRelease((pThis)->hSpinlock) + +/** + * Re-acquires the spinlock. + */ +# define PGMRZDYNMAP_SPINLOCK_REACQUIRE(pThis) \ + RTSpinlockAcquire((pThis)->hSpinlock) +#else +# define PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis) do { } while (0) +# define PGMRZDYNMAP_SPINLOCK_RELEASE(pThis) do { } while (0) +# define PGMRZDYNMAP_SPINLOCK_REACQUIRE(pThis) do { } while (0) +#endif + + +/** Converts a PGMCPUM::AutoSet pointer into a PVMCPU. */ +#define PGMRZDYNMAP_SET_2_VMCPU(pSet) (RT_FROM_MEMBER(pSet, VMCPU, pgm.s.AutoSet)) + +/** Converts a PGMCPUM::AutoSet pointer into a PVM. */ +#define PGMRZDYNMAP_SET_2_VM(pSet) (PGMRZDYNMAP_SET_2_VMCPU(pSet)->CTX_SUFF(pVM)) + +/** Converts a PGMCPUM::AutoSet pointer into a PVM. */ +#ifdef IN_RC +# define PGMRZDYNMAP_SET_2_DYNMAP(pSet) (PGMRZDYNMAP_SET_2_VM(pSet)->pgm.s.pRCDynMap) +#else +# define PGMRZDYNMAP_SET_2_DYNMAP(pSet) (g_pPGMR0DynMap) +#endif + +/** + * Gets the set index of the current CPU. + * + * This always returns 0 when in raw-mode context because there is only ever + * one EMT in that context (at least presently). + */ +#ifdef IN_RC +# define PGMRZDYNMAP_CUR_CPU() (0) +#else +# define PGMRZDYNMAP_CUR_CPU() RTMpCurSetIndex() +#endif + +/** PGMRZDYNMAP::u32Magic. (Jens Christian Bugge Wesseltoft) */ +#define PGMRZDYNMAP_MAGIC UINT32_C(0x19640201) + + +/** Zaps an set entry. */ +#define PGMRZDYNMAP_ZAP_ENTRY(pEntry) \ + do \ + { \ + (pEntry)->iPage = UINT16_MAX; \ + (pEntry)->cRefs = 0; \ + (pEntry)->cInlinedRefs = 0; \ + (pEntry)->cUnrefs = 0; \ + } while (0) + + +/** @def PGMRZDYNMAP_STRICT_RELEASE + * Define this to force pages to be released and make non-present ASAP after + * use. This should not normally be enabled as it is a bit expensive. */ +#if 0 || defined(DOXYGEN_RUNNING) +# define PGMRZDYNMAP_STRICT_RELEASE +#endif + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +#ifdef IN_RING0 +/** + * Ring-0 dynamic mapping cache segment. + * + * The dynamic mapping cache can be extended with additional segments if the + * load is found to be too high. This done the next time a VM is created, under + * the protection of the init mutex. The arrays is reallocated and the new + * segment is added to the end of these. Nothing is rehashed of course, as the + * indexes / addresses must remain unchanged. + * + * This structure is only modified while owning the init mutex or during module + * init / term. + */ +typedef struct PGMR0DYNMAPSEG +{ + /** Pointer to the next segment. */ + struct PGMR0DYNMAPSEG *pNext; + /** The memory object for the virtual address range that we're abusing. */ + RTR0MEMOBJ hMemObj; + /** The start page in the cache. (I.e. index into the arrays.) */ + uint16_t iPage; + /** The number of pages this segment contributes. */ + uint16_t cPages; + /** The number of page tables. */ + uint16_t cPTs; + /** The memory objects for the page tables. */ + RTR0MEMOBJ ahMemObjPTs[1]; +} PGMR0DYNMAPSEG; +/** Pointer to a ring-0 dynamic mapping cache segment. */ +typedef PGMR0DYNMAPSEG *PPGMR0DYNMAPSEG; + + +/** + * Ring-0 dynamic mapping cache entry. + * + * @sa PGMRZDYNMAPENTRY, PGMRCDYNMAPENTRY. + */ +typedef struct PGMR0DYNMAPENTRY +{ + /** The physical address of the currently mapped page. + * This is duplicate for three reasons: cache locality, cache policy of the PT + * mappings and sanity checks. */ + RTHCPHYS HCPhys; + /** Pointer to the page. */ + void *pvPage; + /** The number of references. */ + int32_t volatile cRefs; + /** PTE pointer union. */ + union PGMR0DYNMAPENTRY_PPTE + { + /** PTE pointer, 32-bit legacy version. */ + PX86PTE pLegacy; + /** PTE pointer, PAE version. */ + PX86PTEPAE pPae; + /** PTE pointer, the void version. */ + void *pv; + } uPte; + /** CPUs that haven't invalidated this entry after it's last update. */ + RTCPUSET PendingSet; +} PGMR0DYNMAPENTRY; +/** Pointer a mapping cache entry for the ring-0. + * @sa PPGMRZDYNMAPENTRY, PPGMRCDYNMAPENTRY, */ +typedef PGMR0DYNMAPENTRY *PPGMR0DYNMAPENTRY; + + +/** + * Dynamic mapping cache for ring-0. + * + * This is initialized during VMMR0 module init but no segments are allocated + * at that time. Segments will be added when the first VM is started and + * removed again when the last VM shuts down, thus avoid consuming memory while + * dormant. At module termination, the remaining bits will be freed up. + * + * @sa PPGMRZDYNMAP, PGMRCDYNMAP. + */ +typedef struct PGMR0DYNMAP +{ + /** The usual magic number / eye catcher (PGMRZDYNMAP_MAGIC). */ + uint32_t u32Magic; + /** Spinlock serializing the normal operation of the cache. */ + RTSPINLOCK hSpinlock; + /** Array for tracking and managing the pages. */ + PPGMR0DYNMAPENTRY paPages; + /** The cache size given as a number of pages. */ + uint32_t cPages; + /** Whether it's 32-bit legacy or PAE/AMD64 paging mode. */ + bool fLegacyMode; + /** The current load. + * This does not include guard pages. */ + uint32_t cLoad; + /** The max load ever. + * This is maintained to trigger the adding of more mapping space. */ + uint32_t cMaxLoad; + /** Initialization / termination lock. */ + RTSEMFASTMUTEX hInitLock; + /** The number of guard pages. */ + uint32_t cGuardPages; + /** The number of users (protected by hInitLock). */ + uint32_t cUsers; + /** Array containing a copy of the original page tables. + * The entries are either X86PTE or X86PTEPAE according to fLegacyMode. */ + void *pvSavedPTEs; + /** List of segments. */ + PPGMR0DYNMAPSEG pSegHead; + /** The paging mode. */ + SUPPAGINGMODE enmPgMode; +} PGMR0DYNMAP; + + +/** + * Paging level data. + */ +typedef struct PGMR0DYNMAPPGLVL +{ + uint32_t cLevels; /**< The number of levels. */ + struct + { + RTHCPHYS HCPhys; /**< The address of the page for the current level, + * i.e. what hMemObj/hMapObj is currently mapping. */ + RTHCPHYS fPhysMask; /**< Mask for extracting HCPhys from uEntry. */ + RTR0MEMOBJ hMemObj; /**< Memory object for HCPhys, PAGE_SIZE. */ + RTR0MEMOBJ hMapObj; /**< Mapping object for hMemObj. */ + uint32_t fPtrShift; /**< The pointer shift count. */ + uint64_t fPtrMask; /**< The mask to apply to the shifted pointer to get the table index. */ + uint64_t fAndMask; /**< And mask to check entry flags. */ + uint64_t fResMask; /**< The result from applying fAndMask. */ + union + { + void *pv; /**< hMapObj address. */ + PX86PGUINT paLegacy; /**< Legacy table view. */ + PX86PGPAEUINT paPae; /**< PAE/AMD64 table view. */ + } u; + } a[4]; +} PGMR0DYNMAPPGLVL; +/** Pointer to paging level data. */ +typedef PGMR0DYNMAPPGLVL *PPGMR0DYNMAPPGLVL; +#endif + +/** Mapping cache entry for the current context. + * @sa PGMR0DYNMAPENTRY, PGMRCDYNMAPENTRY */ +typedef CTX_MID(PGM,DYNMAPENTRY) PGMRZDYNMAPENTRY; +/** Pointer a mapping cache entry for the current context. + * @sa PGMR0DYNMAPENTRY, PGMRCDYNMAPENTRY */ +typedef PGMRZDYNMAPENTRY *PPGMRZDYNMAPENTRY; + +/** Pointer to the mapping cache instance for the current context. + * @sa PGMR0DYNMAP, PGMRCDYNMAP */ +typedef CTX_MID(PGM,DYNMAP) *PPGMRZDYNMAP; + + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +#ifdef IN_RING0 +/** Pointer to the ring-0 dynamic mapping cache. */ +static PGMR0DYNMAP *g_pPGMR0DynMap; +#endif +/** For overflow testing. */ +static bool g_fPGMR0DynMapTestRunning = false; + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +static void pgmRZDynMapReleasePage(PPGMRZDYNMAP pThis, uint32_t iPage, uint32_t cRefs); +#ifdef IN_RING0 +static int pgmR0DynMapSetup(PPGMRZDYNMAP pThis); +static int pgmR0DynMapExpand(PPGMRZDYNMAP pThis); +static void pgmR0DynMapTearDown(PPGMRZDYNMAP pThis); +#endif +#if 0 /*def DEBUG*/ +static int pgmR0DynMapTest(PVM pVM); +#endif + + +/** + * Initializes the auto mapping sets for a VM. + * + * @returns VINF_SUCCESS on success, VERR_PGM_DYNMAP_IPE on failure. + * @param pVM The cross context VM structure. + */ +static int pgmRZDynMapInitAutoSetsForVM(PVM pVM) +{ + VMCPUID idCpu = pVM->cCpus; + AssertReturn(idCpu > 0 && idCpu <= VMM_MAX_CPU_COUNT, VERR_PGM_DYNMAP_IPE); + while (idCpu-- > 0) + { + PPGMMAPSET pSet = &pVM->aCpus[idCpu].pgm.s.AutoSet; + uint32_t j = RT_ELEMENTS(pSet->aEntries); + while (j-- > 0) + { + pSet->aEntries[j].pvPage = NULL; + pSet->aEntries[j].HCPhys = NIL_RTHCPHYS; + PGMRZDYNMAP_ZAP_ENTRY(&pSet->aEntries[j]); + } + pSet->cEntries = PGMMAPSET_CLOSED; + pSet->iSubset = UINT32_MAX; + pSet->iCpu = -1; + memset(&pSet->aiHashTable[0], 0xff, sizeof(pSet->aiHashTable)); + } + + return VINF_SUCCESS; +} + + +#ifdef IN_RING0 + +/** + * Initializes the ring-0 dynamic mapping cache. + * + * @returns VBox status code. + */ +VMMR0DECL(int) PGMR0DynMapInit(void) +{ + Assert(!g_pPGMR0DynMap); + + /* + * Create and initialize the cache instance. + */ + PPGMRZDYNMAP pThis = (PPGMRZDYNMAP)RTMemAllocZ(sizeof(*pThis)); + AssertLogRelReturn(pThis, VERR_NO_MEMORY); + int rc = VINF_SUCCESS; + pThis->enmPgMode = SUPR0GetPagingMode(); + switch (pThis->enmPgMode) + { + case SUPPAGINGMODE_32_BIT: + case SUPPAGINGMODE_32_BIT_GLOBAL: + pThis->fLegacyMode = false; + break; + case SUPPAGINGMODE_PAE: + case SUPPAGINGMODE_PAE_GLOBAL: + case SUPPAGINGMODE_PAE_NX: + case SUPPAGINGMODE_PAE_GLOBAL_NX: + case SUPPAGINGMODE_AMD64: + case SUPPAGINGMODE_AMD64_GLOBAL: + case SUPPAGINGMODE_AMD64_NX: + case SUPPAGINGMODE_AMD64_GLOBAL_NX: + pThis->fLegacyMode = false; + break; + default: + rc = VERR_PGM_DYNMAP_IPE; + break; + } + if (RT_SUCCESS(rc)) + { + rc = RTSemFastMutexCreate(&pThis->hInitLock); + if (RT_SUCCESS(rc)) + { + rc = RTSpinlockCreate(&pThis->hSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "PGMR0DynMap"); + if (RT_SUCCESS(rc)) + { + pThis->u32Magic = PGMRZDYNMAP_MAGIC; + g_pPGMR0DynMap = pThis; + return VINF_SUCCESS; + } + RTSemFastMutexDestroy(pThis->hInitLock); + } + } + RTMemFree(pThis); + return rc; +} + + +/** + * Terminates the ring-0 dynamic mapping cache. + */ +VMMR0DECL(void) PGMR0DynMapTerm(void) +{ + /* + * Destroy the cache. + * + * There is not supposed to be any races here, the loader should + * make sure about that. So, don't bother locking anything. + * + * The VM objects should all be destroyed by now, so there is no + * dangling users or anything like that to clean up. This routine + * is just a mirror image of PGMR0DynMapInit. + */ + PPGMRZDYNMAP pThis = g_pPGMR0DynMap; + if (pThis) + { + AssertPtr(pThis); + g_pPGMR0DynMap = NULL; + + /* This should *never* happen, but in case it does try not to leak memory. */ + AssertLogRelMsg(!pThis->cUsers && !pThis->paPages && !pThis->pvSavedPTEs && !pThis->cPages, + ("cUsers=%d paPages=%p pvSavedPTEs=%p cPages=%#x\n", + pThis->cUsers, pThis->paPages, pThis->pvSavedPTEs, pThis->cPages)); + if (pThis->paPages) + pgmR0DynMapTearDown(pThis); + + /* Free the associated resources. */ + RTSemFastMutexDestroy(pThis->hInitLock); + pThis->hInitLock = NIL_RTSEMFASTMUTEX; + RTSpinlockDestroy(pThis->hSpinlock); + pThis->hSpinlock = NIL_RTSPINLOCK; + pThis->u32Magic = UINT32_MAX; + RTMemFree(pThis); + } +} + + +/** + * Initializes the dynamic mapping cache for a new VM. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0DECL(int) PGMR0DynMapInitVM(PVM pVM) +{ + AssertMsgReturn(!pVM->pgm.s.pvR0DynMapUsed, ("%p (pThis=%p)\n", pVM->pgm.s.pvR0DynMapUsed, g_pPGMR0DynMap), VERR_WRONG_ORDER); + + /* + * Initialize the auto sets. + */ + int rc = pgmRZDynMapInitAutoSetsForVM(pVM); + if (RT_FAILURE(rc)) + return rc; + + /* + * Do we need the cache? Skip the last bit if we don't. + */ + if (VM_IS_RAW_MODE_ENABLED(pVM)) + return VINF_SUCCESS; + + /* + * Reference and if necessary setup or expand the cache. + */ + PPGMRZDYNMAP pThis = g_pPGMR0DynMap; + AssertPtrReturn(pThis, VERR_PGM_DYNMAP_IPE); + rc = RTSemFastMutexRequest(pThis->hInitLock); + AssertLogRelRCReturn(rc, rc); + + pThis->cUsers++; + if (pThis->cUsers == 1) + { + rc = pgmR0DynMapSetup(pThis); +#if 0 /*def DEBUG*/ + if (RT_SUCCESS(rc)) + { + rc = pgmR0DynMapTest(pVM); + if (RT_FAILURE(rc)) + pgmR0DynMapTearDown(pThis); + } +#endif + } + else if (pThis->cMaxLoad > PGMR0DYNMAP_CALC_OVERLOAD(pThis->cPages - pThis->cGuardPages)) + rc = pgmR0DynMapExpand(pThis); + if (RT_SUCCESS(rc)) + pVM->pgm.s.pvR0DynMapUsed = pThis; + else + pThis->cUsers--; + + RTSemFastMutexRelease(pThis->hInitLock); + return rc; +} + + +/** + * Terminates the dynamic mapping cache usage for a VM. + * + * @param pVM The cross context VM structure. + */ +VMMR0DECL(void) PGMR0DynMapTermVM(PVM pVM) +{ + /* + * Return immediately if we're not using the cache. + */ + if (!pVM->pgm.s.pvR0DynMapUsed) + return; + + PPGMRZDYNMAP pThis = g_pPGMR0DynMap; + AssertPtrReturnVoid(pThis); + + int rc = RTSemFastMutexRequest(pThis->hInitLock); + AssertLogRelRCReturnVoid(rc); + + if (pVM->pgm.s.pvR0DynMapUsed == pThis) + { + pVM->pgm.s.pvR0DynMapUsed = NULL; + +#ifdef VBOX_STRICT + PGMR0DynMapAssertIntegrity(); +#endif + + /* + * Clean up and check the auto sets. + */ + VMCPUID idCpu = pVM->cCpus; + while (idCpu-- > 0) + { + PPGMMAPSET pSet = &pVM->aCpus[idCpu].pgm.s.AutoSet; + uint32_t j = pSet->cEntries; + if (j <= RT_ELEMENTS(pSet->aEntries)) + { + /* + * The set is open, close it. + */ + while (j-- > 0) + { + int32_t cRefs = pSet->aEntries[j].cRefs; + uint32_t iPage = pSet->aEntries[j].iPage; + LogRel(("PGMR0DynMapTermVM: %d dangling refs to %#x\n", cRefs, iPage)); + if (iPage < pThis->cPages && cRefs > 0) + pgmRZDynMapReleasePage(pThis, iPage, cRefs); + else + AssertLogRelMsgFailed(("cRefs=%d iPage=%#x cPages=%u\n", cRefs, iPage, pThis->cPages)); + + PGMRZDYNMAP_ZAP_ENTRY(&pSet->aEntries[j]); + } + pSet->cEntries = PGMMAPSET_CLOSED; + pSet->iSubset = UINT32_MAX; + pSet->iCpu = -1; + } + else + AssertMsg(j == PGMMAPSET_CLOSED, ("cEntries=%#x\n", j)); + + j = RT_ELEMENTS(pSet->aEntries); + while (j-- > 0) + { + Assert(pSet->aEntries[j].iPage == UINT16_MAX); + Assert(!pSet->aEntries[j].cRefs); + } + } + + /* + * Release our reference to the mapping cache. + */ + Assert(pThis->cUsers > 0); + pThis->cUsers--; + if (!pThis->cUsers) + pgmR0DynMapTearDown(pThis); + } + else + AssertLogRelMsgFailed(("pvR0DynMapUsed=%p pThis=%p\n", pVM->pgm.s.pvR0DynMapUsed, pThis)); + + RTSemFastMutexRelease(pThis->hInitLock); +} + + +/** + * Shoots down the TLBs for all the cache pages, pgmR0DynMapTearDown helper. + * + * @param idCpu The current CPU. + * @param pvUser1 The dynamic mapping cache instance. + * @param pvUser2 Unused, NULL. + */ +static DECLCALLBACK(void) pgmR0DynMapShootDownTlbs(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + Assert(!pvUser2); + PPGMRZDYNMAP pThis = (PPGMRZDYNMAP)pvUser1; + Assert(pThis == g_pPGMR0DynMap); + PPGMRZDYNMAPENTRY paPages = pThis->paPages; + uint32_t iPage = pThis->cPages; + while (iPage-- > 0) + ASMInvalidatePage((uintptr_t)paPages[iPage].pvPage); +} + + +/** + * Shoot down the TLBs for every single cache entry on all CPUs. + * + * @returns IPRT status code (RTMpOnAll). + * @param pThis The dynamic mapping cache instance. + */ +static int pgmR0DynMapTlbShootDown(PPGMRZDYNMAP pThis) +{ + int rc = RTMpOnAll(pgmR0DynMapShootDownTlbs, pThis, NULL); + AssertRC(rc); + if (RT_FAILURE(rc)) + { + uint32_t iPage = pThis->cPages; + while (iPage-- > 0) + ASMInvalidatePage((uintptr_t)pThis->paPages[iPage].pvPage); + } + return rc; +} + + +/** + * Calculate the new cache size based on cMaxLoad statistics. + * + * @returns Number of pages. + * @param pThis The dynamic mapping cache instance. + * @param pcMinPages The minimal size in pages. + */ +static uint32_t pgmR0DynMapCalcNewSize(PPGMRZDYNMAP pThis, uint32_t *pcMinPages) +{ + Assert(pThis->cPages <= PGMR0DYNMAP_MAX_PAGES); + + /* cCpus * PGMR0DYNMAP_PAGES_PER_CPU(_MIN). */ + RTCPUID cCpus = RTMpGetCount(); + AssertReturn(cCpus > 0 && cCpus <= RTCPUSET_MAX_CPUS, 0); + uint32_t cPages = cCpus * PGMR0DYNMAP_PAGES_PER_CPU; + uint32_t cMinPages = cCpus * PGMR0DYNMAP_PAGES_PER_CPU_MIN; + + /* adjust against cMaxLoad. */ + AssertMsg(pThis->cMaxLoad <= PGMR0DYNMAP_MAX_PAGES, ("%#x\n", pThis->cMaxLoad)); + if (pThis->cMaxLoad > PGMR0DYNMAP_MAX_PAGES) + pThis->cMaxLoad = 0; + + while (pThis->cMaxLoad > PGMR0DYNMAP_CALC_OVERLOAD(cPages)) + cPages += PGMR0DYNMAP_PAGES_PER_CPU; + + if (pThis->cMaxLoad > cMinPages) + cMinPages = pThis->cMaxLoad; + + /* adjust against max and current size. */ + if (cPages < pThis->cPages) + cPages = pThis->cPages; + cPages *= PGMR0DYNMAP_GUARD_PAGES + 1; + if (cPages > PGMR0DYNMAP_MAX_PAGES) + cPages = PGMR0DYNMAP_MAX_PAGES; + + if (cMinPages < pThis->cPages) + cMinPages = pThis->cPages; + cMinPages *= PGMR0DYNMAP_GUARD_PAGES + 1; + if (cMinPages > PGMR0DYNMAP_MAX_PAGES) + cMinPages = PGMR0DYNMAP_MAX_PAGES; + + Assert(cMinPages); + *pcMinPages = cMinPages; + return cPages; +} + + +/** + * Initializes the paging level data. + * + * @param pThis The dynamic mapping cache instance. + * @param pPgLvl The paging level data. + */ +void pgmR0DynMapPagingArrayInit(PPGMRZDYNMAP pThis, PPGMR0DYNMAPPGLVL pPgLvl) +{ + RTCCUINTREG cr4 = ASMGetCR4(); + switch (pThis->enmPgMode) + { + case SUPPAGINGMODE_32_BIT: + case SUPPAGINGMODE_32_BIT_GLOBAL: + pPgLvl->cLevels = 2; + pPgLvl->a[0].fPhysMask = X86_CR3_PAGE_MASK; + pPgLvl->a[0].fAndMask = X86_PDE_P | X86_PDE_RW | (cr4 & X86_CR4_PSE ? X86_PDE_PS : 0); + pPgLvl->a[0].fResMask = X86_PDE_P | X86_PDE_RW; + pPgLvl->a[0].fPtrMask = X86_PD_MASK; + pPgLvl->a[0].fPtrShift = X86_PD_SHIFT; + + pPgLvl->a[1].fPhysMask = X86_PDE_PG_MASK; + pPgLvl->a[1].fAndMask = X86_PTE_P | X86_PTE_RW; + pPgLvl->a[1].fResMask = X86_PTE_P | X86_PTE_RW; + pPgLvl->a[1].fPtrMask = X86_PT_MASK; + pPgLvl->a[1].fPtrShift = X86_PT_SHIFT; + break; + + case SUPPAGINGMODE_PAE: + case SUPPAGINGMODE_PAE_GLOBAL: + case SUPPAGINGMODE_PAE_NX: + case SUPPAGINGMODE_PAE_GLOBAL_NX: + pPgLvl->cLevels = 3; + pPgLvl->a[0].fPhysMask = X86_CR3_PAE_PAGE_MASK; + pPgLvl->a[0].fPtrMask = X86_PDPT_MASK_PAE; + pPgLvl->a[0].fPtrShift = X86_PDPT_SHIFT; + pPgLvl->a[0].fAndMask = X86_PDPE_P; + pPgLvl->a[0].fResMask = X86_PDPE_P; + + pPgLvl->a[1].fPhysMask = X86_PDPE_PG_MASK; + pPgLvl->a[1].fPtrMask = X86_PD_PAE_MASK; + pPgLvl->a[1].fPtrShift = X86_PD_PAE_SHIFT; + pPgLvl->a[1].fAndMask = X86_PDE_P | X86_PDE_RW | (cr4 & X86_CR4_PSE ? X86_PDE_PS : 0); + pPgLvl->a[1].fResMask = X86_PDE_P | X86_PDE_RW; + + pPgLvl->a[2].fPhysMask = X86_PDE_PAE_PG_MASK; + pPgLvl->a[2].fPtrMask = X86_PT_PAE_MASK; + pPgLvl->a[2].fPtrShift = X86_PT_PAE_SHIFT; + pPgLvl->a[2].fAndMask = X86_PTE_P | X86_PTE_RW; + pPgLvl->a[2].fResMask = X86_PTE_P | X86_PTE_RW; + break; + + case SUPPAGINGMODE_AMD64: + case SUPPAGINGMODE_AMD64_GLOBAL: + case SUPPAGINGMODE_AMD64_NX: + case SUPPAGINGMODE_AMD64_GLOBAL_NX: + pPgLvl->cLevels = 4; + pPgLvl->a[0].fPhysMask = X86_CR3_AMD64_PAGE_MASK; + pPgLvl->a[0].fPtrShift = X86_PML4_SHIFT; + pPgLvl->a[0].fPtrMask = X86_PML4_MASK; + pPgLvl->a[0].fAndMask = X86_PML4E_P | X86_PML4E_RW; + pPgLvl->a[0].fResMask = X86_PML4E_P | X86_PML4E_RW; + + pPgLvl->a[1].fPhysMask = X86_PML4E_PG_MASK; + pPgLvl->a[1].fPtrShift = X86_PDPT_SHIFT; + pPgLvl->a[1].fPtrMask = X86_PDPT_MASK_AMD64; + pPgLvl->a[1].fAndMask = X86_PDPE_P | X86_PDPE_RW /** @todo check for X86_PDPT_PS support. */; + pPgLvl->a[1].fResMask = X86_PDPE_P | X86_PDPE_RW; + + pPgLvl->a[2].fPhysMask = X86_PDPE_PG_MASK; + pPgLvl->a[2].fPtrShift = X86_PD_PAE_SHIFT; + pPgLvl->a[2].fPtrMask = X86_PD_PAE_MASK; + pPgLvl->a[2].fAndMask = X86_PDE_P | X86_PDE_RW | (cr4 & X86_CR4_PSE ? X86_PDE_PS : 0); + pPgLvl->a[2].fResMask = X86_PDE_P | X86_PDE_RW; + + pPgLvl->a[3].fPhysMask = X86_PDE_PAE_PG_MASK; + pPgLvl->a[3].fPtrShift = X86_PT_PAE_SHIFT; + pPgLvl->a[3].fPtrMask = X86_PT_PAE_MASK; + pPgLvl->a[3].fAndMask = X86_PTE_P | X86_PTE_RW; + pPgLvl->a[3].fResMask = X86_PTE_P | X86_PTE_RW; + break; + + default: + AssertFailed(); + pPgLvl->cLevels = 0; + break; + } + + for (uint32_t i = 0; i < 4; i++) /* ASSUMING array size. */ + { + pPgLvl->a[i].HCPhys = NIL_RTHCPHYS; + pPgLvl->a[i].hMapObj = NIL_RTR0MEMOBJ; + pPgLvl->a[i].hMemObj = NIL_RTR0MEMOBJ; + pPgLvl->a[i].u.pv = NULL; + } +} + + +/** + * Maps a PTE. + * + * This will update the segment structure when new PTs are mapped. + * + * It also assumes that we (for paranoid reasons) wish to establish a mapping + * chain from CR3 to the PT that all corresponds to the processor we're + * currently running on, and go about this by running with interrupts disabled + * and restarting from CR3 for every change. + * + * @returns VBox status code, VINF_TRY_AGAIN if we changed any mappings and had + * to re-enable interrupts. + * @param pThis The dynamic mapping cache instance. + * @param pPgLvl The paging level structure. + * @param pvPage The page. + * @param pSeg The segment. + * @param cMaxPTs The max number of PTs expected in the segment. + * @param ppvPTE Where to store the PTE address. + */ +static int pgmR0DynMapPagingArrayMapPte(PPGMRZDYNMAP pThis, PPGMR0DYNMAPPGLVL pPgLvl, void *pvPage, + PPGMR0DYNMAPSEG pSeg, uint32_t cMaxPTs, void **ppvPTE) +{ + Assert(!(ASMGetFlags() & X86_EFL_IF)); + void *pvEntry = NULL; + X86PGPAEUINT uEntry = ASMGetCR3(); + for (uint32_t i = 0; i < pPgLvl->cLevels; i++) + { + RTHCPHYS HCPhys = uEntry & pPgLvl->a[i].fPhysMask; + if (pPgLvl->a[i].HCPhys != HCPhys) + { + /* + * Need to remap this level. + * The final level, the PT, will not be freed since that is what it's all about. + */ + ASMIntEnable(); + if (i + 1 == pPgLvl->cLevels) + AssertReturn(pSeg->cPTs < cMaxPTs, VERR_PGM_DYNMAP_IPE); + else + { + int rc2 = RTR0MemObjFree(pPgLvl->a[i].hMemObj, true /* fFreeMappings */); AssertRC(rc2); + pPgLvl->a[i].hMemObj = pPgLvl->a[i].hMapObj = NIL_RTR0MEMOBJ; + } + + int rc = RTR0MemObjEnterPhys(&pPgLvl->a[i].hMemObj, HCPhys, PAGE_SIZE, RTMEM_CACHE_POLICY_DONT_CARE); + if (RT_SUCCESS(rc)) + { + rc = RTR0MemObjMapKernel(&pPgLvl->a[i].hMapObj, pPgLvl->a[i].hMemObj, + (void *)-1 /* pvFixed */, 0 /* cbAlignment */, + RTMEM_PROT_WRITE | RTMEM_PROT_READ); + if (RT_SUCCESS(rc)) + { + pPgLvl->a[i].u.pv = RTR0MemObjAddress(pPgLvl->a[i].hMapObj); + AssertMsg(((uintptr_t)pPgLvl->a[i].u.pv & ~(uintptr_t)PAGE_OFFSET_MASK), ("%p\n", pPgLvl->a[i].u.pv)); + pPgLvl->a[i].HCPhys = HCPhys; + if (i + 1 == pPgLvl->cLevels) + pSeg->ahMemObjPTs[pSeg->cPTs++] = pPgLvl->a[i].hMemObj; + ASMIntDisable(); + return VINF_TRY_AGAIN; + } + + pPgLvl->a[i].hMapObj = NIL_RTR0MEMOBJ; + } + else + pPgLvl->a[i].hMemObj = NIL_RTR0MEMOBJ; + pPgLvl->a[i].HCPhys = NIL_RTHCPHYS; + return rc; + } + + /* + * The next level. + */ + uint32_t iEntry = ((uint64_t)(uintptr_t)pvPage >> pPgLvl->a[i].fPtrShift) & pPgLvl->a[i].fPtrMask; + if (pThis->fLegacyMode) + { + pvEntry = &pPgLvl->a[i].u.paLegacy[iEntry]; + uEntry = pPgLvl->a[i].u.paLegacy[iEntry]; + } + else + { + pvEntry = &pPgLvl->a[i].u.paPae[iEntry]; + uEntry = pPgLvl->a[i].u.paPae[iEntry]; + } + + if ((uEntry & pPgLvl->a[i].fAndMask) != pPgLvl->a[i].fResMask) + { + LogRel(("PGMR0DynMap: internal error - iPgLvl=%u cLevels=%u uEntry=%#llx fAnd=%#llx fRes=%#llx got=%#llx\n" + "PGMR0DynMap: pv=%p pvPage=%p iEntry=%#x fLegacyMode=%RTbool\n", + i, pPgLvl->cLevels, uEntry, pPgLvl->a[i].fAndMask, pPgLvl->a[i].fResMask, uEntry & pPgLvl->a[i].fAndMask, + pPgLvl->a[i].u.pv, pvPage, iEntry, pThis->fLegacyMode)); + return VERR_PGM_DYNMAP_IPE; + } + /*Log(("#%d: iEntry=%4d uEntry=%#llx pvEntry=%p HCPhys=%RHp \n", i, iEntry, uEntry, pvEntry, pPgLvl->a[i].HCPhys));*/ + } + + /* made it thru without needing to remap anything. */ + *ppvPTE = pvEntry; + return VINF_SUCCESS; +} + + +/** + * Sets up a guard page. + * + * @param pThis The dynamic mapping cache instance. + * @param pPage The page. + */ +DECLINLINE(void) pgmR0DynMapSetupGuardPage(PPGMRZDYNMAP pThis, PPGMRZDYNMAPENTRY pPage) +{ + memset(pPage->pvPage, 0xfd, PAGE_SIZE); + pPage->cRefs = PGMR0DYNMAP_GUARD_PAGE_REF_COUNT; + pPage->HCPhys = PGMR0DYNMAP_GUARD_PAGE_HCPHYS; +#ifdef PGMR0DYNMAP_GUARD_NP + ASMAtomicBitClear(pPage->uPte.pv, X86_PTE_BIT_P); +#else + if (pThis->fLegacyMode) + ASMAtomicWriteU32(&pPage->uPte.pLegacy->u, PGMR0DYNMAP_GUARD_PAGE_LEGACY_PTE); + else + ASMAtomicWriteU64(&pPage->uPte.pPae->u, PGMR0DYNMAP_GUARD_PAGE_PAE_PTE); +#endif + pThis->cGuardPages++; +} + + +/** + * Adds a new segment of the specified size. + * + * @returns VBox status code. + * @param pThis The dynamic mapping cache instance. + * @param cPages The size of the new segment, give as a page count. + */ +static int pgmR0DynMapAddSeg(PPGMRZDYNMAP pThis, uint32_t cPages) +{ + int rc2; + AssertReturn(ASMGetFlags() & X86_EFL_IF, VERR_PREEMPT_DISABLED); + + /* + * Do the array reallocations first. + * (The pages array has to be replaced behind the spinlock of course.) + */ + void *pvSavedPTEs = RTMemRealloc(pThis->pvSavedPTEs, (pThis->fLegacyMode ? sizeof(X86PGUINT) : sizeof(X86PGPAEUINT)) * (pThis->cPages + cPages)); + if (!pvSavedPTEs) + return VERR_NO_MEMORY; + pThis->pvSavedPTEs = pvSavedPTEs; + + void *pvPages = RTMemAllocZ(sizeof(pThis->paPages[0]) * (pThis->cPages + cPages)); + if (!pvPages) + { + pvSavedPTEs = RTMemRealloc(pThis->pvSavedPTEs, (pThis->fLegacyMode ? sizeof(X86PGUINT) : sizeof(X86PGPAEUINT)) * pThis->cPages); + if (pvSavedPTEs) + pThis->pvSavedPTEs = pvSavedPTEs; + return VERR_NO_MEMORY; + } + + PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis); + + memcpy(pvPages, pThis->paPages, sizeof(pThis->paPages[0]) * pThis->cPages); + void *pvToFree = pThis->paPages; + pThis->paPages = (PPGMRZDYNMAPENTRY)pvPages; + + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + RTMemFree(pvToFree); + + /* + * Allocate the segment structure and pages of memory, then touch all the pages (paranoia). + */ + uint32_t cMaxPTs = cPages / (pThis->fLegacyMode ? X86_PG_ENTRIES : X86_PG_PAE_ENTRIES) + 2; + PPGMR0DYNMAPSEG pSeg = (PPGMR0DYNMAPSEG)RTMemAllocZ(RT_UOFFSETOF_DYN(PGMR0DYNMAPSEG, ahMemObjPTs[cMaxPTs])); + if (!pSeg) + return VERR_NO_MEMORY; + pSeg->pNext = NULL; + pSeg->cPages = cPages; + pSeg->iPage = pThis->cPages; + pSeg->cPTs = 0; + int rc = RTR0MemObjAllocPage(&pSeg->hMemObj, cPages << PAGE_SHIFT, false); + if (RT_SUCCESS(rc)) + { + uint8_t *pbPage = (uint8_t *)RTR0MemObjAddress(pSeg->hMemObj); + AssertMsg(VALID_PTR(pbPage) && !((uintptr_t)pbPage & PAGE_OFFSET_MASK), ("%p\n", pbPage)); + memset(pbPage, 0xfe, cPages << PAGE_SHIFT); + + /* + * Walk thru the pages and set them up with a mapping of their PTE and everything. + */ + ASMIntDisable(); + PGMR0DYNMAPPGLVL PgLvl; + pgmR0DynMapPagingArrayInit(pThis, &PgLvl); + uint32_t const iEndPage = pSeg->iPage + cPages; + for (uint32_t iPage = pSeg->iPage; + iPage < iEndPage; + iPage++, pbPage += PAGE_SIZE) + { + /* Initialize the page data. */ + pThis->paPages[iPage].HCPhys = NIL_RTHCPHYS; + pThis->paPages[iPage].pvPage = pbPage; + pThis->paPages[iPage].cRefs = 0; + pThis->paPages[iPage].uPte.pPae = 0; +#ifndef IN_RC + RTCpuSetFill(&pThis->paPages[iPage].PendingSet); +#endif + + /* Map its page table, retry until we've got a clean run (paranoia). */ + do + rc = pgmR0DynMapPagingArrayMapPte(pThis, &PgLvl, pbPage, pSeg, cMaxPTs, + &pThis->paPages[iPage].uPte.pv); + while (rc == VINF_TRY_AGAIN); + if (RT_FAILURE(rc)) + break; + + /* Save the PTE. */ + if (pThis->fLegacyMode) + ((PX86PGUINT)pThis->pvSavedPTEs)[iPage] = pThis->paPages[iPage].uPte.pLegacy->u; + else + ((PX86PGPAEUINT)pThis->pvSavedPTEs)[iPage] = pThis->paPages[iPage].uPte.pPae->u; + +#ifdef VBOX_STRICT + /* Check that we've got the right entry. */ + RTHCPHYS HCPhysPage = RTR0MemObjGetPagePhysAddr(pSeg->hMemObj, iPage - pSeg->iPage); + RTHCPHYS HCPhysPte = pThis->fLegacyMode + ? pThis->paPages[iPage].uPte.pLegacy->u & X86_PTE_PG_MASK + : pThis->paPages[iPage].uPte.pPae->u & X86_PTE_PAE_PG_MASK; + if (HCPhysPage != HCPhysPte) + { + LogRel(("pgmR0DynMapAddSeg: internal error - page #%u HCPhysPage=%RHp HCPhysPte=%RHp pbPage=%p pvPte=%p\n", + iPage - pSeg->iPage, HCPhysPage, HCPhysPte, pbPage, pThis->paPages[iPage].uPte.pv)); + rc = VERR_PGM_DYNMAP_IPE; + break; + } +#endif + } /* for each page */ + ASMIntEnable(); + + /* cleanup non-PT mappings */ + for (uint32_t i = 0; i < PgLvl.cLevels - 1; i++) + RTR0MemObjFree(PgLvl.a[i].hMemObj, true /* fFreeMappings */); + + if (RT_SUCCESS(rc)) + { +#if PGMR0DYNMAP_GUARD_PAGES > 0 + /* + * Setup guard pages. + * (Note: TLBs will be shot down later on.) + */ + uint32_t iPage = pSeg->iPage; + while (iPage < iEndPage) + { + for (uint32_t iGPg = 0; iGPg < PGMR0DYNMAP_GUARD_PAGES && iPage < iEndPage; iGPg++, iPage++) + pgmR0DynMapSetupGuardPage(pThis, &pThis->paPages[iPage]); + iPage++; /* the guarded page */ + } + + /* Make sure the very last page is a guard page too. */ + iPage = iEndPage - 1; + if (pThis->paPages[iPage].cRefs != PGMR0DYNMAP_GUARD_PAGE_REF_COUNT) + pgmR0DynMapSetupGuardPage(pThis, &pThis->paPages[iPage]); +#endif /* PGMR0DYNMAP_GUARD_PAGES > 0 */ + + /* + * Commit it by adding the segment to the list and updating the page count. + */ + pSeg->pNext = pThis->pSegHead; + pThis->pSegHead = pSeg; + pThis->cPages += cPages; + return VINF_SUCCESS; + } + + /* + * Bail out. + */ + while (pSeg->cPTs-- > 0) + { + rc2 = RTR0MemObjFree(pSeg->ahMemObjPTs[pSeg->cPTs], true /* fFreeMappings */); + AssertRC(rc2); + pSeg->ahMemObjPTs[pSeg->cPTs] = NIL_RTR0MEMOBJ; + } + + rc2 = RTR0MemObjFree(pSeg->hMemObj, true /* fFreeMappings */); + AssertRC(rc2); + pSeg->hMemObj = NIL_RTR0MEMOBJ; + } + else if (rc == VERR_NO_PAGE_MEMORY || rc == VERR_NO_PHYS_MEMORY) + rc = VERR_NO_MEMORY; + RTMemFree(pSeg); + + /* Don't bother resizing the arrays, but free them if we're the only user. */ + if (!pThis->cPages) + { + RTMemFree(pThis->paPages); + pThis->paPages = NULL; + RTMemFree(pThis->pvSavedPTEs); + pThis->pvSavedPTEs = NULL; + } + return rc; +} + + +/** + * Called by PGMR0DynMapInitVM under the init lock. + * + * @returns VBox status code. + * @param pThis The dynamic mapping cache instance. + */ +static int pgmR0DynMapSetup(PPGMRZDYNMAP pThis) +{ + /* + * Calc the size and add a segment of that size. + */ + uint32_t cMinPages; + uint32_t cPages = pgmR0DynMapCalcNewSize(pThis, &cMinPages); + AssertReturn(cPages, VERR_PGM_DYNMAP_IPE); + int rc = pgmR0DynMapAddSeg(pThis, cPages); + if (rc == VERR_NO_MEMORY) + { + /* + * Try adding smaller segments. + */ + do + rc = pgmR0DynMapAddSeg(pThis, PGMR0DYNMAP_SMALL_SEG_PAGES); + while (RT_SUCCESS(rc) && pThis->cPages < cPages); + if (rc == VERR_NO_MEMORY && pThis->cPages >= cMinPages) + rc = VINF_SUCCESS; + if (rc == VERR_NO_MEMORY) + { + if (pThis->cPages) + pgmR0DynMapTearDown(pThis); + rc = VERR_PGM_DYNMAP_SETUP_ERROR; + } + } + Assert(ASMGetFlags() & X86_EFL_IF); + +#if PGMR0DYNMAP_GUARD_PAGES > 0 + /* paranoia */ + if (RT_SUCCESS(rc)) + pgmR0DynMapTlbShootDown(pThis); +#endif + return rc; +} + + +/** + * Called by PGMR0DynMapInitVM under the init lock. + * + * @returns VBox status code. + * @param pThis The dynamic mapping cache instance. + */ +static int pgmR0DynMapExpand(PPGMRZDYNMAP pThis) +{ + /* + * Calc the new target size and add a segment of the appropriate size. + */ + uint32_t cMinPages; + uint32_t cPages = pgmR0DynMapCalcNewSize(pThis, &cMinPages); + AssertReturn(cPages, VERR_PGM_DYNMAP_IPE); + if (pThis->cPages >= cPages) + return VINF_SUCCESS; + + uint32_t cAdd = cPages - pThis->cPages; + int rc = pgmR0DynMapAddSeg(pThis, cAdd); + if (rc == VERR_NO_MEMORY) + { + /* + * Try adding smaller segments. + */ + do + rc = pgmR0DynMapAddSeg(pThis, PGMR0DYNMAP_SMALL_SEG_PAGES); + while (RT_SUCCESS(rc) && pThis->cPages < cPages); + if (rc == VERR_NO_MEMORY && pThis->cPages >= cMinPages) + rc = VINF_SUCCESS; + if (rc == VERR_NO_MEMORY) + rc = VERR_PGM_DYNMAP_EXPAND_ERROR; + } + Assert(ASMGetFlags() & X86_EFL_IF); + +#if PGMR0DYNMAP_GUARD_PAGES > 0 + /* paranoia */ + if (RT_SUCCESS(rc)) + pgmR0DynMapTlbShootDown(pThis); +#endif + return rc; +} + + +/** + * Called by PGMR0DynMapTermVM under the init lock. + * + * @returns VBox status code. + * @param pThis The dynamic mapping cache instance. + */ +static void pgmR0DynMapTearDown(PPGMRZDYNMAP pThis) +{ + /* + * Restore the original page table entries + */ + PPGMRZDYNMAPENTRY paPages = pThis->paPages; + uint32_t iPage = pThis->cPages; + if (pThis->fLegacyMode) + { + X86PGUINT const *paSavedPTEs = (X86PGUINT const *)pThis->pvSavedPTEs; + while (iPage-- > 0) + { + X86PGUINT uOld = paPages[iPage].uPte.pLegacy->u; + X86PGUINT uOld2 = uOld; NOREF(uOld2); + X86PGUINT uNew = paSavedPTEs[iPage]; + while (!ASMAtomicCmpXchgExU32(&paPages[iPage].uPte.pLegacy->u, uNew, uOld, &uOld)) + AssertMsgFailed(("uOld=%#x uOld2=%#x uNew=%#x\n", uOld, uOld2, uNew)); + Assert(paPages[iPage].uPte.pLegacy->u == paSavedPTEs[iPage]); + } + } + else + { + X86PGPAEUINT const *paSavedPTEs = (X86PGPAEUINT const *)pThis->pvSavedPTEs; + while (iPage-- > 0) + { + X86PGPAEUINT uOld = paPages[iPage].uPte.pPae->u; + X86PGPAEUINT uOld2 = uOld; NOREF(uOld2); + X86PGPAEUINT uNew = paSavedPTEs[iPage]; + while (!ASMAtomicCmpXchgExU64(&paPages[iPage].uPte.pPae->u, uNew, uOld, &uOld)) + AssertMsgFailed(("uOld=%#llx uOld2=%#llx uNew=%#llx\n", uOld, uOld2, uNew)); + Assert(paPages[iPage].uPte.pPae->u == paSavedPTEs[iPage]); + } + } + + /* + * Shoot down the TLBs on all CPUs before freeing them. + */ + pgmR0DynMapTlbShootDown(pThis); + + /* + * Free the segments. + */ + while (pThis->pSegHead) + { + int rc; + PPGMR0DYNMAPSEG pSeg = pThis->pSegHead; + pThis->pSegHead = pSeg->pNext; + + uint32_t iPT = pSeg->cPTs; + while (iPT-- > 0) + { + rc = RTR0MemObjFree(pSeg->ahMemObjPTs[iPT], true /* fFreeMappings */); AssertRC(rc); + pSeg->ahMemObjPTs[iPT] = NIL_RTR0MEMOBJ; + } + rc = RTR0MemObjFree(pSeg->hMemObj, true /* fFreeMappings */); AssertRC(rc); + pSeg->hMemObj = NIL_RTR0MEMOBJ; + pSeg->pNext = NULL; + pSeg->iPage = UINT16_MAX; + pSeg->cPages = 0; + pSeg->cPTs = 0; + RTMemFree(pSeg); + } + + /* + * Free the arrays and restore the initial state. + * The cLoadMax value is left behind for the next setup. + */ + RTMemFree(pThis->paPages); + pThis->paPages = NULL; + RTMemFree(pThis->pvSavedPTEs); + pThis->pvSavedPTEs = NULL; + pThis->cPages = 0; + pThis->cLoad = 0; + pThis->cGuardPages = 0; +} + +#endif /* IN_RING0 */ +#ifdef IN_RC + +/** + * Initializes the dynamic mapping cache in raw-mode context. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMRCDECL(int) PGMRCDynMapInit(PVM pVM) +{ + /* + * Allocate and initialize the instance data and page array. + */ + PPGMRZDYNMAP pThis; + size_t const cPages = MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE; + size_t const cb = RT_ALIGN_Z(sizeof(*pThis), 32) + + sizeof(PGMRZDYNMAPENTRY) * cPages; + int rc = MMHyperAlloc(pVM, cb, 32, MM_TAG_PGM, (void **)&pThis); + if (RT_FAILURE(rc)) + return rc; + + pThis->u32Magic = PGMRZDYNMAP_MAGIC; + pThis->paPages = RT_ALIGN_PT(pThis + 1, 32, PPGMRZDYNMAPENTRY); + pThis->cPages = cPages; + pThis->cLoad = 0; + pThis->cMaxLoad = 0; + pThis->cGuardPages = 0; + pThis->cUsers = 1; + + for (size_t iPage = 0; iPage < cPages; iPage++) + { + pThis->paPages[iPage].HCPhys = NIL_RTHCPHYS; + pThis->paPages[iPage].pvPage = pVM->pgm.s.pbDynPageMapBaseGC + iPage * PAGE_SIZE; + pThis->paPages[iPage].cRefs = 0; + pThis->paPages[iPage].uPte.pLegacy = &pVM->pgm.s.paDynPageMap32BitPTEsGC[iPage]; + pThis->paPages[iPage].uPte.pPae = (PX86PTEPAE)&pVM->pgm.s.paDynPageMapPaePTEsGC[iPage]; + } + + pVM->pgm.s.pRCDynMap = pThis; + + /* + * Initialize the autosets the VM. + */ + rc = pgmRZDynMapInitAutoSetsForVM(pVM); + if (RT_FAILURE(rc)) + return rc; + + return VINF_SUCCESS; +} + +#endif /* IN_RC */ + +/** + * Release references to a page, caller owns the spin lock. + * + * @param pThis The dynamic mapping cache instance. + * @param iPage The page. + * @param cRefs The number of references to release. + */ +DECLINLINE(void) pgmRZDynMapReleasePageLocked(PPGMRZDYNMAP pThis, uint32_t iPage, int32_t cRefs) +{ + cRefs = ASMAtomicSubS32(&pThis->paPages[iPage].cRefs, cRefs) - cRefs; + AssertMsg(cRefs >= 0, ("%d\n", cRefs)); + if (!cRefs) + { + pThis->cLoad--; +#ifdef PGMRZDYNMAP_STRICT_RELEASE + pThis->paPages[iPage].HCPhys = NIL_RTHCPHYS; + ASMAtomicBitClear(pThis->paPages[iPage].uPte.pv, X86_PTE_BIT_P); + ASMInvalidatePage((uintptr_t)pThis->paPages[iPage].pvPage); +#endif + } +} + + +/** + * Release references to a page, caller does not own the spin lock. + * + * @param pThis The dynamic mapping cache instance. + * @param iPage The page. + * @param cRefs The number of references to release. + */ +static void pgmRZDynMapReleasePage(PPGMRZDYNMAP pThis, uint32_t iPage, uint32_t cRefs) +{ + PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis); + pgmRZDynMapReleasePageLocked(pThis, iPage, cRefs); + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); +} + + +/** + * pgmR0DynMapPage worker that deals with the tedious bits. + * + * @returns The page index on success, UINT32_MAX on failure. + * @param pThis The dynamic mapping cache instance. + * @param HCPhys The address of the page to be mapped. + * @param iPage The page index pgmR0DynMapPage hashed HCPhys to. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * For statistics. + * @param pfNew Set to @c true if a new entry was made and @c false if + * an old entry was found and reused. + */ +static uint32_t pgmR0DynMapPageSlow(PPGMRZDYNMAP pThis, RTHCPHYS HCPhys, uint32_t iPage, PVMCPU pVCpu, bool *pfNew) +{ + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapPageSlow); RT_NOREF_PV(pVCpu); + + /* + * Check if any of the first 3 pages are unreferenced since the caller + * already has made sure they aren't matching. + */ +#ifdef VBOX_WITH_STATISTICS + bool fLooped = false; +#endif + uint32_t const cPages = pThis->cPages; + PPGMRZDYNMAPENTRY paPages = pThis->paPages; + uint32_t iFreePage; + if (!paPages[iPage].cRefs) + iFreePage = iPage; + else if (!paPages[(iPage + 1) % cPages].cRefs) + iFreePage = (iPage + 1) % cPages; + else if (!paPages[(iPage + 2) % cPages].cRefs) + iFreePage = (iPage + 2) % cPages; + else + { + /* + * Search for an unused or matching entry. + */ + iFreePage = (iPage + 3) % cPages; + for (;;) + { + if (paPages[iFreePage].HCPhys == HCPhys) + { + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapPageSlowLoopHits); + *pfNew = false; + return iFreePage; + } + if (!paPages[iFreePage].cRefs) + break; + + /* advance */ + iFreePage = (iFreePage + 1) % cPages; + if (RT_UNLIKELY(iFreePage == iPage)) + return UINT32_MAX; + } + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapPageSlowLoopMisses); +#ifdef VBOX_WITH_STATISTICS + fLooped = true; +#endif + } + Assert(iFreePage < cPages); + +#if 0 //def VBOX_WITH_STATISTICS + /* Check for lost hits. */ + if (!fLooped) + for (uint32_t iPage2 = (iPage + 3) % cPages; iPage2 != iPage; iPage2 = (iPage2 + 1) % cPages) + if (paPages[iPage2].HCPhys == HCPhys) + STAM_COUNTER_INC(&pVCpu->pgm.s.StatRZDynMapPageSlowLostHits); +#endif + + /* + * Setup the new entry. + */ + *pfNew = true; + /*Log6(("pgmR0DynMapPageSlow: old - %RHp %#x %#llx\n", paPages[iFreePage].HCPhys, paPages[iFreePage].cRefs, paPages[iFreePage].uPte.pPae->u));*/ + paPages[iFreePage].HCPhys = HCPhys; +#ifndef IN_RC + RTCpuSetFill(&paPages[iFreePage].PendingSet); + + if (pThis->fLegacyMode) +#endif + { + X86PGUINT uOld = paPages[iFreePage].uPte.pLegacy->u; + X86PGUINT uOld2 = uOld; NOREF(uOld2); + X86PGUINT uNew = (uOld & (X86_PTE_G | X86_PTE_PAT | X86_PTE_PCD | X86_PTE_PWT)) + | X86_PTE_P | X86_PTE_RW | X86_PTE_A | X86_PTE_D + | (HCPhys & X86_PTE_PG_MASK); + while (!ASMAtomicCmpXchgExU32(&paPages[iFreePage].uPte.pLegacy->u, uNew, uOld, &uOld)) + AssertMsgFailed(("uOld=%#x uOld2=%#x uNew=%#x\n", uOld, uOld2, uNew)); + Assert(paPages[iFreePage].uPte.pLegacy->u == uNew); + } +#ifndef IN_RC + else +#endif + { + X86PGPAEUINT uOld = paPages[iFreePage].uPte.pPae->u; + X86PGPAEUINT uOld2 = uOld; NOREF(uOld2); + X86PGPAEUINT uNew = (uOld & (X86_PTE_G | X86_PTE_PAT | X86_PTE_PCD | X86_PTE_PWT)) + | X86_PTE_P | X86_PTE_RW | X86_PTE_A | X86_PTE_D + | (HCPhys & X86_PTE_PAE_PG_MASK); + while (!ASMAtomicCmpXchgExU64(&paPages[iFreePage].uPte.pPae->u, uNew, uOld, &uOld)) + AssertMsgFailed(("uOld=%#llx uOld2=%#llx uNew=%#llx\n", uOld, uOld2, uNew)); + Assert(paPages[iFreePage].uPte.pPae->u == uNew); + /*Log6(("pgmR0DynMapPageSlow: #%x - %RHp %p %#llx\n", iFreePage, HCPhys, paPages[iFreePage].pvPage, uNew));*/ + } + return iFreePage; +} + + +/** + * Maps a page into the pool. + * + * @returns Page index on success, UINT32_MAX on failure. + * @param pThis The dynamic mapping cache instance. + * @param HCPhys The address of the page to be mapped. + * @param iRealCpu The real cpu set index. (optimization) + * @param pVCpu The cross context virtual CPU structure of the calling + * EMT. For statistics. + * @param ppvPage Where to the page address. + */ +DECLINLINE(uint32_t) pgmR0DynMapPage(PPGMRZDYNMAP pThis, RTHCPHYS HCPhys, int32_t iRealCpu, PVMCPU pVCpu, void **ppvPage) +{ + PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis); + AssertMsg(!(HCPhys & PAGE_OFFSET_MASK), ("HCPhys=%RHp\n", HCPhys)); + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapPage); + + /* + * Find an entry, if possible a matching one. The HCPhys address is hashed + * down to a page index, collisions are handled by linear searching. + * Optimized for a hit in the first 3 pages. + * + * Field easy hits here and defer the tedious searching and inserting + * to pgmR0DynMapPageSlow(). + */ + bool fNew = false; + uint32_t const cPages = pThis->cPages; + uint32_t iPage = (HCPhys >> PAGE_SHIFT) % cPages; + PPGMRZDYNMAPENTRY paPages = pThis->paPages; + if (RT_LIKELY(paPages[iPage].HCPhys == HCPhys)) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapPageHits0); + else + { + uint32_t iPage2 = (iPage + 1) % cPages; + if (RT_LIKELY(paPages[iPage2].HCPhys == HCPhys)) + { + iPage = iPage2; + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapPageHits1); + } + else + { + iPage2 = (iPage + 2) % cPages; + if (paPages[iPage2].HCPhys == HCPhys) + { + iPage = iPage2; + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapPageHits2); + } + else + { + iPage = pgmR0DynMapPageSlow(pThis, HCPhys, iPage, pVCpu, &fNew); + if (RT_UNLIKELY(iPage == UINT32_MAX)) + { + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + *ppvPage = NULL; + return iPage; + } + } + } + } + + /* + * Reference it, update statistics and get the return address. + */ + int32_t cRefs = ASMAtomicIncS32(&paPages[iPage].cRefs); + if (cRefs == 1) + { + pThis->cLoad++; + if (pThis->cLoad > pThis->cMaxLoad) + pThis->cMaxLoad = pThis->cLoad; + AssertMsg(pThis->cLoad <= pThis->cPages - pThis->cGuardPages, ("%d/%d\n", pThis->cLoad, pThis->cPages - pThis->cGuardPages)); + } + else if (RT_UNLIKELY(cRefs <= 0)) + { + ASMAtomicDecS32(&paPages[iPage].cRefs); + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + *ppvPage = NULL; + AssertLogRelMsgFailedReturn(("cRefs=%d iPage=%u HCPhys=%RHp\n", cRefs, iPage, HCPhys), UINT32_MAX); + } + void *pvPage = paPages[iPage].pvPage; + +#ifndef IN_RC + /* + * Invalidate the entry? + */ + bool fInvalidateIt = RTCpuSetIsMemberByIndex(&paPages[iPage].PendingSet, iRealCpu); + if (RT_UNLIKELY(fInvalidateIt)) + RTCpuSetDelByIndex(&paPages[iPage].PendingSet, iRealCpu); +#else + NOREF(iRealCpu); +#endif + + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + + /* + * Do the actual invalidation outside the spinlock. + */ +#ifdef IN_RC + if (RT_UNLIKELY(fNew)) +#else + if (RT_UNLIKELY(fInvalidateIt)) +#endif + { + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapPageInvlPg); + ASMInvalidatePage((uintptr_t)pvPage); + } + + *ppvPage = pvPage; + return iPage; +} + + +/** + * Assert the integrity of the pool. + * + * @returns VBox status code. + */ +static int pgmRZDynMapAssertIntegrity(PPGMRZDYNMAP pThis) +{ + /* + * Basic pool stuff that doesn't require any lock, just assumes we're a user. + */ + if (!pThis) + return VINF_SUCCESS; + AssertPtrReturn(pThis, VERR_INVALID_POINTER); + AssertReturn(pThis->u32Magic == PGMRZDYNMAP_MAGIC, VERR_INVALID_MAGIC); + if (!pThis->cUsers) + return VERR_INVALID_PARAMETER; + + PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis); + +#define CHECK_RET(expr, a) \ + do { \ + if (RT_UNLIKELY(!(expr))) \ + { \ + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); \ + RTAssertMsg1Weak(#expr, __LINE__, __FILE__, __PRETTY_FUNCTION__); \ + RTAssertMsg2Weak a; \ + return VERR_PGM_DYNMAP_IPE; \ + } \ + } while (0) + + /* + * Check that the PTEs are correct. + */ + uint32_t cGuard = 0; + uint32_t cLoad = 0; + PPGMRZDYNMAPENTRY paPages = pThis->paPages; + +#ifndef IN_RC + if (pThis->fLegacyMode) +#endif + { +#ifdef IN_RING0 + PCX86PGUINT paSavedPTEs = (PCX86PGUINT)pThis->pvSavedPTEs; NOREF(paSavedPTEs); +#endif + uint32_t iPage = pThis->cPages; + while (iPage-- > 0) + { + CHECK_RET(!((uintptr_t)paPages[iPage].pvPage & PAGE_OFFSET_MASK), ("#%u: %p\n", iPage, paPages[iPage].pvPage)); + if ( paPages[iPage].cRefs == PGMR0DYNMAP_GUARD_PAGE_REF_COUNT + && paPages[iPage].HCPhys == PGMR0DYNMAP_GUARD_PAGE_HCPHYS) + { +#ifdef PGMR0DYNMAP_GUARD_NP + CHECK_RET(paPages[iPage].uPte.pLegacy->u == (paSavedPTEs[iPage] & ~(X86PGUINT)X86_PTE_P), + ("#%u: %#x %#x", iPage, paPages[iPage].uPte.pLegacy->u, paSavedPTEs[iPage])); +#else + CHECK_RET(paPages[iPage].uPte.pLegacy->u == PGMR0DYNMAP_GUARD_PAGE_LEGACY_PTE, + ("#%u: %#x", iPage, paPages[iPage].uPte.pLegacy->u)); +#endif + cGuard++; + } + else if (paPages[iPage].HCPhys != NIL_RTHCPHYS) + { + CHECK_RET(!(paPages[iPage].HCPhys & PAGE_OFFSET_MASK), ("#%u: %RHp\n", iPage, paPages[iPage].HCPhys)); + X86PGUINT uPte = X86_PTE_P | X86_PTE_RW | X86_PTE_A | X86_PTE_D +#ifdef IN_RING0 + | (paSavedPTEs[iPage] & (X86_PTE_G | X86_PTE_PAT | X86_PTE_PCD | X86_PTE_PWT)) +#endif + | (paPages[iPage].HCPhys & X86_PTE_PAE_PG_MASK); + CHECK_RET(paPages[iPage].uPte.pLegacy->u == uPte, + ("#%u: %#x %#x", iPage, paPages[iPage].uPte.pLegacy->u, uPte)); + if (paPages[iPage].cRefs) + cLoad++; + } +#if defined(IN_RING0) && !defined(PGMRZDYNMAP_STRICT_RELEASE) + else + CHECK_RET(paPages[iPage].uPte.pLegacy->u == paSavedPTEs[iPage], + ("#%u: %#x %#x", iPage, paPages[iPage].uPte.pLegacy->u, paSavedPTEs[iPage])); +#endif + } + } +#ifndef IN_RC + else +#endif + { +#ifdef IN_RING0 + PCX86PGPAEUINT paSavedPTEs = (PCX86PGPAEUINT)pThis->pvSavedPTEs; NOREF(paSavedPTEs); +#endif + uint32_t iPage = pThis->cPages; + while (iPage-- > 0) + { + CHECK_RET(!((uintptr_t)paPages[iPage].pvPage & PAGE_OFFSET_MASK), ("#%u: %p\n", iPage, paPages[iPage].pvPage)); + if ( paPages[iPage].cRefs == PGMR0DYNMAP_GUARD_PAGE_REF_COUNT + && paPages[iPage].HCPhys == PGMR0DYNMAP_GUARD_PAGE_HCPHYS) + { +#ifdef PGMR0DYNMAP_GUARD_NP + CHECK_RET(paPages[iPage].uPte.pPae->u == (paSavedPTEs[iPage] & ~(X86PGPAEUINT)X86_PTE_P), + ("#%u: %#llx %#llx", iPage, paPages[iPage].uPte.pPae->u, paSavedPTEs[iPage])); +#else + CHECK_RET(paPages[iPage].uPte.pPae->u == PGMR0DYNMAP_GUARD_PAGE_PAE_PTE, + ("#%u: %#llx", iPage, paPages[iPage].uPte.pPae->u)); +#endif + cGuard++; + } + else if (paPages[iPage].HCPhys != NIL_RTHCPHYS) + { + CHECK_RET(!(paPages[iPage].HCPhys & PAGE_OFFSET_MASK), ("#%u: %RHp\n", iPage, paPages[iPage].HCPhys)); + X86PGPAEUINT uPte = X86_PTE_P | X86_PTE_RW | X86_PTE_A | X86_PTE_D +#ifdef IN_RING0 + | (paSavedPTEs[iPage] & (X86_PTE_G | X86_PTE_PAT | X86_PTE_PCD | X86_PTE_PWT)) +#endif + | (paPages[iPage].HCPhys & X86_PTE_PAE_PG_MASK); + CHECK_RET(paPages[iPage].uPte.pPae->u == uPte, + ("#%u: %#llx %#llx", iPage, paPages[iPage].uPte.pLegacy->u, uPte)); + if (paPages[iPage].cRefs) + cLoad++; + } +#ifdef IN_RING0 + else + CHECK_RET(paPages[iPage].uPte.pPae->u == paSavedPTEs[iPage], + ("#%u: %#llx %#llx", iPage, paPages[iPage].uPte.pPae->u, paSavedPTEs[iPage])); +#endif + } + } + + CHECK_RET(cLoad == pThis->cLoad, ("%u %u\n", cLoad, pThis->cLoad)); + CHECK_RET(cGuard == pThis->cGuardPages, ("%u %u\n", cGuard, pThis->cGuardPages)); + +#undef CHECK_RET + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + return VINF_SUCCESS; +} + +#ifdef IN_RING0 +/** + * Assert the integrity of the pool. + * + * @returns VBox status code. + */ +VMMR0DECL(int) PGMR0DynMapAssertIntegrity(void) +{ + return pgmRZDynMapAssertIntegrity(g_pPGMR0DynMap); +} +#endif /* IN_RING0 */ + +#ifdef IN_RC +/** + * Assert the integrity of the pool. + * + * @returns VBox status code. + */ +VMMRCDECL(int) PGMRCDynMapAssertIntegrity(PVM pVM) +{ + return pgmRZDynMapAssertIntegrity((PPGMRZDYNMAP)pVM->pgm.s.pRCDynMap); +} +#endif /* IN_RC */ + + +/** + * As a final resort for a (somewhat) full auto set or full cache, try merge + * duplicate entries and flush the ones we can. + * + * @param pSet The set. + */ +static void pgmDynMapOptimizeAutoSet(PPGMMAPSET pSet) +{ + LogFlow(("pgmDynMapOptimizeAutoSet\n")); + + for (uint32_t i = 0 ; i < pSet->cEntries; i++) + { + /* + * Try merge entries. + */ + uint16_t const iPage = pSet->aEntries[i].iPage; + uint32_t j = i + 1; + while ( j < pSet->cEntries + && ( pSet->iSubset == UINT32_MAX + || pSet->iSubset < pSet->cEntries) ) + { + if (pSet->aEntries[j].iPage != iPage) + j++; + else + { + uint32_t const cHardRefs = (uint32_t)pSet->aEntries[i].cRefs + + (uint32_t)pSet->aEntries[j].cRefs; + uint32_t cInlinedRefs = (uint32_t)pSet->aEntries[i].cInlinedRefs + + (uint32_t)pSet->aEntries[j].cInlinedRefs; + uint32_t cUnrefs = (uint32_t)pSet->aEntries[i].cUnrefs + + (uint32_t)pSet->aEntries[j].cUnrefs; + uint32_t cSub = RT_MIN(cUnrefs, cInlinedRefs); + cInlinedRefs -= cSub; + cUnrefs -= cSub; + + if ( cHardRefs < UINT16_MAX + && cInlinedRefs < UINT16_MAX + && cUnrefs < UINT16_MAX) + { + /* merge j into i removing j. */ + Log2(("pgmDynMapOptimizeAutoSet: Merging #%u into #%u\n", j, i)); + pSet->aEntries[i].cRefs = cHardRefs; + pSet->aEntries[i].cInlinedRefs = cInlinedRefs; + pSet->aEntries[i].cUnrefs = cUnrefs; + pSet->cEntries--; + if (j < pSet->cEntries) + { + pSet->aEntries[j] = pSet->aEntries[pSet->cEntries]; + PGMRZDYNMAP_ZAP_ENTRY(&pSet->aEntries[pSet->cEntries]); + } + else + PGMRZDYNMAP_ZAP_ENTRY(&pSet->aEntries[j]); + } +#if 0 /* too complicated, skip it. */ + else + { + /* migrate the max number of refs from j into i and quit the inner loop. */ + uint32_t cMigrate = UINT16_MAX - 1 - pSet->aEntries[i].cRefs; + Assert(pSet->aEntries[j].cRefs > cMigrate); + pSet->aEntries[j].cRefs -= cMigrate; + pSet->aEntries[i].cRefs = UINT16_MAX - 1; + break; + } +#endif + } + } + + /* + * Try make use of the unused hinting (cUnrefs) to evict entries + * from both the set as well as the mapping cache. + */ + + uint32_t const cTotalRefs = (uint32_t)pSet->aEntries[i].cRefs + pSet->aEntries[i].cInlinedRefs; + Log2(("pgmDynMapOptimizeAutoSet: #%u/%u/%u pvPage=%p iPage=%u cRefs=%u cInlinedRefs=%u cUnrefs=%u cTotalRefs=%u\n", + i, + pSet->iSubset, + pSet->cEntries, + pSet->aEntries[i].pvPage, + pSet->aEntries[i].iPage, + pSet->aEntries[i].cRefs, + pSet->aEntries[i].cInlinedRefs, + pSet->aEntries[i].cUnrefs, + cTotalRefs)); + Assert(cTotalRefs >= pSet->aEntries[i].cUnrefs); + + if ( cTotalRefs == pSet->aEntries[i].cUnrefs + && ( pSet->iSubset == UINT32_MAX + || pSet->iSubset < pSet->cEntries) + ) + { + Log2(("pgmDynMapOptimizeAutoSet: Releasing iPage=%d/%p\n", pSet->aEntries[i].iPage, pSet->aEntries[i].pvPage)); + //LogFlow(("pgmDynMapOptimizeAutoSet: Releasing iPage=%d/%p\n", pSet->aEntries[i].iPage, pSet->aEntries[i].pvPage)); + pgmRZDynMapReleasePage(PGMRZDYNMAP_SET_2_DYNMAP(pSet), + pSet->aEntries[i].iPage, + pSet->aEntries[i].cRefs); + pSet->cEntries--; + if (i < pSet->cEntries) + { + pSet->aEntries[i] = pSet->aEntries[pSet->cEntries]; + PGMRZDYNMAP_ZAP_ENTRY(&pSet->aEntries[pSet->cEntries]); + } + + i--; + } + } +} + + + + +/** + * Signals the start of a new set of mappings. + * + * Mostly for strictness. PGMDynMapHCPage won't work unless this + * API is called. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + */ +VMMDECL(void) PGMRZDynMapStartAutoSet(PVMCPU pVCpu) +{ + LogFlow(("PGMRZDynMapStartAutoSet:\n")); + Assert(pVCpu->pgm.s.AutoSet.cEntries == PGMMAPSET_CLOSED); + Assert(pVCpu->pgm.s.AutoSet.iSubset == UINT32_MAX); + pVCpu->pgm.s.AutoSet.cEntries = 0; + pVCpu->pgm.s.AutoSet.iCpu = PGMRZDYNMAP_CUR_CPU(); +} + + +#ifdef IN_RING0 +/** + * Starts or migrates the autoset of a virtual CPU. + * + * This is used by HMR0Enter. When we've longjumped out of the HM + * execution loop with the set open, we'll migrate it when re-entering. While + * under normal circumstances, we'll start it so VMXR0LoadGuestState can access + * guest memory. + * + * @returns @c true if started, @c false if migrated. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @thread EMT + */ +VMMR0DECL(bool) PGMR0DynMapStartOrMigrateAutoSet(PVMCPU pVCpu) +{ + bool fStartIt = pVCpu->pgm.s.AutoSet.cEntries == PGMMAPSET_CLOSED; + if (fStartIt) + PGMRZDynMapStartAutoSet(pVCpu); + else + PGMR0DynMapMigrateAutoSet(pVCpu); + return fStartIt; +} +#endif /* IN_RING0 */ + + +/** + * Checks if the set has high load. + * + * @returns true on high load, otherwise false. + * @param pSet The set. + */ +DECLINLINE(bool) pgmRZDynMapHasHighLoad(PPGMMAPSET pSet) +{ +#ifdef IN_RC + if (pSet->cEntries < MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE / 2) + return false; +#endif + + PPGMRZDYNMAP pThis = PGMRZDYNMAP_SET_2_DYNMAP(pSet); + uint32_t cUnusedPages = pThis->cPages - pThis->cLoad; +#ifdef IN_RC + return cUnusedPages <= MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE * 36 / 100; +#else + return cUnusedPages <= PGMR0DYNMAP_PAGES_PER_CPU_MIN; +#endif +} + + +/** + * Worker that performs the actual flushing of the set. + * + * @param pSet The set to flush. + * @param cEntries The number of entries. + */ +DECLINLINE(void) pgmDynMapFlushAutoSetWorker(PPGMMAPSET pSet, uint32_t cEntries) +{ + /* + * Release any pages it's referencing. + */ + if ( cEntries != 0 + && RT_LIKELY(cEntries <= RT_ELEMENTS(pSet->aEntries))) + { + PPGMRZDYNMAP pThis = PGMRZDYNMAP_SET_2_DYNMAP(pSet); + PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis); + + uint32_t i = cEntries; + while (i-- > 0) + { + uint32_t iPage = pSet->aEntries[i].iPage; + Assert(iPage < pThis->cPages); + int32_t cRefs = pSet->aEntries[i].cRefs; + Assert(cRefs > 0); + pgmRZDynMapReleasePageLocked(pThis, iPage, cRefs); + + PGMRZDYNMAP_ZAP_ENTRY(&pSet->aEntries[i]); + } + + Assert(pThis->cLoad <= pThis->cPages - pThis->cGuardPages); + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + } +} + + +/** + * Releases the dynamic memory mappings made by PGMDynMapHCPage and associates + * since the PGMDynMapStartAutoSet call. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + */ +VMMDECL(void) PGMRZDynMapReleaseAutoSet(PVMCPU pVCpu) +{ + PPGMMAPSET pSet = &pVCpu->pgm.s.AutoSet; + + /* + * Close and flush the set. + */ + uint32_t cEntries = pSet->cEntries; + AssertReturnVoid(cEntries != PGMMAPSET_CLOSED); + pSet->cEntries = PGMMAPSET_CLOSED; + pSet->iSubset = UINT32_MAX; + pSet->iCpu = -1; + +#ifdef IN_RC + if (RT_ELEMENTS(pSet->aEntries) > MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->aStatRZDynMapSetFilledPct[(cEntries * 10 / (MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE)) % 11]); + else +#endif + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->aStatRZDynMapSetFilledPct[(cEntries * 10 / RT_ELEMENTS(pSet->aEntries)) % 11]); + if (cEntries > RT_ELEMENTS(pSet->aEntries) * 50 / 100) + Log(("PGMRZDynMapReleaseAutoSet: cEntries=%d\n", cEntries)); + else + LogFlow(("PGMRZDynMapReleaseAutoSet: cEntries=%d\n", cEntries)); + + pgmDynMapFlushAutoSetWorker(pSet, cEntries); +} + + +/** + * Flushes the set if it's above a certain threshold. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + */ +VMMDECL(void) PGMRZDynMapFlushAutoSet(PVMCPU pVCpu) +{ + PPGMMAPSET pSet = &pVCpu->pgm.s.AutoSet; + AssertMsg(pSet->iCpu == PGMRZDYNMAP_CUR_CPU(), ("%d %d efl=%#x\n", pSet->iCpu, PGMRZDYNMAP_CUR_CPU(), ASMGetFlags())); + + /* + * Only flush it if it's 45% full. + */ + uint32_t cEntries = pSet->cEntries; + AssertReturnVoid(cEntries != PGMMAPSET_CLOSED); + Assert(pSet->iSubset == UINT32_MAX); +#ifdef IN_RC + if (RT_ELEMENTS(pSet->aEntries) > MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->aStatRZDynMapSetFilledPct[(cEntries * 10 / (MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE)) % 11]); + else +#endif + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->aStatRZDynMapSetFilledPct[(cEntries * 10 / RT_ELEMENTS(pSet->aEntries)) % 11]); + if ( cEntries >= RT_ELEMENTS(pSet->aEntries) * 45 / 100 + || pgmRZDynMapHasHighLoad(pSet)) + { + pSet->cEntries = 0; + Log(("PGMDynMapFlushAutoSet: cEntries=%d\n", pSet->cEntries)); + + pgmDynMapFlushAutoSetWorker(pSet, cEntries); + AssertMsg(pSet->iCpu == PGMRZDYNMAP_CUR_CPU(), ("%d %d efl=%#x\n", pSet->iCpu, PGMRZDYNMAP_CUR_CPU(), ASMGetFlags())); + } +} + + +#ifndef IN_RC +/** + * Migrates the automatic mapping set of the current vCPU if it's active and + * necessary. + * + * This is called when re-entering the hardware assisted execution mode after a + * nip down to ring-3. We run the risk that the CPU might have change and we + * will therefore make sure all the cache entries currently in the auto set will + * be valid on the new CPU. If the cpu didn't change nothing will happen as all + * the entries will have been flagged as invalidated. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @thread EMT + */ +VMMR0DECL(void) PGMR0DynMapMigrateAutoSet(PVMCPU pVCpu) +{ + LogFlow(("PGMR0DynMapMigrateAutoSet\n")); + PPGMMAPSET pSet = &pVCpu->pgm.s.AutoSet; + int32_t iRealCpu = PGMRZDYNMAP_CUR_CPU(); + if (pSet->iCpu != iRealCpu) + { + uint32_t i = pSet->cEntries; + if (i != PGMMAPSET_CLOSED) + { + AssertMsg(i <= RT_ELEMENTS(pSet->aEntries), ("%#x (%u)\n", i, i)); + if (i != 0 && RT_LIKELY(i <= RT_ELEMENTS(pSet->aEntries))) + { + PPGMRZDYNMAP pThis = PGMRZDYNMAP_SET_2_DYNMAP(pSet); + PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis); + + while (i-- > 0) + { + Assert(pSet->aEntries[i].cRefs > 0); + uint32_t iPage = pSet->aEntries[i].iPage; + Assert(iPage < pThis->cPages); + if (RTCpuSetIsMemberByIndex(&pThis->paPages[iPage].PendingSet, iRealCpu)) + { + RTCpuSetDelByIndex(&pThis->paPages[iPage].PendingSet, iRealCpu); + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + + ASMInvalidatePage((uintptr_t)pThis->paPages[iPage].pvPage); + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapMigrateInvlPg); + + PGMRZDYNMAP_SPINLOCK_REACQUIRE(pThis); + } + } + + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + } + } + pSet->iCpu = iRealCpu; + } +} +#endif /* !IN_RC */ + + +/** + * Worker function that flushes the current subset. + * + * This is called when the set is popped or when the set + * hash a too high load. As also pointed out elsewhere, the + * whole subset thing is a hack for working around code that + * accesses too many pages. Like PGMPool. + * + * @param pSet The set which subset to flush. + */ +static void pgmDynMapFlushSubset(PPGMMAPSET pSet) +{ + uint32_t iSubset = pSet->iSubset; + uint32_t i = pSet->cEntries; + Assert(i <= RT_ELEMENTS(pSet->aEntries)); + if ( i > iSubset + && i <= RT_ELEMENTS(pSet->aEntries)) + { + Log(("pgmDynMapFlushSubset: cEntries=%d iSubset=%d\n", pSet->cEntries, iSubset)); + pSet->cEntries = iSubset; + + PPGMRZDYNMAP pThis = PGMRZDYNMAP_SET_2_DYNMAP(pSet); + PGMRZDYNMAP_SPINLOCK_ACQUIRE(pThis); + + while (i-- > iSubset) + { + uint32_t iPage = pSet->aEntries[i].iPage; + Assert(iPage < pThis->cPages); + int32_t cRefs = pSet->aEntries[i].cRefs; + Assert(cRefs > 0); + pgmRZDynMapReleasePageLocked(pThis, iPage, cRefs); + + PGMRZDYNMAP_ZAP_ENTRY(&pSet->aEntries[i]); + } + + PGMRZDYNMAP_SPINLOCK_RELEASE(pThis); + } +} + + +/** + * Creates a subset. + * + * A subset is a hack to avoid having to rewrite code that touches a lot of + * pages. It prevents the mapping set from being overflowed by automatically + * flushing previous mappings when a certain threshold is reached. + * + * Pages mapped after calling this function are only valid until the next page + * is mapped. + * + * @returns The index of the previous subset. Pass this to + * PGMDynMapPopAutoSubset when popping it. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + */ +VMMDECL(uint32_t) PGMRZDynMapPushAutoSubset(PVMCPU pVCpu) +{ + PPGMMAPSET pSet = &pVCpu->pgm.s.AutoSet; + AssertReturn(pSet->cEntries != PGMMAPSET_CLOSED, UINT32_MAX); + uint32_t iPrevSubset = pSet->iSubset; + LogFlow(("PGMRZDynMapPushAutoSubset: pVCpu=%p iPrevSubset=%u\n", pVCpu, iPrevSubset)); + + /* + * If it looks like we're approaching the max set size or mapping space + * optimize the set to drop off unused pages. + */ + if ( pSet->cEntries > RT_ELEMENTS(pSet->aEntries) * 60 / 100 + || pgmRZDynMapHasHighLoad(pSet)) + { + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapSetOptimize); + pgmDynMapOptimizeAutoSet(pSet); + } + + pSet->iSubset = pSet->cEntries; + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapSubsets); + + AssertMsg(iPrevSubset <= pSet->iSubset || iPrevSubset == UINT32_MAX, ("iPrevSubset=%#x iSubset=%#x\n", iPrevSubset, pSet->iSubset)); + return iPrevSubset; +} + + +/** + * Pops a subset created by a previous call to PGMDynMapPushAutoSubset. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param iPrevSubset What PGMDynMapPushAutoSubset returned. + */ +VMMDECL(void) PGMRZDynMapPopAutoSubset(PVMCPU pVCpu, uint32_t iPrevSubset) +{ + PPGMMAPSET pSet = &pVCpu->pgm.s.AutoSet; + uint32_t cEntries = pSet->cEntries; + LogFlow(("PGMRZDynMapPopAutoSubset: pVCpu=%p iPrevSubset=%u iSubset=%u cEntries=%u\n", pVCpu, iPrevSubset, pSet->iSubset, cEntries)); + AssertReturnVoid(cEntries != PGMMAPSET_CLOSED); + AssertMsgReturnVoid(pSet->iSubset >= iPrevSubset || iPrevSubset == UINT32_MAX, ("iPrevSubset=%u iSubset=%u cEntries=%u\n", iPrevSubset, pSet->iSubset, cEntries)); +#ifdef IN_RC + if (RT_ELEMENTS(pSet->aEntries) > MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->aStatRZDynMapSetFilledPct[(cEntries * 10 / (MM_HYPER_DYNAMIC_SIZE / PAGE_SIZE)) % 11]); + else +#endif + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->aStatRZDynMapSetFilledPct[(cEntries * 10 / RT_ELEMENTS(pSet->aEntries)) % 11]); + if ( cEntries >= RT_ELEMENTS(pSet->aEntries) * 40 / 100 + && cEntries != pSet->iSubset) + { + pgmDynMapFlushSubset(pSet); + Assert(pSet->cEntries >= iPrevSubset || iPrevSubset == UINT32_MAX); + } + pSet->iSubset = iPrevSubset; +} + + +/** + * Indicates that the given page is unused and its mapping can be re-used. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param pvHint The page that is now unused. This does not have to + * point at the start of the page. NULL is ignored. + */ +#ifdef LOG_ENABLED +void pgmRZDynMapUnusedHint(PVMCPU pVCpu, void *pvHint, RT_SRC_POS_DECL) +#else +void pgmRZDynMapUnusedHint(PVMCPU pVCpu, void *pvHint) +#endif +{ + /* + * Ignore NULL pointers and mask off the page offset bits. + */ + if (pvHint == NULL) + return; + pvHint = (void *)((uintptr_t)pvHint & ~(uintptr_t)PAGE_OFFSET_MASK); + + PPGMMAPSET pSet = &pVCpu->pgm.s.AutoSet; + uint32_t iEntry = pSet->cEntries; + AssertReturnVoid(iEntry > 0); + + /* + * Find the entry in the usual unrolled fashion. + */ + /** @todo add a hint to the set which entry was used last since it's not + * always the last entry? */ +#define IS_MATCHING_ENTRY(pSet, iEntry, pvHint) \ + ( (pSet)->aEntries[(iEntry)].pvPage == (pvHint) \ + && (uint32_t)(pSet)->aEntries[(iEntry)].cRefs + (pSet)->aEntries[(iEntry)].cInlinedRefs \ + > (pSet)->aEntries[(iEntry)].cUnrefs ) + if ( iEntry >= 1 && IS_MATCHING_ENTRY(pSet, iEntry - 1, pvHint)) + iEntry = iEntry - 1; + else if (iEntry >= 2 && IS_MATCHING_ENTRY(pSet, iEntry - 2, pvHint)) + iEntry = iEntry - 2; + else if (iEntry >= 3 && IS_MATCHING_ENTRY(pSet, iEntry - 3, pvHint)) + iEntry = iEntry - 3; + else if (iEntry >= 4 && IS_MATCHING_ENTRY(pSet, iEntry - 4, pvHint)) + iEntry = iEntry - 4; + else if (iEntry >= 5 && IS_MATCHING_ENTRY(pSet, iEntry - 5, pvHint)) + iEntry = iEntry - 5; + else if (iEntry >= 6 && IS_MATCHING_ENTRY(pSet, iEntry - 6, pvHint)) + iEntry = iEntry - 6; + else if (iEntry >= 7 && IS_MATCHING_ENTRY(pSet, iEntry - 7, pvHint)) + iEntry = iEntry - 7; + else + { + /* + * Loop till we find it. + */ + bool fFound = false; + if (iEntry > 7) + { + iEntry -= 7; + while (iEntry-- > 0) + if (IS_MATCHING_ENTRY(pSet, iEntry, pvHint)) + { + fFound = true; + break; + } + } + AssertMsgReturnVoid(fFound, + ("pvHint=%p cEntries=%#x iSubset=%#x\n" + "aEntries[0] = {%#x, %#x, %#x, %#x, %p}\n" + "aEntries[1] = {%#x, %#x, %#x, %#x, %p}\n" + "aEntries[2] = {%#x, %#x, %#x, %#x, %p}\n" + "aEntries[3] = {%#x, %#x, %#x, %#x, %p}\n" + "aEntries[4] = {%#x, %#x, %#x, %#x, %p}\n" + "aEntries[5] = {%#x, %#x, %#x, %#x, %p}\n" + , + pvHint, pSet->cEntries, pSet->iSubset, + pSet->aEntries[0].iPage, pSet->aEntries[0].cRefs, pSet->aEntries[0].cInlinedRefs, pSet->aEntries[0].cUnrefs, pSet->aEntries[0].pvPage, + pSet->aEntries[1].iPage, pSet->aEntries[1].cRefs, pSet->aEntries[1].cInlinedRefs, pSet->aEntries[1].cUnrefs, pSet->aEntries[1].pvPage, + pSet->aEntries[2].iPage, pSet->aEntries[2].cRefs, pSet->aEntries[2].cInlinedRefs, pSet->aEntries[2].cUnrefs, pSet->aEntries[2].pvPage, + pSet->aEntries[3].iPage, pSet->aEntries[3].cRefs, pSet->aEntries[3].cInlinedRefs, pSet->aEntries[3].cUnrefs, pSet->aEntries[3].pvPage, + pSet->aEntries[4].iPage, pSet->aEntries[4].cRefs, pSet->aEntries[4].cInlinedRefs, pSet->aEntries[4].cUnrefs, pSet->aEntries[4].pvPage, + pSet->aEntries[5].iPage, pSet->aEntries[5].cRefs, pSet->aEntries[5].cInlinedRefs, pSet->aEntries[5].cUnrefs, pSet->aEntries[5].pvPage)); + } +#undef IS_MATCHING_ENTRY + + /* + * Update it. + */ + uint32_t const cTotalRefs = (uint32_t)pSet->aEntries[iEntry].cRefs + pSet->aEntries[iEntry].cInlinedRefs; + uint32_t const cUnrefs = pSet->aEntries[iEntry].cUnrefs; + LogFlow(("pgmRZDynMapUnusedHint: pvHint=%p #%u cRefs=%d cInlinedRefs=%d cUnrefs=%d (+1) cTotalRefs=%d %s(%d) %s\n", + pvHint, iEntry, pSet->aEntries[iEntry].cRefs, pSet->aEntries[iEntry].cInlinedRefs, cUnrefs, cTotalRefs, pszFile, iLine, pszFunction)); + AssertReturnVoid(cTotalRefs > cUnrefs); + + if (RT_LIKELY(cUnrefs < UINT16_MAX - 1)) + pSet->aEntries[iEntry].cUnrefs++; + else if (pSet->aEntries[iEntry].cInlinedRefs) + { + uint32_t cSub = RT_MIN(pSet->aEntries[iEntry].cInlinedRefs, pSet->aEntries[iEntry].cUnrefs); + pSet->aEntries[iEntry].cInlinedRefs -= cSub; + pSet->aEntries[iEntry].cUnrefs -= cSub; + pSet->aEntries[iEntry].cUnrefs++; + } + else + Log(("pgmRZDynMapUnusedHint: pvHint=%p ignored because of overflow! %s(%d) %s\n", pvHint, pszFile, iLine, pszFunction)); + +#ifdef PGMRZDYNMAP_STRICT_RELEASE + /* + * Optimize the set to trigger the unmapping and invalidation of the page. + */ + if (cUnrefs + 1 == cTotalRefs) + pgmDynMapOptimizeAutoSet(pSet); +#endif +} + + +/** + * Common worker code for pgmRZDynMapHCPageInlined, pgmRZDynMapHCPageV2Inlined + * and pgmR0DynMapGCPageOffInlined. + * + * @returns VINF_SUCCESS, bails out to ring-3 on failure. + * @param pSet The set. + * @param HCPhys The physical address of the page. + * @param ppv Where to store the address of the mapping on success. + * + * @remarks This is a very hot path. + */ +int pgmRZDynMapHCPageCommon(PPGMMAPSET pSet, RTHCPHYS HCPhys, void **ppv RTLOG_COMMA_SRC_POS_DECL) +{ + AssertMsg(pSet->iCpu == PGMRZDYNMAP_CUR_CPU(), ("%d %d efl=%#x\n", pSet->iCpu, PGMRZDYNMAP_CUR_CPU(), ASMGetFlags())); + PVMCPU pVCpu = PGMRZDYNMAP_SET_2_VMCPU(pSet); + STAM_PROFILE_START(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapHCPage, a); + + /* + * Map it. + */ + void *pvPage; + PPGMRZDYNMAP pThis = PGMRZDYNMAP_SET_2_DYNMAP(pSet); + uint32_t iPage = pgmR0DynMapPage(pThis, HCPhys, pSet->iCpu, pVCpu, &pvPage); + if (RT_UNLIKELY(iPage == UINT32_MAX)) + { + /* + * We're out of mapping space, optimize our set to try remedy the + * situation. (Only works if there are unreference hints.) + */ + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapSetOptimize); + pgmDynMapOptimizeAutoSet(pSet); + + iPage = pgmR0DynMapPage(pThis, HCPhys, pSet->iCpu, pVCpu, &pvPage); + if (RT_UNLIKELY(iPage == UINT32_MAX)) + { + RTAssertMsg2Weak("pgmRZDynMapHCPageCommon: cLoad=%u/%u cPages=%u cGuardPages=%u\n", + pThis->cLoad, pThis->cMaxLoad, pThis->cPages, pThis->cGuardPages); + if (!g_fPGMR0DynMapTestRunning) + VMMRZCallRing3NoCpu(PGMRZDYNMAP_SET_2_VM(pSet), VMMCALLRING3_VM_R0_ASSERTION, 0); + *ppv = NULL; + STAM_PROFILE_STOP(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapHCPage, a); + return VERR_PGM_DYNMAP_FAILED; + } + } + + /* + * Add the page to the auto reference set. + * + * The typical usage pattern means that the same pages will be mapped + * several times in the same set. We can catch most of these + * remappings by looking a few pages back into the set. (The searching + * and set optimizing path will hardly ever be used when doing this.) + */ + AssertCompile(RT_ELEMENTS(pSet->aEntries) >= 8); + int32_t i = pSet->cEntries; + if (i-- < 5) + { + unsigned iEntry = pSet->cEntries++; + pSet->aEntries[iEntry].cRefs = 1; + pSet->aEntries[iEntry].cUnrefs = 0; + pSet->aEntries[iEntry].cInlinedRefs = 0; + pSet->aEntries[iEntry].iPage = iPage; + pSet->aEntries[iEntry].pvPage = pvPage; + pSet->aEntries[iEntry].HCPhys = HCPhys; + pSet->aiHashTable[PGMMAPSET_HASH(HCPhys)] = iEntry; + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=%u/0/0 iPage=%#x [a] %s(%d) %s\n", + pSet, HCPhys, iEntry, iEntry + 1, pvPage, 1, iPage, pszFile, iLine, pszFunction)); + } + /* Any of the last 5 pages? */ + else if ( pSet->aEntries[i - 0].iPage == iPage + && pSet->aEntries[i - 0].cRefs < UINT16_MAX - 1) + { + pSet->aEntries[i - 0].cRefs++; + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=%u/%u/%u iPage=%#x [0] %s(%d) %s\n", pSet, HCPhys, i - 0, pSet->cEntries, pvPage, pSet->aEntries[i - 0].cRefs, pSet->aEntries[i - 0].cInlinedRefs, pSet->aEntries[i - 0].cUnrefs, iPage, pszFile, iLine, pszFunction)); + } + else if ( pSet->aEntries[i - 1].iPage == iPage + && pSet->aEntries[i - 1].cRefs < UINT16_MAX - 1) + { + pSet->aEntries[i - 1].cRefs++; + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=%u/%u/%u iPage=%#x [1] %s(%d) %s\n", pSet, HCPhys, i - 1, pSet->cEntries, pvPage, pSet->aEntries[i - 1].cRefs, pSet->aEntries[i - 1].cInlinedRefs, pSet->aEntries[i - 1].cUnrefs, iPage, pszFile, iLine, pszFunction)); + } + else if ( pSet->aEntries[i - 2].iPage == iPage + && pSet->aEntries[i - 2].cRefs < UINT16_MAX - 1) + { + pSet->aEntries[i - 2].cRefs++; + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=%u/%u/%u iPage=%#x [2] %s(%d) %s\n", pSet, HCPhys, i - 2, pSet->cEntries, pvPage, pSet->aEntries[i - 2].cRefs, pSet->aEntries[i - 2].cInlinedRefs, pSet->aEntries[i - 2].cUnrefs, iPage, pszFile, iLine, pszFunction)); + } + else if ( pSet->aEntries[i - 3].iPage == iPage + && pSet->aEntries[i - 3].cRefs < UINT16_MAX - 1) + { + pSet->aEntries[i - 3].cRefs++; + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=%u/%u/%u iPage=%#x [4] %s(%d) %s\n", pSet, HCPhys, i - 3, pSet->cEntries, pvPage, pSet->aEntries[i - 3].cRefs, pSet->aEntries[i - 3].cInlinedRefs, pSet->aEntries[i - 3].cUnrefs, iPage, pszFile, iLine, pszFunction)); + } + else if ( pSet->aEntries[i - 4].iPage == iPage + && pSet->aEntries[i - 4].cRefs < UINT16_MAX - 1) + { + pSet->aEntries[i - 4].cRefs++; + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=%u/%u/%u iPage=%#x [4] %s(%d) %s\n", pSet, HCPhys, i - 4, pSet->cEntries, pvPage, pSet->aEntries[i - 4].cRefs, pSet->aEntries[i - 4].cInlinedRefs, pSet->aEntries[i - 4].cUnrefs, iPage, pszFile, iLine, pszFunction)); + } + /* Don't bother searching unless we're above a 60% load. */ + else if (RT_LIKELY(i <= (int32_t)RT_ELEMENTS(pSet->aEntries) * 60 / 100)) + { + unsigned iEntry = pSet->cEntries++; + pSet->aEntries[iEntry].cRefs = 1; + pSet->aEntries[iEntry].cUnrefs = 0; + pSet->aEntries[iEntry].cInlinedRefs = 0; + pSet->aEntries[iEntry].iPage = iPage; + pSet->aEntries[iEntry].pvPage = pvPage; + pSet->aEntries[iEntry].HCPhys = HCPhys; + pSet->aiHashTable[PGMMAPSET_HASH(HCPhys)] = iEntry; + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=1/0/0 iPage=%#x [b] %s(%d) %s\n", pSet, HCPhys, iEntry, pSet->cEntries, pvPage, iPage, pszFile, iLine, pszFunction)); + } + else + { + /* Search the rest of the set. */ + Assert(pSet->cEntries <= RT_ELEMENTS(pSet->aEntries)); + i -= 4; + while (i-- > 0) + if ( pSet->aEntries[i].iPage == iPage + && pSet->aEntries[i].cRefs < UINT16_MAX - 1) + { + pSet->aEntries[i].cRefs++; + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapSetSearchHits); + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=%u/%u/%u iPage=%#x [c] %s(%d) %s\n", pSet, HCPhys, i, pSet->cEntries, pvPage, pSet->aEntries[i].cRefs, pSet->aEntries[i].cInlinedRefs, pSet->aEntries[i].cUnrefs, iPage, pszFile, iLine, pszFunction)); + break; + } + if (i < 0) + { + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapSetSearchMisses); +#if 0 /* this is very bogus */ + if (pSet->iSubset < pSet->cEntries) + { + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapSetSearchFlushes); + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->aStatRZDynMapSetFilledPct[(pSet->cEntries * 10 / RT_ELEMENTS(pSet->aEntries)) % 11]); + pgmDynMapFlushSubset(pSet); + } +#endif + + if (RT_UNLIKELY(pSet->cEntries >= RT_ELEMENTS(pSet->aEntries))) + { + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapSetOptimize); + pgmDynMapOptimizeAutoSet(pSet); + } + + if (RT_LIKELY(pSet->cEntries < RT_ELEMENTS(pSet->aEntries))) + { + unsigned iEntry = pSet->cEntries++; + pSet->aEntries[iEntry].cRefs = 1; + pSet->aEntries[iEntry].cUnrefs = 0; + pSet->aEntries[iEntry].cInlinedRefs = 0; + pSet->aEntries[iEntry].iPage = iPage; + pSet->aEntries[iEntry].pvPage = pvPage; + pSet->aEntries[iEntry].HCPhys = HCPhys; + pSet->aiHashTable[PGMMAPSET_HASH(HCPhys)] = iEntry; + LogFlow(("pgmRZDynMapHCPageCommon: pSet=%p HCPhys=%RHp #%u/%u/%p cRefs=1/0/0 iPage=%#x [d] %s(%d) %s\n", pSet, HCPhys, iEntry, pSet->cEntries, pvPage, iPage, pszFile, iLine, pszFunction)); + } + else + { + /* We're screwed. */ + pgmRZDynMapReleasePage(pThis, iPage, 1); + + RTAssertMsg2Weak("pgmRZDynMapHCPageCommon: set is full!\n"); + if (!g_fPGMR0DynMapTestRunning) + VMMRZCallRing3NoCpu(PGMRZDYNMAP_SET_2_VM(pSet), VMMCALLRING3_VM_R0_ASSERTION, 0); + *ppv = NULL; + STAM_PROFILE_STOP(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapHCPage, a); + return VERR_PGM_DYNMAP_FULL_SET; + } + } + } + + *ppv = pvPage; + STAM_PROFILE_STOP(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZDynMapHCPage, a); + return VINF_SUCCESS; +} + + +#if 0 /*def DEBUG*/ +/** For pgmR0DynMapTest3PerCpu. */ +typedef struct PGMR0DYNMAPTEST +{ + uint32_t u32Expect; + uint32_t *pu32; + uint32_t volatile cFailures; +} PGMR0DYNMAPTEST; +typedef PGMR0DYNMAPTEST *PPGMR0DYNMAPTEST; + +/** + * Checks that the content of the page is the same on all CPUs, i.e. that there + * are no CPU specific PTs or similar nasty stuff involved. + * + * @param idCpu The current CPU. + * @param pvUser1 Pointer a PGMR0DYNMAPTEST structure. + * @param pvUser2 Unused, ignored. + */ +static DECLCALLBACK(void) pgmR0DynMapTest3PerCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PPGMR0DYNMAPTEST pTest = (PPGMR0DYNMAPTEST)pvUser1; + ASMInvalidatePage(pTest->pu32); + if (*pTest->pu32 != pTest->u32Expect) + ASMAtomicIncU32(&pTest->cFailures); + NOREF(pvUser2); NOREF(idCpu); +} + + +/** + * Performs some basic tests in debug builds. + */ +static int pgmR0DynMapTest(PVM pVM) +{ + LogRel(("pgmR0DynMapTest: ****** START ******\n")); + PPGMMAPSET pSet = &pVM->aCpus[0].pgm.s.AutoSet; + PPGMRZDYNMAP pThis = PGMRZDYNMAP_SET_2_DYNMAP(pSet); + uint32_t i; + + /* + * Assert internal integrity first. + */ + LogRel(("Test #0\n")); + int rc = PGMR0DynMapAssertIntegrity(); + if (RT_FAILURE(rc)) + return rc; + + void *pvR0DynMapUsedSaved = pVM->pgm.s.pvR0DynMapUsed; + pVM->pgm.s.pvR0DynMapUsed = pThis; + g_fPGMR0DynMapTestRunning = true; + + /* + * Simple test, map CR3 twice and check that we're getting the + * same mapping address back. + */ + LogRel(("Test #1\n")); + ASMIntDisable(); + PGMRZDynMapStartAutoSet(&pVM->aCpus[0]); + + uint64_t cr3 = ASMGetCR3() & ~(uint64_t)PAGE_OFFSET_MASK; + void *pv = (void *)(intptr_t)-1; + void *pv2 = (void *)(intptr_t)-2; + rc = pgmRZDynMapHCPageCommon(pVM, cr3, &pv RTLOG_COMMA_SRC_POS); + int rc2 = pgmRZDynMapHCPageCommon(pVM, cr3, &pv2 RTLOG_COMMA_SRC_POS); + ASMIntEnable(); + if ( RT_SUCCESS(rc2) + && RT_SUCCESS(rc) + && pv == pv2) + { + LogRel(("Load=%u/%u/%u Set=%u/%u\n", pThis->cLoad, pThis->cMaxLoad, pThis->cPages - pThis->cPages, pSet->cEntries, RT_ELEMENTS(pSet->aEntries))); + rc = PGMR0DynMapAssertIntegrity(); + + /* + * Check that the simple set overflow code works by filling it + * with more CR3 mappings. + */ + LogRel(("Test #2\n")); + ASMIntDisable(); + PGMR0DynMapMigrateAutoSet(&pVM->aCpus[0]); + for (i = 0 ; i < UINT16_MAX*2 - 1 && RT_SUCCESS(rc) && pv2 == pv; i++) + { + pv2 = (void *)(intptr_t)-4; + rc = pgmRZDynMapHCPageCommon(pVM, cr3, &pv2 RTLOG_COMMA_SRC_POS); + } + ASMIntEnable(); + if (RT_FAILURE(rc) || pv != pv2) + { + LogRel(("failed(%d): rc=%Rrc; pv=%p pv2=%p i=%p\n", __LINE__, rc, pv, pv2, i)); + if (RT_SUCCESS(rc)) rc = VERR_PGM_DYNMAP_IPE; + } + else if (pSet->cEntries != 5) + { + LogRel(("failed(%d): cEntries=%d expected %d\n", __LINE__, pSet->cEntries, RT_ELEMENTS(pSet->aEntries) / 2)); + rc = VERR_PGM_DYNMAP_IPE; + } + else if ( pSet->aEntries[4].cRefs != UINT16_MAX - 1 + || pSet->aEntries[3].cRefs != UINT16_MAX - 1 + || pSet->aEntries[2].cRefs != 1 + || pSet->aEntries[1].cRefs != 1 + || pSet->aEntries[0].cRefs != 1) + { + LogRel(("failed(%d): bad set dist: ", __LINE__)); + for (i = 0; i < pSet->cEntries; i++) + LogRel(("[%d]=%d, ", i, pSet->aEntries[i].cRefs)); + LogRel(("\n")); + rc = VERR_PGM_DYNMAP_IPE; + } + if (RT_SUCCESS(rc)) + rc = PGMR0DynMapAssertIntegrity(); + if (RT_SUCCESS(rc)) + { + /* + * Trigger an set optimization run (exactly). + */ + LogRel(("Test #3\n")); + ASMIntDisable(); + PGMR0DynMapMigrateAutoSet(&pVM->aCpus[0]); + pv2 = NULL; + for (i = 0 ; i < RT_ELEMENTS(pSet->aEntries) - 5 && RT_SUCCESS(rc) && pv2 != pv; i++) + { + pv2 = (void *)(intptr_t)(-5 - i); + rc = pgmRZDynMapHCPageCommon(pVM, cr3 + PAGE_SIZE * (i + 5), &pv2 RTLOG_COMMA_SRC_POS); + } + ASMIntEnable(); + if (RT_FAILURE(rc) || pv == pv2) + { + LogRel(("failed(%d): rc=%Rrc; pv=%p pv2=%p i=%d\n", __LINE__, rc, pv, pv2, i)); + if (RT_SUCCESS(rc)) rc = VERR_PGM_DYNMAP_IPE; + } + else if (pSet->cEntries != RT_ELEMENTS(pSet->aEntries)) + { + LogRel(("failed(%d): cEntries=%d expected %d\n", __LINE__, pSet->cEntries, RT_ELEMENTS(pSet->aEntries))); + rc = VERR_PGM_DYNMAP_IPE; + } + LogRel(("Load=%u/%u/%u Set=%u/%u\n", pThis->cLoad, pThis->cMaxLoad, pThis->cPages - pThis->cPages, pSet->cEntries, RT_ELEMENTS(pSet->aEntries))); + if (RT_SUCCESS(rc)) + rc = PGMR0DynMapAssertIntegrity(); + if (RT_SUCCESS(rc)) + { + /* + * Trigger an overflow error. + */ + LogRel(("Test #4\n")); + ASMIntDisable(); + PGMR0DynMapMigrateAutoSet(&pVM->aCpus[0]); + for (i = 0 ; i < RT_ELEMENTS(pSet->aEntries) + 2; i++) + { + rc = pgmRZDynMapHCPageCommon(pVM, cr3 - PAGE_SIZE * (i + 5), &pv2 RTLOG_COMMA_SRC_POS); + if (RT_SUCCESS(rc)) + rc = PGMR0DynMapAssertIntegrity(); + if (RT_FAILURE(rc)) + break; + } + ASMIntEnable(); + if (rc == VERR_PGM_DYNMAP_FULL_SET) + { + /* flush the set. */ + LogRel(("Test #5\n")); + ASMIntDisable(); + PGMR0DynMapMigrateAutoSet(&pVM->aCpus[0]); + PGMRZDynMapReleaseAutoSet(&pVM->aCpus[0]); + PGMRZDynMapStartAutoSet(&pVM->aCpus[0]); + ASMIntEnable(); + + rc = PGMR0DynMapAssertIntegrity(); + } + else + { + LogRel(("failed(%d): rc=%Rrc, wanted %d ; pv2=%p Set=%u/%u; i=%d\n", __LINE__, + rc, VERR_PGM_DYNMAP_FULL_SET, pv2, pSet->cEntries, RT_ELEMENTS(pSet->aEntries), i)); + if (RT_SUCCESS(rc)) rc = VERR_PGM_DYNMAP_IPE; + } + } + } + } + else + { + LogRel(("failed(%d): rc=%Rrc rc2=%Rrc; pv=%p pv2=%p\n", __LINE__, rc, rc2, pv, pv2)); + if (RT_SUCCESS(rc)) + rc = rc2; + } + + /* + * Check that everyone sees the same stuff. + */ + if (RT_SUCCESS(rc)) + { + LogRel(("Test #5\n")); + ASMIntDisable(); + PGMR0DynMapMigrateAutoSet(&pVM->aCpus[0]); + RTHCPHYS HCPhysPT = RTR0MemObjGetPagePhysAddr(pThis->pSegHead->ahMemObjPTs[0], 0); + rc = pgmRZDynMapHCPageCommon(pVM, HCPhysPT, &pv RTLOG_COMMA_SRC_POS); + if (RT_SUCCESS(rc)) + { + PGMR0DYNMAPTEST Test; + uint32_t *pu32Real = &pThis->paPages[pThis->pSegHead->iPage].uPte.pLegacy->u; + Test.pu32 = (uint32_t *)((uintptr_t)pv | ((uintptr_t)pu32Real & PAGE_OFFSET_MASK)); + Test.u32Expect = *pu32Real; + ASMAtomicWriteU32(&Test.cFailures, 0); + ASMIntEnable(); + + rc = RTMpOnAll(pgmR0DynMapTest3PerCpu, &Test, NULL); + if (RT_FAILURE(rc)) + LogRel(("failed(%d): RTMpOnAll rc=%Rrc\n", __LINE__, rc)); + else if (Test.cFailures) + { + LogRel(("failed(%d): cFailures=%d pu32Real=%p pu32=%p u32Expect=%#x *pu32=%#x\n", __LINE__, + Test.cFailures, pu32Real, Test.pu32, Test.u32Expect, *Test.pu32)); + rc = VERR_PGM_DYNMAP_IPE; + } + else + LogRel(("pu32Real=%p pu32=%p u32Expect=%#x *pu32=%#x\n", + pu32Real, Test.pu32, Test.u32Expect, *Test.pu32)); + } + else + { + ASMIntEnable(); + LogRel(("failed(%d): rc=%Rrc\n", rc)); + } + } + + /* + * Clean up. + */ + LogRel(("Cleanup.\n")); + ASMIntDisable(); + PGMR0DynMapMigrateAutoSet(&pVM->aCpus[0]); + PGMRZDynMapFlushAutoSet(&pVM->aCpus[0]); + PGMRZDynMapReleaseAutoSet(&pVM->aCpus[0]); + ASMIntEnable(); + + if (RT_SUCCESS(rc)) + rc = PGMR0DynMapAssertIntegrity(); + else + PGMR0DynMapAssertIntegrity(); + + g_fPGMR0DynMapTestRunning = false; + LogRel(("Result: rc=%Rrc Load=%u/%u/%u Set=%#x/%u\n", rc, + pThis->cLoad, pThis->cMaxLoad, pThis->cPages - pThis->cPages, pSet->cEntries, RT_ELEMENTS(pSet->aEntries))); + pVM->pgm.s.pvR0DynMapUsed = pvR0DynMapUsedSaved; + LogRel(("pgmR0DynMapTest: ****** END ******\n")); + return rc; +} +#endif /* DEBUG */ + diff --git a/src/VBox/VMM/VMMRZ/VMMRZ.cpp b/src/VBox/VMM/VMMRZ/VMMRZ.cpp new file mode 100644 index 00000000..fd8ccd5c --- /dev/null +++ b/src/VBox/VMM/VMMRZ/VMMRZ.cpp @@ -0,0 +1,253 @@ +/* $Id: VMMRZ.cpp $ */ +/** @file + * VMM - Virtual Machine Monitor, Raw-mode and ring-0 context code. + */ + +/* + * Copyright (C) 2009-2020 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_VMM +#include <VBox/vmm/vmm.h> +#include "VMMInternal.h" +#include <VBox/vmm/vmcc.h> +#include <VBox/err.h> + +#include <iprt/assert.h> +#include <iprt/asm-amd64-x86.h> +#include <iprt/string.h> + + +/** + * Calls the ring-3 host code. + * + * @returns VBox status code of the ring-3 call. + * @retval VERR_VMM_RING3_CALL_DISABLED if called at the wrong time. This must + * be passed up the stack, or if that isn't possible then VMMRZCallRing3 + * needs to change it into an assertion. + * + * + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param enmOperation The operation. + * @param uArg The argument to the operation. + */ +VMMRZDECL(int) VMMRZCallRing3(PVMCC pVM, PVMCPUCC pVCpu, VMMCALLRING3 enmOperation, uint64_t uArg) +{ + VMCPU_ASSERT_EMT(pVCpu); + + /* + * Check if calling ring-3 has been disabled and only let let fatal calls thru. + */ + if (RT_UNLIKELY( pVCpu->vmm.s.cCallRing3Disabled != 0 + && enmOperation != VMMCALLRING3_VM_R0_ASSERTION)) + { +#ifndef IN_RING0 + /* + * In most cases, it's sufficient to return a status code which + * will then be propagated up the code usually encountering several + * AssertRC invocations along the way. Hitting one of those is more + * helpful than stopping here. + * + * However, some doesn't check the status code because they are called + * from void functions, and for these we'll turn this into a ring-0 + * assertion host call. + */ + if (enmOperation != VMMCALLRING3_REM_REPLAY_HANDLER_NOTIFICATIONS) + return VERR_VMM_RING3_CALL_DISABLED; +#endif +#ifdef IN_RC + RTStrPrintf(g_szRTAssertMsg1, sizeof(pVM->vmm.s.szRing0AssertMsg1), + "VMMRZCallRing3: enmOperation=%d uArg=%#llx idCpu=%#x cCallRing3Disabled=%#x\n", + enmOperation, uArg, pVCpu->idCpu, pVCpu->vmm.s.cCallRing3Disabled); +#endif + RTStrPrintf(pVM->vmm.s.szRing0AssertMsg1, sizeof(pVM->vmm.s.szRing0AssertMsg1), + "VMMRZCallRing3: enmOperation=%d uArg=%#llx idCpu=%#x cCallRing3Disabled=%#x\n", + enmOperation, uArg, pVCpu->idCpu, pVCpu->vmm.s.cCallRing3Disabled); + enmOperation = VMMCALLRING3_VM_R0_ASSERTION; + } + + /* + * The normal path. + */ +/** @todo profile this! */ + pVCpu->vmm.s.enmCallRing3Operation = enmOperation; + pVCpu->vmm.s.u64CallRing3Arg = uArg; + pVCpu->vmm.s.rcCallRing3 = VERR_VMM_RING3_CALL_NO_RC; +#ifdef IN_RC + pVM->vmm.s.pfnRCToHost(VINF_VMM_CALL_HOST); +#else + int rc; + if (pVCpu->vmm.s.pfnCallRing3CallbackR0) + { + rc = pVCpu->vmm.s.pfnCallRing3CallbackR0(pVCpu, enmOperation, pVCpu->vmm.s.pvCallRing3CallbackUserR0); + if (RT_FAILURE(rc)) + return rc; + } + rc = vmmR0CallRing3LongJmp(&pVCpu->vmm.s.CallRing3JmpBufR0, VINF_VMM_CALL_HOST); + if (RT_FAILURE(rc)) + return rc; +#endif + return pVCpu->vmm.s.rcCallRing3; +} + + +/** + * Simple wrapper that adds the pVCpu argument. + * + * @returns VBox status code of the ring-3 call. + * @retval VERR_VMM_RING3_CALL_DISABLED if called at the wrong time. This must + * be passed up the stack, or if that isn't possible then VMMRZCallRing3 + * needs to change it into an assertion. + * + * @param pVM The cross context VM structure. + * @param enmOperation The operation. + * @param uArg The argument to the operation. + */ +VMMRZDECL(int) VMMRZCallRing3NoCpu(PVMCC pVM, VMMCALLRING3 enmOperation, uint64_t uArg) +{ + return VMMRZCallRing3(pVM, VMMGetCpu(pVM), enmOperation, uArg); +} + + +/** + * Disables all host calls, except certain fatal ones. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @thread EMT. + */ +VMMRZDECL(void) VMMRZCallRing3Disable(PVMCPUCC pVCpu) +{ + VMCPU_ASSERT_EMT(pVCpu); +#if defined(LOG_ENABLED) && defined(IN_RING0) + RTCCUINTREG fFlags = ASMIntDisableFlags(); /* preemption consistency. */ +#endif + + Assert(pVCpu->vmm.s.cCallRing3Disabled < 16); + if (ASMAtomicUoIncU32(&pVCpu->vmm.s.cCallRing3Disabled) == 1) + { + /** @todo it might make more sense to just disable logging here, then we + * won't flush away important bits... but that goes both ways really. */ +#ifdef IN_RC + pVCpu->pVMRC->vmm.s.fRCLoggerFlushingDisabled = true; +#else +# ifdef LOG_ENABLED + if (pVCpu->vmm.s.pR0LoggerR0) + pVCpu->vmm.s.pR0LoggerR0->fFlushingDisabled = true; +# endif + if (pVCpu->vmm.s.pR0RelLoggerR0) + pVCpu->vmm.s.pR0RelLoggerR0->fFlushingDisabled = true; +#endif + } + +#if defined(LOG_ENABLED) && defined(IN_RING0) + ASMSetFlags(fFlags); +#endif +} + + +/** + * Counters VMMRZCallRing3Disable() and re-enables host calls. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @thread EMT. + */ +VMMRZDECL(void) VMMRZCallRing3Enable(PVMCPUCC pVCpu) +{ + VMCPU_ASSERT_EMT(pVCpu); +#if defined(LOG_ENABLED) && defined(IN_RING0) + RTCCUINTREG fFlags = ASMIntDisableFlags(); /* preemption consistency. */ +#endif + + Assert(pVCpu->vmm.s.cCallRing3Disabled > 0); + if (ASMAtomicUoDecU32(&pVCpu->vmm.s.cCallRing3Disabled) == 0) + { +#ifdef IN_RC + pVCpu->pVMRC->vmm.s.fRCLoggerFlushingDisabled = false; +#else +# ifdef LOG_ENABLED + if (pVCpu->vmm.s.pR0LoggerR0) + pVCpu->vmm.s.pR0LoggerR0->fFlushingDisabled = false; +# endif + if (pVCpu->vmm.s.pR0RelLoggerR0) + pVCpu->vmm.s.pR0RelLoggerR0->fFlushingDisabled = false; +#endif + } + +#if defined(LOG_ENABLED) && defined(IN_RING0) + ASMSetFlags(fFlags); +#endif +} + + +/** + * Checks whether its possible to call host context or not. + * + * @returns true if it's safe, false if it isn't. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + */ +VMMRZDECL(bool) VMMRZCallRing3IsEnabled(PVMCPUCC pVCpu) +{ + VMCPU_ASSERT_EMT(pVCpu); + Assert(pVCpu->vmm.s.cCallRing3Disabled <= 16); + return pVCpu->vmm.s.cCallRing3Disabled == 0; +} + + +/** + * Sets the ring-0 callback before doing the ring-3 call. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pfnCallback Pointer to the callback. + * @param pvUser The user argument. + * + * @return VBox status code. + */ +VMMRZDECL(int) VMMRZCallRing3SetNotification(PVMCPUCC pVCpu, R0PTRTYPE(PFNVMMR0CALLRING3NOTIFICATION) pfnCallback, RTR0PTR pvUser) +{ + AssertPtrReturn(pVCpu, VERR_INVALID_POINTER); + AssertPtrReturn(pfnCallback, VERR_INVALID_POINTER); + + if (pVCpu->vmm.s.pfnCallRing3CallbackR0) + return VERR_ALREADY_EXISTS; + + pVCpu->vmm.s.pfnCallRing3CallbackR0 = pfnCallback; + pVCpu->vmm.s.pvCallRing3CallbackUserR0 = pvUser; + return VINF_SUCCESS; +} + + +/** + * Removes the ring-0 callback. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMRZDECL(void) VMMRZCallRing3RemoveNotification(PVMCPUCC pVCpu) +{ + pVCpu->vmm.s.pfnCallRing3CallbackR0 = NULL; +} + + +/** + * Checks whether there is a ring-0 callback notification active. + * + * @param pVCpu The cross context virtual CPU structure. + * @returns true if there the notification is active, false otherwise. + */ +VMMRZDECL(bool) VMMRZCallRing3IsNotificationSet(PVMCPUCC pVCpu) +{ + return pVCpu->vmm.s.pfnCallRing3CallbackR0 != NULL; +} + |