diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 03:01:46 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 03:01:46 +0000 |
commit | f8fe689a81f906d1b91bb3220acde2a4ecb14c5b (patch) | |
tree | 26484e9d7e2c67806c2d1760196ff01aaa858e8c /src/VBox/VMM/VMMR0 | |
parent | Initial commit. (diff) | |
download | virtualbox-f8fe689a81f906d1b91bb3220acde2a4ecb14c5b.tar.xz virtualbox-f8fe689a81f906d1b91bb3220acde2a4ecb14c5b.zip |
Adding upstream version 6.0.4-dfsg.upstream/6.0.4-dfsgupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
31 files changed, 45952 insertions, 0 deletions
diff --git a/src/VBox/VMM/VMMR0/CPUMR0.cpp b/src/VBox/VMM/VMMR0/CPUMR0.cpp new file mode 100644 index 00000000..e7afcbac --- /dev/null +++ b/src/VBox/VMM/VMMR0/CPUMR0.cpp @@ -0,0 +1,1009 @@ +/* $Id: CPUMR0.cpp $ */ +/** @file + * CPUM - Host Context Ring 0. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_CPUM +#include <VBox/vmm/cpum.h> +#include "CPUMInternal.h" +#include <VBox/vmm/vm.h> +#include <VBox/err.h> +#include <VBox/log.h> +#include <VBox/vmm/hm.h> +#include <iprt/assert.h> +#include <iprt/asm-amd64-x86.h> +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI +# include <iprt/mem.h> +# include <iprt/memobj.h> +# include <VBox/apic.h> +#endif +#include <iprt/x86.h> + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI +/** + * Local APIC mappings. + */ +typedef struct CPUMHOSTLAPIC +{ + /** Indicates that the entry is in use and have valid data. */ + bool fEnabled; + /** Whether it's operating in X2APIC mode (EXTD). */ + bool fX2Apic; + /** The APIC version number. */ + uint32_t uVersion; + /** The physical address of the APIC registers. */ + RTHCPHYS PhysBase; + /** The memory object entering the physical address. */ + RTR0MEMOBJ hMemObj; + /** The mapping object for hMemObj. */ + RTR0MEMOBJ hMapObj; + /** The mapping address APIC registers. + * @remarks Different CPUs may use the same physical address to map their + * APICs, so this pointer is only valid when on the CPU owning the + * APIC. */ + void *pv; +} CPUMHOSTLAPIC; +#endif + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI +static CPUMHOSTLAPIC g_aLApics[RTCPUSET_MAX_CPUS]; +#endif + +/** + * CPUID bits to unify among all cores. + */ +static struct +{ + uint32_t uLeaf; /**< Leaf to check. */ + uint32_t uEcx; /**< which bits in ecx to unify between CPUs. */ + uint32_t uEdx; /**< which bits in edx to unify between CPUs. */ +} +const g_aCpuidUnifyBits[] = +{ + { + 0x00000001, + X86_CPUID_FEATURE_ECX_CX16 | X86_CPUID_FEATURE_ECX_MONITOR, + X86_CPUID_FEATURE_EDX_CX8 + } +}; + + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI +static int cpumR0MapLocalApics(void); +static void cpumR0UnmapLocalApics(void); +#endif +static int cpumR0SaveHostDebugState(PVMCPU pVCpu); + + +/** + * Does the Ring-0 CPU initialization once during module load. + * XXX Host-CPU hot-plugging? + */ +VMMR0_INT_DECL(int) CPUMR0ModuleInit(void) +{ + int rc = VINF_SUCCESS; +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI + rc = cpumR0MapLocalApics(); +#endif + return rc; +} + + +/** + * Terminate the module. + */ +VMMR0_INT_DECL(int) CPUMR0ModuleTerm(void) +{ +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI + cpumR0UnmapLocalApics(); +#endif + return VINF_SUCCESS; +} + + +/** + * Check the CPUID features of this particular CPU and disable relevant features + * for the guest which do not exist on this CPU. We have seen systems where the + * X86_CPUID_FEATURE_ECX_MONITOR feature flag is only set on some host CPUs, see + * @bugref{5436}. + * + * @note This function might be called simultaneously on more than one CPU! + * + * @param idCpu The identifier for the CPU the function is called on. + * @param pvUser1 Pointer to the VM structure. + * @param pvUser2 Ignored. + */ +static DECLCALLBACK(void) cpumR0CheckCpuid(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PVM pVM = (PVM)pvUser1; + + NOREF(idCpu); NOREF(pvUser2); + for (uint32_t i = 0; i < RT_ELEMENTS(g_aCpuidUnifyBits); i++) + { + /* Note! Cannot use cpumCpuIdGetLeaf from here because we're not + necessarily in the VM process context. So, we using the + legacy arrays as temporary storage. */ + + uint32_t uLeaf = g_aCpuidUnifyBits[i].uLeaf; + PCPUMCPUID pLegacyLeaf; + if (uLeaf < RT_ELEMENTS(pVM->cpum.s.aGuestCpuIdPatmStd)) + pLegacyLeaf = &pVM->cpum.s.aGuestCpuIdPatmStd[uLeaf]; + else if (uLeaf - UINT32_C(0x80000000) < RT_ELEMENTS(pVM->cpum.s.aGuestCpuIdPatmExt)) + pLegacyLeaf = &pVM->cpum.s.aGuestCpuIdPatmExt[uLeaf - UINT32_C(0x80000000)]; + else if (uLeaf - UINT32_C(0xc0000000) < RT_ELEMENTS(pVM->cpum.s.aGuestCpuIdPatmCentaur)) + pLegacyLeaf = &pVM->cpum.s.aGuestCpuIdPatmCentaur[uLeaf - UINT32_C(0xc0000000)]; + else + continue; + + uint32_t eax, ebx, ecx, edx; + ASMCpuIdExSlow(uLeaf, 0, 0, 0, &eax, &ebx, &ecx, &edx); + + ASMAtomicAndU32(&pLegacyLeaf->uEcx, ecx | ~g_aCpuidUnifyBits[i].uEcx); + ASMAtomicAndU32(&pLegacyLeaf->uEdx, edx | ~g_aCpuidUnifyBits[i].uEdx); + } +} + + +/** + * Does Ring-0 CPUM initialization. + * + * This is mainly to check that the Host CPU mode is compatible + * with VBox. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) CPUMR0InitVM(PVM pVM) +{ + LogFlow(("CPUMR0Init: %p\n", pVM)); + + /* + * Check CR0 & CR4 flags. + */ + uint32_t u32CR0 = ASMGetCR0(); + if ((u32CR0 & (X86_CR0_PE | X86_CR0_PG)) != (X86_CR0_PE | X86_CR0_PG)) /* a bit paranoid perhaps.. */ + { + Log(("CPUMR0Init: PE or PG not set. cr0=%#x\n", u32CR0)); + return VERR_UNSUPPORTED_CPU_MODE; + } + + /* + * Check for sysenter and syscall usage. + */ + if (ASMHasCpuId()) + { + /* + * SYSENTER/SYSEXIT + * + * Intel docs claim you should test both the flag and family, model & + * stepping because some Pentium Pro CPUs have the SEP cpuid flag set, + * but don't support it. AMD CPUs may support this feature in legacy + * mode, they've banned it from long mode. Since we switch to 32-bit + * mode when entering raw-mode context the feature would become + * accessible again on AMD CPUs, so we have to check regardless of + * host bitness. + */ + uint32_t u32CpuVersion; + uint32_t u32Dummy; + uint32_t fFeatures; /* (Used further down to check for MSRs, so don't clobber.) */ + ASMCpuId(1, &u32CpuVersion, &u32Dummy, &u32Dummy, &fFeatures); + uint32_t const u32Family = u32CpuVersion >> 8; + uint32_t const u32Model = (u32CpuVersion >> 4) & 0xF; + uint32_t const u32Stepping = u32CpuVersion & 0xF; + if ( (fFeatures & X86_CPUID_FEATURE_EDX_SEP) + && ( u32Family != 6 /* (> pentium pro) */ + || u32Model >= 3 + || u32Stepping >= 3 + || !ASMIsIntelCpu()) + ) + { + /* + * Read the MSR and see if it's in use or not. + */ + uint32_t u32 = ASMRdMsr_Low(MSR_IA32_SYSENTER_CS); + if (u32) + { + pVM->cpum.s.fHostUseFlags |= CPUM_USE_SYSENTER; + Log(("CPUMR0Init: host uses sysenter cs=%08x%08x\n", ASMRdMsr_High(MSR_IA32_SYSENTER_CS), u32)); + } + } + + /* + * SYSCALL/SYSRET + * + * This feature is indicated by the SEP bit returned in EDX by CPUID + * function 0x80000001. Intel CPUs only supports this feature in + * long mode. Since we're not running 64-bit guests in raw-mode there + * are no issues with 32-bit intel hosts. + */ + uint32_t cExt = 0; + ASMCpuId(0x80000000, &cExt, &u32Dummy, &u32Dummy, &u32Dummy); + if (ASMIsValidExtRange(cExt)) + { + uint32_t fExtFeaturesEDX = ASMCpuId_EDX(0x80000001); + if (fExtFeaturesEDX & X86_CPUID_EXT_FEATURE_EDX_SYSCALL) + { +#ifdef RT_ARCH_X86 + if (!ASMIsIntelCpu()) +#endif + { + uint64_t fEfer = ASMRdMsr(MSR_K6_EFER); + if (fEfer & MSR_K6_EFER_SCE) + { + pVM->cpum.s.fHostUseFlags |= CPUM_USE_SYSCALL; + Log(("CPUMR0Init: host uses syscall\n")); + } + } + } + } + + /* + * Copy MSR_IA32_ARCH_CAPABILITIES bits over into the host feature structure. + */ + pVM->cpum.s.HostFeatures.fArchRdclNo = 0; + pVM->cpum.s.HostFeatures.fArchIbrsAll = 0; + pVM->cpum.s.HostFeatures.fArchRsbOverride = 0; + pVM->cpum.s.HostFeatures.fArchVmmNeedNotFlushL1d = 0; + uint32_t const cStdRange = ASMCpuId_EAX(0); + if ( ASMIsValidStdRange(cStdRange) + && cStdRange >= 7) + { + uint32_t fEdxFeatures = ASMCpuId_EDX(7); + if ( (fEdxFeatures & X86_CPUID_STEXT_FEATURE_EDX_ARCHCAP) + && (fFeatures & X86_CPUID_FEATURE_EDX_MSR)) + { + uint64_t const fArchVal = ASMRdMsr(MSR_IA32_ARCH_CAPABILITIES); + pVM->cpum.s.HostFeatures.fArchRdclNo = RT_BOOL(fArchVal & MSR_IA32_ARCH_CAP_F_RDCL_NO); + pVM->cpum.s.HostFeatures.fArchIbrsAll = RT_BOOL(fArchVal & MSR_IA32_ARCH_CAP_F_IBRS_ALL); + pVM->cpum.s.HostFeatures.fArchRsbOverride = RT_BOOL(fArchVal & MSR_IA32_ARCH_CAP_F_RSBO); + pVM->cpum.s.HostFeatures.fArchVmmNeedNotFlushL1d = RT_BOOL(fArchVal & MSR_IA32_ARCH_CAP_F_VMM_NEED_NOT_FLUSH_L1D); + } + else + pVM->cpum.s.HostFeatures.fArchCap = 0; + } + + /* + * Unify/cross check some CPUID feature bits on all available CPU cores + * and threads. We've seen CPUs where the monitor support differed. + * + * Because the hyper heap isn't always mapped into ring-0, we cannot + * access it from a RTMpOnAll callback. We use the legacy CPUID arrays + * as temp ring-0 accessible memory instead, ASSUMING that they're all + * up to date when we get here. + */ + RTMpOnAll(cpumR0CheckCpuid, pVM, NULL); + + for (uint32_t i = 0; i < RT_ELEMENTS(g_aCpuidUnifyBits); i++) + { + bool fIgnored; + uint32_t uLeaf = g_aCpuidUnifyBits[i].uLeaf; + PCPUMCPUIDLEAF pLeaf = cpumCpuIdGetLeafEx(pVM, uLeaf, 0, &fIgnored); + if (pLeaf) + { + PCPUMCPUID pLegacyLeaf; + if (uLeaf < RT_ELEMENTS(pVM->cpum.s.aGuestCpuIdPatmStd)) + pLegacyLeaf = &pVM->cpum.s.aGuestCpuIdPatmStd[uLeaf]; + else if (uLeaf - UINT32_C(0x80000000) < RT_ELEMENTS(pVM->cpum.s.aGuestCpuIdPatmExt)) + pLegacyLeaf = &pVM->cpum.s.aGuestCpuIdPatmExt[uLeaf - UINT32_C(0x80000000)]; + else if (uLeaf - UINT32_C(0xc0000000) < RT_ELEMENTS(pVM->cpum.s.aGuestCpuIdPatmCentaur)) + pLegacyLeaf = &pVM->cpum.s.aGuestCpuIdPatmCentaur[uLeaf - UINT32_C(0xc0000000)]; + else + continue; + + pLeaf->uEcx = pLegacyLeaf->uEcx; + pLeaf->uEdx = pLegacyLeaf->uEdx; + } + } + + } + + + /* + * Check if debug registers are armed. + * This ASSUMES that DR7.GD is not set, or that it's handled transparently! + */ + uint32_t u32DR7 = ASMGetDR7(); + if (u32DR7 & X86_DR7_ENABLED_MASK) + { + for (VMCPUID i = 0; i < pVM->cCpus; i++) + pVM->aCpus[i].cpum.s.fUseFlags |= CPUM_USE_DEBUG_REGS_HOST; + Log(("CPUMR0Init: host uses debug registers (dr7=%x)\n", u32DR7)); + } + + return VINF_SUCCESS; +} + + +/** + * Trap handler for device-not-available fault (\#NM). + * Device not available, FP or (F)WAIT instruction. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if the guest FPU state is loaded. + * @retval VINF_EM_RAW_GUEST_TRAP if it is a guest trap. + * @retval VINF_CPUM_HOST_CR0_MODIFIED if we modified the host CR0. + * + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0_INT_DECL(int) CPUMR0Trap07Handler(PVM pVM, PVMCPU pVCpu) +{ + Assert(pVM->cpum.s.HostFeatures.fFxSaveRstor); + Assert(ASMGetCR4() & X86_CR4_OSFXSR); + + /* If the FPU state has already been loaded, then it's a guest trap. */ + if (CPUMIsGuestFPUStateActive(pVCpu)) + { + Assert( ((pVCpu->cpum.s.Guest.cr0 & (X86_CR0_MP | X86_CR0_EM | X86_CR0_TS)) == (X86_CR0_MP | X86_CR0_TS)) + || ((pVCpu->cpum.s.Guest.cr0 & (X86_CR0_MP | X86_CR0_EM | X86_CR0_TS)) == (X86_CR0_MP | X86_CR0_TS | X86_CR0_EM))); + return VINF_EM_RAW_GUEST_TRAP; + } + + /* + * There are two basic actions: + * 1. Save host fpu and restore guest fpu. + * 2. Generate guest trap. + * + * When entering the hypervisor we'll always enable MP (for proper wait + * trapping) and TS (for intercepting all fpu/mmx/sse stuff). The EM flag + * is taken from the guest OS in order to get proper SSE handling. + * + * + * Actions taken depending on the guest CR0 flags: + * + * 3 2 1 + * TS | EM | MP | FPUInstr | WAIT :: VMM Action + * ------------------------------------------------------------------------ + * 0 | 0 | 0 | Exec | Exec :: Clear TS & MP, Save HC, Load GC. + * 0 | 0 | 1 | Exec | Exec :: Clear TS, Save HC, Load GC. + * 0 | 1 | 0 | #NM | Exec :: Clear TS & MP, Save HC, Load GC. + * 0 | 1 | 1 | #NM | Exec :: Clear TS, Save HC, Load GC. + * 1 | 0 | 0 | #NM | Exec :: Clear MP, Save HC, Load GC. (EM is already cleared.) + * 1 | 0 | 1 | #NM | #NM :: Go to guest taking trap there. + * 1 | 1 | 0 | #NM | Exec :: Clear MP, Save HC, Load GC. (EM is already set.) + * 1 | 1 | 1 | #NM | #NM :: Go to guest taking trap there. + */ + + switch (pVCpu->cpum.s.Guest.cr0 & (X86_CR0_MP | X86_CR0_EM | X86_CR0_TS)) + { + case X86_CR0_MP | X86_CR0_TS: + case X86_CR0_MP | X86_CR0_TS | X86_CR0_EM: + return VINF_EM_RAW_GUEST_TRAP; + default: + break; + } + + return CPUMR0LoadGuestFPU(pVM, pVCpu); +} + + +/** + * Saves the host-FPU/XMM state (if necessary) and (always) loads the guest-FPU + * state into the CPU. + * + * @returns VINF_SUCCESS on success, host CR0 unmodified. + * @returns VINF_CPUM_HOST_CR0_MODIFIED on success when the host CR0 was + * modified and VT-x needs to update the value in the VMCS. + * + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0_INT_DECL(int) CPUMR0LoadGuestFPU(PVM pVM, PVMCPU pVCpu) +{ + int rc = VINF_SUCCESS; + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(!(pVCpu->cpum.s.fUseFlags & CPUM_USED_FPU_GUEST)); + Assert(!(pVCpu->cpum.s.fUseFlags & CPUM_SYNC_FPU_STATE)); + +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.s.Guest)) + { + Assert(!(pVCpu->cpum.s.fUseFlags & CPUM_USED_MANUAL_XMM_RESTORE)); + + /* Save the host state if necessary. */ + if (!(pVCpu->cpum.s.fUseFlags & CPUM_USED_FPU_HOST)) + rc = cpumRZSaveHostFPUState(&pVCpu->cpum.s); + + /* Restore the state on entry as we need to be in 64-bit mode to access the full state. */ + pVCpu->cpum.s.fUseFlags |= CPUM_SYNC_FPU_STATE; + + Assert( (pVCpu->cpum.s.fUseFlags & (CPUM_USED_FPU_HOST | CPUM_USED_FPU_SINCE_REM)) + == (CPUM_USED_FPU_HOST | CPUM_USED_FPU_SINCE_REM)); + } + else +#endif + { + if (!pVM->cpum.s.HostFeatures.fLeakyFxSR) + { + Assert(!(pVCpu->cpum.s.fUseFlags & CPUM_USED_MANUAL_XMM_RESTORE)); + rc = cpumR0SaveHostRestoreGuestFPUState(&pVCpu->cpum.s); + } + else + { + Assert(!(pVCpu->cpum.s.fUseFlags & CPUM_USED_MANUAL_XMM_RESTORE) || (pVCpu->cpum.s.fUseFlags & CPUM_USED_FPU_HOST)); + /** @todo r=ramshankar: Can't we used a cached value here + * instead of reading the MSR? host EFER doesn't usually + * change. */ + uint64_t uHostEfer = ASMRdMsr(MSR_K6_EFER); + if (!(uHostEfer & MSR_K6_EFER_FFXSR)) + rc = cpumR0SaveHostRestoreGuestFPUState(&pVCpu->cpum.s); + else + { + RTCCUINTREG const uSavedFlags = ASMIntDisableFlags(); + pVCpu->cpum.s.fUseFlags |= CPUM_USED_MANUAL_XMM_RESTORE; + ASMWrMsr(MSR_K6_EFER, uHostEfer & ~MSR_K6_EFER_FFXSR); + rc = cpumR0SaveHostRestoreGuestFPUState(&pVCpu->cpum.s); + ASMWrMsr(MSR_K6_EFER, uHostEfer | MSR_K6_EFER_FFXSR); + ASMSetFlags(uSavedFlags); + } + } + Assert( (pVCpu->cpum.s.fUseFlags & (CPUM_USED_FPU_GUEST | CPUM_USED_FPU_HOST | CPUM_USED_FPU_SINCE_REM)) + == (CPUM_USED_FPU_GUEST | CPUM_USED_FPU_HOST | CPUM_USED_FPU_SINCE_REM)); + } + return rc; +} + + +/** + * Saves the guest FPU/XMM state if needed, restores the host FPU/XMM state as + * needed. + * + * @returns true if we saved the guest state. + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0_INT_DECL(bool) CPUMR0FpuStateMaybeSaveGuestAndRestoreHost(PVMCPU pVCpu) +{ + bool fSavedGuest; + Assert(pVCpu->CTX_SUFF(pVM)->cpum.s.HostFeatures.fFxSaveRstor); + Assert(ASMGetCR4() & X86_CR4_OSFXSR); + if (pVCpu->cpum.s.fUseFlags & (CPUM_USED_FPU_GUEST | CPUM_USED_FPU_HOST)) + { + fSavedGuest = RT_BOOL(pVCpu->cpum.s.fUseFlags & CPUM_USED_FPU_GUEST); +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.s.Guest)) + { + if (pVCpu->cpum.s.fUseFlags & CPUM_USED_FPU_GUEST) + { + Assert(!(pVCpu->cpum.s.fUseFlags & CPUM_SYNC_FPU_STATE)); + HMR0SaveFPUState(pVCpu->CTX_SUFF(pVM), pVCpu, &pVCpu->cpum.s.Guest); + } + else + pVCpu->cpum.s.fUseFlags &= ~CPUM_SYNC_FPU_STATE; + cpumR0RestoreHostFPUState(&pVCpu->cpum.s); + } + else +#endif + { + if (!(pVCpu->cpum.s.fUseFlags & CPUM_USED_MANUAL_XMM_RESTORE)) + cpumR0SaveGuestRestoreHostFPUState(&pVCpu->cpum.s); + else + { + /* Temporarily clear MSR_K6_EFER_FFXSR or else we'll be unable to + save/restore the XMM state with fxsave/fxrstor. */ + uint64_t uHostEfer = ASMRdMsr(MSR_K6_EFER); + if (uHostEfer & MSR_K6_EFER_FFXSR) + { + RTCCUINTREG const uSavedFlags = ASMIntDisableFlags(); + ASMWrMsr(MSR_K6_EFER, uHostEfer & ~MSR_K6_EFER_FFXSR); + cpumR0SaveGuestRestoreHostFPUState(&pVCpu->cpum.s); + ASMWrMsr(MSR_K6_EFER, uHostEfer | MSR_K6_EFER_FFXSR); + ASMSetFlags(uSavedFlags); + } + else + cpumR0SaveGuestRestoreHostFPUState(&pVCpu->cpum.s); + pVCpu->cpum.s.fUseFlags &= ~CPUM_USED_MANUAL_XMM_RESTORE; + } + } + } + else + fSavedGuest = false; + Assert(!( pVCpu->cpum.s.fUseFlags + & (CPUM_USED_FPU_GUEST | CPUM_USED_FPU_HOST | CPUM_SYNC_FPU_STATE | CPUM_USED_MANUAL_XMM_RESTORE))); + return fSavedGuest; +} + + +/** + * Saves the host debug state, setting CPUM_USED_HOST_DEBUG_STATE and loading + * DR7 with safe values. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +static int cpumR0SaveHostDebugState(PVMCPU pVCpu) +{ + /* + * Save the host state. + */ + pVCpu->cpum.s.Host.dr0 = ASMGetDR0(); + pVCpu->cpum.s.Host.dr1 = ASMGetDR1(); + pVCpu->cpum.s.Host.dr2 = ASMGetDR2(); + pVCpu->cpum.s.Host.dr3 = ASMGetDR3(); + pVCpu->cpum.s.Host.dr6 = ASMGetDR6(); + /** @todo dr7 might already have been changed to 0x400; don't care right now as it's harmless. */ + pVCpu->cpum.s.Host.dr7 = ASMGetDR7(); + + /* Preemption paranoia. */ + ASMAtomicOrU32(&pVCpu->cpum.s.fUseFlags, CPUM_USED_DEBUG_REGS_HOST); + + /* + * Make sure DR7 is harmless or else we could trigger breakpoints when + * load guest or hypervisor DRx values later. + */ + if (pVCpu->cpum.s.Host.dr7 != X86_DR7_INIT_VAL) + ASMSetDR7(X86_DR7_INIT_VAL); + + return VINF_SUCCESS; +} + + +/** + * Saves the guest DRx state residing in host registers and restore the host + * register values. + * + * The guest DRx state is only saved if CPUMR0LoadGuestDebugState was called, + * since it's assumed that we're shadowing the guest DRx register values + * accurately when using the combined hypervisor debug register values + * (CPUMR0LoadHyperDebugState). + * + * @returns true if either guest or hypervisor debug registers were loaded. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param fDr6 Whether to include DR6 or not. + * @thread EMT(pVCpu) + */ +VMMR0_INT_DECL(bool) CPUMR0DebugStateMaybeSaveGuestAndRestoreHost(PVMCPU pVCpu, bool fDr6) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + bool const fDrXLoaded = RT_BOOL(pVCpu->cpum.s.fUseFlags & (CPUM_USED_DEBUG_REGS_GUEST | CPUM_USED_DEBUG_REGS_HYPER)); + + /* + * Do we need to save the guest DRx registered loaded into host registers? + * (DR7 and DR6 (if fDr6 is true) are left to the caller.) + */ + if (pVCpu->cpum.s.fUseFlags & CPUM_USED_DEBUG_REGS_GUEST) + { +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.s.Guest)) + { + uint64_t uDr6 = pVCpu->cpum.s.Guest.dr[6]; + HMR0SaveDebugState(pVCpu->CTX_SUFF(pVM), pVCpu, &pVCpu->cpum.s.Guest); + if (!fDr6) + pVCpu->cpum.s.Guest.dr[6] = uDr6; + } + else +#endif + { + pVCpu->cpum.s.Guest.dr[0] = ASMGetDR0(); + pVCpu->cpum.s.Guest.dr[1] = ASMGetDR1(); + pVCpu->cpum.s.Guest.dr[2] = ASMGetDR2(); + pVCpu->cpum.s.Guest.dr[3] = ASMGetDR3(); + if (fDr6) + pVCpu->cpum.s.Guest.dr[6] = ASMGetDR6(); + } + } + ASMAtomicAndU32(&pVCpu->cpum.s.fUseFlags, ~( CPUM_USED_DEBUG_REGS_GUEST | CPUM_USED_DEBUG_REGS_HYPER + | CPUM_SYNC_DEBUG_REGS_GUEST | CPUM_SYNC_DEBUG_REGS_HYPER)); + + /* + * Restore the host's debug state. DR0-3, DR6 and only then DR7! + */ + if (pVCpu->cpum.s.fUseFlags & CPUM_USED_DEBUG_REGS_HOST) + { + /* A bit of paranoia first... */ + uint64_t uCurDR7 = ASMGetDR7(); + if (uCurDR7 != X86_DR7_INIT_VAL) + ASMSetDR7(X86_DR7_INIT_VAL); + + ASMSetDR0(pVCpu->cpum.s.Host.dr0); + ASMSetDR1(pVCpu->cpum.s.Host.dr1); + ASMSetDR2(pVCpu->cpum.s.Host.dr2); + ASMSetDR3(pVCpu->cpum.s.Host.dr3); + /** @todo consider only updating if they differ, esp. DR6. Need to figure how + * expensive DRx reads are over DRx writes. */ + ASMSetDR6(pVCpu->cpum.s.Host.dr6); + ASMSetDR7(pVCpu->cpum.s.Host.dr7); + + ASMAtomicAndU32(&pVCpu->cpum.s.fUseFlags, ~CPUM_USED_DEBUG_REGS_HOST); + } + + return fDrXLoaded; +} + + +/** + * Saves the guest DRx state if it resides host registers. + * + * This does NOT clear any use flags, so the host registers remains loaded with + * the guest DRx state upon return. The purpose is only to make sure the values + * in the CPU context structure is up to date. + * + * @returns true if the host registers contains guest values, false if not. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param fDr6 Whether to include DR6 or not. + * @thread EMT(pVCpu) + */ +VMMR0_INT_DECL(bool) CPUMR0DebugStateMaybeSaveGuest(PVMCPU pVCpu, bool fDr6) +{ + /* + * Do we need to save the guest DRx registered loaded into host registers? + * (DR7 and DR6 (if fDr6 is true) are left to the caller.) + */ + if (pVCpu->cpum.s.fUseFlags & CPUM_USED_DEBUG_REGS_GUEST) + { +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.s.Guest)) + { + uint64_t uDr6 = pVCpu->cpum.s.Guest.dr[6]; + HMR0SaveDebugState(pVCpu->CTX_SUFF(pVM), pVCpu, &pVCpu->cpum.s.Guest); + if (!fDr6) + pVCpu->cpum.s.Guest.dr[6] = uDr6; + } + else +#endif + { + pVCpu->cpum.s.Guest.dr[0] = ASMGetDR0(); + pVCpu->cpum.s.Guest.dr[1] = ASMGetDR1(); + pVCpu->cpum.s.Guest.dr[2] = ASMGetDR2(); + pVCpu->cpum.s.Guest.dr[3] = ASMGetDR3(); + if (fDr6) + pVCpu->cpum.s.Guest.dr[6] = ASMGetDR6(); + } + return true; + } + return false; +} + + +/** + * Lazily sync in the debug state. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param fDr6 Whether to include DR6 or not. + * @thread EMT(pVCpu) + */ +VMMR0_INT_DECL(void) CPUMR0LoadGuestDebugState(PVMCPU pVCpu, bool fDr6) +{ + /* + * Save the host state and disarm all host BPs. + */ + cpumR0SaveHostDebugState(pVCpu); + Assert(ASMGetDR7() == X86_DR7_INIT_VAL); + + /* + * Activate the guest state DR0-3. + * DR7 and DR6 (if fDr6 is true) are left to the caller. + */ +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.s.Guest)) + ASMAtomicOrU32(&pVCpu->cpum.s.fUseFlags, CPUM_SYNC_DEBUG_REGS_GUEST); /* Postpone it to the world switch. */ + else +#endif + { + ASMSetDR0(pVCpu->cpum.s.Guest.dr[0]); + ASMSetDR1(pVCpu->cpum.s.Guest.dr[1]); + ASMSetDR2(pVCpu->cpum.s.Guest.dr[2]); + ASMSetDR3(pVCpu->cpum.s.Guest.dr[3]); + if (fDr6) + ASMSetDR6(pVCpu->cpum.s.Guest.dr[6]); + + ASMAtomicOrU32(&pVCpu->cpum.s.fUseFlags, CPUM_USED_DEBUG_REGS_GUEST); + } +} + + +/** + * Lazily sync in the hypervisor debug state + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param fDr6 Whether to include DR6 or not. + * @thread EMT(pVCpu) + */ +VMMR0_INT_DECL(void) CPUMR0LoadHyperDebugState(PVMCPU pVCpu, bool fDr6) +{ + /* + * Save the host state and disarm all host BPs. + */ + cpumR0SaveHostDebugState(pVCpu); + Assert(ASMGetDR7() == X86_DR7_INIT_VAL); + + /* + * Make sure the hypervisor values are up to date. + */ + CPUMRecalcHyperDRx(pVCpu, UINT8_MAX /* no loading, please */, true); + + /* + * Activate the guest state DR0-3. + * DR7 and DR6 (if fDr6 is true) are left to the caller. + */ +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.s.Guest)) + ASMAtomicOrU32(&pVCpu->cpum.s.fUseFlags, CPUM_SYNC_DEBUG_REGS_HYPER); /* Postpone it. */ + else +#endif + { + ASMSetDR0(pVCpu->cpum.s.Hyper.dr[0]); + ASMSetDR1(pVCpu->cpum.s.Hyper.dr[1]); + ASMSetDR2(pVCpu->cpum.s.Hyper.dr[2]); + ASMSetDR3(pVCpu->cpum.s.Hyper.dr[3]); + if (fDr6) + ASMSetDR6(X86_DR6_INIT_VAL); + + ASMAtomicOrU32(&pVCpu->cpum.s.fUseFlags, CPUM_USED_DEBUG_REGS_HYPER); + } +} + +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI + +/** + * Per-CPU callback that probes the CPU for APIC support. + * + * @param idCpu The identifier for the CPU the function is called on. + * @param pvUser1 Ignored. + * @param pvUser2 Ignored. + */ +static DECLCALLBACK(void) cpumR0MapLocalApicCpuProber(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + NOREF(pvUser1); NOREF(pvUser2); + int iCpu = RTMpCpuIdToSetIndex(idCpu); + AssertReturnVoid(iCpu >= 0 && (unsigned)iCpu < RT_ELEMENTS(g_aLApics)); + + /* + * Check for APIC support. + */ + uint32_t uMaxLeaf, u32EBX, u32ECX, u32EDX; + ASMCpuId(0, &uMaxLeaf, &u32EBX, &u32ECX, &u32EDX); + if ( ( ASMIsIntelCpuEx(u32EBX, u32ECX, u32EDX) + || ASMIsAmdCpuEx(u32EBX, u32ECX, u32EDX) + || ASMIsViaCentaurCpuEx(u32EBX, u32ECX, u32EDX) + || ASMIsShanghaiCpuEx(u32EBX, u32ECX, u32EDX)) + && ASMIsValidStdRange(uMaxLeaf)) + { + uint32_t uDummy; + ASMCpuId(1, &uDummy, &u32EBX, &u32ECX, &u32EDX); + if ( (u32EDX & X86_CPUID_FEATURE_EDX_APIC) + && (u32EDX & X86_CPUID_FEATURE_EDX_MSR)) + { + /* + * Safe to access the MSR. Read it and calc the BASE (a little complicated). + */ + uint64_t u64ApicBase = ASMRdMsr(MSR_IA32_APICBASE); + uint64_t u64Mask = MSR_IA32_APICBASE_BASE_MIN; + + /* see Intel Manual: Local APIC Status and Location: MAXPHYADDR default is bit 36 */ + uint32_t uMaxExtLeaf; + ASMCpuId(0x80000000, &uMaxExtLeaf, &u32EBX, &u32ECX, &u32EDX); + if ( uMaxExtLeaf >= UINT32_C(0x80000008) + && ASMIsValidExtRange(uMaxExtLeaf)) + { + uint32_t u32PhysBits; + ASMCpuId(0x80000008, &u32PhysBits, &u32EBX, &u32ECX, &u32EDX); + u32PhysBits &= 0xff; + u64Mask = ((UINT64_C(1) << u32PhysBits) - 1) & UINT64_C(0xfffffffffffff000); + } + + AssertCompile(sizeof(g_aLApics[iCpu].PhysBase) == sizeof(u64ApicBase)); + g_aLApics[iCpu].PhysBase = u64ApicBase & u64Mask; + g_aLApics[iCpu].fEnabled = RT_BOOL(u64ApicBase & MSR_IA32_APICBASE_EN); + g_aLApics[iCpu].fX2Apic = (u64ApicBase & (MSR_IA32_APICBASE_EXTD | MSR_IA32_APICBASE_EN)) + == (MSR_IA32_APICBASE_EXTD | MSR_IA32_APICBASE_EN); + } + } +} + + + +/** + * Per-CPU callback that verifies our APIC expectations. + * + * @param idCpu The identifier for the CPU the function is called on. + * @param pvUser1 Ignored. + * @param pvUser2 Ignored. + */ +static DECLCALLBACK(void) cpumR0MapLocalApicCpuChecker(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + NOREF(pvUser1); NOREF(pvUser2); + + int iCpu = RTMpCpuIdToSetIndex(idCpu); + AssertReturnVoid(iCpu >= 0 && (unsigned)iCpu < RT_ELEMENTS(g_aLApics)); + if (!g_aLApics[iCpu].fEnabled) + return; + + /* + * 0x0X 82489 external APIC + * 0x1X Local APIC + * 0x2X..0xFF reserved + */ + uint32_t uApicVersion; + if (g_aLApics[iCpu].fX2Apic) + uApicVersion = ApicX2RegRead32(APIC_REG_VERSION); + else + uApicVersion = ApicRegRead(g_aLApics[iCpu].pv, APIC_REG_VERSION); + if ((APIC_REG_VERSION_GET_VER(uApicVersion) & 0xF0) == 0x10) + { + g_aLApics[iCpu].uVersion = uApicVersion; + +#if 0 /* enable if you need it. */ + if (g_aLApics[iCpu].fX2Apic) + SUPR0Printf("CPUM: X2APIC %02u - ver %#010x, lint0=%#07x lint1=%#07x pc=%#07x thmr=%#07x cmci=%#07x\n", + iCpu, uApicVersion, + ApicX2RegRead32(APIC_REG_LVT_LINT0), ApicX2RegRead32(APIC_REG_LVT_LINT1), + ApicX2RegRead32(APIC_REG_LVT_PC), ApicX2RegRead32(APIC_REG_LVT_THMR), + ApicX2RegRead32(APIC_REG_LVT_CMCI)); + else + { + SUPR0Printf("CPUM: APIC %02u at %RGp (mapped at %p) - ver %#010x, lint0=%#07x lint1=%#07x pc=%#07x thmr=%#07x cmci=%#07x\n", + iCpu, g_aLApics[iCpu].PhysBase, g_aLApics[iCpu].pv, uApicVersion, + ApicRegRead(g_aLApics[iCpu].pv, APIC_REG_LVT_LINT0), ApicRegRead(g_aLApics[iCpu].pv, APIC_REG_LVT_LINT1), + ApicRegRead(g_aLApics[iCpu].pv, APIC_REG_LVT_PC), ApicRegRead(g_aLApics[iCpu].pv, APIC_REG_LVT_THMR), + ApicRegRead(g_aLApics[iCpu].pv, APIC_REG_LVT_CMCI)); + if (uApicVersion & 0x80000000) + { + uint32_t uExtFeatures = ApicRegRead(g_aLApics[iCpu].pv, 0x400); + uint32_t cEiLvt = (uExtFeatures >> 16) & 0xff; + SUPR0Printf("CPUM: APIC %02u: ExtSpace available. extfeat=%08x eilvt[0..3]=%08x %08x %08x %08x\n", + iCpu, + ApicRegRead(g_aLApics[iCpu].pv, 0x400), + cEiLvt >= 1 ? ApicRegRead(g_aLApics[iCpu].pv, 0x500) : 0, + cEiLvt >= 2 ? ApicRegRead(g_aLApics[iCpu].pv, 0x510) : 0, + cEiLvt >= 3 ? ApicRegRead(g_aLApics[iCpu].pv, 0x520) : 0, + cEiLvt >= 4 ? ApicRegRead(g_aLApics[iCpu].pv, 0x530) : 0); + } + } +#endif + } + else + { + g_aLApics[iCpu].fEnabled = false; + g_aLApics[iCpu].fX2Apic = false; + SUPR0Printf("VBox/CPUM: Unsupported APIC version %#x (iCpu=%d)\n", uApicVersion, iCpu); + } +} + + +/** + * Map the MMIO page of each local APIC in the system. + */ +static int cpumR0MapLocalApics(void) +{ + /* + * Check that we'll always stay within the array bounds. + */ + if (RTMpGetArraySize() > RT_ELEMENTS(g_aLApics)) + { + LogRel(("CPUM: Too many real CPUs/cores/threads - %u, max %u\n", RTMpGetArraySize(), RT_ELEMENTS(g_aLApics))); + return VERR_TOO_MANY_CPUS; + } + + /* + * Create mappings for all online CPUs we think have legacy APICs. + */ + int rc = RTMpOnAll(cpumR0MapLocalApicCpuProber, NULL, NULL); + + for (unsigned iCpu = 0; RT_SUCCESS(rc) && iCpu < RT_ELEMENTS(g_aLApics); iCpu++) + { + if (g_aLApics[iCpu].fEnabled && !g_aLApics[iCpu].fX2Apic) + { + rc = RTR0MemObjEnterPhys(&g_aLApics[iCpu].hMemObj, g_aLApics[iCpu].PhysBase, + PAGE_SIZE, RTMEM_CACHE_POLICY_MMIO); + if (RT_SUCCESS(rc)) + { + rc = RTR0MemObjMapKernel(&g_aLApics[iCpu].hMapObj, g_aLApics[iCpu].hMemObj, (void *)-1, + PAGE_SIZE, RTMEM_PROT_READ | RTMEM_PROT_WRITE); + if (RT_SUCCESS(rc)) + { + g_aLApics[iCpu].pv = RTR0MemObjAddress(g_aLApics[iCpu].hMapObj); + continue; + } + RTR0MemObjFree(g_aLApics[iCpu].hMemObj, true /* fFreeMappings */); + } + g_aLApics[iCpu].fEnabled = false; + } + g_aLApics[iCpu].pv = NULL; + } + + /* + * Check the APICs. + */ + if (RT_SUCCESS(rc)) + rc = RTMpOnAll(cpumR0MapLocalApicCpuChecker, NULL, NULL); + + if (RT_FAILURE(rc)) + { + cpumR0UnmapLocalApics(); + return rc; + } + +#ifdef LOG_ENABLED + /* + * Log the result (pretty useless, requires enabling CPUM in VBoxDrv + * and !VBOX_WITH_R0_LOGGING). + */ + if (LogIsEnabled()) + { + uint32_t cEnabled = 0; + uint32_t cX2Apics = 0; + for (unsigned iCpu = 0; iCpu < RT_ELEMENTS(g_aLApics); iCpu++) + if (g_aLApics[iCpu].fEnabled) + { + cEnabled++; + cX2Apics += g_aLApics[iCpu].fX2Apic; + } + Log(("CPUM: %u APICs, %u X2APICs\n", cEnabled, cX2Apics)); + } +#endif + + return VINF_SUCCESS; +} + + +/** + * Unmap the Local APIC of all host CPUs. + */ +static void cpumR0UnmapLocalApics(void) +{ + for (unsigned iCpu = RT_ELEMENTS(g_aLApics); iCpu-- > 0;) + { + if (g_aLApics[iCpu].pv) + { + RTR0MemObjFree(g_aLApics[iCpu].hMapObj, true /* fFreeMappings */); + RTR0MemObjFree(g_aLApics[iCpu].hMemObj, true /* fFreeMappings */); + g_aLApics[iCpu].hMapObj = NIL_RTR0MEMOBJ; + g_aLApics[iCpu].hMemObj = NIL_RTR0MEMOBJ; + g_aLApics[iCpu].fEnabled = false; + g_aLApics[iCpu].fX2Apic = false; + g_aLApics[iCpu].pv = NULL; + } + } +} + + +/** + * Updates CPUMCPU::pvApicBase and CPUMCPU::fX2Apic prior to world switch. + * + * Writes the Local APIC mapping address of the current host CPU to CPUMCPU so + * the world switchers can access the APIC registers for the purpose of + * disabling and re-enabling the NMIs. Must be called with disabled preemption + * or disabled interrupts! + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param iHostCpuSet The CPU set index of the current host CPU. + */ +VMMR0_INT_DECL(void) CPUMR0SetLApic(PVMCPU pVCpu, uint32_t iHostCpuSet) +{ + Assert(iHostCpuSet <= RT_ELEMENTS(g_aLApics)); + pVCpu->cpum.s.pvApicBase = g_aLApics[iHostCpuSet].pv; + pVCpu->cpum.s.fX2Apic = g_aLApics[iHostCpuSet].fX2Apic; +// Log6(("CPUMR0SetLApic: pvApicBase=%p fX2Apic=%d\n", g_aLApics[idxCpu].pv, g_aLApics[idxCpu].fX2Apic)); +} + +#endif /* VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI */ + diff --git a/src/VBox/VMM/VMMR0/CPUMR0A.asm b/src/VBox/VMM/VMMR0/CPUMR0A.asm new file mode 100644 index 00000000..b0d1eef3 --- /dev/null +++ b/src/VBox/VMM/VMMR0/CPUMR0A.asm @@ -0,0 +1,358 @@ + ; $Id: CPUMR0A.asm $ +;; @file +; CPUM - Ring-0 Assembly Routines (supporting HM and IEM). +; + +; +; Copyright (C) 2006-2019 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + + +;******************************************************************************* +;* Header Files * +;******************************************************************************* +%define RT_ASM_WITH_SEH64 +%include "iprt/asmdefs.mac" +%include "VBox/asmdefs.mac" +%include "VBox/vmm/vm.mac" +%include "VBox/err.mac" +%include "VBox/vmm/stam.mac" +%include "CPUMInternal.mac" +%include "iprt/x86.mac" +%include "VBox/vmm/cpum.mac" + + +BEGINCODE + +;; +; Makes sure the EMTs have a FPU state associated with them on hosts where we're +; allowed to use it in ring-0 too. +; +; This ensure that we don't have to allocate the state lazily while trying to execute +; guest code with preemption disabled or worse. +; +; @cproto VMMR0_INT_DECL(void) CPUMR0RegisterVCpuThread(PVMCPU pVCpu); +; +BEGINPROC CPUMR0RegisterVCpuThread + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + +%ifdef VMM_R0_TOUCH_FPU + movdqa xmm0, xmm0 ; hope this is harmless. +%endif + +.return: + xor eax, eax ; paranoia + leave + ret +ENDPROC CPUMR0RegisterVCpuThread + + +%ifdef VMM_R0_TOUCH_FPU +;; +; Touches the host FPU state. +; +; @uses nothing (well, maybe cr0) +; + %ifndef RT_ASM_WITH_SEH64 ; workaround for yasm 1.3.0 bug (error: prologue -1 bytes, must be <256) +ALIGNCODE(16) + %endif +BEGINPROC CPUMR0TouchHostFpu + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + + movdqa xmm0, xmm0 ; Hope this is harmless. + + leave + ret +ENDPROC CPUMR0TouchHostFpu +%endif ; VMM_R0_TOUCH_FPU + + +;; +; Saves the host FPU/SSE/AVX state and restores the guest FPU/SSE/AVX state. +; +; @returns VINF_SUCCESS (0) or VINF_CPUM_HOST_CR0_MODIFIED. (EAX) +; @param pCpumCpu x86:[ebp+8] gcc:rdi msc:rcx CPUMCPU pointer +; +; @remarks 64-bit Windows drivers shouldn't use AVX registers without saving+loading: +; https://msdn.microsoft.com/en-us/library/windows/hardware/ff545910%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396 +; However the compiler docs have different idea: +; https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx +; We'll go with the former for now. +; +%ifndef RT_ASM_WITH_SEH64 ; workaround for yasm 1.3.0 bug (error: prologue -1 bytes, must be <256) +ALIGNCODE(16) +%endif +BEGINPROC cpumR0SaveHostRestoreGuestFPUState + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + + ; + ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input. + ; +%ifdef RT_ARCH_AMD64 + %ifdef RT_OS_WINDOWS + mov r11, rcx + %else + mov r11, rdi + %endif + %define pCpumCpu r11 + %define pXState r10 +%else + push ebx + push esi + mov ebx, dword [ebp + 8] + %define pCpumCpu ebx + %define pXState esi +%endif + + pushf ; The darwin kernel can get upset or upset things if an + cli ; interrupt occurs while we're doing fxsave/fxrstor/cr0. + + ; + ; Save the host state. + ; + test dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USED_FPU_HOST + jnz .already_saved_host + + CPUMRZ_TOUCH_FPU_CLEAR_CR0_FPU_TRAPS_SET_RC xCX, xAX, pCpumCpu ; xCX is the return value for VT-x; xAX is scratch. + + CPUMR0_SAVE_HOST + +%ifdef VBOX_WITH_KERNEL_USING_XMM + jmp .load_guest +%endif +.already_saved_host: +%ifdef VBOX_WITH_KERNEL_USING_XMM + ; If we didn't save the host state, we must save the non-volatile XMM registers. + mov pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0] + stmxcsr [pXState + X86FXSTATE.MXCSR] + movdqa [pXState + X86FXSTATE.xmm6 ], xmm6 + movdqa [pXState + X86FXSTATE.xmm7 ], xmm7 + movdqa [pXState + X86FXSTATE.xmm8 ], xmm8 + movdqa [pXState + X86FXSTATE.xmm9 ], xmm9 + movdqa [pXState + X86FXSTATE.xmm10], xmm10 + movdqa [pXState + X86FXSTATE.xmm11], xmm11 + movdqa [pXState + X86FXSTATE.xmm12], xmm12 + movdqa [pXState + X86FXSTATE.xmm13], xmm13 + movdqa [pXState + X86FXSTATE.xmm14], xmm14 + movdqa [pXState + X86FXSTATE.xmm15], xmm15 + + ; + ; Load the guest state. + ; +.load_guest: +%endif + CPUMR0_LOAD_GUEST + +%ifdef VBOX_WITH_KERNEL_USING_XMM + ; Restore the non-volatile xmm registers. ASSUMING 64-bit host. + mov pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0] + movdqa xmm6, [pXState + X86FXSTATE.xmm6] + movdqa xmm7, [pXState + X86FXSTATE.xmm7] + movdqa xmm8, [pXState + X86FXSTATE.xmm8] + movdqa xmm9, [pXState + X86FXSTATE.xmm9] + movdqa xmm10, [pXState + X86FXSTATE.xmm10] + movdqa xmm11, [pXState + X86FXSTATE.xmm11] + movdqa xmm12, [pXState + X86FXSTATE.xmm12] + movdqa xmm13, [pXState + X86FXSTATE.xmm13] + movdqa xmm14, [pXState + X86FXSTATE.xmm14] + movdqa xmm15, [pXState + X86FXSTATE.xmm15] + ldmxcsr [pXState + X86FXSTATE.MXCSR] +%endif + + or dword [pCpumCpu + CPUMCPU.fUseFlags], (CPUM_USED_FPU_GUEST | CPUM_USED_FPU_SINCE_REM | CPUM_USED_FPU_HOST) + popf + + mov eax, ecx +.return: +%ifdef RT_ARCH_X86 + pop esi + pop ebx +%endif + leave + ret +ENDPROC cpumR0SaveHostRestoreGuestFPUState + + +;; +; Saves the guest FPU/SSE/AVX state and restores the host FPU/SSE/AVX state. +; +; @param pCpumCpu x86:[ebp+8] gcc:rdi msc:rcx CPUMCPU pointer +; +; @remarks 64-bit Windows drivers shouldn't use AVX registers without saving+loading: +; https://msdn.microsoft.com/en-us/library/windows/hardware/ff545910%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396 +; However the compiler docs have different idea: +; https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx +; We'll go with the former for now. +; +%ifndef RT_ASM_WITH_SEH64 ; workaround for yasm 1.3.0 bug (error: prologue -1 bytes, must be <256) +ALIGNCODE(16) +%endif +BEGINPROC cpumR0SaveGuestRestoreHostFPUState + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 +SEH64_END_PROLOGUE + + ; + ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input. + ; +%ifdef RT_ARCH_AMD64 + %ifdef RT_OS_WINDOWS + mov r11, rcx + %else + mov r11, rdi + %endif + %define pCpumCpu r11 + %define pXState r10 +%else + push ebx + push esi + mov ebx, dword [ebp + 8] + %define pCpumCpu ebx + %define pXState esi +%endif + pushf ; The darwin kernel can get upset or upset things if an + cli ; interrupt occurs while we're doing fxsave/fxrstor/cr0. + + %ifdef VBOX_WITH_KERNEL_USING_XMM + ; + ; Copy non-volatile XMM registers to the host state so we can use + ; them while saving the guest state (we've gotta do this anyway). + ; + mov pXState, [pCpumCpu + CPUMCPU.Host.pXStateR0] + stmxcsr [pXState + X86FXSTATE.MXCSR] + movdqa [pXState + X86FXSTATE.xmm6], xmm6 + movdqa [pXState + X86FXSTATE.xmm7], xmm7 + movdqa [pXState + X86FXSTATE.xmm8], xmm8 + movdqa [pXState + X86FXSTATE.xmm9], xmm9 + movdqa [pXState + X86FXSTATE.xmm10], xmm10 + movdqa [pXState + X86FXSTATE.xmm11], xmm11 + movdqa [pXState + X86FXSTATE.xmm12], xmm12 + movdqa [pXState + X86FXSTATE.xmm13], xmm13 + movdqa [pXState + X86FXSTATE.xmm14], xmm14 + movdqa [pXState + X86FXSTATE.xmm15], xmm15 + %endif + + ; + ; Save the guest state if necessary. + ; + test dword [pCpumCpu + CPUMCPU.fUseFlags], CPUM_USED_FPU_GUEST + jz .load_only_host + + %ifdef VBOX_WITH_KERNEL_USING_XMM + ; Load the guest XMM register values we already saved in HMR0VMXStartVMWrapXMM. + mov pXState, [pCpumCpu + CPUMCPU.Guest.pXStateR0] + movdqa xmm0, [pXState + X86FXSTATE.xmm0] + movdqa xmm1, [pXState + X86FXSTATE.xmm1] + movdqa xmm2, [pXState + X86FXSTATE.xmm2] + movdqa xmm3, [pXState + X86FXSTATE.xmm3] + movdqa xmm4, [pXState + X86FXSTATE.xmm4] + movdqa xmm5, [pXState + X86FXSTATE.xmm5] + movdqa xmm6, [pXState + X86FXSTATE.xmm6] + movdqa xmm7, [pXState + X86FXSTATE.xmm7] + movdqa xmm8, [pXState + X86FXSTATE.xmm8] + movdqa xmm9, [pXState + X86FXSTATE.xmm9] + movdqa xmm10, [pXState + X86FXSTATE.xmm10] + movdqa xmm11, [pXState + X86FXSTATE.xmm11] + movdqa xmm12, [pXState + X86FXSTATE.xmm12] + movdqa xmm13, [pXState + X86FXSTATE.xmm13] + movdqa xmm14, [pXState + X86FXSTATE.xmm14] + movdqa xmm15, [pXState + X86FXSTATE.xmm15] + ldmxcsr [pXState + X86FXSTATE.MXCSR] + %endif + CPUMR0_SAVE_GUEST + + ; + ; Load the host state. + ; +.load_only_host: + CPUMR0_LOAD_HOST + + ; Restore the CR0 value we saved in cpumR0SaveHostRestoreGuestFPUState or + ; in cpumRZSaveHostFPUState. + mov xCX, [pCpumCpu + CPUMCPU.Host.cr0Fpu] + CPUMRZ_RESTORE_CR0_IF_TS_OR_EM_SET xCX + and dword [pCpumCpu + CPUMCPU.fUseFlags], ~(CPUM_USED_FPU_GUEST | CPUM_USED_FPU_HOST) + + popf +%ifdef RT_ARCH_X86 + pop esi + pop ebx +%endif + leave + ret +%undef pCpumCpu +%undef pXState +ENDPROC cpumR0SaveGuestRestoreHostFPUState + + +%if ARCH_BITS == 32 + %ifdef VBOX_WITH_64_BITS_GUESTS +;; +; Restores the host's FPU/SSE/AVX state from pCpumCpu->Host. +; +; @param pCpumCpu x86:[ebp+8] gcc:rdi msc:rcx CPUMCPU pointer +; + %ifndef RT_ASM_WITH_SEH64 ; workaround for yasm 1.3.0 bug (error: prologue -1 bytes, must be <256) +ALIGNCODE(16) + %endif +BEGINPROC cpumR0RestoreHostFPUState + ; + ; Prologue - xAX+xDX must be free for XSAVE/XRSTOR input. + ; + push ebp + mov ebp, esp + push ebx + push esi + mov ebx, dword [ebp + 8] + %define pCpumCpu ebx + %define pXState esi + + ; + ; Restore host CPU state. + ; + pushf ; The darwin kernel can get upset or upset things if an + cli ; interrupt occurs while we're doing fxsave/fxrstor/cr0. + + CPUMR0_LOAD_HOST + + ; Restore the CR0 value we saved in cpumR0SaveHostRestoreGuestFPUState or + ; in cpumRZSaveHostFPUState. + ;; @todo What about XCR0? + mov xCX, [pCpumCpu + CPUMCPU.Host.cr0Fpu] + CPUMRZ_RESTORE_CR0_IF_TS_OR_EM_SET xCX + + and dword [pCpumCpu + CPUMCPU.fUseFlags], ~CPUM_USED_FPU_HOST + popf + + pop esi + pop ebx + leave + ret + %undef pCpumCPu + %undef pXState +ENDPROC cpumR0RestoreHostFPUState + %endif ; VBOX_WITH_64_BITS_GUESTS +%endif ; ARCH_BITS == 32 + diff --git a/src/VBox/VMM/VMMR0/EMR0.cpp b/src/VBox/VMM/VMMR0/EMR0.cpp new file mode 100644 index 00000000..68efbd88 --- /dev/null +++ b/src/VBox/VMM/VMMR0/EMR0.cpp @@ -0,0 +1,60 @@ +/* $Id: EMR0.cpp $ */ +/** @file + * EM - Host Context Ring 0. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_EM +#include <VBox/vmm/em.h> +#include "EMInternal.h" +#include <VBox/vmm/vm.h> +#include <VBox/vmm/gvm.h> +#include <iprt/errcore.h> +#include <VBox/log.h> +#include <iprt/assert.h> +#include <iprt/thread.h> + + + +/** + * Adjusts EM configuration options. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM structure. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) EMR0InitVM(PGVM pGVM, PVM pVM) +{ + /* + * Override ring-0 exit optimizations settings. + */ + bool fEnabledR0 = pVM->aCpus[0].em.s.fExitOptimizationEnabled + && pVM->aCpus[0].em.s.fExitOptimizationEnabledR0 + && (RTThreadPreemptIsPossible() || RTThreadPreemptIsPendingTrusty()); + bool fEnabledR0PreemptDisabled = fEnabledR0 + && pVM->aCpus[0].em.s.fExitOptimizationEnabledR0PreemptDisabled + && RTThreadPreemptIsPendingTrusty(); + for (VMCPUID i = 0; i < pGVM->cCpus; i++) + { + pVM->aCpus[i].em.s.fExitOptimizationEnabledR0 = fEnabledR0; + pVM->aCpus[i].em.s.fExitOptimizationEnabledR0PreemptDisabled = fEnabledR0PreemptDisabled; + } + + return VINF_SUCCESS; +} + diff --git a/src/VBox/VMM/VMMR0/GIMR0.cpp b/src/VBox/VMM/VMMR0/GIMR0.cpp new file mode 100644 index 00000000..e4750911 --- /dev/null +++ b/src/VBox/VMM/VMMR0/GIMR0.cpp @@ -0,0 +1,117 @@ +/* $Id: GIMR0.cpp $ */ +/** @file + * Guest Interface Manager (GIM) - Host Context Ring-0. + */ + +/* + * Copyright (C) 2014-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_GIM +#include <VBox/vmm/gim.h> +#include "GIMInternal.h" +#include "GIMHvInternal.h" +#include <VBox/vmm/vm.h> + +#include <VBox/err.h> + + +/** + * Does ring-0 per-VM GIM initialization. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) GIMR0InitVM(PVM pVM) +{ + if (!GIMIsEnabled(pVM)) + return VINF_SUCCESS; + + switch (pVM->gim.s.enmProviderId) + { + case GIMPROVIDERID_HYPERV: + return gimR0HvInitVM(pVM); + + case GIMPROVIDERID_KVM: + return gimR0KvmInitVM(pVM); + + default: + break; + } + return VINF_SUCCESS; +} + + +/** + * Does ring-0 per-VM GIM termination. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) GIMR0TermVM(PVM pVM) +{ + if (!GIMIsEnabled(pVM)) + return VINF_SUCCESS; + + switch (pVM->gim.s.enmProviderId) + { + case GIMPROVIDERID_HYPERV: + return gimR0HvTermVM(pVM); + + case GIMPROVIDERID_KVM: + return gimR0KvmTermVM(pVM); + + default: + break; + } + return VINF_SUCCESS; +} + + +/** + * Updates the paravirtualized TSC supported by the GIM provider. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if the paravirt. TSC is setup and in use. + * @retval VERR_GIM_NOT_ENABLED if no GIM provider is configured for this VM. + * @retval VERR_GIM_PVTSC_NOT_AVAILABLE if the GIM provider does not support any + * paravirt. TSC. + * @retval VERR_GIM_PVTSC_NOT_IN_USE if the GIM provider supports paravirt. TSC + * but the guest isn't currently using it. + * + * @param pVM The cross context VM structure. + * @param u64Offset The computed TSC offset. + * + * @thread EMT(pVCpu) + */ +VMMR0_INT_DECL(int) GIMR0UpdateParavirtTsc(PVM pVM, uint64_t u64Offset) +{ + switch (pVM->gim.s.enmProviderId) + { + case GIMPROVIDERID_HYPERV: + return gimR0HvUpdateParavirtTsc(pVM, u64Offset); + + case GIMPROVIDERID_KVM: + return VINF_SUCCESS; + + case GIMPROVIDERID_NONE: + return VERR_GIM_NOT_ENABLED; + + default: + break; + } + return VERR_GIM_PVTSC_NOT_AVAILABLE; +} + diff --git a/src/VBox/VMM/VMMR0/GIMR0Hv.cpp b/src/VBox/VMM/VMMR0/GIMR0Hv.cpp new file mode 100644 index 00000000..cbf23de1 --- /dev/null +++ b/src/VBox/VMM/VMMR0/GIMR0Hv.cpp @@ -0,0 +1,182 @@ +/* $Id: GIMR0Hv.cpp $ */ +/** @file + * Guest Interface Manager (GIM), Hyper-V - Host Context Ring-0. + */ + +/* + * Copyright (C) 2014-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_GIM +#include <VBox/vmm/gim.h> +#include <VBox/vmm/tm.h> +#include "GIMInternal.h" +#include "GIMHvInternal.h" +#include <VBox/vmm/vm.h> + +#include <VBox/err.h> + +#include <iprt/spinlock.h> + + +#if 0 +/** + * Allocates and maps one physically contiguous page. The allocated page is + * zero'd out. + * + * @returns IPRT status code. + * @param pMemObj Pointer to the ring-0 memory object. + * @param ppVirt Where to store the virtual address of the + * allocation. + * @param pPhys Where to store the physical address of the + * allocation. + */ +static int gimR0HvPageAllocZ(PRTR0MEMOBJ pMemObj, PRTR0PTR ppVirt, PRTHCPHYS pHCPhys) +{ + AssertPtr(pMemObj); + AssertPtr(ppVirt); + AssertPtr(pHCPhys); + + int rc = RTR0MemObjAllocCont(pMemObj, PAGE_SIZE, false /* fExecutable */); + if (RT_FAILURE(rc)) + return rc; + *ppVirt = RTR0MemObjAddress(*pMemObj); + *pHCPhys = RTR0MemObjGetPagePhysAddr(*pMemObj, 0 /* iPage */); + ASMMemZero32(*ppVirt, PAGE_SIZE); + return VINF_SUCCESS; +} + + +/** + * Frees and unmaps an allocated physical page. + * + * @param pMemObj Pointer to the ring-0 memory object. + * @param ppVirt Where to re-initialize the virtual address of + * allocation as 0. + * @param pHCPhys Where to re-initialize the physical address of the + * allocation as 0. + */ +static void gimR0HvPageFree(PRTR0MEMOBJ pMemObj, PRTR0PTR ppVirt, PRTHCPHYS pHCPhys) +{ + AssertPtr(pMemObj); + AssertPtr(ppVirt); + AssertPtr(pHCPhys); + if (*pMemObj != NIL_RTR0MEMOBJ) + { + int rc = RTR0MemObjFree(*pMemObj, true /* fFreeMappings */); + AssertRC(rc); + *pMemObj = NIL_RTR0MEMOBJ; + *ppVirt = 0; + *pHCPhys = 0; + } +} +#endif + +/** + * Updates Hyper-V's reference TSC page. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + * @param u64Offset The computed TSC offset. + * @thread EMT. + */ +VMM_INT_DECL(int) gimR0HvUpdateParavirtTsc(PVM pVM, uint64_t u64Offset) +{ + Assert(GIMIsEnabled(pVM)); + bool fHvTscEnabled = MSR_GIM_HV_REF_TSC_IS_ENABLED(pVM->gim.s.u.Hv.u64TscPageMsr); + if (RT_UNLIKELY(!fHvTscEnabled)) + return VERR_GIM_PVTSC_NOT_ENABLED; + + /** @todo this is buggy when large pages are used due to a PGM limitation, see + * @bugref{7532}. + * + * In any case, we do not ever update this page while the guest is + * running after setting it up (in ring-3, see gimR3HvEnableTscPage()) as + * the TSC offset is handled in the VMCS/VMCB (HM) or by trapping RDTSC + * (raw-mode). */ +#if 0 + PCGIMHV pcHv = &pVM->gim.s.u.Hv; + PCGIMMMIO2REGION pcRegion = &pcHv->aMmio2Regions[GIM_HV_REF_TSC_PAGE_REGION_IDX]; + PGIMHVREFTSC pRefTsc = (PGIMHVREFTSC)pcRegion->CTX_SUFF(pvPage); + Assert(pRefTsc); + + /* + * Hyper-V reports the reference time in 100 nanosecond units. + */ + uint64_t u64Tsc100Ns = pcHv->cTscTicksPerSecond / RT_NS_10MS; + int64_t i64TscOffset = (int64_t)u64Offset / u64Tsc100Ns; + + /* + * The TSC page can be simulatenously read by other VCPUs in the guest. The + * spinlock is only for protecting simultaneous hypervisor writes from other + * EMTs. + */ + RTSpinlockAcquire(pcHv->hSpinlockR0); + if (pRefTsc->i64TscOffset != i64TscOffset) + { + if (pRefTsc->u32TscSequence < UINT32_C(0xfffffffe)) + ASMAtomicIncU32(&pRefTsc->u32TscSequence); + else + ASMAtomicWriteU32(&pRefTsc->u32TscSequence, 1); + ASMAtomicWriteS64(&pRefTsc->i64TscOffset, i64TscOffset); + } + RTSpinlockRelease(pcHv->hSpinlockR0); + + Assert(pRefTsc->u32TscSequence != 0); + Assert(pRefTsc->u32TscSequence != UINT32_C(0xffffffff)); +#else + NOREF(u64Offset); +#endif + return VINF_SUCCESS; +} + + +/** + * Does ring-0 per-VM GIM Hyper-V initialization. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) gimR0HvInitVM(PVM pVM) +{ + AssertPtr(pVM); + Assert(GIMIsEnabled(pVM)); + + PGIMHV pHv = &pVM->gim.s.u.Hv; + Assert(pHv->hSpinlockR0 == NIL_RTSPINLOCK); + + int rc = RTSpinlockCreate(&pHv->hSpinlockR0, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "Hyper-V"); + return rc; +} + + +/** + * Does ring-0 per-VM GIM Hyper-V termination. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) gimR0HvTermVM(PVM pVM) +{ + AssertPtr(pVM); + Assert(GIMIsEnabled(pVM)); + + PGIMHV pHv = &pVM->gim.s.u.Hv; + RTSpinlockDestroy(pHv->hSpinlockR0); + pHv->hSpinlockR0 = NIL_RTSPINLOCK; + + return VINF_SUCCESS; +} + diff --git a/src/VBox/VMM/VMMR0/GIMR0Kvm.cpp b/src/VBox/VMM/VMMR0/GIMR0Kvm.cpp new file mode 100644 index 00000000..bcc849db --- /dev/null +++ b/src/VBox/VMM/VMMR0/GIMR0Kvm.cpp @@ -0,0 +1,119 @@ +/* $Id: GIMR0Kvm.cpp $ */ +/** @file + * Guest Interface Manager (GIM), KVM - Host Context Ring-0. + */ + +/* + * Copyright (C) 2015-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_GIM +#include <VBox/vmm/gim.h> +#include <VBox/vmm/tm.h> +#include "GIMInternal.h" +#include "GIMKvmInternal.h" +#include <VBox/vmm/vm.h> + +#include <VBox/err.h> + +#include <iprt/spinlock.h> + + +/** + * Updates KVM's system time information globally for all VCPUs. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * @thread EMT. + * @remarks Can be called with preemption disabled! + */ +VMM_INT_DECL(int) gimR0KvmUpdateSystemTime(PVM pVM, PVMCPU pVCpu) +{ + /* + * Validate. + */ + Assert(GIMIsEnabled(pVM)); + PGIMKVM pKvm = &pVM->gim.s.u.Kvm; + AssertReturn(pKvm->hSpinlockR0 != NIL_RTSPINLOCK, VERR_GIM_IPE_3); + + /* + * Record the TSC and virtual NanoTS pairs. + */ + uint64_t uTsc; + uint64_t uVirtNanoTS; + RTCCUINTREG fEFlags = ASMIntDisableFlags(); + uTsc = TMCpuTickGetNoCheck(pVCpu) | UINT64_C(1); + uVirtNanoTS = TMVirtualGetNoCheck(pVM) | UINT64_C(1); + ASMSetFlags(fEFlags); + + /* + * Update VCPUs with this information. The first VCPU's values + * will be applied to the remaining. + */ + RTSpinlockAcquire(pKvm->hSpinlockR0); + for (uint32_t i = 0; i < pVM->cCpus; i++) + { + PGIMKVMCPU pKvmCpu = &pVM->aCpus[i].gim.s.u.KvmCpu; + if ( !pKvmCpu->uTsc + && !pKvmCpu->uVirtNanoTS) + { + pKvmCpu->uTsc = uTsc; + pKvmCpu->uVirtNanoTS = uVirtNanoTS; + } + } + RTSpinlockRelease(pKvm->hSpinlockR0); + + return VINF_SUCCESS; +} + + +/** + * Does ring-0 per-VM GIM KVM initialization. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) gimR0KvmInitVM(PVM pVM) +{ + AssertPtr(pVM); + Assert(GIMIsEnabled(pVM)); + + PGIMKVM pKvm = &pVM->gim.s.u.Kvm; + Assert(pKvm->hSpinlockR0 == NIL_RTSPINLOCK); + + int rc = RTSpinlockCreate(&pKvm->hSpinlockR0, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "KVM"); + return rc; +} + + +/** + * Does ring-0 per-VM GIM KVM termination. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) gimR0KvmTermVM(PVM pVM) +{ + AssertPtr(pVM); + Assert(GIMIsEnabled(pVM)); + + PGIMKVM pKvm = &pVM->gim.s.u.Kvm; + RTSpinlockDestroy(pKvm->hSpinlockR0); + pKvm->hSpinlockR0 = NIL_RTSPINLOCK; + + return VINF_SUCCESS; +} + diff --git a/src/VBox/VMM/VMMR0/GMMR0.cpp b/src/VBox/VMM/VMMR0/GMMR0.cpp new file mode 100644 index 00000000..cf90eb66 --- /dev/null +++ b/src/VBox/VMM/VMMR0/GMMR0.cpp @@ -0,0 +1,5445 @@ +/* $Id: GMMR0.cpp $ */ +/** @file + * GMM - Global Memory Manager. + */ + +/* + * Copyright (C) 2007-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/** @page pg_gmm GMM - The Global Memory Manager + * + * As the name indicates, this component is responsible for global memory + * management. Currently only guest RAM is allocated from the GMM, but this + * may change to include shadow page tables and other bits later. + * + * Guest RAM is managed as individual pages, but allocated from the host OS + * in chunks for reasons of portability / efficiency. To minimize the memory + * footprint all tracking structure must be as small as possible without + * unnecessary performance penalties. + * + * The allocation chunks has fixed sized, the size defined at compile time + * by the #GMM_CHUNK_SIZE \#define. + * + * Each chunk is given an unique ID. Each page also has a unique ID. The + * relationship between the two IDs is: + * @code + * GMM_CHUNK_SHIFT = log2(GMM_CHUNK_SIZE / PAGE_SIZE); + * idPage = (idChunk << GMM_CHUNK_SHIFT) | iPage; + * @endcode + * Where iPage is the index of the page within the chunk. This ID scheme + * permits for efficient chunk and page lookup, but it relies on the chunk size + * to be set at compile time. The chunks are organized in an AVL tree with their + * IDs being the keys. + * + * The physical address of each page in an allocation chunk is maintained by + * the #RTR0MEMOBJ and obtained using #RTR0MemObjGetPagePhysAddr. There is no + * need to duplicate this information (it'll cost 8-bytes per page if we did). + * + * So what do we need to track per page? Most importantly we need to know + * which state the page is in: + * - Private - Allocated for (eventually) backing one particular VM page. + * - Shared - Readonly page that is used by one or more VMs and treated + * as COW by PGM. + * - Free - Not used by anyone. + * + * For the page replacement operations (sharing, defragmenting and freeing) + * to be somewhat efficient, private pages needs to be associated with a + * particular page in a particular VM. + * + * Tracking the usage of shared pages is impractical and expensive, so we'll + * settle for a reference counting system instead. + * + * Free pages will be chained on LIFOs + * + * On 64-bit systems we will use a 64-bit bitfield per page, while on 32-bit + * systems a 32-bit bitfield will have to suffice because of address space + * limitations. The #GMMPAGE structure shows the details. + * + * + * @section sec_gmm_alloc_strat Page Allocation Strategy + * + * The strategy for allocating pages has to take fragmentation and shared + * pages into account, or we may end up with with 2000 chunks with only + * a few pages in each. Shared pages cannot easily be reallocated because + * of the inaccurate usage accounting (see above). Private pages can be + * reallocated by a defragmentation thread in the same manner that sharing + * is done. + * + * The first approach is to manage the free pages in two sets depending on + * whether they are mainly for the allocation of shared or private pages. + * In the initial implementation there will be almost no possibility for + * mixing shared and private pages in the same chunk (only if we're really + * stressed on memory), but when we implement forking of VMs and have to + * deal with lots of COW pages it'll start getting kind of interesting. + * + * The sets are lists of chunks with approximately the same number of + * free pages. Say the chunk size is 1MB, meaning 256 pages, and a set + * consists of 16 lists. So, the first list will contain the chunks with + * 1-7 free pages, the second covers 8-15, and so on. The chunks will be + * moved between the lists as pages are freed up or allocated. + * + * + * @section sec_gmm_costs Costs + * + * The per page cost in kernel space is 32-bit plus whatever RTR0MEMOBJ + * entails. In addition there is the chunk cost of approximately + * (sizeof(RT0MEMOBJ) + sizeof(CHUNK)) / 2^CHUNK_SHIFT bytes per page. + * + * On Windows the per page #RTR0MEMOBJ cost is 32-bit on 32-bit windows + * and 64-bit on 64-bit windows (a PFN_NUMBER in the MDL). So, 64-bit per page. + * The cost on Linux is identical, but here it's because of sizeof(struct page *). + * + * + * @section sec_gmm_legacy Legacy Mode for Non-Tier-1 Platforms + * + * In legacy mode the page source is locked user pages and not + * #RTR0MemObjAllocPhysNC, this means that a page can only be allocated + * by the VM that locked it. We will make no attempt at implementing + * page sharing on these systems, just do enough to make it all work. + * + * + * @subsection sub_gmm_locking Serializing + * + * One simple fast mutex will be employed in the initial implementation, not + * two as mentioned in @ref sec_pgmPhys_Serializing. + * + * @see @ref sec_pgmPhys_Serializing + * + * + * @section sec_gmm_overcommit Memory Over-Commitment Management + * + * The GVM will have to do the system wide memory over-commitment + * management. My current ideas are: + * - Per VM oc policy that indicates how much to initially commit + * to it and what to do in a out-of-memory situation. + * - Prevent overtaxing the host. + * + * There are some challenges here, the main ones are configurability and + * security. Should we for instance permit anyone to request 100% memory + * commitment? Who should be allowed to do runtime adjustments of the + * config. And how to prevent these settings from being lost when the last + * VM process exits? The solution is probably to have an optional root + * daemon the will keep VMMR0.r0 in memory and enable the security measures. + * + * + * + * @section sec_gmm_numa NUMA + * + * NUMA considerations will be designed and implemented a bit later. + * + * The preliminary guesses is that we will have to try allocate memory as + * close as possible to the CPUs the VM is executed on (EMT and additional CPU + * threads). Which means it's mostly about allocation and sharing policies. + * Both the scheduler and allocator interface will to supply some NUMA info + * and we'll need to have a way to calc access costs. + * + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_GMM +#include <VBox/rawpci.h> +#include <VBox/vmm/vm.h> +#include <VBox/vmm/gmm.h> +#include "GMMR0Internal.h" +#include <VBox/vmm/gvm.h> +#include <VBox/vmm/pgm.h> +#include <VBox/log.h> +#include <VBox/param.h> +#include <VBox/err.h> +#include <VBox/VMMDev.h> +#include <iprt/asm.h> +#include <iprt/avl.h> +#ifdef VBOX_STRICT +# include <iprt/crc.h> +#endif +#include <iprt/critsect.h> +#include <iprt/list.h> +#include <iprt/mem.h> +#include <iprt/memobj.h> +#include <iprt/mp.h> +#include <iprt/semaphore.h> +#include <iprt/string.h> +#include <iprt/time.h> + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +/** @def VBOX_USE_CRIT_SECT_FOR_GIANT + * Use a critical section instead of a fast mutex for the giant GMM lock. + * + * @remarks This is primarily a way of avoiding the deadlock checks in the + * windows driver verifier. */ +#if defined(RT_OS_WINDOWS) || defined(DOXYGEN_RUNNING) +# define VBOX_USE_CRIT_SECT_FOR_GIANT +#endif + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** Pointer to set of free chunks. */ +typedef struct GMMCHUNKFREESET *PGMMCHUNKFREESET; + +/** + * The per-page tracking structure employed by the GMM. + * + * On 32-bit hosts we'll some trickery is necessary to compress all + * the information into 32-bits. When the fSharedFree member is set, + * the 30th bit decides whether it's a free page or not. + * + * Because of the different layout on 32-bit and 64-bit hosts, macros + * are used to get and set some of the data. + */ +typedef union GMMPAGE +{ +#if HC_ARCH_BITS == 64 + /** Unsigned integer view. */ + uint64_t u; + + /** The common view. */ + struct GMMPAGECOMMON + { + uint32_t uStuff1 : 32; + uint32_t uStuff2 : 30; + /** The page state. */ + uint32_t u2State : 2; + } Common; + + /** The view of a private page. */ + struct GMMPAGEPRIVATE + { + /** The guest page frame number. (Max addressable: 2 ^ 44 - 16) */ + uint32_t pfn; + /** The GVM handle. (64K VMs) */ + uint32_t hGVM : 16; + /** Reserved. */ + uint32_t u16Reserved : 14; + /** The page state. */ + uint32_t u2State : 2; + } Private; + + /** The view of a shared page. */ + struct GMMPAGESHARED + { + /** The host page frame number. (Max addressable: 2 ^ 44 - 16) */ + uint32_t pfn; + /** The reference count (64K VMs). */ + uint32_t cRefs : 16; + /** Used for debug checksumming. */ + uint32_t u14Checksum : 14; + /** The page state. */ + uint32_t u2State : 2; + } Shared; + + /** The view of a free page. */ + struct GMMPAGEFREE + { + /** The index of the next page in the free list. UINT16_MAX is NIL. */ + uint16_t iNext; + /** Reserved. Checksum or something? */ + uint16_t u16Reserved0; + /** Reserved. Checksum or something? */ + uint32_t u30Reserved1 : 30; + /** The page state. */ + uint32_t u2State : 2; + } Free; + +#else /* 32-bit */ + /** Unsigned integer view. */ + uint32_t u; + + /** The common view. */ + struct GMMPAGECOMMON + { + uint32_t uStuff : 30; + /** The page state. */ + uint32_t u2State : 2; + } Common; + + /** The view of a private page. */ + struct GMMPAGEPRIVATE + { + /** The guest page frame number. (Max addressable: 2 ^ 36) */ + uint32_t pfn : 24; + /** The GVM handle. (127 VMs) */ + uint32_t hGVM : 7; + /** The top page state bit, MBZ. */ + uint32_t fZero : 1; + } Private; + + /** The view of a shared page. */ + struct GMMPAGESHARED + { + /** The reference count. */ + uint32_t cRefs : 30; + /** The page state. */ + uint32_t u2State : 2; + } Shared; + + /** The view of a free page. */ + struct GMMPAGEFREE + { + /** The index of the next page in the free list. UINT16_MAX is NIL. */ + uint32_t iNext : 16; + /** Reserved. Checksum or something? */ + uint32_t u14Reserved : 14; + /** The page state. */ + uint32_t u2State : 2; + } Free; +#endif +} GMMPAGE; +AssertCompileSize(GMMPAGE, sizeof(RTHCUINTPTR)); +/** Pointer to a GMMPAGE. */ +typedef GMMPAGE *PGMMPAGE; + + +/** @name The Page States. + * @{ */ +/** A private page. */ +#define GMM_PAGE_STATE_PRIVATE 0 +/** A private page - alternative value used on the 32-bit implementation. + * This will never be used on 64-bit hosts. */ +#define GMM_PAGE_STATE_PRIVATE_32 1 +/** A shared page. */ +#define GMM_PAGE_STATE_SHARED 2 +/** A free page. */ +#define GMM_PAGE_STATE_FREE 3 +/** @} */ + + +/** @def GMM_PAGE_IS_PRIVATE + * + * @returns true if private, false if not. + * @param pPage The GMM page. + */ +#if HC_ARCH_BITS == 64 +# define GMM_PAGE_IS_PRIVATE(pPage) ( (pPage)->Common.u2State == GMM_PAGE_STATE_PRIVATE ) +#else +# define GMM_PAGE_IS_PRIVATE(pPage) ( (pPage)->Private.fZero == 0 ) +#endif + +/** @def GMM_PAGE_IS_SHARED + * + * @returns true if shared, false if not. + * @param pPage The GMM page. + */ +#define GMM_PAGE_IS_SHARED(pPage) ( (pPage)->Common.u2State == GMM_PAGE_STATE_SHARED ) + +/** @def GMM_PAGE_IS_FREE + * + * @returns true if free, false if not. + * @param pPage The GMM page. + */ +#define GMM_PAGE_IS_FREE(pPage) ( (pPage)->Common.u2State == GMM_PAGE_STATE_FREE ) + +/** @def GMM_PAGE_PFN_LAST + * The last valid guest pfn range. + * @remark Some of the values outside the range has special meaning, + * see GMM_PAGE_PFN_UNSHAREABLE. + */ +#if HC_ARCH_BITS == 64 +# define GMM_PAGE_PFN_LAST UINT32_C(0xfffffff0) +#else +# define GMM_PAGE_PFN_LAST UINT32_C(0x00fffff0) +#endif +AssertCompile(GMM_PAGE_PFN_LAST == (GMM_GCPHYS_LAST >> PAGE_SHIFT)); + +/** @def GMM_PAGE_PFN_UNSHAREABLE + * Indicates that this page isn't used for normal guest memory and thus isn't shareable. + */ +#if HC_ARCH_BITS == 64 +# define GMM_PAGE_PFN_UNSHAREABLE UINT32_C(0xfffffff1) +#else +# define GMM_PAGE_PFN_UNSHAREABLE UINT32_C(0x00fffff1) +#endif +AssertCompile(GMM_PAGE_PFN_UNSHAREABLE == (GMM_GCPHYS_UNSHAREABLE >> PAGE_SHIFT)); + + +/** + * A GMM allocation chunk ring-3 mapping record. + * + * This should really be associated with a session and not a VM, but + * it's simpler to associated with a VM and cleanup with the VM object + * is destroyed. + */ +typedef struct GMMCHUNKMAP +{ + /** The mapping object. */ + RTR0MEMOBJ hMapObj; + /** The VM owning the mapping. */ + PGVM pGVM; +} GMMCHUNKMAP; +/** Pointer to a GMM allocation chunk mapping. */ +typedef struct GMMCHUNKMAP *PGMMCHUNKMAP; + + +/** + * A GMM allocation chunk. + */ +typedef struct GMMCHUNK +{ + /** The AVL node core. + * The Key is the chunk ID. (Giant mtx.) */ + AVLU32NODECORE Core; + /** The memory object. + * Either from RTR0MemObjAllocPhysNC or RTR0MemObjLockUser depending on + * what the host can dish up with. (Chunk mtx protects mapping accesses + * and related frees.) */ + RTR0MEMOBJ hMemObj; + /** Pointer to the next chunk in the free list. (Giant mtx.) */ + PGMMCHUNK pFreeNext; + /** Pointer to the previous chunk in the free list. (Giant mtx.) */ + PGMMCHUNK pFreePrev; + /** Pointer to the free set this chunk belongs to. NULL for + * chunks with no free pages. (Giant mtx.) */ + PGMMCHUNKFREESET pSet; + /** List node in the chunk list (GMM::ChunkList). (Giant mtx.) */ + RTLISTNODE ListNode; + /** Pointer to an array of mappings. (Chunk mtx.) */ + PGMMCHUNKMAP paMappingsX; + /** The number of mappings. (Chunk mtx.) */ + uint16_t cMappingsX; + /** The mapping lock this chunk is using using. UINT16_MAX if nobody is + * mapping or freeing anything. (Giant mtx.) */ + uint8_t volatile iChunkMtx; + /** Flags field reserved for future use (like eliminating enmType). + * (Giant mtx.) */ + uint8_t fFlags; + /** The head of the list of free pages. UINT16_MAX is the NIL value. + * (Giant mtx.) */ + uint16_t iFreeHead; + /** The number of free pages. (Giant mtx.) */ + uint16_t cFree; + /** The GVM handle of the VM that first allocated pages from this chunk, this + * is used as a preference when there are several chunks to choose from. + * When in bound memory mode this isn't a preference any longer. (Giant + * mtx.) */ + uint16_t hGVM; + /** The ID of the NUMA node the memory mostly resides on. (Reserved for + * future use.) (Giant mtx.) */ + uint16_t idNumaNode; + /** The number of private pages. (Giant mtx.) */ + uint16_t cPrivate; + /** The number of shared pages. (Giant mtx.) */ + uint16_t cShared; + /** The pages. (Giant mtx.) */ + GMMPAGE aPages[GMM_CHUNK_SIZE >> PAGE_SHIFT]; +} GMMCHUNK; + +/** Indicates that the NUMA properies of the memory is unknown. */ +#define GMM_CHUNK_NUMA_ID_UNKNOWN UINT16_C(0xfffe) + +/** @name GMM_CHUNK_FLAGS_XXX - chunk flags. + * @{ */ +/** Indicates that the chunk is a large page (2MB). */ +#define GMM_CHUNK_FLAGS_LARGE_PAGE UINT16_C(0x0001) +/** @} */ + + +/** + * An allocation chunk TLB entry. + */ +typedef struct GMMCHUNKTLBE +{ + /** The chunk id. */ + uint32_t idChunk; + /** Pointer to the chunk. */ + PGMMCHUNK pChunk; +} GMMCHUNKTLBE; +/** Pointer to an allocation chunk TLB entry. */ +typedef GMMCHUNKTLBE *PGMMCHUNKTLBE; + + +/** The number of entries tin the allocation chunk TLB. */ +#define GMM_CHUNKTLB_ENTRIES 32 +/** Gets the TLB entry index for the given Chunk ID. */ +#define GMM_CHUNKTLB_IDX(idChunk) ( (idChunk) & (GMM_CHUNKTLB_ENTRIES - 1) ) + +/** + * An allocation chunk TLB. + */ +typedef struct GMMCHUNKTLB +{ + /** The TLB entries. */ + GMMCHUNKTLBE aEntries[GMM_CHUNKTLB_ENTRIES]; +} GMMCHUNKTLB; +/** Pointer to an allocation chunk TLB. */ +typedef GMMCHUNKTLB *PGMMCHUNKTLB; + + +/** + * The GMM instance data. + */ +typedef struct GMM +{ + /** Magic / eye catcher. GMM_MAGIC */ + uint32_t u32Magic; + /** The number of threads waiting on the mutex. */ + uint32_t cMtxContenders; +#ifdef VBOX_USE_CRIT_SECT_FOR_GIANT + /** The critical section protecting the GMM. + * More fine grained locking can be implemented later if necessary. */ + RTCRITSECT GiantCritSect; +#else + /** The fast mutex protecting the GMM. + * More fine grained locking can be implemented later if necessary. */ + RTSEMFASTMUTEX hMtx; +#endif +#ifdef VBOX_STRICT + /** The current mutex owner. */ + RTNATIVETHREAD hMtxOwner; +#endif + /** The chunk tree. */ + PAVLU32NODECORE pChunks; + /** The chunk TLB. */ + GMMCHUNKTLB ChunkTLB; + /** The private free set. */ + GMMCHUNKFREESET PrivateX; + /** The shared free set. */ + GMMCHUNKFREESET Shared; + + /** Shared module tree (global). + * @todo separate trees for distinctly different guest OSes. */ + PAVLLU32NODECORE pGlobalSharedModuleTree; + /** Sharable modules (count of nodes in pGlobalSharedModuleTree). */ + uint32_t cShareableModules; + + /** The chunk list. For simplifying the cleanup process. */ + RTLISTANCHOR ChunkList; + + /** The maximum number of pages we're allowed to allocate. + * @gcfgm{GMM/MaxPages,64-bit, Direct.} + * @gcfgm{GMM/PctPages,32-bit, Relative to the number of host pages.} */ + uint64_t cMaxPages; + /** The number of pages that has been reserved. + * The deal is that cReservedPages - cOverCommittedPages <= cMaxPages. */ + uint64_t cReservedPages; + /** The number of pages that we have over-committed in reservations. */ + uint64_t cOverCommittedPages; + /** The number of actually allocated (committed if you like) pages. */ + uint64_t cAllocatedPages; + /** The number of pages that are shared. A subset of cAllocatedPages. */ + uint64_t cSharedPages; + /** The number of pages that are actually shared between VMs. */ + uint64_t cDuplicatePages; + /** The number of pages that are shared that has been left behind by + * VMs not doing proper cleanups. */ + uint64_t cLeftBehindSharedPages; + /** The number of allocation chunks. + * (The number of pages we've allocated from the host can be derived from this.) */ + uint32_t cChunks; + /** The number of current ballooned pages. */ + uint64_t cBalloonedPages; + + /** The legacy allocation mode indicator. + * This is determined at initialization time. */ + bool fLegacyAllocationMode; + /** The bound memory mode indicator. + * When set, the memory will be bound to a specific VM and never + * shared. This is always set if fLegacyAllocationMode is set. + * (Also determined at initialization time.) */ + bool fBoundMemoryMode; + /** The number of registered VMs. */ + uint16_t cRegisteredVMs; + + /** The number of freed chunks ever. This is used a list generation to + * avoid restarting the cleanup scanning when the list wasn't modified. */ + uint32_t cFreedChunks; + /** The previous allocated Chunk ID. + * Used as a hint to avoid scanning the whole bitmap. */ + uint32_t idChunkPrev; + /** Chunk ID allocation bitmap. + * Bits of allocated IDs are set, free ones are clear. + * The NIL id (0) is marked allocated. */ + uint32_t bmChunkId[(GMM_CHUNKID_LAST + 1 + 31) / 32]; + + /** The index of the next mutex to use. */ + uint32_t iNextChunkMtx; + /** Chunk locks for reducing lock contention without having to allocate + * one lock per chunk. */ + struct + { + /** The mutex */ + RTSEMFASTMUTEX hMtx; + /** The number of threads currently using this mutex. */ + uint32_t volatile cUsers; + } aChunkMtx[64]; +} GMM; +/** Pointer to the GMM instance. */ +typedef GMM *PGMM; + +/** The value of GMM::u32Magic (Katsuhiro Otomo). */ +#define GMM_MAGIC UINT32_C(0x19540414) + + +/** + * GMM chunk mutex state. + * + * This is returned by gmmR0ChunkMutexAcquire and is used by the other + * gmmR0ChunkMutex* methods. + */ +typedef struct GMMR0CHUNKMTXSTATE +{ + PGMM pGMM; + /** The index of the chunk mutex. */ + uint8_t iChunkMtx; + /** The relevant flags (GMMR0CHUNK_MTX_XXX). */ + uint8_t fFlags; +} GMMR0CHUNKMTXSTATE; +/** Pointer to a chunk mutex state. */ +typedef GMMR0CHUNKMTXSTATE *PGMMR0CHUNKMTXSTATE; + +/** @name GMMR0CHUNK_MTX_XXX + * @{ */ +#define GMMR0CHUNK_MTX_INVALID UINT32_C(0) +#define GMMR0CHUNK_MTX_KEEP_GIANT UINT32_C(1) +#define GMMR0CHUNK_MTX_RETAKE_GIANT UINT32_C(2) +#define GMMR0CHUNK_MTX_DROP_GIANT UINT32_C(3) +#define GMMR0CHUNK_MTX_END UINT32_C(4) +/** @} */ + + +/** The maximum number of shared modules per-vm. */ +#define GMM_MAX_SHARED_PER_VM_MODULES 2048 +/** The maximum number of shared modules GMM is allowed to track. */ +#define GMM_MAX_SHARED_GLOBAL_MODULES 16834 + + +/** + * Argument packet for gmmR0SharedModuleCleanup. + */ +typedef struct GMMR0SHMODPERVMDTORARGS +{ + PGVM pGVM; + PGMM pGMM; +} GMMR0SHMODPERVMDTORARGS; + +/** + * Argument packet for gmmR0CheckSharedModule. + */ +typedef struct GMMCHECKSHAREDMODULEINFO +{ + PGVM pGVM; + VMCPUID idCpu; +} GMMCHECKSHAREDMODULEINFO; + +/** + * Argument packet for gmmR0FindDupPageInChunk by GMMR0FindDuplicatePage. + */ +typedef struct GMMFINDDUPPAGEINFO +{ + PGVM pGVM; + PGMM pGMM; + uint8_t *pSourcePage; + bool fFoundDuplicate; +} GMMFINDDUPPAGEINFO; + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +/** Pointer to the GMM instance data. */ +static PGMM g_pGMM = NULL; + +/** Macro for obtaining and validating the g_pGMM pointer. + * + * On failure it will return from the invoking function with the specified + * return value. + * + * @param pGMM The name of the pGMM variable. + * @param rc The return value on failure. Use VERR_GMM_INSTANCE for VBox + * status codes. + */ +#define GMM_GET_VALID_INSTANCE(pGMM, rc) \ + do { \ + (pGMM) = g_pGMM; \ + AssertPtrReturn((pGMM), (rc)); \ + AssertMsgReturn((pGMM)->u32Magic == GMM_MAGIC, ("%p - %#x\n", (pGMM), (pGMM)->u32Magic), (rc)); \ + } while (0) + +/** Macro for obtaining and validating the g_pGMM pointer, void function + * variant. + * + * On failure it will return from the invoking function. + * + * @param pGMM The name of the pGMM variable. + */ +#define GMM_GET_VALID_INSTANCE_VOID(pGMM) \ + do { \ + (pGMM) = g_pGMM; \ + AssertPtrReturnVoid((pGMM)); \ + AssertMsgReturnVoid((pGMM)->u32Magic == GMM_MAGIC, ("%p - %#x\n", (pGMM), (pGMM)->u32Magic)); \ + } while (0) + + +/** @def GMM_CHECK_SANITY_UPON_ENTERING + * Checks the sanity of the GMM instance data before making changes. + * + * This is macro is a stub by default and must be enabled manually in the code. + * + * @returns true if sane, false if not. + * @param pGMM The name of the pGMM variable. + */ +#if defined(VBOX_STRICT) && defined(GMMR0_WITH_SANITY_CHECK) && 0 +# define GMM_CHECK_SANITY_UPON_ENTERING(pGMM) (gmmR0SanityCheck((pGMM), __PRETTY_FUNCTION__, __LINE__) == 0) +#else +# define GMM_CHECK_SANITY_UPON_ENTERING(pGMM) (true) +#endif + +/** @def GMM_CHECK_SANITY_UPON_LEAVING + * Checks the sanity of the GMM instance data after making changes. + * + * This is macro is a stub by default and must be enabled manually in the code. + * + * @returns true if sane, false if not. + * @param pGMM The name of the pGMM variable. + */ +#if defined(VBOX_STRICT) && defined(GMMR0_WITH_SANITY_CHECK) && 0 +# define GMM_CHECK_SANITY_UPON_LEAVING(pGMM) (gmmR0SanityCheck((pGMM), __PRETTY_FUNCTION__, __LINE__) == 0) +#else +# define GMM_CHECK_SANITY_UPON_LEAVING(pGMM) (true) +#endif + +/** @def GMM_CHECK_SANITY_IN_LOOPS + * Checks the sanity of the GMM instance in the allocation loops. + * + * This is macro is a stub by default and must be enabled manually in the code. + * + * @returns true if sane, false if not. + * @param pGMM The name of the pGMM variable. + */ +#if defined(VBOX_STRICT) && defined(GMMR0_WITH_SANITY_CHECK) && 0 +# define GMM_CHECK_SANITY_IN_LOOPS(pGMM) (gmmR0SanityCheck((pGMM), __PRETTY_FUNCTION__, __LINE__) == 0) +#else +# define GMM_CHECK_SANITY_IN_LOOPS(pGMM) (true) +#endif + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +static DECLCALLBACK(int) gmmR0TermDestroyChunk(PAVLU32NODECORE pNode, void *pvGMM); +static bool gmmR0CleanupVMScanChunk(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk); +DECLINLINE(void) gmmR0UnlinkChunk(PGMMCHUNK pChunk); +DECLINLINE(void) gmmR0LinkChunk(PGMMCHUNK pChunk, PGMMCHUNKFREESET pSet); +DECLINLINE(void) gmmR0SelectSetAndLinkChunk(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk); +#ifdef GMMR0_WITH_SANITY_CHECK +static uint32_t gmmR0SanityCheck(PGMM pGMM, const char *pszFunction, unsigned uLineNo); +#endif +static bool gmmR0FreeChunk(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk, bool fRelaxedSem); +DECLINLINE(void) gmmR0FreePrivatePage(PGMM pGMM, PGVM pGVM, uint32_t idPage, PGMMPAGE pPage); +DECLINLINE(void) gmmR0FreeSharedPage(PGMM pGMM, PGVM pGVM, uint32_t idPage, PGMMPAGE pPage); +static int gmmR0UnmapChunkLocked(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk); +#ifdef VBOX_WITH_PAGE_SHARING +static void gmmR0SharedModuleCleanup(PGMM pGMM, PGVM pGVM); +# ifdef VBOX_STRICT +static uint32_t gmmR0StrictPageChecksum(PGMM pGMM, PGVM pGVM, uint32_t idPage); +# endif +#endif + + + +/** + * Initializes the GMM component. + * + * This is called when the VMMR0.r0 module is loaded and protected by the + * loader semaphore. + * + * @returns VBox status code. + */ +GMMR0DECL(int) GMMR0Init(void) +{ + LogFlow(("GMMInit:\n")); + + /* + * Allocate the instance data and the locks. + */ + PGMM pGMM = (PGMM)RTMemAllocZ(sizeof(*pGMM)); + if (!pGMM) + return VERR_NO_MEMORY; + + pGMM->u32Magic = GMM_MAGIC; + for (unsigned i = 0; i < RT_ELEMENTS(pGMM->ChunkTLB.aEntries); i++) + pGMM->ChunkTLB.aEntries[i].idChunk = NIL_GMM_CHUNKID; + RTListInit(&pGMM->ChunkList); + ASMBitSet(&pGMM->bmChunkId[0], NIL_GMM_CHUNKID); + +#ifdef VBOX_USE_CRIT_SECT_FOR_GIANT + int rc = RTCritSectInit(&pGMM->GiantCritSect); +#else + int rc = RTSemFastMutexCreate(&pGMM->hMtx); +#endif + if (RT_SUCCESS(rc)) + { + unsigned iMtx; + for (iMtx = 0; iMtx < RT_ELEMENTS(pGMM->aChunkMtx); iMtx++) + { + rc = RTSemFastMutexCreate(&pGMM->aChunkMtx[iMtx].hMtx); + if (RT_FAILURE(rc)) + break; + } + if (RT_SUCCESS(rc)) + { + /* + * Check and see if RTR0MemObjAllocPhysNC works. + */ +#if 0 /* later, see @bufref{3170}. */ + RTR0MEMOBJ MemObj; + rc = RTR0MemObjAllocPhysNC(&MemObj, _64K, NIL_RTHCPHYS); + if (RT_SUCCESS(rc)) + { + rc = RTR0MemObjFree(MemObj, true); + AssertRC(rc); + } + else if (rc == VERR_NOT_SUPPORTED) + pGMM->fLegacyAllocationMode = pGMM->fBoundMemoryMode = true; + else + SUPR0Printf("GMMR0Init: RTR0MemObjAllocPhysNC(,64K,Any) -> %d!\n", rc); +#else +# if defined(RT_OS_WINDOWS) || (defined(RT_OS_SOLARIS) && ARCH_BITS == 64) || defined(RT_OS_LINUX) || defined(RT_OS_FREEBSD) + pGMM->fLegacyAllocationMode = false; +# if ARCH_BITS == 32 + /* Don't reuse possibly partial chunks because of the virtual + address space limitation. */ + pGMM->fBoundMemoryMode = true; +# else + pGMM->fBoundMemoryMode = false; +# endif +# else + pGMM->fLegacyAllocationMode = true; + pGMM->fBoundMemoryMode = true; +# endif +#endif + + /* + * Query system page count and guess a reasonable cMaxPages value. + */ + pGMM->cMaxPages = UINT32_MAX; /** @todo IPRT function for query ram size and such. */ + + g_pGMM = pGMM; + LogFlow(("GMMInit: pGMM=%p fLegacyAllocationMode=%RTbool fBoundMemoryMode=%RTbool\n", pGMM, pGMM->fLegacyAllocationMode, pGMM->fBoundMemoryMode)); + return VINF_SUCCESS; + } + + /* + * Bail out. + */ + while (iMtx-- > 0) + RTSemFastMutexDestroy(pGMM->aChunkMtx[iMtx].hMtx); +#ifdef VBOX_USE_CRIT_SECT_FOR_GIANT + RTCritSectDelete(&pGMM->GiantCritSect); +#else + RTSemFastMutexDestroy(pGMM->hMtx); +#endif + } + + pGMM->u32Magic = 0; + RTMemFree(pGMM); + SUPR0Printf("GMMR0Init: failed! rc=%d\n", rc); + return rc; +} + + +/** + * Terminates the GMM component. + */ +GMMR0DECL(void) GMMR0Term(void) +{ + LogFlow(("GMMTerm:\n")); + + /* + * Take care / be paranoid... + */ + PGMM pGMM = g_pGMM; + if (!VALID_PTR(pGMM)) + return; + if (pGMM->u32Magic != GMM_MAGIC) + { + SUPR0Printf("GMMR0Term: u32Magic=%#x\n", pGMM->u32Magic); + return; + } + + /* + * Undo what init did and free all the resources we've acquired. + */ + /* Destroy the fundamentals. */ + g_pGMM = NULL; + pGMM->u32Magic = ~GMM_MAGIC; +#ifdef VBOX_USE_CRIT_SECT_FOR_GIANT + RTCritSectDelete(&pGMM->GiantCritSect); +#else + RTSemFastMutexDestroy(pGMM->hMtx); + pGMM->hMtx = NIL_RTSEMFASTMUTEX; +#endif + + /* Free any chunks still hanging around. */ + RTAvlU32Destroy(&pGMM->pChunks, gmmR0TermDestroyChunk, pGMM); + + /* Destroy the chunk locks. */ + for (unsigned iMtx = 0; iMtx < RT_ELEMENTS(pGMM->aChunkMtx); iMtx++) + { + Assert(pGMM->aChunkMtx[iMtx].cUsers == 0); + RTSemFastMutexDestroy(pGMM->aChunkMtx[iMtx].hMtx); + pGMM->aChunkMtx[iMtx].hMtx = NIL_RTSEMFASTMUTEX; + } + + /* Finally the instance data itself. */ + RTMemFree(pGMM); + LogFlow(("GMMTerm: done\n")); +} + + +/** + * RTAvlU32Destroy callback. + * + * @returns 0 + * @param pNode The node to destroy. + * @param pvGMM The GMM handle. + */ +static DECLCALLBACK(int) gmmR0TermDestroyChunk(PAVLU32NODECORE pNode, void *pvGMM) +{ + PGMMCHUNK pChunk = (PGMMCHUNK)pNode; + + if (pChunk->cFree != (GMM_CHUNK_SIZE >> PAGE_SHIFT)) + SUPR0Printf("GMMR0Term: %RKv/%#x: cFree=%d cPrivate=%d cShared=%d cMappings=%d\n", pChunk, + pChunk->Core.Key, pChunk->cFree, pChunk->cPrivate, pChunk->cShared, pChunk->cMappingsX); + + int rc = RTR0MemObjFree(pChunk->hMemObj, true /* fFreeMappings */); + if (RT_FAILURE(rc)) + { + SUPR0Printf("GMMR0Term: %RKv/%#x: RTRMemObjFree(%RKv,true) -> %d (cMappings=%d)\n", pChunk, + pChunk->Core.Key, pChunk->hMemObj, rc, pChunk->cMappingsX); + AssertRC(rc); + } + pChunk->hMemObj = NIL_RTR0MEMOBJ; + + RTMemFree(pChunk->paMappingsX); + pChunk->paMappingsX = NULL; + + RTMemFree(pChunk); + NOREF(pvGMM); + return 0; +} + + +/** + * Initializes the per-VM data for the GMM. + * + * This is called from within the GVMM lock (from GVMMR0CreateVM) + * and should only initialize the data members so GMMR0CleanupVM + * can deal with them. We reserve no memory or anything here, + * that's done later in GMMR0InitVM. + * + * @param pGVM Pointer to the Global VM structure. + */ +GMMR0DECL(void) GMMR0InitPerVMData(PGVM pGVM) +{ + AssertCompile(RT_SIZEOFMEMB(GVM,gmm.s) <= RT_SIZEOFMEMB(GVM,gmm.padding)); + + pGVM->gmm.s.Stats.enmPolicy = GMMOCPOLICY_INVALID; + pGVM->gmm.s.Stats.enmPriority = GMMPRIORITY_INVALID; + pGVM->gmm.s.Stats.fMayAllocate = false; +} + + +/** + * Acquires the GMM giant lock. + * + * @returns Assert status code from RTSemFastMutexRequest. + * @param pGMM Pointer to the GMM instance. + */ +static int gmmR0MutexAcquire(PGMM pGMM) +{ + ASMAtomicIncU32(&pGMM->cMtxContenders); +#ifdef VBOX_USE_CRIT_SECT_FOR_GIANT + int rc = RTCritSectEnter(&pGMM->GiantCritSect); +#else + int rc = RTSemFastMutexRequest(pGMM->hMtx); +#endif + ASMAtomicDecU32(&pGMM->cMtxContenders); + AssertRC(rc); +#ifdef VBOX_STRICT + pGMM->hMtxOwner = RTThreadNativeSelf(); +#endif + return rc; +} + + +/** + * Releases the GMM giant lock. + * + * @returns Assert status code from RTSemFastMutexRequest. + * @param pGMM Pointer to the GMM instance. + */ +static int gmmR0MutexRelease(PGMM pGMM) +{ +#ifdef VBOX_STRICT + pGMM->hMtxOwner = NIL_RTNATIVETHREAD; +#endif +#ifdef VBOX_USE_CRIT_SECT_FOR_GIANT + int rc = RTCritSectLeave(&pGMM->GiantCritSect); +#else + int rc = RTSemFastMutexRelease(pGMM->hMtx); + AssertRC(rc); +#endif + return rc; +} + + +/** + * Yields the GMM giant lock if there is contention and a certain minimum time + * has elapsed since we took it. + * + * @returns @c true if the mutex was yielded, @c false if not. + * @param pGMM Pointer to the GMM instance. + * @param puLockNanoTS Where the lock acquisition time stamp is kept + * (in/out). + */ +static bool gmmR0MutexYield(PGMM pGMM, uint64_t *puLockNanoTS) +{ + /* + * If nobody is contending the mutex, don't bother checking the time. + */ + if (ASMAtomicReadU32(&pGMM->cMtxContenders) == 0) + return false; + + /* + * Don't yield if we haven't executed for at least 2 milliseconds. + */ + uint64_t uNanoNow = RTTimeSystemNanoTS(); + if (uNanoNow - *puLockNanoTS < UINT32_C(2000000)) + return false; + + /* + * Yield the mutex. + */ +#ifdef VBOX_STRICT + pGMM->hMtxOwner = NIL_RTNATIVETHREAD; +#endif + ASMAtomicIncU32(&pGMM->cMtxContenders); +#ifdef VBOX_USE_CRIT_SECT_FOR_GIANT + int rc1 = RTCritSectLeave(&pGMM->GiantCritSect); AssertRC(rc1); +#else + int rc1 = RTSemFastMutexRelease(pGMM->hMtx); AssertRC(rc1); +#endif + + RTThreadYield(); + +#ifdef VBOX_USE_CRIT_SECT_FOR_GIANT + int rc2 = RTCritSectEnter(&pGMM->GiantCritSect); AssertRC(rc2); +#else + int rc2 = RTSemFastMutexRequest(pGMM->hMtx); AssertRC(rc2); +#endif + *puLockNanoTS = RTTimeSystemNanoTS(); + ASMAtomicDecU32(&pGMM->cMtxContenders); +#ifdef VBOX_STRICT + pGMM->hMtxOwner = RTThreadNativeSelf(); +#endif + + return true; +} + + +/** + * Acquires a chunk lock. + * + * The caller must own the giant lock. + * + * @returns Assert status code from RTSemFastMutexRequest. + * @param pMtxState The chunk mutex state info. (Avoids + * passing the same flags and stuff around + * for subsequent release and drop-giant + * calls.) + * @param pGMM Pointer to the GMM instance. + * @param pChunk Pointer to the chunk. + * @param fFlags Flags regarding the giant lock, GMMR0CHUNK_MTX_XXX. + */ +static int gmmR0ChunkMutexAcquire(PGMMR0CHUNKMTXSTATE pMtxState, PGMM pGMM, PGMMCHUNK pChunk, uint32_t fFlags) +{ + Assert(fFlags > GMMR0CHUNK_MTX_INVALID && fFlags < GMMR0CHUNK_MTX_END); + Assert(pGMM->hMtxOwner == RTThreadNativeSelf()); + + pMtxState->pGMM = pGMM; + pMtxState->fFlags = (uint8_t)fFlags; + + /* + * Get the lock index and reference the lock. + */ + Assert(pGMM->hMtxOwner == RTThreadNativeSelf()); + uint32_t iChunkMtx = pChunk->iChunkMtx; + if (iChunkMtx == UINT8_MAX) + { + iChunkMtx = pGMM->iNextChunkMtx++; + iChunkMtx %= RT_ELEMENTS(pGMM->aChunkMtx); + + /* Try get an unused one... */ + if (pGMM->aChunkMtx[iChunkMtx].cUsers) + { + iChunkMtx = pGMM->iNextChunkMtx++; + iChunkMtx %= RT_ELEMENTS(pGMM->aChunkMtx); + if (pGMM->aChunkMtx[iChunkMtx].cUsers) + { + iChunkMtx = pGMM->iNextChunkMtx++; + iChunkMtx %= RT_ELEMENTS(pGMM->aChunkMtx); + if (pGMM->aChunkMtx[iChunkMtx].cUsers) + { + iChunkMtx = pGMM->iNextChunkMtx++; + iChunkMtx %= RT_ELEMENTS(pGMM->aChunkMtx); + } + } + } + + pChunk->iChunkMtx = iChunkMtx; + } + AssertCompile(RT_ELEMENTS(pGMM->aChunkMtx) < UINT8_MAX); + pMtxState->iChunkMtx = (uint8_t)iChunkMtx; + ASMAtomicIncU32(&pGMM->aChunkMtx[iChunkMtx].cUsers); + + /* + * Drop the giant? + */ + if (fFlags != GMMR0CHUNK_MTX_KEEP_GIANT) + { + /** @todo GMM life cycle cleanup (we may race someone + * destroying and cleaning up GMM)? */ + gmmR0MutexRelease(pGMM); + } + + /* + * Take the chunk mutex. + */ + int rc = RTSemFastMutexRequest(pGMM->aChunkMtx[iChunkMtx].hMtx); + AssertRC(rc); + return rc; +} + + +/** + * Releases the GMM giant lock. + * + * @returns Assert status code from RTSemFastMutexRequest. + * @param pMtxState Pointer to the chunk mutex state. + * @param pChunk Pointer to the chunk if it's still + * alive, NULL if it isn't. This is used to deassociate + * the chunk from the mutex on the way out so a new one + * can be selected next time, thus avoiding contented + * mutexes. + */ +static int gmmR0ChunkMutexRelease(PGMMR0CHUNKMTXSTATE pMtxState, PGMMCHUNK pChunk) +{ + PGMM pGMM = pMtxState->pGMM; + + /* + * Release the chunk mutex and reacquire the giant if requested. + */ + int rc = RTSemFastMutexRelease(pGMM->aChunkMtx[pMtxState->iChunkMtx].hMtx); + AssertRC(rc); + if (pMtxState->fFlags == GMMR0CHUNK_MTX_RETAKE_GIANT) + rc = gmmR0MutexAcquire(pGMM); + else + Assert((pMtxState->fFlags != GMMR0CHUNK_MTX_DROP_GIANT) == (pGMM->hMtxOwner == RTThreadNativeSelf())); + + /* + * Drop the chunk mutex user reference and deassociate it from the chunk + * when possible. + */ + if ( ASMAtomicDecU32(&pGMM->aChunkMtx[pMtxState->iChunkMtx].cUsers) == 0 + && pChunk + && RT_SUCCESS(rc) ) + { + if (pMtxState->fFlags != GMMR0CHUNK_MTX_DROP_GIANT) + pChunk->iChunkMtx = UINT8_MAX; + else + { + rc = gmmR0MutexAcquire(pGMM); + if (RT_SUCCESS(rc)) + { + if (pGMM->aChunkMtx[pMtxState->iChunkMtx].cUsers == 0) + pChunk->iChunkMtx = UINT8_MAX; + rc = gmmR0MutexRelease(pGMM); + } + } + } + + pMtxState->pGMM = NULL; + return rc; +} + + +/** + * Drops the giant GMM lock we kept in gmmR0ChunkMutexAcquire while keeping the + * chunk locked. + * + * This only works if gmmR0ChunkMutexAcquire was called with + * GMMR0CHUNK_MTX_KEEP_GIANT. gmmR0ChunkMutexRelease will retake the giant + * mutex, i.e. behave as if GMMR0CHUNK_MTX_RETAKE_GIANT was used. + * + * @returns VBox status code (assuming success is ok). + * @param pMtxState Pointer to the chunk mutex state. + */ +static int gmmR0ChunkMutexDropGiant(PGMMR0CHUNKMTXSTATE pMtxState) +{ + AssertReturn(pMtxState->fFlags == GMMR0CHUNK_MTX_KEEP_GIANT, VERR_GMM_MTX_FLAGS); + Assert(pMtxState->pGMM->hMtxOwner == RTThreadNativeSelf()); + pMtxState->fFlags = GMMR0CHUNK_MTX_RETAKE_GIANT; + /** @todo GMM life cycle cleanup (we may race someone + * destroying and cleaning up GMM)? */ + return gmmR0MutexRelease(pMtxState->pGMM); +} + + +/** + * For experimenting with NUMA affinity and such. + * + * @returns The current NUMA Node ID. + */ +static uint16_t gmmR0GetCurrentNumaNodeId(void) +{ +#if 1 + return GMM_CHUNK_NUMA_ID_UNKNOWN; +#else + return RTMpCpuId() / 16; +#endif +} + + + +/** + * Cleans up when a VM is terminating. + * + * @param pGVM Pointer to the Global VM structure. + */ +GMMR0DECL(void) GMMR0CleanupVM(PGVM pGVM) +{ + LogFlow(("GMMR0CleanupVM: pGVM=%p:{.pVM=%p, .hSelf=%#x}\n", pGVM, pGVM->pVM, pGVM->hSelf)); + + PGMM pGMM; + GMM_GET_VALID_INSTANCE_VOID(pGMM); + +#ifdef VBOX_WITH_PAGE_SHARING + /* + * Clean up all registered shared modules first. + */ + gmmR0SharedModuleCleanup(pGMM, pGVM); +#endif + + gmmR0MutexAcquire(pGMM); + uint64_t uLockNanoTS = RTTimeSystemNanoTS(); + GMM_CHECK_SANITY_UPON_ENTERING(pGMM); + + /* + * The policy is 'INVALID' until the initial reservation + * request has been serviced. + */ + if ( pGVM->gmm.s.Stats.enmPolicy > GMMOCPOLICY_INVALID + && pGVM->gmm.s.Stats.enmPolicy < GMMOCPOLICY_END) + { + /* + * If it's the last VM around, we can skip walking all the chunk looking + * for the pages owned by this VM and instead flush the whole shebang. + * + * This takes care of the eventuality that a VM has left shared page + * references behind (shouldn't happen of course, but you never know). + */ + Assert(pGMM->cRegisteredVMs); + pGMM->cRegisteredVMs--; + + /* + * Walk the entire pool looking for pages that belong to this VM + * and leftover mappings. (This'll only catch private pages, + * shared pages will be 'left behind'.) + */ + /** @todo r=bird: This scanning+freeing could be optimized in bound mode! */ + uint64_t cPrivatePages = pGVM->gmm.s.Stats.cPrivatePages; /* save */ + + unsigned iCountDown = 64; + bool fRedoFromStart; + PGMMCHUNK pChunk; + do + { + fRedoFromStart = false; + RTListForEachReverse(&pGMM->ChunkList, pChunk, GMMCHUNK, ListNode) + { + uint32_t const cFreeChunksOld = pGMM->cFreedChunks; + if ( ( !pGMM->fBoundMemoryMode + || pChunk->hGVM == pGVM->hSelf) + && gmmR0CleanupVMScanChunk(pGMM, pGVM, pChunk)) + { + /* We left the giant mutex, so reset the yield counters. */ + uLockNanoTS = RTTimeSystemNanoTS(); + iCountDown = 64; + } + else + { + /* Didn't leave it, so do normal yielding. */ + if (!iCountDown) + gmmR0MutexYield(pGMM, &uLockNanoTS); + else + iCountDown--; + } + if (pGMM->cFreedChunks != cFreeChunksOld) + { + fRedoFromStart = true; + break; + } + } + } while (fRedoFromStart); + + if (pGVM->gmm.s.Stats.cPrivatePages) + SUPR0Printf("GMMR0CleanupVM: hGVM=%#x has %#x private pages that cannot be found!\n", pGVM->hSelf, pGVM->gmm.s.Stats.cPrivatePages); + + pGMM->cAllocatedPages -= cPrivatePages; + + /* + * Free empty chunks. + */ + PGMMCHUNKFREESET pPrivateSet = pGMM->fBoundMemoryMode ? &pGVM->gmm.s.Private : &pGMM->PrivateX; + do + { + fRedoFromStart = false; + iCountDown = 10240; + pChunk = pPrivateSet->apLists[GMM_CHUNK_FREE_SET_UNUSED_LIST]; + while (pChunk) + { + PGMMCHUNK pNext = pChunk->pFreeNext; + Assert(pChunk->cFree == GMM_CHUNK_NUM_PAGES); + if ( !pGMM->fBoundMemoryMode + || pChunk->hGVM == pGVM->hSelf) + { + uint64_t const idGenerationOld = pPrivateSet->idGeneration; + if (gmmR0FreeChunk(pGMM, pGVM, pChunk, true /*fRelaxedSem*/)) + { + /* We've left the giant mutex, restart? (+1 for our unlink) */ + fRedoFromStart = pPrivateSet->idGeneration != idGenerationOld + 1; + if (fRedoFromStart) + break; + uLockNanoTS = RTTimeSystemNanoTS(); + iCountDown = 10240; + } + } + + /* Advance and maybe yield the lock. */ + pChunk = pNext; + if (--iCountDown == 0) + { + uint64_t const idGenerationOld = pPrivateSet->idGeneration; + fRedoFromStart = gmmR0MutexYield(pGMM, &uLockNanoTS) + && pPrivateSet->idGeneration != idGenerationOld; + if (fRedoFromStart) + break; + iCountDown = 10240; + } + } + } while (fRedoFromStart); + + /* + * Account for shared pages that weren't freed. + */ + if (pGVM->gmm.s.Stats.cSharedPages) + { + Assert(pGMM->cSharedPages >= pGVM->gmm.s.Stats.cSharedPages); + SUPR0Printf("GMMR0CleanupVM: hGVM=%#x left %#x shared pages behind!\n", pGVM->hSelf, pGVM->gmm.s.Stats.cSharedPages); + pGMM->cLeftBehindSharedPages += pGVM->gmm.s.Stats.cSharedPages; + } + + /* + * Clean up balloon statistics in case the VM process crashed. + */ + Assert(pGMM->cBalloonedPages >= pGVM->gmm.s.Stats.cBalloonedPages); + pGMM->cBalloonedPages -= pGVM->gmm.s.Stats.cBalloonedPages; + + /* + * Update the over-commitment management statistics. + */ + pGMM->cReservedPages -= pGVM->gmm.s.Stats.Reserved.cBasePages + + pGVM->gmm.s.Stats.Reserved.cFixedPages + + pGVM->gmm.s.Stats.Reserved.cShadowPages; + switch (pGVM->gmm.s.Stats.enmPolicy) + { + case GMMOCPOLICY_NO_OC: + break; + default: + /** @todo Update GMM->cOverCommittedPages */ + break; + } + } + + /* zap the GVM data. */ + pGVM->gmm.s.Stats.enmPolicy = GMMOCPOLICY_INVALID; + pGVM->gmm.s.Stats.enmPriority = GMMPRIORITY_INVALID; + pGVM->gmm.s.Stats.fMayAllocate = false; + + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + gmmR0MutexRelease(pGMM); + + LogFlow(("GMMR0CleanupVM: returns\n")); +} + + +/** + * Scan one chunk for private pages belonging to the specified VM. + * + * @note This function may drop the giant mutex! + * + * @returns @c true if we've temporarily dropped the giant mutex, @c false if + * we didn't. + * @param pGMM Pointer to the GMM instance. + * @param pGVM The global VM handle. + * @param pChunk The chunk to scan. + */ +static bool gmmR0CleanupVMScanChunk(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk) +{ + Assert(!pGMM->fBoundMemoryMode || pChunk->hGVM == pGVM->hSelf); + + /* + * Look for pages belonging to the VM. + * (Perform some internal checks while we're scanning.) + */ +#ifndef VBOX_STRICT + if (pChunk->cFree != (GMM_CHUNK_SIZE >> PAGE_SHIFT)) +#endif + { + unsigned cPrivate = 0; + unsigned cShared = 0; + unsigned cFree = 0; + + gmmR0UnlinkChunk(pChunk); /* avoiding cFreePages updates. */ + + uint16_t hGVM = pGVM->hSelf; + unsigned iPage = (GMM_CHUNK_SIZE >> PAGE_SHIFT); + while (iPage-- > 0) + if (GMM_PAGE_IS_PRIVATE(&pChunk->aPages[iPage])) + { + if (pChunk->aPages[iPage].Private.hGVM == hGVM) + { + /* + * Free the page. + * + * The reason for not using gmmR0FreePrivatePage here is that we + * must *not* cause the chunk to be freed from under us - we're in + * an AVL tree walk here. + */ + pChunk->aPages[iPage].u = 0; + pChunk->aPages[iPage].Free.iNext = pChunk->iFreeHead; + pChunk->aPages[iPage].Free.u2State = GMM_PAGE_STATE_FREE; + pChunk->iFreeHead = iPage; + pChunk->cPrivate--; + pChunk->cFree++; + pGVM->gmm.s.Stats.cPrivatePages--; + cFree++; + } + else + cPrivate++; + } + else if (GMM_PAGE_IS_FREE(&pChunk->aPages[iPage])) + cFree++; + else + cShared++; + + gmmR0SelectSetAndLinkChunk(pGMM, pGVM, pChunk); + + /* + * Did it add up? + */ + if (RT_UNLIKELY( pChunk->cFree != cFree + || pChunk->cPrivate != cPrivate + || pChunk->cShared != cShared)) + { + SUPR0Printf("gmmR0CleanupVMScanChunk: Chunk %RKv/%#x has bogus stats - free=%d/%d private=%d/%d shared=%d/%d\n", + pChunk, pChunk->Core.Key, pChunk->cFree, cFree, pChunk->cPrivate, cPrivate, pChunk->cShared, cShared); + pChunk->cFree = cFree; + pChunk->cPrivate = cPrivate; + pChunk->cShared = cShared; + } + } + + /* + * If not in bound memory mode, we should reset the hGVM field + * if it has our handle in it. + */ + if (pChunk->hGVM == pGVM->hSelf) + { + if (!g_pGMM->fBoundMemoryMode) + pChunk->hGVM = NIL_GVM_HANDLE; + else if (pChunk->cFree != GMM_CHUNK_NUM_PAGES) + { + SUPR0Printf("gmmR0CleanupVMScanChunk: %RKv/%#x: cFree=%#x - it should be 0 in bound mode!\n", + pChunk, pChunk->Core.Key, pChunk->cFree); + AssertMsgFailed(("%p/%#x: cFree=%#x - it should be 0 in bound mode!\n", pChunk, pChunk->Core.Key, pChunk->cFree)); + + gmmR0UnlinkChunk(pChunk); + pChunk->cFree = GMM_CHUNK_NUM_PAGES; + gmmR0SelectSetAndLinkChunk(pGMM, pGVM, pChunk); + } + } + + /* + * Look for a mapping belonging to the terminating VM. + */ + GMMR0CHUNKMTXSTATE MtxState; + gmmR0ChunkMutexAcquire(&MtxState, pGMM, pChunk, GMMR0CHUNK_MTX_KEEP_GIANT); + unsigned cMappings = pChunk->cMappingsX; + for (unsigned i = 0; i < cMappings; i++) + if (pChunk->paMappingsX[i].pGVM == pGVM) + { + gmmR0ChunkMutexDropGiant(&MtxState); + + RTR0MEMOBJ hMemObj = pChunk->paMappingsX[i].hMapObj; + + cMappings--; + if (i < cMappings) + pChunk->paMappingsX[i] = pChunk->paMappingsX[cMappings]; + pChunk->paMappingsX[cMappings].pGVM = NULL; + pChunk->paMappingsX[cMappings].hMapObj = NIL_RTR0MEMOBJ; + Assert(pChunk->cMappingsX - 1U == cMappings); + pChunk->cMappingsX = cMappings; + + int rc = RTR0MemObjFree(hMemObj, false /* fFreeMappings (NA) */); + if (RT_FAILURE(rc)) + { + SUPR0Printf("gmmR0CleanupVMScanChunk: %RKv/%#x: mapping #%x: RTRMemObjFree(%RKv,false) -> %d \n", + pChunk, pChunk->Core.Key, i, hMemObj, rc); + AssertRC(rc); + } + + gmmR0ChunkMutexRelease(&MtxState, pChunk); + return true; + } + + gmmR0ChunkMutexRelease(&MtxState, pChunk); + return false; +} + + +/** + * The initial resource reservations. + * + * This will make memory reservations according to policy and priority. If there aren't + * sufficient resources available to sustain the VM this function will fail and all + * future allocations requests will fail as well. + * + * These are just the initial reservations made very very early during the VM creation + * process and will be adjusted later in the GMMR0UpdateReservation call after the + * ring-3 init has completed. + * + * @returns VBox status code. + * @retval VERR_GMM_MEMORY_RESERVATION_DECLINED + * @retval VERR_GMM_ + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id - must be zero. + * @param cBasePages The number of pages that may be allocated for the base RAM and ROMs. + * This does not include MMIO2 and similar. + * @param cShadowPages The number of pages that may be allocated for shadow paging structures. + * @param cFixedPages The number of pages that may be allocated for fixed objects like the + * hyper heap, MMIO2 and similar. + * @param enmPolicy The OC policy to use on this VM. + * @param enmPriority The priority in an out-of-memory situation. + * + * @thread The creator thread / EMT(0). + */ +GMMR0DECL(int) GMMR0InitialReservation(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint64_t cBasePages, uint32_t cShadowPages, + uint32_t cFixedPages, GMMOCPOLICY enmPolicy, GMMPRIORITY enmPriority) +{ + LogFlow(("GMMR0InitialReservation: pGVM=%p pVM=%p cBasePages=%#llx cShadowPages=%#x cFixedPages=%#x enmPolicy=%d enmPriority=%d\n", + pGVM, pVM, cBasePages, cShadowPages, cFixedPages, enmPolicy, enmPriority)); + + /* + * Validate, get basics and take the semaphore. + */ + AssertReturn(idCpu == 0, VERR_INVALID_CPU_ID); + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + AssertReturn(cBasePages, VERR_INVALID_PARAMETER); + AssertReturn(cShadowPages, VERR_INVALID_PARAMETER); + AssertReturn(cFixedPages, VERR_INVALID_PARAMETER); + AssertReturn(enmPolicy > GMMOCPOLICY_INVALID && enmPolicy < GMMOCPOLICY_END, VERR_INVALID_PARAMETER); + AssertReturn(enmPriority > GMMPRIORITY_INVALID && enmPriority < GMMPRIORITY_END, VERR_INVALID_PARAMETER); + + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + if ( !pGVM->gmm.s.Stats.Reserved.cBasePages + && !pGVM->gmm.s.Stats.Reserved.cFixedPages + && !pGVM->gmm.s.Stats.Reserved.cShadowPages) + { + /* + * Check if we can accommodate this. + */ + /* ... later ... */ + if (RT_SUCCESS(rc)) + { + /* + * Update the records. + */ + pGVM->gmm.s.Stats.Reserved.cBasePages = cBasePages; + pGVM->gmm.s.Stats.Reserved.cFixedPages = cFixedPages; + pGVM->gmm.s.Stats.Reserved.cShadowPages = cShadowPages; + pGVM->gmm.s.Stats.enmPolicy = enmPolicy; + pGVM->gmm.s.Stats.enmPriority = enmPriority; + pGVM->gmm.s.Stats.fMayAllocate = true; + + pGMM->cReservedPages += cBasePages + cFixedPages + cShadowPages; + pGMM->cRegisteredVMs++; + } + } + else + rc = VERR_WRONG_ORDER; + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + gmmR0MutexRelease(pGMM); + LogFlow(("GMMR0InitialReservation: returns %Rrc\n", rc)); + return rc; +} + + +/** + * VMMR0 request wrapper for GMMR0InitialReservation. + * + * @returns see GMMR0InitialReservation. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0InitialReservationReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMINITIALRESERVATIONREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pVM, VERR_INVALID_POINTER); + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + return GMMR0InitialReservation(pGVM, pVM, idCpu, pReq->cBasePages, pReq->cShadowPages, + pReq->cFixedPages, pReq->enmPolicy, pReq->enmPriority); +} + + +/** + * This updates the memory reservation with the additional MMIO2 and ROM pages. + * + * @returns VBox status code. + * @retval VERR_GMM_MEMORY_RESERVATION_DECLINED + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param cBasePages The number of pages that may be allocated for the base RAM and ROMs. + * This does not include MMIO2 and similar. + * @param cShadowPages The number of pages that may be allocated for shadow paging structures. + * @param cFixedPages The number of pages that may be allocated for fixed objects like the + * hyper heap, MMIO2 and similar. + * + * @thread EMT(idCpu) + */ +GMMR0DECL(int) GMMR0UpdateReservation(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint64_t cBasePages, + uint32_t cShadowPages, uint32_t cFixedPages) +{ + LogFlow(("GMMR0UpdateReservation: pGVM=%p pVM=%p cBasePages=%#llx cShadowPages=%#x cFixedPages=%#x\n", + pGVM, pVM, cBasePages, cShadowPages, cFixedPages)); + + /* + * Validate, get basics and take the semaphore. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + AssertReturn(cBasePages, VERR_INVALID_PARAMETER); + AssertReturn(cShadowPages, VERR_INVALID_PARAMETER); + AssertReturn(cFixedPages, VERR_INVALID_PARAMETER); + + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + if ( pGVM->gmm.s.Stats.Reserved.cBasePages + && pGVM->gmm.s.Stats.Reserved.cFixedPages + && pGVM->gmm.s.Stats.Reserved.cShadowPages) + { + /* + * Check if we can accommodate this. + */ + /* ... later ... */ + if (RT_SUCCESS(rc)) + { + /* + * Update the records. + */ + pGMM->cReservedPages -= pGVM->gmm.s.Stats.Reserved.cBasePages + + pGVM->gmm.s.Stats.Reserved.cFixedPages + + pGVM->gmm.s.Stats.Reserved.cShadowPages; + pGMM->cReservedPages += cBasePages + cFixedPages + cShadowPages; + + pGVM->gmm.s.Stats.Reserved.cBasePages = cBasePages; + pGVM->gmm.s.Stats.Reserved.cFixedPages = cFixedPages; + pGVM->gmm.s.Stats.Reserved.cShadowPages = cShadowPages; + } + } + else + rc = VERR_WRONG_ORDER; + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + gmmR0MutexRelease(pGMM); + LogFlow(("GMMR0UpdateReservation: returns %Rrc\n", rc)); + return rc; +} + + +/** + * VMMR0 request wrapper for GMMR0UpdateReservation. + * + * @returns see GMMR0UpdateReservation. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0UpdateReservationReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMUPDATERESERVATIONREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pVM, VERR_INVALID_POINTER); + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + return GMMR0UpdateReservation(pGVM, pVM, idCpu, pReq->cBasePages, pReq->cShadowPages, pReq->cFixedPages); +} + +#ifdef GMMR0_WITH_SANITY_CHECK + +/** + * Performs sanity checks on a free set. + * + * @returns Error count. + * + * @param pGMM Pointer to the GMM instance. + * @param pSet Pointer to the set. + * @param pszSetName The set name. + * @param pszFunction The function from which it was called. + * @param uLine The line number. + */ +static uint32_t gmmR0SanityCheckSet(PGMM pGMM, PGMMCHUNKFREESET pSet, const char *pszSetName, + const char *pszFunction, unsigned uLineNo) +{ + uint32_t cErrors = 0; + + /* + * Count the free pages in all the chunks and match it against pSet->cFreePages. + */ + uint32_t cPages = 0; + for (unsigned i = 0; i < RT_ELEMENTS(pSet->apLists); i++) + { + for (PGMMCHUNK pCur = pSet->apLists[i]; pCur; pCur = pCur->pFreeNext) + { + /** @todo check that the chunk is hash into the right set. */ + cPages += pCur->cFree; + } + } + if (RT_UNLIKELY(cPages != pSet->cFreePages)) + { + SUPR0Printf("GMM insanity: found %#x pages in the %s set, expected %#x. (%s, line %u)\n", + cPages, pszSetName, pSet->cFreePages, pszFunction, uLineNo); + cErrors++; + } + + return cErrors; +} + + +/** + * Performs some sanity checks on the GMM while owning lock. + * + * @returns Error count. + * + * @param pGMM Pointer to the GMM instance. + * @param pszFunction The function from which it is called. + * @param uLineNo The line number. + */ +static uint32_t gmmR0SanityCheck(PGMM pGMM, const char *pszFunction, unsigned uLineNo) +{ + uint32_t cErrors = 0; + + cErrors += gmmR0SanityCheckSet(pGMM, &pGMM->PrivateX, "private", pszFunction, uLineNo); + cErrors += gmmR0SanityCheckSet(pGMM, &pGMM->Shared, "shared", pszFunction, uLineNo); + /** @todo add more sanity checks. */ + + return cErrors; +} + +#endif /* GMMR0_WITH_SANITY_CHECK */ + +/** + * Looks up a chunk in the tree and fill in the TLB entry for it. + * + * This is not expected to fail and will bitch if it does. + * + * @returns Pointer to the allocation chunk, NULL if not found. + * @param pGMM Pointer to the GMM instance. + * @param idChunk The ID of the chunk to find. + * @param pTlbe Pointer to the TLB entry. + */ +static PGMMCHUNK gmmR0GetChunkSlow(PGMM pGMM, uint32_t idChunk, PGMMCHUNKTLBE pTlbe) +{ + PGMMCHUNK pChunk = (PGMMCHUNK)RTAvlU32Get(&pGMM->pChunks, idChunk); + AssertMsgReturn(pChunk, ("Chunk %#x not found!\n", idChunk), NULL); + pTlbe->idChunk = idChunk; + pTlbe->pChunk = pChunk; + return pChunk; +} + + +/** + * Finds a allocation chunk. + * + * This is not expected to fail and will bitch if it does. + * + * @returns Pointer to the allocation chunk, NULL if not found. + * @param pGMM Pointer to the GMM instance. + * @param idChunk The ID of the chunk to find. + */ +DECLINLINE(PGMMCHUNK) gmmR0GetChunk(PGMM pGMM, uint32_t idChunk) +{ + /* + * Do a TLB lookup, branch if not in the TLB. + */ + PGMMCHUNKTLBE pTlbe = &pGMM->ChunkTLB.aEntries[GMM_CHUNKTLB_IDX(idChunk)]; + if ( pTlbe->idChunk != idChunk + || !pTlbe->pChunk) + return gmmR0GetChunkSlow(pGMM, idChunk, pTlbe); + return pTlbe->pChunk; +} + + +/** + * Finds a page. + * + * This is not expected to fail and will bitch if it does. + * + * @returns Pointer to the page, NULL if not found. + * @param pGMM Pointer to the GMM instance. + * @param idPage The ID of the page to find. + */ +DECLINLINE(PGMMPAGE) gmmR0GetPage(PGMM pGMM, uint32_t idPage) +{ + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, idPage >> GMM_CHUNKID_SHIFT); + if (RT_LIKELY(pChunk)) + return &pChunk->aPages[idPage & GMM_PAGEID_IDX_MASK]; + return NULL; +} + + +#if 0 /* unused */ +/** + * Gets the host physical address for a page given by it's ID. + * + * @returns The host physical address or NIL_RTHCPHYS. + * @param pGMM Pointer to the GMM instance. + * @param idPage The ID of the page to find. + */ +DECLINLINE(RTHCPHYS) gmmR0GetPageHCPhys(PGMM pGMM, uint32_t idPage) +{ + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, idPage >> GMM_CHUNKID_SHIFT); + if (RT_LIKELY(pChunk)) + return RTR0MemObjGetPagePhysAddr(pChunk->hMemObj, idPage & GMM_PAGEID_IDX_MASK); + return NIL_RTHCPHYS; +} +#endif /* unused */ + + +/** + * Selects the appropriate free list given the number of free pages. + * + * @returns Free list index. + * @param cFree The number of free pages in the chunk. + */ +DECLINLINE(unsigned) gmmR0SelectFreeSetList(unsigned cFree) +{ + unsigned iList = cFree >> GMM_CHUNK_FREE_SET_SHIFT; + AssertMsg(iList < RT_SIZEOFMEMB(GMMCHUNKFREESET, apLists) / RT_SIZEOFMEMB(GMMCHUNKFREESET, apLists[0]), + ("%d (%u)\n", iList, cFree)); + return iList; +} + + +/** + * Unlinks the chunk from the free list it's currently on (if any). + * + * @param pChunk The allocation chunk. + */ +DECLINLINE(void) gmmR0UnlinkChunk(PGMMCHUNK pChunk) +{ + PGMMCHUNKFREESET pSet = pChunk->pSet; + if (RT_LIKELY(pSet)) + { + pSet->cFreePages -= pChunk->cFree; + pSet->idGeneration++; + + PGMMCHUNK pPrev = pChunk->pFreePrev; + PGMMCHUNK pNext = pChunk->pFreeNext; + if (pPrev) + pPrev->pFreeNext = pNext; + else + pSet->apLists[gmmR0SelectFreeSetList(pChunk->cFree)] = pNext; + if (pNext) + pNext->pFreePrev = pPrev; + + pChunk->pSet = NULL; + pChunk->pFreeNext = NULL; + pChunk->pFreePrev = NULL; + } + else + { + Assert(!pChunk->pFreeNext); + Assert(!pChunk->pFreePrev); + Assert(!pChunk->cFree); + } +} + + +/** + * Links the chunk onto the appropriate free list in the specified free set. + * + * If no free entries, it's not linked into any list. + * + * @param pChunk The allocation chunk. + * @param pSet The free set. + */ +DECLINLINE(void) gmmR0LinkChunk(PGMMCHUNK pChunk, PGMMCHUNKFREESET pSet) +{ + Assert(!pChunk->pSet); + Assert(!pChunk->pFreeNext); + Assert(!pChunk->pFreePrev); + + if (pChunk->cFree > 0) + { + pChunk->pSet = pSet; + pChunk->pFreePrev = NULL; + unsigned const iList = gmmR0SelectFreeSetList(pChunk->cFree); + pChunk->pFreeNext = pSet->apLists[iList]; + if (pChunk->pFreeNext) + pChunk->pFreeNext->pFreePrev = pChunk; + pSet->apLists[iList] = pChunk; + + pSet->cFreePages += pChunk->cFree; + pSet->idGeneration++; + } +} + + +/** + * Links the chunk onto the appropriate free list in the specified free set. + * + * If no free entries, it's not linked into any list. + * + * @param pGMM Pointer to the GMM instance. + * @param pGVM Pointer to the kernel-only VM instace data. + * @param pChunk The allocation chunk. + */ +DECLINLINE(void) gmmR0SelectSetAndLinkChunk(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk) +{ + PGMMCHUNKFREESET pSet; + if (pGMM->fBoundMemoryMode) + pSet = &pGVM->gmm.s.Private; + else if (pChunk->cShared) + pSet = &pGMM->Shared; + else + pSet = &pGMM->PrivateX; + gmmR0LinkChunk(pChunk, pSet); +} + + +/** + * Frees a Chunk ID. + * + * @param pGMM Pointer to the GMM instance. + * @param idChunk The Chunk ID to free. + */ +static void gmmR0FreeChunkId(PGMM pGMM, uint32_t idChunk) +{ + AssertReturnVoid(idChunk != NIL_GMM_CHUNKID); + AssertMsg(ASMBitTest(&pGMM->bmChunkId[0], idChunk), ("%#x\n", idChunk)); + ASMAtomicBitClear(&pGMM->bmChunkId[0], idChunk); +} + + +/** + * Allocates a new Chunk ID. + * + * @returns The Chunk ID. + * @param pGMM Pointer to the GMM instance. + */ +static uint32_t gmmR0AllocateChunkId(PGMM pGMM) +{ + AssertCompile(!((GMM_CHUNKID_LAST + 1) & 31)); /* must be a multiple of 32 */ + AssertCompile(NIL_GMM_CHUNKID == 0); + + /* + * Try the next sequential one. + */ + int32_t idChunk = ++pGMM->idChunkPrev; +#if 0 /** @todo enable this code */ + if ( idChunk <= GMM_CHUNKID_LAST + && idChunk > NIL_GMM_CHUNKID + && !ASMAtomicBitTestAndSet(&pVMM->bmChunkId[0], idChunk)) + return idChunk; +#endif + + /* + * Scan sequentially from the last one. + */ + if ( (uint32_t)idChunk < GMM_CHUNKID_LAST + && idChunk > NIL_GMM_CHUNKID) + { + idChunk = ASMBitNextClear(&pGMM->bmChunkId[0], GMM_CHUNKID_LAST + 1, idChunk - 1); + if (idChunk > NIL_GMM_CHUNKID) + { + AssertMsgReturn(!ASMAtomicBitTestAndSet(&pGMM->bmChunkId[0], idChunk), ("%#x\n", idChunk), NIL_GMM_CHUNKID); + return pGMM->idChunkPrev = idChunk; + } + } + + /* + * Ok, scan from the start. + * We're not racing anyone, so there is no need to expect failures or have restart loops. + */ + idChunk = ASMBitFirstClear(&pGMM->bmChunkId[0], GMM_CHUNKID_LAST + 1); + AssertMsgReturn(idChunk > NIL_GMM_CHUNKID, ("%#x\n", idChunk), NIL_GVM_HANDLE); + AssertMsgReturn(!ASMAtomicBitTestAndSet(&pGMM->bmChunkId[0], idChunk), ("%#x\n", idChunk), NIL_GMM_CHUNKID); + + return pGMM->idChunkPrev = idChunk; +} + + +/** + * Allocates one private page. + * + * Worker for gmmR0AllocatePages. + * + * @param pChunk The chunk to allocate it from. + * @param hGVM The GVM handle of the VM requesting memory. + * @param pPageDesc The page descriptor. + */ +static void gmmR0AllocatePage(PGMMCHUNK pChunk, uint32_t hGVM, PGMMPAGEDESC pPageDesc) +{ + /* update the chunk stats. */ + if (pChunk->hGVM == NIL_GVM_HANDLE) + pChunk->hGVM = hGVM; + Assert(pChunk->cFree); + pChunk->cFree--; + pChunk->cPrivate++; + + /* unlink the first free page. */ + const uint32_t iPage = pChunk->iFreeHead; + AssertReleaseMsg(iPage < RT_ELEMENTS(pChunk->aPages), ("%d\n", iPage)); + PGMMPAGE pPage = &pChunk->aPages[iPage]; + Assert(GMM_PAGE_IS_FREE(pPage)); + pChunk->iFreeHead = pPage->Free.iNext; + Log3(("A pPage=%p iPage=%#x/%#x u2State=%d iFreeHead=%#x iNext=%#x\n", + pPage, iPage, (pChunk->Core.Key << GMM_CHUNKID_SHIFT) | iPage, + pPage->Common.u2State, pChunk->iFreeHead, pPage->Free.iNext)); + + /* make the page private. */ + pPage->u = 0; + AssertCompile(GMM_PAGE_STATE_PRIVATE == 0); + pPage->Private.hGVM = hGVM; + AssertCompile(NIL_RTHCPHYS >= GMM_GCPHYS_LAST); + AssertCompile(GMM_GCPHYS_UNSHAREABLE >= GMM_GCPHYS_LAST); + if (pPageDesc->HCPhysGCPhys <= GMM_GCPHYS_LAST) + pPage->Private.pfn = pPageDesc->HCPhysGCPhys >> PAGE_SHIFT; + else + pPage->Private.pfn = GMM_PAGE_PFN_UNSHAREABLE; /* unshareable / unassigned - same thing. */ + + /* update the page descriptor. */ + pPageDesc->HCPhysGCPhys = RTR0MemObjGetPagePhysAddr(pChunk->hMemObj, iPage); + Assert(pPageDesc->HCPhysGCPhys != NIL_RTHCPHYS); + pPageDesc->idPage = (pChunk->Core.Key << GMM_CHUNKID_SHIFT) | iPage; + pPageDesc->idSharedPage = NIL_GMM_PAGEID; +} + + +/** + * Picks the free pages from a chunk. + * + * @returns The new page descriptor table index. + * @param pChunk The chunk. + * @param hGVM The affinity of the chunk. NIL_GVM_HANDLE for no + * affinity. + * @param iPage The current page descriptor table index. + * @param cPages The total number of pages to allocate. + * @param paPages The page descriptor table (input + ouput). + */ +static uint32_t gmmR0AllocatePagesFromChunk(PGMMCHUNK pChunk, uint16_t const hGVM, uint32_t iPage, uint32_t cPages, + PGMMPAGEDESC paPages) +{ + PGMMCHUNKFREESET pSet = pChunk->pSet; Assert(pSet); + gmmR0UnlinkChunk(pChunk); + + for (; pChunk->cFree && iPage < cPages; iPage++) + gmmR0AllocatePage(pChunk, hGVM, &paPages[iPage]); + + gmmR0LinkChunk(pChunk, pSet); + return iPage; +} + + +/** + * Registers a new chunk of memory. + * + * This is called by both gmmR0AllocateOneChunk and GMMR0SeedChunk. + * + * @returns VBox status code. On success, the giant GMM lock will be held, the + * caller must release it (ugly). + * @param pGMM Pointer to the GMM instance. + * @param pSet Pointer to the set. + * @param MemObj The memory object for the chunk. + * @param hGVM The affinity of the chunk. NIL_GVM_HANDLE for no + * affinity. + * @param fChunkFlags The chunk flags, GMM_CHUNK_FLAGS_XXX. + * @param ppChunk Chunk address (out). Optional. + * + * @remarks The caller must not own the giant GMM mutex. + * The giant GMM mutex will be acquired and returned acquired in + * the success path. On failure, no locks will be held. + */ +static int gmmR0RegisterChunk(PGMM pGMM, PGMMCHUNKFREESET pSet, RTR0MEMOBJ MemObj, uint16_t hGVM, uint16_t fChunkFlags, + PGMMCHUNK *ppChunk) +{ + Assert(pGMM->hMtxOwner != RTThreadNativeSelf()); + Assert(hGVM != NIL_GVM_HANDLE || pGMM->fBoundMemoryMode); + Assert(fChunkFlags == 0 || fChunkFlags == GMM_CHUNK_FLAGS_LARGE_PAGE); + + int rc; + PGMMCHUNK pChunk = (PGMMCHUNK)RTMemAllocZ(sizeof(*pChunk)); + if (pChunk) + { + /* + * Initialize it. + */ + pChunk->hMemObj = MemObj; + pChunk->cFree = GMM_CHUNK_NUM_PAGES; + pChunk->hGVM = hGVM; + /*pChunk->iFreeHead = 0;*/ + pChunk->idNumaNode = gmmR0GetCurrentNumaNodeId(); + pChunk->iChunkMtx = UINT8_MAX; + pChunk->fFlags = fChunkFlags; + for (unsigned iPage = 0; iPage < RT_ELEMENTS(pChunk->aPages) - 1; iPage++) + { + pChunk->aPages[iPage].Free.u2State = GMM_PAGE_STATE_FREE; + pChunk->aPages[iPage].Free.iNext = iPage + 1; + } + pChunk->aPages[RT_ELEMENTS(pChunk->aPages) - 1].Free.u2State = GMM_PAGE_STATE_FREE; + pChunk->aPages[RT_ELEMENTS(pChunk->aPages) - 1].Free.iNext = UINT16_MAX; + + /* + * Allocate a Chunk ID and insert it into the tree. + * This has to be done behind the mutex of course. + */ + rc = gmmR0MutexAcquire(pGMM); + if (RT_SUCCESS(rc)) + { + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + pChunk->Core.Key = gmmR0AllocateChunkId(pGMM); + if ( pChunk->Core.Key != NIL_GMM_CHUNKID + && pChunk->Core.Key <= GMM_CHUNKID_LAST + && RTAvlU32Insert(&pGMM->pChunks, &pChunk->Core)) + { + pGMM->cChunks++; + RTListAppend(&pGMM->ChunkList, &pChunk->ListNode); + gmmR0LinkChunk(pChunk, pSet); + LogFlow(("gmmR0RegisterChunk: pChunk=%p id=%#x cChunks=%d\n", pChunk, pChunk->Core.Key, pGMM->cChunks)); + + if (ppChunk) + *ppChunk = pChunk; + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + return VINF_SUCCESS; + } + + /* bail out */ + rc = VERR_GMM_CHUNK_INSERT; + } + else + rc = VERR_GMM_IS_NOT_SANE; + gmmR0MutexRelease(pGMM); + } + + RTMemFree(pChunk); + } + else + rc = VERR_NO_MEMORY; + return rc; +} + + +/** + * Allocate a new chunk, immediately pick the requested pages from it, and adds + * what's remaining to the specified free set. + * + * @note This will leave the giant mutex while allocating the new chunk! + * + * @returns VBox status code. + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the kernel-only VM instace data. + * @param pSet Pointer to the free set. + * @param cPages The number of pages requested. + * @param paPages The page descriptor table (input + output). + * @param piPage The pointer to the page descriptor table index variable. + * This will be updated. + */ +static int gmmR0AllocateChunkNew(PGMM pGMM, PGVM pGVM, PGMMCHUNKFREESET pSet, uint32_t cPages, + PGMMPAGEDESC paPages, uint32_t *piPage) +{ + gmmR0MutexRelease(pGMM); + + RTR0MEMOBJ hMemObj; + int rc = RTR0MemObjAllocPhysNC(&hMemObj, GMM_CHUNK_SIZE, NIL_RTHCPHYS); + if (RT_SUCCESS(rc)) + { +/** @todo Duplicate gmmR0RegisterChunk here so we can avoid chaining up the + * free pages first and then unchaining them right afterwards. Instead + * do as much work as possible without holding the giant lock. */ + PGMMCHUNK pChunk; + rc = gmmR0RegisterChunk(pGMM, pSet, hMemObj, pGVM->hSelf, 0 /*fChunkFlags*/, &pChunk); + if (RT_SUCCESS(rc)) + { + *piPage = gmmR0AllocatePagesFromChunk(pChunk, pGVM->hSelf, *piPage, cPages, paPages); + return VINF_SUCCESS; + } + + /* bail out */ + RTR0MemObjFree(hMemObj, false /* fFreeMappings */); + } + + int rc2 = gmmR0MutexAcquire(pGMM); + AssertRCReturn(rc2, RT_FAILURE(rc) ? rc : rc2); + return rc; + +} + + +/** + * As a last restort we'll pick any page we can get. + * + * @returns The new page descriptor table index. + * @param pSet The set to pick from. + * @param pGVM Pointer to the global VM structure. + * @param iPage The current page descriptor table index. + * @param cPages The total number of pages to allocate. + * @param paPages The page descriptor table (input + ouput). + */ +static uint32_t gmmR0AllocatePagesIndiscriminately(PGMMCHUNKFREESET pSet, PGVM pGVM, + uint32_t iPage, uint32_t cPages, PGMMPAGEDESC paPages) +{ + unsigned iList = RT_ELEMENTS(pSet->apLists); + while (iList-- > 0) + { + PGMMCHUNK pChunk = pSet->apLists[iList]; + while (pChunk) + { + PGMMCHUNK pNext = pChunk->pFreeNext; + + iPage = gmmR0AllocatePagesFromChunk(pChunk, pGVM->hSelf, iPage, cPages, paPages); + if (iPage >= cPages) + return iPage; + + pChunk = pNext; + } + } + return iPage; +} + + +/** + * Pick pages from empty chunks on the same NUMA node. + * + * @returns The new page descriptor table index. + * @param pSet The set to pick from. + * @param pGVM Pointer to the global VM structure. + * @param iPage The current page descriptor table index. + * @param cPages The total number of pages to allocate. + * @param paPages The page descriptor table (input + ouput). + */ +static uint32_t gmmR0AllocatePagesFromEmptyChunksOnSameNode(PGMMCHUNKFREESET pSet, PGVM pGVM, + uint32_t iPage, uint32_t cPages, PGMMPAGEDESC paPages) +{ + PGMMCHUNK pChunk = pSet->apLists[GMM_CHUNK_FREE_SET_UNUSED_LIST]; + if (pChunk) + { + uint16_t const idNumaNode = gmmR0GetCurrentNumaNodeId(); + while (pChunk) + { + PGMMCHUNK pNext = pChunk->pFreeNext; + + if (pChunk->idNumaNode == idNumaNode) + { + pChunk->hGVM = pGVM->hSelf; + iPage = gmmR0AllocatePagesFromChunk(pChunk, pGVM->hSelf, iPage, cPages, paPages); + if (iPage >= cPages) + { + pGVM->gmm.s.idLastChunkHint = pChunk->cFree ? pChunk->Core.Key : NIL_GMM_CHUNKID; + return iPage; + } + } + + pChunk = pNext; + } + } + return iPage; +} + + +/** + * Pick pages from non-empty chunks on the same NUMA node. + * + * @returns The new page descriptor table index. + * @param pSet The set to pick from. + * @param pGVM Pointer to the global VM structure. + * @param iPage The current page descriptor table index. + * @param cPages The total number of pages to allocate. + * @param paPages The page descriptor table (input + ouput). + */ +static uint32_t gmmR0AllocatePagesFromSameNode(PGMMCHUNKFREESET pSet, PGVM pGVM, + uint32_t iPage, uint32_t cPages, PGMMPAGEDESC paPages) +{ + /** @todo start by picking from chunks with about the right size first? */ + uint16_t const idNumaNode = gmmR0GetCurrentNumaNodeId(); + unsigned iList = GMM_CHUNK_FREE_SET_UNUSED_LIST; + while (iList-- > 0) + { + PGMMCHUNK pChunk = pSet->apLists[iList]; + while (pChunk) + { + PGMMCHUNK pNext = pChunk->pFreeNext; + + if (pChunk->idNumaNode == idNumaNode) + { + iPage = gmmR0AllocatePagesFromChunk(pChunk, pGVM->hSelf, iPage, cPages, paPages); + if (iPage >= cPages) + { + pGVM->gmm.s.idLastChunkHint = pChunk->cFree ? pChunk->Core.Key : NIL_GMM_CHUNKID; + return iPage; + } + } + + pChunk = pNext; + } + } + return iPage; +} + + +/** + * Pick pages that are in chunks already associated with the VM. + * + * @returns The new page descriptor table index. + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the global VM structure. + * @param pSet The set to pick from. + * @param iPage The current page descriptor table index. + * @param cPages The total number of pages to allocate. + * @param paPages The page descriptor table (input + ouput). + */ +static uint32_t gmmR0AllocatePagesAssociatedWithVM(PGMM pGMM, PGVM pGVM, PGMMCHUNKFREESET pSet, + uint32_t iPage, uint32_t cPages, PGMMPAGEDESC paPages) +{ + uint16_t const hGVM = pGVM->hSelf; + + /* Hint. */ + if (pGVM->gmm.s.idLastChunkHint != NIL_GMM_CHUNKID) + { + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, pGVM->gmm.s.idLastChunkHint); + if (pChunk && pChunk->cFree) + { + iPage = gmmR0AllocatePagesFromChunk(pChunk, hGVM, iPage, cPages, paPages); + if (iPage >= cPages) + return iPage; + } + } + + /* Scan. */ + for (unsigned iList = 0; iList < RT_ELEMENTS(pSet->apLists); iList++) + { + PGMMCHUNK pChunk = pSet->apLists[iList]; + while (pChunk) + { + PGMMCHUNK pNext = pChunk->pFreeNext; + + if (pChunk->hGVM == hGVM) + { + iPage = gmmR0AllocatePagesFromChunk(pChunk, hGVM, iPage, cPages, paPages); + if (iPage >= cPages) + { + pGVM->gmm.s.idLastChunkHint = pChunk->cFree ? pChunk->Core.Key : NIL_GMM_CHUNKID; + return iPage; + } + } + + pChunk = pNext; + } + } + return iPage; +} + + + +/** + * Pick pages in bound memory mode. + * + * @returns The new page descriptor table index. + * @param pGVM Pointer to the global VM structure. + * @param iPage The current page descriptor table index. + * @param cPages The total number of pages to allocate. + * @param paPages The page descriptor table (input + ouput). + */ +static uint32_t gmmR0AllocatePagesInBoundMode(PGVM pGVM, uint32_t iPage, uint32_t cPages, PGMMPAGEDESC paPages) +{ + for (unsigned iList = 0; iList < RT_ELEMENTS(pGVM->gmm.s.Private.apLists); iList++) + { + PGMMCHUNK pChunk = pGVM->gmm.s.Private.apLists[iList]; + while (pChunk) + { + Assert(pChunk->hGVM == pGVM->hSelf); + PGMMCHUNK pNext = pChunk->pFreeNext; + iPage = gmmR0AllocatePagesFromChunk(pChunk, pGVM->hSelf, iPage, cPages, paPages); + if (iPage >= cPages) + return iPage; + pChunk = pNext; + } + } + return iPage; +} + + +/** + * Checks if we should start picking pages from chunks of other VMs because + * we're getting close to the system memory or reserved limit. + * + * @returns @c true if we should, @c false if we should first try allocate more + * chunks. + */ +static bool gmmR0ShouldAllocatePagesInOtherChunksBecauseOfLimits(PGVM pGVM) +{ + /* + * Don't allocate a new chunk if we're + */ + uint64_t cPgReserved = pGVM->gmm.s.Stats.Reserved.cBasePages + + pGVM->gmm.s.Stats.Reserved.cFixedPages + - pGVM->gmm.s.Stats.cBalloonedPages + /** @todo what about shared pages? */; + uint64_t cPgAllocated = pGVM->gmm.s.Stats.Allocated.cBasePages + + pGVM->gmm.s.Stats.Allocated.cFixedPages; + uint64_t cPgDelta = cPgReserved - cPgAllocated; + if (cPgDelta < GMM_CHUNK_NUM_PAGES * 4) + return true; + /** @todo make the threshold configurable, also test the code to see if + * this ever kicks in (we might be reserving too much or smth). */ + + /* + * Check how close we're to the max memory limit and how many fragments + * there are?... + */ + /** @todo. */ + + return false; +} + + +/** + * Checks if we should start picking pages from chunks of other VMs because + * there is a lot of free pages around. + * + * @returns @c true if we should, @c false if we should first try allocate more + * chunks. + */ +static bool gmmR0ShouldAllocatePagesInOtherChunksBecauseOfLotsFree(PGMM pGMM) +{ + /* + * Setting the limit at 16 chunks (32 MB) at the moment. + */ + if (pGMM->PrivateX.cFreePages >= GMM_CHUNK_NUM_PAGES * 16) + return true; + return false; +} + + +/** + * Common worker for GMMR0AllocateHandyPages and GMMR0AllocatePages. + * + * @returns VBox status code: + * @retval VINF_SUCCESS on success. + * @retval VERR_GMM_SEED_ME if seeding via GMMR0SeedChunk or + * gmmR0AllocateMoreChunks is necessary. + * @retval VERR_GMM_HIT_GLOBAL_LIMIT if we've exhausted the available pages. + * @retval VERR_GMM_HIT_VM_ACCOUNT_LIMIT if we've hit the VM account limit, + * that is we're trying to allocate more than we've reserved. + * + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the VM. + * @param cPages The number of pages to allocate. + * @param paPages Pointer to the page descriptors. See GMMPAGEDESC for + * details on what is expected on input. + * @param enmAccount The account to charge. + * + * @remarks Call takes the giant GMM lock. + */ +static int gmmR0AllocatePagesNew(PGMM pGMM, PGVM pGVM, uint32_t cPages, PGMMPAGEDESC paPages, GMMACCOUNT enmAccount) +{ + Assert(pGMM->hMtxOwner == RTThreadNativeSelf()); + + /* + * Check allocation limits. + */ + if (RT_UNLIKELY(pGMM->cAllocatedPages + cPages > pGMM->cMaxPages)) + return VERR_GMM_HIT_GLOBAL_LIMIT; + + switch (enmAccount) + { + case GMMACCOUNT_BASE: + if (RT_UNLIKELY( pGVM->gmm.s.Stats.Allocated.cBasePages + pGVM->gmm.s.Stats.cBalloonedPages + cPages + > pGVM->gmm.s.Stats.Reserved.cBasePages)) + { + Log(("gmmR0AllocatePages:Base: Reserved=%#llx Allocated+Ballooned+Requested=%#llx+%#llx+%#x!\n", + pGVM->gmm.s.Stats.Reserved.cBasePages, pGVM->gmm.s.Stats.Allocated.cBasePages, + pGVM->gmm.s.Stats.cBalloonedPages, cPages)); + return VERR_GMM_HIT_VM_ACCOUNT_LIMIT; + } + break; + case GMMACCOUNT_SHADOW: + if (RT_UNLIKELY(pGVM->gmm.s.Stats.Allocated.cShadowPages + cPages > pGVM->gmm.s.Stats.Reserved.cShadowPages)) + { + Log(("gmmR0AllocatePages:Shadow: Reserved=%#x Allocated+Requested=%#x+%#x!\n", + pGVM->gmm.s.Stats.Reserved.cShadowPages, pGVM->gmm.s.Stats.Allocated.cShadowPages, cPages)); + return VERR_GMM_HIT_VM_ACCOUNT_LIMIT; + } + break; + case GMMACCOUNT_FIXED: + if (RT_UNLIKELY(pGVM->gmm.s.Stats.Allocated.cFixedPages + cPages > pGVM->gmm.s.Stats.Reserved.cFixedPages)) + { + Log(("gmmR0AllocatePages:Fixed: Reserved=%#x Allocated+Requested=%#x+%#x!\n", + pGVM->gmm.s.Stats.Reserved.cFixedPages, pGVM->gmm.s.Stats.Allocated.cFixedPages, cPages)); + return VERR_GMM_HIT_VM_ACCOUNT_LIMIT; + } + break; + default: + AssertMsgFailedReturn(("enmAccount=%d\n", enmAccount), VERR_IPE_NOT_REACHED_DEFAULT_CASE); + } + + /* + * If we're in legacy memory mode, it's easy to figure if we have + * sufficient number of pages up-front. + */ + if ( pGMM->fLegacyAllocationMode + && pGVM->gmm.s.Private.cFreePages < cPages) + { + Assert(pGMM->fBoundMemoryMode); + return VERR_GMM_SEED_ME; + } + + /* + * Update the accounts before we proceed because we might be leaving the + * protection of the global mutex and thus run the risk of permitting + * too much memory to be allocated. + */ + switch (enmAccount) + { + case GMMACCOUNT_BASE: pGVM->gmm.s.Stats.Allocated.cBasePages += cPages; break; + case GMMACCOUNT_SHADOW: pGVM->gmm.s.Stats.Allocated.cShadowPages += cPages; break; + case GMMACCOUNT_FIXED: pGVM->gmm.s.Stats.Allocated.cFixedPages += cPages; break; + default: AssertMsgFailedReturn(("enmAccount=%d\n", enmAccount), VERR_IPE_NOT_REACHED_DEFAULT_CASE); + } + pGVM->gmm.s.Stats.cPrivatePages += cPages; + pGMM->cAllocatedPages += cPages; + + /* + * Part two of it's-easy-in-legacy-memory-mode. + */ + uint32_t iPage = 0; + if (pGMM->fLegacyAllocationMode) + { + iPage = gmmR0AllocatePagesInBoundMode(pGVM, iPage, cPages, paPages); + AssertReleaseReturn(iPage == cPages, VERR_GMM_ALLOC_PAGES_IPE); + return VINF_SUCCESS; + } + + /* + * Bound mode is also relatively straightforward. + */ + int rc = VINF_SUCCESS; + if (pGMM->fBoundMemoryMode) + { + iPage = gmmR0AllocatePagesInBoundMode(pGVM, iPage, cPages, paPages); + if (iPage < cPages) + do + rc = gmmR0AllocateChunkNew(pGMM, pGVM, &pGVM->gmm.s.Private, cPages, paPages, &iPage); + while (iPage < cPages && RT_SUCCESS(rc)); + } + /* + * Shared mode is trickier as we should try archive the same locality as + * in bound mode, but smartly make use of non-full chunks allocated by + * other VMs if we're low on memory. + */ + else + { + /* Pick the most optimal pages first. */ + iPage = gmmR0AllocatePagesAssociatedWithVM(pGMM, pGVM, &pGMM->PrivateX, iPage, cPages, paPages); + if (iPage < cPages) + { + /* Maybe we should try getting pages from chunks "belonging" to + other VMs before allocating more chunks? */ + bool fTriedOnSameAlready = false; + if (gmmR0ShouldAllocatePagesInOtherChunksBecauseOfLimits(pGVM)) + { + iPage = gmmR0AllocatePagesFromSameNode(&pGMM->PrivateX, pGVM, iPage, cPages, paPages); + fTriedOnSameAlready = true; + } + + /* Allocate memory from empty chunks. */ + if (iPage < cPages) + iPage = gmmR0AllocatePagesFromEmptyChunksOnSameNode(&pGMM->PrivateX, pGVM, iPage, cPages, paPages); + + /* Grab empty shared chunks. */ + if (iPage < cPages) + iPage = gmmR0AllocatePagesFromEmptyChunksOnSameNode(&pGMM->Shared, pGVM, iPage, cPages, paPages); + + /* If there is a lof of free pages spread around, try not waste + system memory on more chunks. (Should trigger defragmentation.) */ + if ( !fTriedOnSameAlready + && gmmR0ShouldAllocatePagesInOtherChunksBecauseOfLotsFree(pGMM)) + { + iPage = gmmR0AllocatePagesFromSameNode(&pGMM->PrivateX, pGVM, iPage, cPages, paPages); + if (iPage < cPages) + iPage = gmmR0AllocatePagesIndiscriminately(&pGMM->PrivateX, pGVM, iPage, cPages, paPages); + } + + /* + * Ok, try allocate new chunks. + */ + if (iPage < cPages) + { + do + rc = gmmR0AllocateChunkNew(pGMM, pGVM, &pGMM->PrivateX, cPages, paPages, &iPage); + while (iPage < cPages && RT_SUCCESS(rc)); + + /* If the host is out of memory, take whatever we can get. */ + if ( (rc == VERR_NO_MEMORY || rc == VERR_NO_PHYS_MEMORY) + && pGMM->PrivateX.cFreePages + pGMM->Shared.cFreePages >= cPages - iPage) + { + iPage = gmmR0AllocatePagesIndiscriminately(&pGMM->PrivateX, pGVM, iPage, cPages, paPages); + if (iPage < cPages) + iPage = gmmR0AllocatePagesIndiscriminately(&pGMM->Shared, pGVM, iPage, cPages, paPages); + AssertRelease(iPage == cPages); + rc = VINF_SUCCESS; + } + } + } + } + + /* + * Clean up on failure. Since this is bound to be a low-memory condition + * we will give back any empty chunks that might be hanging around. + */ + if (RT_FAILURE(rc)) + { + /* Update the statistics. */ + pGVM->gmm.s.Stats.cPrivatePages -= cPages; + pGMM->cAllocatedPages -= cPages - iPage; + switch (enmAccount) + { + case GMMACCOUNT_BASE: pGVM->gmm.s.Stats.Allocated.cBasePages -= cPages; break; + case GMMACCOUNT_SHADOW: pGVM->gmm.s.Stats.Allocated.cShadowPages -= cPages; break; + case GMMACCOUNT_FIXED: pGVM->gmm.s.Stats.Allocated.cFixedPages -= cPages; break; + default: AssertMsgFailedReturn(("enmAccount=%d\n", enmAccount), VERR_IPE_NOT_REACHED_DEFAULT_CASE); + } + + /* Release the pages. */ + while (iPage-- > 0) + { + uint32_t idPage = paPages[iPage].idPage; + PGMMPAGE pPage = gmmR0GetPage(pGMM, idPage); + if (RT_LIKELY(pPage)) + { + Assert(GMM_PAGE_IS_PRIVATE(pPage)); + Assert(pPage->Private.hGVM == pGVM->hSelf); + gmmR0FreePrivatePage(pGMM, pGVM, idPage, pPage); + } + else + AssertMsgFailed(("idPage=%#x\n", idPage)); + + paPages[iPage].idPage = NIL_GMM_PAGEID; + paPages[iPage].idSharedPage = NIL_GMM_PAGEID; + paPages[iPage].HCPhysGCPhys = NIL_RTHCPHYS; + } + + /* Free empty chunks. */ + /** @todo */ + + /* return the fail status on failure */ + return rc; + } + return VINF_SUCCESS; +} + + +/** + * Updates the previous allocations and allocates more pages. + * + * The handy pages are always taken from the 'base' memory account. + * The allocated pages are not cleared and will contains random garbage. + * + * @returns VBox status code: + * @retval VINF_SUCCESS on success. + * @retval VERR_NOT_OWNER if the caller is not an EMT. + * @retval VERR_GMM_PAGE_NOT_FOUND if one of the pages to update wasn't found. + * @retval VERR_GMM_PAGE_NOT_PRIVATE if one of the pages to update wasn't a + * private page. + * @retval VERR_GMM_PAGE_NOT_SHARED if one of the pages to update wasn't a + * shared page. + * @retval VERR_GMM_NOT_PAGE_OWNER if one of the pages to be updated wasn't + * owned by the VM. + * @retval VERR_GMM_SEED_ME if seeding via GMMR0SeedChunk is necessary. + * @retval VERR_GMM_HIT_GLOBAL_LIMIT if we've exhausted the available pages. + * @retval VERR_GMM_HIT_VM_ACCOUNT_LIMIT if we've hit the VM account limit, + * that is we're trying to allocate more than we've reserved. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param cPagesToUpdate The number of pages to update (starting from the head). + * @param cPagesToAlloc The number of pages to allocate (starting from the head). + * @param paPages The array of page descriptors. + * See GMMPAGEDESC for details on what is expected on input. + * @thread EMT(idCpu) + */ +GMMR0DECL(int) GMMR0AllocateHandyPages(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint32_t cPagesToUpdate, + uint32_t cPagesToAlloc, PGMMPAGEDESC paPages) +{ + LogFlow(("GMMR0AllocateHandyPages: pGVM=%p pVM=%p cPagesToUpdate=%#x cPagesToAlloc=%#x paPages=%p\n", + pGVM, pVM, cPagesToUpdate, cPagesToAlloc, paPages)); + + /* + * Validate, get basics and take the semaphore. + * (This is a relatively busy path, so make predictions where possible.) + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + AssertPtrReturn(paPages, VERR_INVALID_PARAMETER); + AssertMsgReturn( (cPagesToUpdate && cPagesToUpdate < 1024) + || (cPagesToAlloc && cPagesToAlloc < 1024), + ("cPagesToUpdate=%#x cPagesToAlloc=%#x\n", cPagesToUpdate, cPagesToAlloc), + VERR_INVALID_PARAMETER); + + unsigned iPage = 0; + for (; iPage < cPagesToUpdate; iPage++) + { + AssertMsgReturn( ( paPages[iPage].HCPhysGCPhys <= GMM_GCPHYS_LAST + && !(paPages[iPage].HCPhysGCPhys & PAGE_OFFSET_MASK)) + || paPages[iPage].HCPhysGCPhys == NIL_RTHCPHYS + || paPages[iPage].HCPhysGCPhys == GMM_GCPHYS_UNSHAREABLE, + ("#%#x: %RHp\n", iPage, paPages[iPage].HCPhysGCPhys), + VERR_INVALID_PARAMETER); + AssertMsgReturn( paPages[iPage].idPage <= GMM_PAGEID_LAST + /*|| paPages[iPage].idPage == NIL_GMM_PAGEID*/, + ("#%#x: %#x\n", iPage, paPages[iPage].idPage), VERR_INVALID_PARAMETER); + AssertMsgReturn( paPages[iPage].idPage <= GMM_PAGEID_LAST + /*|| paPages[iPage].idSharedPage == NIL_GMM_PAGEID*/, + ("#%#x: %#x\n", iPage, paPages[iPage].idSharedPage), VERR_INVALID_PARAMETER); + } + + for (; iPage < cPagesToAlloc; iPage++) + { + AssertMsgReturn(paPages[iPage].HCPhysGCPhys == NIL_RTHCPHYS, ("#%#x: %RHp\n", iPage, paPages[iPage].HCPhysGCPhys), VERR_INVALID_PARAMETER); + AssertMsgReturn(paPages[iPage].idPage == NIL_GMM_PAGEID, ("#%#x: %#x\n", iPage, paPages[iPage].idPage), VERR_INVALID_PARAMETER); + AssertMsgReturn(paPages[iPage].idSharedPage == NIL_GMM_PAGEID, ("#%#x: %#x\n", iPage, paPages[iPage].idSharedPage), VERR_INVALID_PARAMETER); + } + + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + /* No allocations before the initial reservation has been made! */ + if (RT_LIKELY( pGVM->gmm.s.Stats.Reserved.cBasePages + && pGVM->gmm.s.Stats.Reserved.cFixedPages + && pGVM->gmm.s.Stats.Reserved.cShadowPages)) + { + /* + * Perform the updates. + * Stop on the first error. + */ + for (iPage = 0; iPage < cPagesToUpdate; iPage++) + { + if (paPages[iPage].idPage != NIL_GMM_PAGEID) + { + PGMMPAGE pPage = gmmR0GetPage(pGMM, paPages[iPage].idPage); + if (RT_LIKELY(pPage)) + { + if (RT_LIKELY(GMM_PAGE_IS_PRIVATE(pPage))) + { + if (RT_LIKELY(pPage->Private.hGVM == pGVM->hSelf)) + { + AssertCompile(NIL_RTHCPHYS > GMM_GCPHYS_LAST && GMM_GCPHYS_UNSHAREABLE > GMM_GCPHYS_LAST); + if (RT_LIKELY(paPages[iPage].HCPhysGCPhys <= GMM_GCPHYS_LAST)) + pPage->Private.pfn = paPages[iPage].HCPhysGCPhys >> PAGE_SHIFT; + else if (paPages[iPage].HCPhysGCPhys == GMM_GCPHYS_UNSHAREABLE) + pPage->Private.pfn = GMM_PAGE_PFN_UNSHAREABLE; + /* else: NIL_RTHCPHYS nothing */ + + paPages[iPage].idPage = NIL_GMM_PAGEID; + paPages[iPage].HCPhysGCPhys = NIL_RTHCPHYS; + } + else + { + Log(("GMMR0AllocateHandyPages: #%#x/%#x: Not owner! hGVM=%#x hSelf=%#x\n", + iPage, paPages[iPage].idPage, pPage->Private.hGVM, pGVM->hSelf)); + rc = VERR_GMM_NOT_PAGE_OWNER; + break; + } + } + else + { + Log(("GMMR0AllocateHandyPages: #%#x/%#x: Not private! %.*Rhxs (type %d)\n", iPage, paPages[iPage].idPage, sizeof(*pPage), pPage, pPage->Common.u2State)); + rc = VERR_GMM_PAGE_NOT_PRIVATE; + break; + } + } + else + { + Log(("GMMR0AllocateHandyPages: #%#x/%#x: Not found! (private)\n", iPage, paPages[iPage].idPage)); + rc = VERR_GMM_PAGE_NOT_FOUND; + break; + } + } + + if (paPages[iPage].idSharedPage != NIL_GMM_PAGEID) + { + PGMMPAGE pPage = gmmR0GetPage(pGMM, paPages[iPage].idSharedPage); + if (RT_LIKELY(pPage)) + { + if (RT_LIKELY(GMM_PAGE_IS_SHARED(pPage))) + { + AssertCompile(NIL_RTHCPHYS > GMM_GCPHYS_LAST && GMM_GCPHYS_UNSHAREABLE > GMM_GCPHYS_LAST); + Assert(pPage->Shared.cRefs); + Assert(pGVM->gmm.s.Stats.cSharedPages); + Assert(pGVM->gmm.s.Stats.Allocated.cBasePages); + + Log(("GMMR0AllocateHandyPages: free shared page %x cRefs=%d\n", paPages[iPage].idSharedPage, pPage->Shared.cRefs)); + pGVM->gmm.s.Stats.cSharedPages--; + pGVM->gmm.s.Stats.Allocated.cBasePages--; + if (!--pPage->Shared.cRefs) + gmmR0FreeSharedPage(pGMM, pGVM, paPages[iPage].idSharedPage, pPage); + else + { + Assert(pGMM->cDuplicatePages); + pGMM->cDuplicatePages--; + } + + paPages[iPage].idSharedPage = NIL_GMM_PAGEID; + } + else + { + Log(("GMMR0AllocateHandyPages: #%#x/%#x: Not shared!\n", iPage, paPages[iPage].idSharedPage)); + rc = VERR_GMM_PAGE_NOT_SHARED; + break; + } + } + else + { + Log(("GMMR0AllocateHandyPages: #%#x/%#x: Not found! (shared)\n", iPage, paPages[iPage].idSharedPage)); + rc = VERR_GMM_PAGE_NOT_FOUND; + break; + } + } + } /* for each page to update */ + + if (RT_SUCCESS(rc) && cPagesToAlloc > 0) + { +#if defined(VBOX_STRICT) && 0 /** @todo re-test this later. Appeared to be a PGM init bug. */ + for (iPage = 0; iPage < cPagesToAlloc; iPage++) + { + Assert(paPages[iPage].HCPhysGCPhys == NIL_RTHCPHYS); + Assert(paPages[iPage].idPage == NIL_GMM_PAGEID); + Assert(paPages[iPage].idSharedPage == NIL_GMM_PAGEID); + } +#endif + + /* + * Join paths with GMMR0AllocatePages for the allocation. + * Note! gmmR0AllocateMoreChunks may leave the protection of the mutex! + */ + rc = gmmR0AllocatePagesNew(pGMM, pGVM, cPagesToAlloc, paPages, GMMACCOUNT_BASE); + } + } + else + rc = VERR_WRONG_ORDER; + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + gmmR0MutexRelease(pGMM); + LogFlow(("GMMR0AllocateHandyPages: returns %Rrc\n", rc)); + return rc; +} + + +/** + * Allocate one or more pages. + * + * This is typically used for ROMs and MMIO2 (VRAM) during VM creation. + * The allocated pages are not cleared and will contain random garbage. + * + * @returns VBox status code: + * @retval VINF_SUCCESS on success. + * @retval VERR_NOT_OWNER if the caller is not an EMT. + * @retval VERR_GMM_SEED_ME if seeding via GMMR0SeedChunk is necessary. + * @retval VERR_GMM_HIT_GLOBAL_LIMIT if we've exhausted the available pages. + * @retval VERR_GMM_HIT_VM_ACCOUNT_LIMIT if we've hit the VM account limit, + * that is we're trying to allocate more than we've reserved. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param cPages The number of pages to allocate. + * @param paPages Pointer to the page descriptors. + * See GMMPAGEDESC for details on what is expected on + * input. + * @param enmAccount The account to charge. + * + * @thread EMT. + */ +GMMR0DECL(int) GMMR0AllocatePages(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint32_t cPages, PGMMPAGEDESC paPages, GMMACCOUNT enmAccount) +{ + LogFlow(("GMMR0AllocatePages: pGVM=%p pVM=%p cPages=%#x paPages=%p enmAccount=%d\n", pGVM, pVM, cPages, paPages, enmAccount)); + + /* + * Validate, get basics and take the semaphore. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + AssertPtrReturn(paPages, VERR_INVALID_PARAMETER); + AssertMsgReturn(enmAccount > GMMACCOUNT_INVALID && enmAccount < GMMACCOUNT_END, ("%d\n", enmAccount), VERR_INVALID_PARAMETER); + AssertMsgReturn(cPages > 0 && cPages < RT_BIT(32 - PAGE_SHIFT), ("%#x\n", cPages), VERR_INVALID_PARAMETER); + + for (unsigned iPage = 0; iPage < cPages; iPage++) + { + AssertMsgReturn( paPages[iPage].HCPhysGCPhys == NIL_RTHCPHYS + || paPages[iPage].HCPhysGCPhys == GMM_GCPHYS_UNSHAREABLE + || ( enmAccount == GMMACCOUNT_BASE + && paPages[iPage].HCPhysGCPhys <= GMM_GCPHYS_LAST + && !(paPages[iPage].HCPhysGCPhys & PAGE_OFFSET_MASK)), + ("#%#x: %RHp enmAccount=%d\n", iPage, paPages[iPage].HCPhysGCPhys, enmAccount), + VERR_INVALID_PARAMETER); + AssertMsgReturn(paPages[iPage].idPage == NIL_GMM_PAGEID, ("#%#x: %#x\n", iPage, paPages[iPage].idPage), VERR_INVALID_PARAMETER); + AssertMsgReturn(paPages[iPage].idSharedPage == NIL_GMM_PAGEID, ("#%#x: %#x\n", iPage, paPages[iPage].idSharedPage), VERR_INVALID_PARAMETER); + } + + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + + /* No allocations before the initial reservation has been made! */ + if (RT_LIKELY( pGVM->gmm.s.Stats.Reserved.cBasePages + && pGVM->gmm.s.Stats.Reserved.cFixedPages + && pGVM->gmm.s.Stats.Reserved.cShadowPages)) + rc = gmmR0AllocatePagesNew(pGMM, pGVM, cPages, paPages, enmAccount); + else + rc = VERR_WRONG_ORDER; + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + gmmR0MutexRelease(pGMM); + LogFlow(("GMMR0AllocatePages: returns %Rrc\n", rc)); + return rc; +} + + +/** + * VMMR0 request wrapper for GMMR0AllocatePages. + * + * @returns see GMMR0AllocatePages. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0AllocatePagesReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMALLOCATEPAGESREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq >= RT_UOFFSETOF(GMMALLOCATEPAGESREQ, aPages[0]), + ("%#x < %#x\n", pReq->Hdr.cbReq, RT_UOFFSETOF(GMMALLOCATEPAGESREQ, aPages[0])), + VERR_INVALID_PARAMETER); + AssertMsgReturn(pReq->Hdr.cbReq == RT_UOFFSETOF_DYN(GMMALLOCATEPAGESREQ, aPages[pReq->cPages]), + ("%#x != %#x\n", pReq->Hdr.cbReq, RT_UOFFSETOF_DYN(GMMALLOCATEPAGESREQ, aPages[pReq->cPages])), + VERR_INVALID_PARAMETER); + + return GMMR0AllocatePages(pGVM, pVM, idCpu, pReq->cPages, &pReq->aPages[0], pReq->enmAccount); +} + + +/** + * Allocate a large page to represent guest RAM + * + * The allocated pages are not cleared and will contains random garbage. + * + * @returns VBox status code: + * @retval VINF_SUCCESS on success. + * @retval VERR_NOT_OWNER if the caller is not an EMT. + * @retval VERR_GMM_SEED_ME if seeding via GMMR0SeedChunk is necessary. + * @retval VERR_GMM_HIT_GLOBAL_LIMIT if we've exhausted the available pages. + * @retval VERR_GMM_HIT_VM_ACCOUNT_LIMIT if we've hit the VM account limit, + * that is we're trying to allocate more than we've reserved. + * @returns see GMMR0AllocatePages. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param cbPage Large page size. + * @param pIdPage Where to return the GMM page ID of the page. + * @param pHCPhys Where to return the host physical address of the page. + */ +GMMR0DECL(int) GMMR0AllocateLargePage(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint32_t cbPage, uint32_t *pIdPage, RTHCPHYS *pHCPhys) +{ + LogFlow(("GMMR0AllocateLargePage: pGVM=%p pVM=%p cbPage=%x\n", pGVM, pVM, cbPage)); + + AssertReturn(cbPage == GMM_CHUNK_SIZE, VERR_INVALID_PARAMETER); + AssertPtrReturn(pIdPage, VERR_INVALID_PARAMETER); + AssertPtrReturn(pHCPhys, VERR_INVALID_PARAMETER); + + /* + * Validate, get basics and take the semaphore. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + /* Not supported in legacy mode where we allocate the memory in ring 3 and lock it in ring 0. */ + if (pGMM->fLegacyAllocationMode) + return VERR_NOT_SUPPORTED; + + *pHCPhys = NIL_RTHCPHYS; + *pIdPage = NIL_GMM_PAGEID; + + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + const unsigned cPages = (GMM_CHUNK_SIZE >> PAGE_SHIFT); + if (RT_UNLIKELY( pGVM->gmm.s.Stats.Allocated.cBasePages + pGVM->gmm.s.Stats.cBalloonedPages + cPages + > pGVM->gmm.s.Stats.Reserved.cBasePages)) + { + Log(("GMMR0AllocateLargePage: Reserved=%#llx Allocated+Requested=%#llx+%#x!\n", + pGVM->gmm.s.Stats.Reserved.cBasePages, pGVM->gmm.s.Stats.Allocated.cBasePages, cPages)); + gmmR0MutexRelease(pGMM); + return VERR_GMM_HIT_VM_ACCOUNT_LIMIT; + } + + /* + * Allocate a new large page chunk. + * + * Note! We leave the giant GMM lock temporarily as the allocation might + * take a long time. gmmR0RegisterChunk will retake it (ugly). + */ + AssertCompile(GMM_CHUNK_SIZE == _2M); + gmmR0MutexRelease(pGMM); + + RTR0MEMOBJ hMemObj; + rc = RTR0MemObjAllocPhysEx(&hMemObj, GMM_CHUNK_SIZE, NIL_RTHCPHYS, GMM_CHUNK_SIZE); + if (RT_SUCCESS(rc)) + { + PGMMCHUNKFREESET pSet = pGMM->fBoundMemoryMode ? &pGVM->gmm.s.Private : &pGMM->PrivateX; + PGMMCHUNK pChunk; + rc = gmmR0RegisterChunk(pGMM, pSet, hMemObj, pGVM->hSelf, GMM_CHUNK_FLAGS_LARGE_PAGE, &pChunk); + if (RT_SUCCESS(rc)) + { + /* + * Allocate all the pages in the chunk. + */ + /* Unlink the new chunk from the free list. */ + gmmR0UnlinkChunk(pChunk); + + /** @todo rewrite this to skip the looping. */ + /* Allocate all pages. */ + GMMPAGEDESC PageDesc; + gmmR0AllocatePage(pChunk, pGVM->hSelf, &PageDesc); + + /* Return the first page as we'll use the whole chunk as one big page. */ + *pIdPage = PageDesc.idPage; + *pHCPhys = PageDesc.HCPhysGCPhys; + + for (unsigned i = 1; i < cPages; i++) + gmmR0AllocatePage(pChunk, pGVM->hSelf, &PageDesc); + + /* Update accounting. */ + pGVM->gmm.s.Stats.Allocated.cBasePages += cPages; + pGVM->gmm.s.Stats.cPrivatePages += cPages; + pGMM->cAllocatedPages += cPages; + + gmmR0LinkChunk(pChunk, pSet); + gmmR0MutexRelease(pGMM); + } + else + RTR0MemObjFree(hMemObj, false /* fFreeMappings */); + } + } + else + { + gmmR0MutexRelease(pGMM); + rc = VERR_GMM_IS_NOT_SANE; + } + + LogFlow(("GMMR0AllocateLargePage: returns %Rrc\n", rc)); + return rc; +} + + +/** + * Free a large page. + * + * @returns VBox status code: + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param idPage The large page id. + */ +GMMR0DECL(int) GMMR0FreeLargePage(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint32_t idPage) +{ + LogFlow(("GMMR0FreeLargePage: pGVM=%p pVM=%p idPage=%x\n", pGVM, pVM, idPage)); + + /* + * Validate, get basics and take the semaphore. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + /* Not supported in legacy mode where we allocate the memory in ring 3 and lock it in ring 0. */ + if (pGMM->fLegacyAllocationMode) + return VERR_NOT_SUPPORTED; + + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + const unsigned cPages = (GMM_CHUNK_SIZE >> PAGE_SHIFT); + + if (RT_UNLIKELY(pGVM->gmm.s.Stats.Allocated.cBasePages < cPages)) + { + Log(("GMMR0FreeLargePage: allocated=%#llx cPages=%#x!\n", pGVM->gmm.s.Stats.Allocated.cBasePages, cPages)); + gmmR0MutexRelease(pGMM); + return VERR_GMM_ATTEMPT_TO_FREE_TOO_MUCH; + } + + PGMMPAGE pPage = gmmR0GetPage(pGMM, idPage); + if (RT_LIKELY( pPage + && GMM_PAGE_IS_PRIVATE(pPage))) + { + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, idPage >> GMM_CHUNKID_SHIFT); + Assert(pChunk); + Assert(pChunk->cFree < GMM_CHUNK_NUM_PAGES); + Assert(pChunk->cPrivate > 0); + + /* Release the memory immediately. */ + gmmR0FreeChunk(pGMM, NULL, pChunk, false /*fRelaxedSem*/); /** @todo this can be relaxed too! */ + + /* Update accounting. */ + pGVM->gmm.s.Stats.Allocated.cBasePages -= cPages; + pGVM->gmm.s.Stats.cPrivatePages -= cPages; + pGMM->cAllocatedPages -= cPages; + } + else + rc = VERR_GMM_PAGE_NOT_FOUND; + } + else + rc = VERR_GMM_IS_NOT_SANE; + + gmmR0MutexRelease(pGMM); + LogFlow(("GMMR0FreeLargePage: returns %Rrc\n", rc)); + return rc; +} + + +/** + * VMMR0 request wrapper for GMMR0FreeLargePage. + * + * @returns see GMMR0FreeLargePage. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0FreeLargePageReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMFREELARGEPAGEREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(GMMFREEPAGESREQ), + ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(GMMFREEPAGESREQ)), + VERR_INVALID_PARAMETER); + + return GMMR0FreeLargePage(pGVM, pVM, idCpu, pReq->idPage); +} + + +/** + * Frees a chunk, giving it back to the host OS. + * + * @param pGMM Pointer to the GMM instance. + * @param pGVM This is set when called from GMMR0CleanupVM so we can + * unmap and free the chunk in one go. + * @param pChunk The chunk to free. + * @param fRelaxedSem Whether we can release the semaphore while doing the + * freeing (@c true) or not. + */ +static bool gmmR0FreeChunk(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk, bool fRelaxedSem) +{ + Assert(pChunk->Core.Key != NIL_GMM_CHUNKID); + + GMMR0CHUNKMTXSTATE MtxState; + gmmR0ChunkMutexAcquire(&MtxState, pGMM, pChunk, GMMR0CHUNK_MTX_KEEP_GIANT); + + /* + * Cleanup hack! Unmap the chunk from the callers address space. + * This shouldn't happen, so screw lock contention... + */ + if ( pChunk->cMappingsX + && !pGMM->fLegacyAllocationMode + && pGVM) + gmmR0UnmapChunkLocked(pGMM, pGVM, pChunk); + + /* + * If there are current mappings of the chunk, then request the + * VMs to unmap them. Reposition the chunk in the free list so + * it won't be a likely candidate for allocations. + */ + if (pChunk->cMappingsX) + { + /** @todo R0 -> VM request */ + /* The chunk can be mapped by more than one VM if fBoundMemoryMode is false! */ + Log(("gmmR0FreeChunk: chunk still has %d mappings; don't free!\n", pChunk->cMappingsX)); + gmmR0ChunkMutexRelease(&MtxState, pChunk); + return false; + } + + + /* + * Save and trash the handle. + */ + RTR0MEMOBJ const hMemObj = pChunk->hMemObj; + pChunk->hMemObj = NIL_RTR0MEMOBJ; + + /* + * Unlink it from everywhere. + */ + gmmR0UnlinkChunk(pChunk); + + RTListNodeRemove(&pChunk->ListNode); + + PAVLU32NODECORE pCore = RTAvlU32Remove(&pGMM->pChunks, pChunk->Core.Key); + Assert(pCore == &pChunk->Core); NOREF(pCore); + + PGMMCHUNKTLBE pTlbe = &pGMM->ChunkTLB.aEntries[GMM_CHUNKTLB_IDX(pChunk->Core.Key)]; + if (pTlbe->pChunk == pChunk) + { + pTlbe->idChunk = NIL_GMM_CHUNKID; + pTlbe->pChunk = NULL; + } + + Assert(pGMM->cChunks > 0); + pGMM->cChunks--; + + /* + * Free the Chunk ID before dropping the locks and freeing the rest. + */ + gmmR0FreeChunkId(pGMM, pChunk->Core.Key); + pChunk->Core.Key = NIL_GMM_CHUNKID; + + pGMM->cFreedChunks++; + + gmmR0ChunkMutexRelease(&MtxState, NULL); + if (fRelaxedSem) + gmmR0MutexRelease(pGMM); + + RTMemFree(pChunk->paMappingsX); + pChunk->paMappingsX = NULL; + + RTMemFree(pChunk); + + int rc = RTR0MemObjFree(hMemObj, false /* fFreeMappings */); + AssertLogRelRC(rc); + + if (fRelaxedSem) + gmmR0MutexAcquire(pGMM); + return fRelaxedSem; +} + + +/** + * Free page worker. + * + * The caller does all the statistic decrementing, we do all the incrementing. + * + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the GVM instance. + * @param pChunk Pointer to the chunk this page belongs to. + * @param idPage The Page ID. + * @param pPage Pointer to the page. + */ +static void gmmR0FreePageWorker(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk, uint32_t idPage, PGMMPAGE pPage) +{ + Log3(("F pPage=%p iPage=%#x/%#x u2State=%d iFreeHead=%#x\n", + pPage, pPage - &pChunk->aPages[0], idPage, pPage->Common.u2State, pChunk->iFreeHead)); NOREF(idPage); + + /* + * Put the page on the free list. + */ + pPage->u = 0; + pPage->Free.u2State = GMM_PAGE_STATE_FREE; + Assert(pChunk->iFreeHead < RT_ELEMENTS(pChunk->aPages) || pChunk->iFreeHead == UINT16_MAX); + pPage->Free.iNext = pChunk->iFreeHead; + pChunk->iFreeHead = pPage - &pChunk->aPages[0]; + + /* + * Update statistics (the cShared/cPrivate stats are up to date already), + * and relink the chunk if necessary. + */ + unsigned const cFree = pChunk->cFree; + if ( !cFree + || gmmR0SelectFreeSetList(cFree) != gmmR0SelectFreeSetList(cFree + 1)) + { + gmmR0UnlinkChunk(pChunk); + pChunk->cFree++; + gmmR0SelectSetAndLinkChunk(pGMM, pGVM, pChunk); + } + else + { + pChunk->cFree = cFree + 1; + pChunk->pSet->cFreePages++; + } + + /* + * If the chunk becomes empty, consider giving memory back to the host OS. + * + * The current strategy is to try give it back if there are other chunks + * in this free list, meaning if there are at least 240 free pages in this + * category. Note that since there are probably mappings of the chunk, + * it won't be freed up instantly, which probably screws up this logic + * a bit... + */ + /** @todo Do this on the way out. */ + if (RT_UNLIKELY( pChunk->cFree == GMM_CHUNK_NUM_PAGES + && pChunk->pFreeNext + && pChunk->pFreePrev /** @todo this is probably misfiring, see reset... */ + && !pGMM->fLegacyAllocationMode)) + gmmR0FreeChunk(pGMM, NULL, pChunk, false); + +} + + +/** + * Frees a shared page, the page is known to exist and be valid and such. + * + * @param pGMM Pointer to the GMM instance. + * @param pGVM Pointer to the GVM instance. + * @param idPage The page id. + * @param pPage The page structure. + */ +DECLINLINE(void) gmmR0FreeSharedPage(PGMM pGMM, PGVM pGVM, uint32_t idPage, PGMMPAGE pPage) +{ + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, idPage >> GMM_CHUNKID_SHIFT); + Assert(pChunk); + Assert(pChunk->cFree < GMM_CHUNK_NUM_PAGES); + Assert(pChunk->cShared > 0); + Assert(pGMM->cSharedPages > 0); + Assert(pGMM->cAllocatedPages > 0); + Assert(!pPage->Shared.cRefs); + + pChunk->cShared--; + pGMM->cAllocatedPages--; + pGMM->cSharedPages--; + gmmR0FreePageWorker(pGMM, pGVM, pChunk, idPage, pPage); +} + + +/** + * Frees a private page, the page is known to exist and be valid and such. + * + * @param pGMM Pointer to the GMM instance. + * @param pGVM Pointer to the GVM instance. + * @param idPage The page id. + * @param pPage The page structure. + */ +DECLINLINE(void) gmmR0FreePrivatePage(PGMM pGMM, PGVM pGVM, uint32_t idPage, PGMMPAGE pPage) +{ + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, idPage >> GMM_CHUNKID_SHIFT); + Assert(pChunk); + Assert(pChunk->cFree < GMM_CHUNK_NUM_PAGES); + Assert(pChunk->cPrivate > 0); + Assert(pGMM->cAllocatedPages > 0); + + pChunk->cPrivate--; + pGMM->cAllocatedPages--; + gmmR0FreePageWorker(pGMM, pGVM, pChunk, idPage, pPage); +} + + +/** + * Common worker for GMMR0FreePages and GMMR0BalloonedPages. + * + * @returns VBox status code: + * @retval xxx + * + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the VM. + * @param cPages The number of pages to free. + * @param paPages Pointer to the page descriptors. + * @param enmAccount The account this relates to. + */ +static int gmmR0FreePages(PGMM pGMM, PGVM pGVM, uint32_t cPages, PGMMFREEPAGEDESC paPages, GMMACCOUNT enmAccount) +{ + /* + * Check that the request isn't impossible wrt to the account status. + */ + switch (enmAccount) + { + case GMMACCOUNT_BASE: + if (RT_UNLIKELY(pGVM->gmm.s.Stats.Allocated.cBasePages < cPages)) + { + Log(("gmmR0FreePages: allocated=%#llx cPages=%#x!\n", pGVM->gmm.s.Stats.Allocated.cBasePages, cPages)); + return VERR_GMM_ATTEMPT_TO_FREE_TOO_MUCH; + } + break; + case GMMACCOUNT_SHADOW: + if (RT_UNLIKELY(pGVM->gmm.s.Stats.Allocated.cShadowPages < cPages)) + { + Log(("gmmR0FreePages: allocated=%#llx cPages=%#x!\n", pGVM->gmm.s.Stats.Allocated.cShadowPages, cPages)); + return VERR_GMM_ATTEMPT_TO_FREE_TOO_MUCH; + } + break; + case GMMACCOUNT_FIXED: + if (RT_UNLIKELY(pGVM->gmm.s.Stats.Allocated.cFixedPages < cPages)) + { + Log(("gmmR0FreePages: allocated=%#llx cPages=%#x!\n", pGVM->gmm.s.Stats.Allocated.cFixedPages, cPages)); + return VERR_GMM_ATTEMPT_TO_FREE_TOO_MUCH; + } + break; + default: + AssertMsgFailedReturn(("enmAccount=%d\n", enmAccount), VERR_IPE_NOT_REACHED_DEFAULT_CASE); + } + + /* + * Walk the descriptors and free the pages. + * + * Statistics (except the account) are being updated as we go along, + * unlike the alloc code. Also, stop on the first error. + */ + int rc = VINF_SUCCESS; + uint32_t iPage; + for (iPage = 0; iPage < cPages; iPage++) + { + uint32_t idPage = paPages[iPage].idPage; + PGMMPAGE pPage = gmmR0GetPage(pGMM, idPage); + if (RT_LIKELY(pPage)) + { + if (RT_LIKELY(GMM_PAGE_IS_PRIVATE(pPage))) + { + if (RT_LIKELY(pPage->Private.hGVM == pGVM->hSelf)) + { + Assert(pGVM->gmm.s.Stats.cPrivatePages); + pGVM->gmm.s.Stats.cPrivatePages--; + gmmR0FreePrivatePage(pGMM, pGVM, idPage, pPage); + } + else + { + Log(("gmmR0AllocatePages: #%#x/%#x: not owner! hGVM=%#x hSelf=%#x\n", iPage, idPage, + pPage->Private.hGVM, pGVM->hSelf)); + rc = VERR_GMM_NOT_PAGE_OWNER; + break; + } + } + else if (RT_LIKELY(GMM_PAGE_IS_SHARED(pPage))) + { + Assert(pGVM->gmm.s.Stats.cSharedPages); + Assert(pPage->Shared.cRefs); +#if defined(VBOX_WITH_PAGE_SHARING) && defined(VBOX_STRICT) && HC_ARCH_BITS == 64 + if (pPage->Shared.u14Checksum) + { + uint32_t uChecksum = gmmR0StrictPageChecksum(pGMM, pGVM, idPage); + uChecksum &= UINT32_C(0x00003fff); + AssertMsg(!uChecksum || uChecksum == pPage->Shared.u14Checksum, + ("%#x vs %#x - idPage=%#x\n", uChecksum, pPage->Shared.u14Checksum, idPage)); + } +#endif + pGVM->gmm.s.Stats.cSharedPages--; + if (!--pPage->Shared.cRefs) + gmmR0FreeSharedPage(pGMM, pGVM, idPage, pPage); + else + { + Assert(pGMM->cDuplicatePages); + pGMM->cDuplicatePages--; + } + } + else + { + Log(("gmmR0AllocatePages: #%#x/%#x: already free!\n", iPage, idPage)); + rc = VERR_GMM_PAGE_ALREADY_FREE; + break; + } + } + else + { + Log(("gmmR0AllocatePages: #%#x/%#x: not found!\n", iPage, idPage)); + rc = VERR_GMM_PAGE_NOT_FOUND; + break; + } + paPages[iPage].idPage = NIL_GMM_PAGEID; + } + + /* + * Update the account. + */ + switch (enmAccount) + { + case GMMACCOUNT_BASE: pGVM->gmm.s.Stats.Allocated.cBasePages -= iPage; break; + case GMMACCOUNT_SHADOW: pGVM->gmm.s.Stats.Allocated.cShadowPages -= iPage; break; + case GMMACCOUNT_FIXED: pGVM->gmm.s.Stats.Allocated.cFixedPages -= iPage; break; + default: + AssertMsgFailedReturn(("enmAccount=%d\n", enmAccount), VERR_IPE_NOT_REACHED_DEFAULT_CASE); + } + + /* + * Any threshold stuff to be done here? + */ + + return rc; +} + + +/** + * Free one or more pages. + * + * This is typically used at reset time or power off. + * + * @returns VBox status code: + * @retval xxx + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param cPages The number of pages to allocate. + * @param paPages Pointer to the page descriptors containing the page IDs + * for each page. + * @param enmAccount The account this relates to. + * @thread EMT. + */ +GMMR0DECL(int) GMMR0FreePages(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint32_t cPages, PGMMFREEPAGEDESC paPages, GMMACCOUNT enmAccount) +{ + LogFlow(("GMMR0FreePages: pGVM=%p pVM=%p cPages=%#x paPages=%p enmAccount=%d\n", pGVM, pVM, cPages, paPages, enmAccount)); + + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + AssertPtrReturn(paPages, VERR_INVALID_PARAMETER); + AssertMsgReturn(enmAccount > GMMACCOUNT_INVALID && enmAccount < GMMACCOUNT_END, ("%d\n", enmAccount), VERR_INVALID_PARAMETER); + AssertMsgReturn(cPages > 0 && cPages < RT_BIT(32 - PAGE_SHIFT), ("%#x\n", cPages), VERR_INVALID_PARAMETER); + + for (unsigned iPage = 0; iPage < cPages; iPage++) + AssertMsgReturn( paPages[iPage].idPage <= GMM_PAGEID_LAST + /*|| paPages[iPage].idPage == NIL_GMM_PAGEID*/, + ("#%#x: %#x\n", iPage, paPages[iPage].idPage), VERR_INVALID_PARAMETER); + + /* + * Take the semaphore and call the worker function. + */ + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + rc = gmmR0FreePages(pGMM, pGVM, cPages, paPages, enmAccount); + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + gmmR0MutexRelease(pGMM); + LogFlow(("GMMR0FreePages: returns %Rrc\n", rc)); + return rc; +} + + +/** + * VMMR0 request wrapper for GMMR0FreePages. + * + * @returns see GMMR0FreePages. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0FreePagesReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMFREEPAGESREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq >= RT_UOFFSETOF(GMMFREEPAGESREQ, aPages[0]), + ("%#x < %#x\n", pReq->Hdr.cbReq, RT_UOFFSETOF(GMMFREEPAGESREQ, aPages[0])), + VERR_INVALID_PARAMETER); + AssertMsgReturn(pReq->Hdr.cbReq == RT_UOFFSETOF_DYN(GMMFREEPAGESREQ, aPages[pReq->cPages]), + ("%#x != %#x\n", pReq->Hdr.cbReq, RT_UOFFSETOF_DYN(GMMFREEPAGESREQ, aPages[pReq->cPages])), + VERR_INVALID_PARAMETER); + + return GMMR0FreePages(pGVM, pVM, idCpu, pReq->cPages, &pReq->aPages[0], pReq->enmAccount); +} + + +/** + * Report back on a memory ballooning request. + * + * The request may or may not have been initiated by the GMM. If it was initiated + * by the GMM it is important that this function is called even if no pages were + * ballooned. + * + * @returns VBox status code: + * @retval VERR_GMM_ATTEMPT_TO_FREE_TOO_MUCH + * @retval VERR_GMM_ATTEMPT_TO_DEFLATE_TOO_MUCH + * @retval VERR_GMM_OVERCOMMITTED_TRY_AGAIN_IN_A_BIT - reset condition + * indicating that we won't necessarily have sufficient RAM to boot + * the VM again and that it should pause until this changes (we'll try + * balloon some other VM). (For standard deflate we have little choice + * but to hope the VM won't use the memory that was returned to it.) + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param enmAction Inflate/deflate/reset. + * @param cBalloonedPages The number of pages that was ballooned. + * + * @thread EMT(idCpu) + */ +GMMR0DECL(int) GMMR0BalloonedPages(PGVM pGVM, PVM pVM, VMCPUID idCpu, GMMBALLOONACTION enmAction, uint32_t cBalloonedPages) +{ + LogFlow(("GMMR0BalloonedPages: pGVM=%p pVM=%p enmAction=%d cBalloonedPages=%#x\n", + pGVM, pVM, enmAction, cBalloonedPages)); + + AssertMsgReturn(cBalloonedPages < RT_BIT(32 - PAGE_SHIFT), ("%#x\n", cBalloonedPages), VERR_INVALID_PARAMETER); + + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + /* + * Take the semaphore and do some more validations. + */ + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + switch (enmAction) + { + case GMMBALLOONACTION_INFLATE: + { + if (RT_LIKELY(pGVM->gmm.s.Stats.Allocated.cBasePages + pGVM->gmm.s.Stats.cBalloonedPages + cBalloonedPages + <= pGVM->gmm.s.Stats.Reserved.cBasePages)) + { + /* + * Record the ballooned memory. + */ + pGMM->cBalloonedPages += cBalloonedPages; + if (pGVM->gmm.s.Stats.cReqBalloonedPages) + { + /* Codepath never taken. Might be interesting in the future to request ballooned memory from guests in low memory conditions.. */ + AssertFailed(); + + pGVM->gmm.s.Stats.cBalloonedPages += cBalloonedPages; + pGVM->gmm.s.Stats.cReqActuallyBalloonedPages += cBalloonedPages; + Log(("GMMR0BalloonedPages: +%#x - Global=%#llx / VM: Total=%#llx Req=%#llx Actual=%#llx (pending)\n", + cBalloonedPages, pGMM->cBalloonedPages, pGVM->gmm.s.Stats.cBalloonedPages, + pGVM->gmm.s.Stats.cReqBalloonedPages, pGVM->gmm.s.Stats.cReqActuallyBalloonedPages)); + } + else + { + pGVM->gmm.s.Stats.cBalloonedPages += cBalloonedPages; + Log(("GMMR0BalloonedPages: +%#x - Global=%#llx / VM: Total=%#llx (user)\n", + cBalloonedPages, pGMM->cBalloonedPages, pGVM->gmm.s.Stats.cBalloonedPages)); + } + } + else + { + Log(("GMMR0BalloonedPages: cBasePages=%#llx Total=%#llx cBalloonedPages=%#llx Reserved=%#llx\n", + pGVM->gmm.s.Stats.Allocated.cBasePages, pGVM->gmm.s.Stats.cBalloonedPages, cBalloonedPages, + pGVM->gmm.s.Stats.Reserved.cBasePages)); + rc = VERR_GMM_ATTEMPT_TO_FREE_TOO_MUCH; + } + break; + } + + case GMMBALLOONACTION_DEFLATE: + { + /* Deflate. */ + if (pGVM->gmm.s.Stats.cBalloonedPages >= cBalloonedPages) + { + /* + * Record the ballooned memory. + */ + Assert(pGMM->cBalloonedPages >= cBalloonedPages); + pGMM->cBalloonedPages -= cBalloonedPages; + pGVM->gmm.s.Stats.cBalloonedPages -= cBalloonedPages; + if (pGVM->gmm.s.Stats.cReqDeflatePages) + { + AssertFailed(); /* This is path is for later. */ + Log(("GMMR0BalloonedPages: -%#x - Global=%#llx / VM: Total=%#llx Req=%#llx\n", + cBalloonedPages, pGMM->cBalloonedPages, pGVM->gmm.s.Stats.cBalloonedPages, pGVM->gmm.s.Stats.cReqDeflatePages)); + + /* + * Anything we need to do here now when the request has been completed? + */ + pGVM->gmm.s.Stats.cReqDeflatePages = 0; + } + else + Log(("GMMR0BalloonedPages: -%#x - Global=%#llx / VM: Total=%#llx (user)\n", + cBalloonedPages, pGMM->cBalloonedPages, pGVM->gmm.s.Stats.cBalloonedPages)); + } + else + { + Log(("GMMR0BalloonedPages: Total=%#llx cBalloonedPages=%#llx\n", pGVM->gmm.s.Stats.cBalloonedPages, cBalloonedPages)); + rc = VERR_GMM_ATTEMPT_TO_DEFLATE_TOO_MUCH; + } + break; + } + + case GMMBALLOONACTION_RESET: + { + /* Reset to an empty balloon. */ + Assert(pGMM->cBalloonedPages >= pGVM->gmm.s.Stats.cBalloonedPages); + + pGMM->cBalloonedPages -= pGVM->gmm.s.Stats.cBalloonedPages; + pGVM->gmm.s.Stats.cBalloonedPages = 0; + break; + } + + default: + rc = VERR_INVALID_PARAMETER; + break; + } + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + + gmmR0MutexRelease(pGMM); + LogFlow(("GMMR0BalloonedPages: returns %Rrc\n", rc)); + return rc; +} + + +/** + * VMMR0 request wrapper for GMMR0BalloonedPages. + * + * @returns see GMMR0BalloonedPages. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0BalloonedPagesReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMBALLOONEDPAGESREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(GMMBALLOONEDPAGESREQ), + ("%#x < %#x\n", pReq->Hdr.cbReq, sizeof(GMMBALLOONEDPAGESREQ)), + VERR_INVALID_PARAMETER); + + return GMMR0BalloonedPages(pGVM, pVM, idCpu, pReq->enmAction, pReq->cBalloonedPages); +} + + +/** + * Return memory statistics for the hypervisor + * + * @returns VBox status code. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0QueryHypervisorMemoryStatsReq(PGMMMEMSTATSREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(GMMMEMSTATSREQ), + ("%#x < %#x\n", pReq->Hdr.cbReq, sizeof(GMMMEMSTATSREQ)), + VERR_INVALID_PARAMETER); + + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + pReq->cAllocPages = pGMM->cAllocatedPages; + pReq->cFreePages = (pGMM->cChunks << (GMM_CHUNK_SHIFT- PAGE_SHIFT)) - pGMM->cAllocatedPages; + pReq->cBalloonedPages = pGMM->cBalloonedPages; + pReq->cMaxPages = pGMM->cMaxPages; + pReq->cSharedPages = pGMM->cDuplicatePages; + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + + return VINF_SUCCESS; +} + + +/** + * Return memory statistics for the VM + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu Cpu id. + * @param pReq Pointer to the request packet. + * + * @thread EMT(idCpu) + */ +GMMR0DECL(int) GMMR0QueryMemoryStatsReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMMEMSTATSREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(GMMMEMSTATSREQ), + ("%#x < %#x\n", pReq->Hdr.cbReq, sizeof(GMMMEMSTATSREQ)), + VERR_INVALID_PARAMETER); + + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + /* + * Take the semaphore and do some more validations. + */ + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + pReq->cAllocPages = pGVM->gmm.s.Stats.Allocated.cBasePages; + pReq->cBalloonedPages = pGVM->gmm.s.Stats.cBalloonedPages; + pReq->cMaxPages = pGVM->gmm.s.Stats.Reserved.cBasePages; + pReq->cFreePages = pReq->cMaxPages - pReq->cAllocPages; + } + else + rc = VERR_GMM_IS_NOT_SANE; + + gmmR0MutexRelease(pGMM); + LogFlow(("GMMR3QueryVMMemoryStats: returns %Rrc\n", rc)); + return rc; +} + + +/** + * Worker for gmmR0UnmapChunk and gmmr0FreeChunk. + * + * Don't call this in legacy allocation mode! + * + * @returns VBox status code. + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the Global VM structure. + * @param pChunk Pointer to the chunk to be unmapped. + */ +static int gmmR0UnmapChunkLocked(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk) +{ + Assert(!pGMM->fLegacyAllocationMode); NOREF(pGMM); + + /* + * Find the mapping and try unmapping it. + */ + uint32_t cMappings = pChunk->cMappingsX; + for (uint32_t i = 0; i < cMappings; i++) + { + Assert(pChunk->paMappingsX[i].pGVM && pChunk->paMappingsX[i].hMapObj != NIL_RTR0MEMOBJ); + if (pChunk->paMappingsX[i].pGVM == pGVM) + { + /* unmap */ + int rc = RTR0MemObjFree(pChunk->paMappingsX[i].hMapObj, false /* fFreeMappings (NA) */); + if (RT_SUCCESS(rc)) + { + /* update the record. */ + cMappings--; + if (i < cMappings) + pChunk->paMappingsX[i] = pChunk->paMappingsX[cMappings]; + pChunk->paMappingsX[cMappings].hMapObj = NIL_RTR0MEMOBJ; + pChunk->paMappingsX[cMappings].pGVM = NULL; + Assert(pChunk->cMappingsX - 1U == cMappings); + pChunk->cMappingsX = cMappings; + } + + return rc; + } + } + + Log(("gmmR0UnmapChunk: Chunk %#x is not mapped into pGVM=%p/%#x\n", pChunk->Core.Key, pGVM, pGVM->hSelf)); + return VERR_GMM_CHUNK_NOT_MAPPED; +} + + +/** + * Unmaps a chunk previously mapped into the address space of the current process. + * + * @returns VBox status code. + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the Global VM structure. + * @param pChunk Pointer to the chunk to be unmapped. + * @param fRelaxedSem Whether we can release the semaphore while doing the + * mapping (@c true) or not. + */ +static int gmmR0UnmapChunk(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk, bool fRelaxedSem) +{ + if (!pGMM->fLegacyAllocationMode) + { + /* + * Lock the chunk and if possible leave the giant GMM lock. + */ + GMMR0CHUNKMTXSTATE MtxState; + int rc = gmmR0ChunkMutexAcquire(&MtxState, pGMM, pChunk, + fRelaxedSem ? GMMR0CHUNK_MTX_RETAKE_GIANT : GMMR0CHUNK_MTX_KEEP_GIANT); + if (RT_SUCCESS(rc)) + { + rc = gmmR0UnmapChunkLocked(pGMM, pGVM, pChunk); + gmmR0ChunkMutexRelease(&MtxState, pChunk); + } + return rc; + } + + if (pChunk->hGVM == pGVM->hSelf) + return VINF_SUCCESS; + + Log(("gmmR0UnmapChunk: Chunk %#x is not mapped into pGVM=%p/%#x (legacy)\n", pChunk->Core.Key, pGVM, pGVM->hSelf)); + return VERR_GMM_CHUNK_NOT_MAPPED; +} + + +/** + * Worker for gmmR0MapChunk. + * + * @returns VBox status code. + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the Global VM structure. + * @param pChunk Pointer to the chunk to be mapped. + * @param ppvR3 Where to store the ring-3 address of the mapping. + * In the VERR_GMM_CHUNK_ALREADY_MAPPED case, this will be + * contain the address of the existing mapping. + */ +static int gmmR0MapChunkLocked(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk, PRTR3PTR ppvR3) +{ + /* + * If we're in legacy mode this is simple. + */ + if (pGMM->fLegacyAllocationMode) + { + if (pChunk->hGVM != pGVM->hSelf) + { + Log(("gmmR0MapChunk: chunk %#x is already mapped at %p!\n", pChunk->Core.Key, *ppvR3)); + return VERR_GMM_CHUNK_NOT_FOUND; + } + + *ppvR3 = RTR0MemObjAddressR3(pChunk->hMemObj); + return VINF_SUCCESS; + } + + /* + * Check to see if the chunk is already mapped. + */ + for (uint32_t i = 0; i < pChunk->cMappingsX; i++) + { + Assert(pChunk->paMappingsX[i].pGVM && pChunk->paMappingsX[i].hMapObj != NIL_RTR0MEMOBJ); + if (pChunk->paMappingsX[i].pGVM == pGVM) + { + *ppvR3 = RTR0MemObjAddressR3(pChunk->paMappingsX[i].hMapObj); + Log(("gmmR0MapChunk: chunk %#x is already mapped at %p!\n", pChunk->Core.Key, *ppvR3)); +#ifdef VBOX_WITH_PAGE_SHARING + /* The ring-3 chunk cache can be out of sync; don't fail. */ + return VINF_SUCCESS; +#else + return VERR_GMM_CHUNK_ALREADY_MAPPED; +#endif + } + } + + /* + * Do the mapping. + */ + RTR0MEMOBJ hMapObj; + int rc = RTR0MemObjMapUser(&hMapObj, pChunk->hMemObj, (RTR3PTR)-1, 0, RTMEM_PROT_READ | RTMEM_PROT_WRITE, NIL_RTR0PROCESS); + if (RT_SUCCESS(rc)) + { + /* reallocate the array? assumes few users per chunk (usually one). */ + unsigned iMapping = pChunk->cMappingsX; + if ( iMapping <= 3 + || (iMapping & 3) == 0) + { + unsigned cNewSize = iMapping <= 3 + ? iMapping + 1 + : iMapping + 4; + Assert(cNewSize < 4 || RT_ALIGN_32(cNewSize, 4) == cNewSize); + if (RT_UNLIKELY(cNewSize > UINT16_MAX)) + { + rc = RTR0MemObjFree(hMapObj, false /* fFreeMappings (NA) */); AssertRC(rc); + return VERR_GMM_TOO_MANY_CHUNK_MAPPINGS; + } + + void *pvMappings = RTMemRealloc(pChunk->paMappingsX, cNewSize * sizeof(pChunk->paMappingsX[0])); + if (RT_UNLIKELY(!pvMappings)) + { + rc = RTR0MemObjFree(hMapObj, false /* fFreeMappings (NA) */); AssertRC(rc); + return VERR_NO_MEMORY; + } + pChunk->paMappingsX = (PGMMCHUNKMAP)pvMappings; + } + + /* insert new entry */ + pChunk->paMappingsX[iMapping].hMapObj = hMapObj; + pChunk->paMappingsX[iMapping].pGVM = pGVM; + Assert(pChunk->cMappingsX == iMapping); + pChunk->cMappingsX = iMapping + 1; + + *ppvR3 = RTR0MemObjAddressR3(hMapObj); + } + + return rc; +} + + +/** + * Maps a chunk into the user address space of the current process. + * + * @returns VBox status code. + * @param pGMM Pointer to the GMM instance data. + * @param pGVM Pointer to the Global VM structure. + * @param pChunk Pointer to the chunk to be mapped. + * @param fRelaxedSem Whether we can release the semaphore while doing the + * mapping (@c true) or not. + * @param ppvR3 Where to store the ring-3 address of the mapping. + * In the VERR_GMM_CHUNK_ALREADY_MAPPED case, this will be + * contain the address of the existing mapping. + */ +static int gmmR0MapChunk(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk, bool fRelaxedSem, PRTR3PTR ppvR3) +{ + /* + * Take the chunk lock and leave the giant GMM lock when possible, then + * call the worker function. + */ + GMMR0CHUNKMTXSTATE MtxState; + int rc = gmmR0ChunkMutexAcquire(&MtxState, pGMM, pChunk, + fRelaxedSem ? GMMR0CHUNK_MTX_RETAKE_GIANT : GMMR0CHUNK_MTX_KEEP_GIANT); + if (RT_SUCCESS(rc)) + { + rc = gmmR0MapChunkLocked(pGMM, pGVM, pChunk, ppvR3); + gmmR0ChunkMutexRelease(&MtxState, pChunk); + } + + return rc; +} + + + +#if defined(VBOX_WITH_PAGE_SHARING) || (defined(VBOX_STRICT) && HC_ARCH_BITS == 64) +/** + * Check if a chunk is mapped into the specified VM + * + * @returns mapped yes/no + * @param pGMM Pointer to the GMM instance. + * @param pGVM Pointer to the Global VM structure. + * @param pChunk Pointer to the chunk to be mapped. + * @param ppvR3 Where to store the ring-3 address of the mapping. + */ +static bool gmmR0IsChunkMapped(PGMM pGMM, PGVM pGVM, PGMMCHUNK pChunk, PRTR3PTR ppvR3) +{ + GMMR0CHUNKMTXSTATE MtxState; + gmmR0ChunkMutexAcquire(&MtxState, pGMM, pChunk, GMMR0CHUNK_MTX_KEEP_GIANT); + for (uint32_t i = 0; i < pChunk->cMappingsX; i++) + { + Assert(pChunk->paMappingsX[i].pGVM && pChunk->paMappingsX[i].hMapObj != NIL_RTR0MEMOBJ); + if (pChunk->paMappingsX[i].pGVM == pGVM) + { + *ppvR3 = RTR0MemObjAddressR3(pChunk->paMappingsX[i].hMapObj); + gmmR0ChunkMutexRelease(&MtxState, pChunk); + return true; + } + } + *ppvR3 = NULL; + gmmR0ChunkMutexRelease(&MtxState, pChunk); + return false; +} +#endif /* VBOX_WITH_PAGE_SHARING || (VBOX_STRICT && 64-BIT) */ + + +/** + * Map a chunk and/or unmap another chunk. + * + * The mapping and unmapping applies to the current process. + * + * This API does two things because it saves a kernel call per mapping when + * when the ring-3 mapping cache is full. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idChunkMap The chunk to map. NIL_GMM_CHUNKID if nothing to map. + * @param idChunkUnmap The chunk to unmap. NIL_GMM_CHUNKID if nothing to unmap. + * @param ppvR3 Where to store the address of the mapped chunk. NULL is ok if nothing to map. + * @thread EMT ??? + */ +GMMR0DECL(int) GMMR0MapUnmapChunk(PGVM pGVM, PVM pVM, uint32_t idChunkMap, uint32_t idChunkUnmap, PRTR3PTR ppvR3) +{ + LogFlow(("GMMR0MapUnmapChunk: pGVM=%p pVM=%p idChunkMap=%#x idChunkUnmap=%#x ppvR3=%p\n", + pGVM, pVM, idChunkMap, idChunkUnmap, ppvR3)); + + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVM(pGVM, pVM); + if (RT_FAILURE(rc)) + return rc; + + AssertCompile(NIL_GMM_CHUNKID == 0); + AssertMsgReturn(idChunkMap <= GMM_CHUNKID_LAST, ("%#x\n", idChunkMap), VERR_INVALID_PARAMETER); + AssertMsgReturn(idChunkUnmap <= GMM_CHUNKID_LAST, ("%#x\n", idChunkUnmap), VERR_INVALID_PARAMETER); + + if ( idChunkMap == NIL_GMM_CHUNKID + && idChunkUnmap == NIL_GMM_CHUNKID) + return VERR_INVALID_PARAMETER; + + if (idChunkMap != NIL_GMM_CHUNKID) + { + AssertPtrReturn(ppvR3, VERR_INVALID_POINTER); + *ppvR3 = NIL_RTR3PTR; + } + + /* + * Take the semaphore and do the work. + * + * The unmapping is done last since it's easier to undo a mapping than + * undoing an unmapping. The ring-3 mapping cache cannot not be so big + * that it pushes the user virtual address space to within a chunk of + * it it's limits, so, no problem here. + */ + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + PGMMCHUNK pMap = NULL; + if (idChunkMap != NIL_GVM_HANDLE) + { + pMap = gmmR0GetChunk(pGMM, idChunkMap); + if (RT_LIKELY(pMap)) + rc = gmmR0MapChunk(pGMM, pGVM, pMap, true /*fRelaxedSem*/, ppvR3); + else + { + Log(("GMMR0MapUnmapChunk: idChunkMap=%#x\n", idChunkMap)); + rc = VERR_GMM_CHUNK_NOT_FOUND; + } + } +/** @todo split this operation, the bail out might (theoretcially) not be + * entirely safe. */ + + if ( idChunkUnmap != NIL_GMM_CHUNKID + && RT_SUCCESS(rc)) + { + PGMMCHUNK pUnmap = gmmR0GetChunk(pGMM, idChunkUnmap); + if (RT_LIKELY(pUnmap)) + rc = gmmR0UnmapChunk(pGMM, pGVM, pUnmap, true /*fRelaxedSem*/); + else + { + Log(("GMMR0MapUnmapChunk: idChunkUnmap=%#x\n", idChunkUnmap)); + rc = VERR_GMM_CHUNK_NOT_FOUND; + } + + if (RT_FAILURE(rc) && pMap) + gmmR0UnmapChunk(pGMM, pGVM, pMap, false /*fRelaxedSem*/); + } + + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + gmmR0MutexRelease(pGMM); + + LogFlow(("GMMR0MapUnmapChunk: returns %Rrc\n", rc)); + return rc; +} + + +/** + * VMMR0 request wrapper for GMMR0MapUnmapChunk. + * + * @returns see GMMR0MapUnmapChunk. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0MapUnmapChunkReq(PGVM pGVM, PVM pVM, PGMMMAPUNMAPCHUNKREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + return GMMR0MapUnmapChunk(pGVM, pVM, pReq->idChunkMap, pReq->idChunkUnmap, &pReq->pvR3); +} + + +/** + * Legacy mode API for supplying pages. + * + * The specified user address points to a allocation chunk sized block that + * will be locked down and used by the GMM when the GM asks for pages. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pvR3 Pointer to the chunk size memory block to lock down. + */ +GMMR0DECL(int) GMMR0SeedChunk(PGVM pGVM, PVM pVM, VMCPUID idCpu, RTR3PTR pvR3) +{ + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + AssertPtrReturn(pvR3, VERR_INVALID_POINTER); + AssertReturn(!(PAGE_OFFSET_MASK & pvR3), VERR_INVALID_POINTER); + + if (!pGMM->fLegacyAllocationMode) + { + Log(("GMMR0SeedChunk: not in legacy allocation mode!\n")); + return VERR_NOT_SUPPORTED; + } + + /* + * Lock the memory and add it as new chunk with our hGVM. + * (The GMM locking is done inside gmmR0RegisterChunk.) + */ + RTR0MEMOBJ MemObj; + rc = RTR0MemObjLockUser(&MemObj, pvR3, GMM_CHUNK_SIZE, RTMEM_PROT_READ | RTMEM_PROT_WRITE, NIL_RTR0PROCESS); + if (RT_SUCCESS(rc)) + { + rc = gmmR0RegisterChunk(pGMM, &pGVM->gmm.s.Private, MemObj, pGVM->hSelf, 0 /*fChunkFlags*/, NULL); + if (RT_SUCCESS(rc)) + gmmR0MutexRelease(pGMM); + else + RTR0MemObjFree(MemObj, false /* fFreeMappings */); + } + + LogFlow(("GMMR0SeedChunk: rc=%d (pvR3=%p)\n", rc, pvR3)); + return rc; +} + +#ifdef VBOX_WITH_PAGE_SHARING + +# ifdef VBOX_STRICT +/** + * For checksumming shared pages in strict builds. + * + * The purpose is making sure that a page doesn't change. + * + * @returns Checksum, 0 on failure. + * @param pGMM The GMM instance data. + * @param pGVM Pointer to the kernel-only VM instace data. + * @param idPage The page ID. + */ +static uint32_t gmmR0StrictPageChecksum(PGMM pGMM, PGVM pGVM, uint32_t idPage) +{ + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, idPage >> GMM_CHUNKID_SHIFT); + AssertMsgReturn(pChunk, ("idPage=%#x\n", idPage), 0); + + uint8_t *pbChunk; + if (!gmmR0IsChunkMapped(pGMM, pGVM, pChunk, (PRTR3PTR)&pbChunk)) + return 0; + uint8_t const *pbPage = pbChunk + ((idPage & GMM_PAGEID_IDX_MASK) << PAGE_SHIFT); + + return RTCrc32(pbPage, PAGE_SIZE); +} +# endif /* VBOX_STRICT */ + + +/** + * Calculates the module hash value. + * + * @returns Hash value. + * @param pszModuleName The module name. + * @param pszVersion The module version string. + */ +static uint32_t gmmR0ShModCalcHash(const char *pszModuleName, const char *pszVersion) +{ + return RTStrHash1ExN(3, pszModuleName, RTSTR_MAX, "::", (size_t)2, pszVersion, RTSTR_MAX); +} + + +/** + * Finds a global module. + * + * @returns Pointer to the global module on success, NULL if not found. + * @param pGMM The GMM instance data. + * @param uHash The hash as calculated by gmmR0ShModCalcHash. + * @param cbModule The module size. + * @param enmGuestOS The guest OS type. + * @param cRegions The number of regions. + * @param pszModuleName The module name. + * @param pszVersion The module version. + * @param paRegions The region descriptions. + */ +static PGMMSHAREDMODULE gmmR0ShModFindGlobal(PGMM pGMM, uint32_t uHash, uint32_t cbModule, VBOXOSFAMILY enmGuestOS, + uint32_t cRegions, const char *pszModuleName, const char *pszVersion, + struct VMMDEVSHAREDREGIONDESC const *paRegions) +{ + for (PGMMSHAREDMODULE pGblMod = (PGMMSHAREDMODULE)RTAvllU32Get(&pGMM->pGlobalSharedModuleTree, uHash); + pGblMod; + pGblMod = (PGMMSHAREDMODULE)pGblMod->Core.pList) + { + if (pGblMod->cbModule != cbModule) + continue; + if (pGblMod->enmGuestOS != enmGuestOS) + continue; + if (pGblMod->cRegions != cRegions) + continue; + if (strcmp(pGblMod->szName, pszModuleName)) + continue; + if (strcmp(pGblMod->szVersion, pszVersion)) + continue; + + uint32_t i; + for (i = 0; i < cRegions; i++) + { + uint32_t off = paRegions[i].GCRegionAddr & PAGE_OFFSET_MASK; + if (pGblMod->aRegions[i].off != off) + break; + + uint32_t cb = RT_ALIGN_32(paRegions[i].cbRegion + off, PAGE_SIZE); + if (pGblMod->aRegions[i].cb != cb) + break; + } + + if (i == cRegions) + return pGblMod; + } + + return NULL; +} + + +/** + * Creates a new global module. + * + * @returns VBox status code. + * @param pGMM The GMM instance data. + * @param uHash The hash as calculated by gmmR0ShModCalcHash. + * @param cbModule The module size. + * @param enmGuestOS The guest OS type. + * @param cRegions The number of regions. + * @param pszModuleName The module name. + * @param pszVersion The module version. + * @param paRegions The region descriptions. + * @param ppGblMod Where to return the new module on success. + */ +static int gmmR0ShModNewGlobal(PGMM pGMM, uint32_t uHash, uint32_t cbModule, VBOXOSFAMILY enmGuestOS, + uint32_t cRegions, const char *pszModuleName, const char *pszVersion, + struct VMMDEVSHAREDREGIONDESC const *paRegions, PGMMSHAREDMODULE *ppGblMod) +{ + Log(("gmmR0ShModNewGlobal: %s %s size %#x os %u rgn %u\n", pszModuleName, pszVersion, cbModule, enmGuestOS, cRegions)); + if (pGMM->cShareableModules >= GMM_MAX_SHARED_GLOBAL_MODULES) + { + Log(("gmmR0ShModNewGlobal: Too many modules\n")); + return VERR_GMM_TOO_MANY_GLOBAL_MODULES; + } + + PGMMSHAREDMODULE pGblMod = (PGMMSHAREDMODULE)RTMemAllocZ(RT_UOFFSETOF_DYN(GMMSHAREDMODULE, aRegions[cRegions])); + if (!pGblMod) + { + Log(("gmmR0ShModNewGlobal: No memory\n")); + return VERR_NO_MEMORY; + } + + pGblMod->Core.Key = uHash; + pGblMod->cbModule = cbModule; + pGblMod->cRegions = cRegions; + pGblMod->cUsers = 1; + pGblMod->enmGuestOS = enmGuestOS; + strcpy(pGblMod->szName, pszModuleName); + strcpy(pGblMod->szVersion, pszVersion); + + for (uint32_t i = 0; i < cRegions; i++) + { + Log(("gmmR0ShModNewGlobal: rgn[%u]=%RGvLB%#x\n", i, paRegions[i].GCRegionAddr, paRegions[i].cbRegion)); + pGblMod->aRegions[i].off = paRegions[i].GCRegionAddr & PAGE_OFFSET_MASK; + pGblMod->aRegions[i].cb = paRegions[i].cbRegion + pGblMod->aRegions[i].off; + pGblMod->aRegions[i].cb = RT_ALIGN_32(pGblMod->aRegions[i].cb, PAGE_SIZE); + pGblMod->aRegions[i].paidPages = NULL; /* allocated when needed. */ + } + + bool fInsert = RTAvllU32Insert(&pGMM->pGlobalSharedModuleTree, &pGblMod->Core); + Assert(fInsert); NOREF(fInsert); + pGMM->cShareableModules++; + + *ppGblMod = pGblMod; + return VINF_SUCCESS; +} + + +/** + * Deletes a global module which is no longer referenced by anyone. + * + * @param pGMM The GMM instance data. + * @param pGblMod The module to delete. + */ +static void gmmR0ShModDeleteGlobal(PGMM pGMM, PGMMSHAREDMODULE pGblMod) +{ + Assert(pGblMod->cUsers == 0); + Assert(pGMM->cShareableModules > 0 && pGMM->cShareableModules <= GMM_MAX_SHARED_GLOBAL_MODULES); + + void *pvTest = RTAvllU32RemoveNode(&pGMM->pGlobalSharedModuleTree, &pGblMod->Core); + Assert(pvTest == pGblMod); NOREF(pvTest); + pGMM->cShareableModules--; + + uint32_t i = pGblMod->cRegions; + while (i-- > 0) + { + if (pGblMod->aRegions[i].paidPages) + { + /* We don't doing anything to the pages as they are handled by the + copy-on-write mechanism in PGM. */ + RTMemFree(pGblMod->aRegions[i].paidPages); + pGblMod->aRegions[i].paidPages = NULL; + } + } + RTMemFree(pGblMod); +} + + +static int gmmR0ShModNewPerVM(PGVM pGVM, RTGCPTR GCBaseAddr, uint32_t cRegions, const VMMDEVSHAREDREGIONDESC *paRegions, + PGMMSHAREDMODULEPERVM *ppRecVM) +{ + if (pGVM->gmm.s.Stats.cShareableModules >= GMM_MAX_SHARED_PER_VM_MODULES) + return VERR_GMM_TOO_MANY_PER_VM_MODULES; + + PGMMSHAREDMODULEPERVM pRecVM; + pRecVM = (PGMMSHAREDMODULEPERVM)RTMemAllocZ(RT_UOFFSETOF_DYN(GMMSHAREDMODULEPERVM, aRegionsGCPtrs[cRegions])); + if (!pRecVM) + return VERR_NO_MEMORY; + + pRecVM->Core.Key = GCBaseAddr; + for (uint32_t i = 0; i < cRegions; i++) + pRecVM->aRegionsGCPtrs[i] = paRegions[i].GCRegionAddr; + + bool fInsert = RTAvlGCPtrInsert(&pGVM->gmm.s.pSharedModuleTree, &pRecVM->Core); + Assert(fInsert); NOREF(fInsert); + pGVM->gmm.s.Stats.cShareableModules++; + + *ppRecVM = pRecVM; + return VINF_SUCCESS; +} + + +static void gmmR0ShModDeletePerVM(PGMM pGMM, PGVM pGVM, PGMMSHAREDMODULEPERVM pRecVM, bool fRemove) +{ + /* + * Free the per-VM module. + */ + PGMMSHAREDMODULE pGblMod = pRecVM->pGlobalModule; + pRecVM->pGlobalModule = NULL; + + if (fRemove) + { + void *pvTest = RTAvlGCPtrRemove(&pGVM->gmm.s.pSharedModuleTree, pRecVM->Core.Key); + Assert(pvTest == &pRecVM->Core); NOREF(pvTest); + } + + RTMemFree(pRecVM); + + /* + * Release the global module. + * (In the registration bailout case, it might not be.) + */ + if (pGblMod) + { + Assert(pGblMod->cUsers > 0); + pGblMod->cUsers--; + if (pGblMod->cUsers == 0) + gmmR0ShModDeleteGlobal(pGMM, pGblMod); + } +} + +#endif /* VBOX_WITH_PAGE_SHARING */ + +/** + * Registers a new shared module for the VM. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param enmGuestOS The guest OS type. + * @param pszModuleName The module name. + * @param pszVersion The module version. + * @param GCPtrModBase The module base address. + * @param cbModule The module size. + * @param cRegions The mumber of shared region descriptors. + * @param paRegions Pointer to an array of shared region(s). + * @thread EMT(idCpu) + */ +GMMR0DECL(int) GMMR0RegisterSharedModule(PGVM pGVM, PVM pVM, VMCPUID idCpu, VBOXOSFAMILY enmGuestOS, char *pszModuleName, + char *pszVersion, RTGCPTR GCPtrModBase, uint32_t cbModule, + uint32_t cRegions, struct VMMDEVSHAREDREGIONDESC const *paRegions) +{ +#ifdef VBOX_WITH_PAGE_SHARING + /* + * Validate input and get the basics. + * + * Note! Turns out the module size does necessarily match the size of the + * regions. (iTunes on XP) + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + if (RT_UNLIKELY(cRegions > VMMDEVSHAREDREGIONDESC_MAX)) + return VERR_GMM_TOO_MANY_REGIONS; + + if (RT_UNLIKELY(cbModule == 0 || cbModule > _1G)) + return VERR_GMM_BAD_SHARED_MODULE_SIZE; + + uint32_t cbTotal = 0; + for (uint32_t i = 0; i < cRegions; i++) + { + if (RT_UNLIKELY(paRegions[i].cbRegion == 0 || paRegions[i].cbRegion > _1G)) + return VERR_GMM_SHARED_MODULE_BAD_REGIONS_SIZE; + + cbTotal += paRegions[i].cbRegion; + if (RT_UNLIKELY(cbTotal > _1G)) + return VERR_GMM_SHARED_MODULE_BAD_REGIONS_SIZE; + } + + AssertPtrReturn(pszModuleName, VERR_INVALID_POINTER); + if (RT_UNLIKELY(!memchr(pszModuleName, '\0', GMM_SHARED_MODULE_MAX_NAME_STRING))) + return VERR_GMM_MODULE_NAME_TOO_LONG; + + AssertPtrReturn(pszVersion, VERR_INVALID_POINTER); + if (RT_UNLIKELY(!memchr(pszVersion, '\0', GMM_SHARED_MODULE_MAX_VERSION_STRING))) + return VERR_GMM_MODULE_NAME_TOO_LONG; + + uint32_t const uHash = gmmR0ShModCalcHash(pszModuleName, pszVersion); + Log(("GMMR0RegisterSharedModule %s %s base %RGv size %x hash %x\n", pszModuleName, pszVersion, GCPtrModBase, cbModule, uHash)); + + /* + * Take the semaphore and do some more validations. + */ + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + /* + * Check if this module is already locally registered and register + * it if it isn't. The base address is a unique module identifier + * locally. + */ + PGMMSHAREDMODULEPERVM pRecVM = (PGMMSHAREDMODULEPERVM)RTAvlGCPtrGet(&pGVM->gmm.s.pSharedModuleTree, GCPtrModBase); + bool fNewModule = pRecVM == NULL; + if (fNewModule) + { + rc = gmmR0ShModNewPerVM(pGVM, GCPtrModBase, cRegions, paRegions, &pRecVM); + if (RT_SUCCESS(rc)) + { + /* + * Find a matching global module, register a new one if needed. + */ + PGMMSHAREDMODULE pGblMod = gmmR0ShModFindGlobal(pGMM, uHash, cbModule, enmGuestOS, cRegions, + pszModuleName, pszVersion, paRegions); + if (!pGblMod) + { + Assert(fNewModule); + rc = gmmR0ShModNewGlobal(pGMM, uHash, cbModule, enmGuestOS, cRegions, + pszModuleName, pszVersion, paRegions, &pGblMod); + if (RT_SUCCESS(rc)) + { + pRecVM->pGlobalModule = pGblMod; /* (One referenced returned by gmmR0ShModNewGlobal.) */ + Log(("GMMR0RegisterSharedModule: new module %s %s\n", pszModuleName, pszVersion)); + } + else + gmmR0ShModDeletePerVM(pGMM, pGVM, pRecVM, true /*fRemove*/); + } + else + { + Assert(pGblMod->cUsers > 0 && pGblMod->cUsers < UINT32_MAX / 2); + pGblMod->cUsers++; + pRecVM->pGlobalModule = pGblMod; + + Log(("GMMR0RegisterSharedModule: new per vm module %s %s, gbl users %d\n", pszModuleName, pszVersion, pGblMod->cUsers)); + } + } + } + else + { + /* + * Attempt to re-register an existing module. + */ + PGMMSHAREDMODULE pGblMod = gmmR0ShModFindGlobal(pGMM, uHash, cbModule, enmGuestOS, cRegions, + pszModuleName, pszVersion, paRegions); + if (pRecVM->pGlobalModule == pGblMod) + { + Log(("GMMR0RegisterSharedModule: already registered %s %s, gbl users %d\n", pszModuleName, pszVersion, pGblMod->cUsers)); + rc = VINF_GMM_SHARED_MODULE_ALREADY_REGISTERED; + } + else + { + /** @todo may have to unregister+register when this happens in case it's caused + * by VBoxService crashing and being restarted... */ + Log(("GMMR0RegisterSharedModule: Address clash!\n" + " incoming at %RGvLB%#x %s %s rgns %u\n" + " existing at %RGvLB%#x %s %s rgns %u\n", + GCPtrModBase, cbModule, pszModuleName, pszVersion, cRegions, + pRecVM->Core.Key, pRecVM->pGlobalModule->cbModule, pRecVM->pGlobalModule->szName, + pRecVM->pGlobalModule->szVersion, pRecVM->pGlobalModule->cRegions)); + rc = VERR_GMM_SHARED_MODULE_ADDRESS_CLASH; + } + } + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + + gmmR0MutexRelease(pGMM); + return rc; +#else + + NOREF(pGVM); NOREF(pVM); NOREF(idCpu); NOREF(enmGuestOS); NOREF(pszModuleName); NOREF(pszVersion); + NOREF(GCPtrModBase); NOREF(cbModule); NOREF(cRegions); NOREF(paRegions); + return VERR_NOT_IMPLEMENTED; +#endif +} + + +/** + * VMMR0 request wrapper for GMMR0RegisterSharedModule. + * + * @returns see GMMR0RegisterSharedModule. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0RegisterSharedModuleReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMREGISTERSHAREDMODULEREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn( pReq->Hdr.cbReq >= sizeof(*pReq) + && pReq->Hdr.cbReq == RT_UOFFSETOF_DYN(GMMREGISTERSHAREDMODULEREQ, aRegions[pReq->cRegions]), + ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + /* Pass back return code in the request packet to preserve informational codes. (VMMR3CallR0 chokes on them) */ + pReq->rc = GMMR0RegisterSharedModule(pGVM, pVM, idCpu, pReq->enmGuestOS, pReq->szName, pReq->szVersion, + pReq->GCBaseAddr, pReq->cbModule, pReq->cRegions, pReq->aRegions); + return VINF_SUCCESS; +} + + +/** + * Unregisters a shared module for the VM + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pszModuleName The module name. + * @param pszVersion The module version. + * @param GCPtrModBase The module base address. + * @param cbModule The module size. + */ +GMMR0DECL(int) GMMR0UnregisterSharedModule(PGVM pGVM, PVM pVM, VMCPUID idCpu, char *pszModuleName, char *pszVersion, + RTGCPTR GCPtrModBase, uint32_t cbModule) +{ +#ifdef VBOX_WITH_PAGE_SHARING + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + AssertPtrReturn(pszModuleName, VERR_INVALID_POINTER); + AssertPtrReturn(pszVersion, VERR_INVALID_POINTER); + if (RT_UNLIKELY(!memchr(pszModuleName, '\0', GMM_SHARED_MODULE_MAX_NAME_STRING))) + return VERR_GMM_MODULE_NAME_TOO_LONG; + if (RT_UNLIKELY(!memchr(pszVersion, '\0', GMM_SHARED_MODULE_MAX_VERSION_STRING))) + return VERR_GMM_MODULE_NAME_TOO_LONG; + + Log(("GMMR0UnregisterSharedModule %s %s base=%RGv size %x\n", pszModuleName, pszVersion, GCPtrModBase, cbModule)); + + /* + * Take the semaphore and do some more validations. + */ + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + /* + * Locate and remove the specified module. + */ + PGMMSHAREDMODULEPERVM pRecVM = (PGMMSHAREDMODULEPERVM)RTAvlGCPtrGet(&pGVM->gmm.s.pSharedModuleTree, GCPtrModBase); + if (pRecVM) + { + /** @todo Do we need to do more validations here, like that the + * name + version + cbModule matches? */ + NOREF(cbModule); + Assert(pRecVM->pGlobalModule); + gmmR0ShModDeletePerVM(pGMM, pGVM, pRecVM, true /*fRemove*/); + } + else + rc = VERR_GMM_SHARED_MODULE_NOT_FOUND; + + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + + gmmR0MutexRelease(pGMM); + return rc; +#else + + NOREF(pGVM); NOREF(pVM); NOREF(idCpu); NOREF(pszModuleName); NOREF(pszVersion); NOREF(GCPtrModBase); NOREF(cbModule); + return VERR_NOT_IMPLEMENTED; +#endif +} + + +/** + * VMMR0 request wrapper for GMMR0UnregisterSharedModule. + * + * @returns see GMMR0UnregisterSharedModule. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0UnregisterSharedModuleReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGMMUNREGISTERSHAREDMODULEREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + return GMMR0UnregisterSharedModule(pGVM, pVM, idCpu, pReq->szName, pReq->szVersion, pReq->GCBaseAddr, pReq->cbModule); +} + +#ifdef VBOX_WITH_PAGE_SHARING + +/** + * Increase the use count of a shared page, the page is known to exist and be valid and such. + * + * @param pGMM Pointer to the GMM instance. + * @param pGVM Pointer to the GVM instance. + * @param pPage The page structure. + */ +DECLINLINE(void) gmmR0UseSharedPage(PGMM pGMM, PGVM pGVM, PGMMPAGE pPage) +{ + Assert(pGMM->cSharedPages > 0); + Assert(pGMM->cAllocatedPages > 0); + + pGMM->cDuplicatePages++; + + pPage->Shared.cRefs++; + pGVM->gmm.s.Stats.cSharedPages++; + pGVM->gmm.s.Stats.Allocated.cBasePages++; +} + + +/** + * Converts a private page to a shared page, the page is known to exist and be valid and such. + * + * @param pGMM Pointer to the GMM instance. + * @param pGVM Pointer to the GVM instance. + * @param HCPhys Host physical address + * @param idPage The Page ID + * @param pPage The page structure. + * @param pPageDesc Shared page descriptor + */ +DECLINLINE(void) gmmR0ConvertToSharedPage(PGMM pGMM, PGVM pGVM, RTHCPHYS HCPhys, uint32_t idPage, PGMMPAGE pPage, + PGMMSHAREDPAGEDESC pPageDesc) +{ + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, idPage >> GMM_CHUNKID_SHIFT); + Assert(pChunk); + Assert(pChunk->cFree < GMM_CHUNK_NUM_PAGES); + Assert(GMM_PAGE_IS_PRIVATE(pPage)); + + pChunk->cPrivate--; + pChunk->cShared++; + + pGMM->cSharedPages++; + + pGVM->gmm.s.Stats.cSharedPages++; + pGVM->gmm.s.Stats.cPrivatePages--; + + /* Modify the page structure. */ + pPage->Shared.pfn = (uint32_t)(uint64_t)(HCPhys >> PAGE_SHIFT); + pPage->Shared.cRefs = 1; +#ifdef VBOX_STRICT + pPageDesc->u32StrictChecksum = gmmR0StrictPageChecksum(pGMM, pGVM, idPage); + pPage->Shared.u14Checksum = pPageDesc->u32StrictChecksum; +#else + NOREF(pPageDesc); + pPage->Shared.u14Checksum = 0; +#endif + pPage->Shared.u2State = GMM_PAGE_STATE_SHARED; +} + + +static int gmmR0SharedModuleCheckPageFirstTime(PGMM pGMM, PGVM pGVM, PGMMSHAREDMODULE pModule, + unsigned idxRegion, unsigned idxPage, + PGMMSHAREDPAGEDESC pPageDesc, PGMMSHAREDREGIONDESC pGlobalRegion) +{ + NOREF(pModule); + + /* Easy case: just change the internal page type. */ + PGMMPAGE pPage = gmmR0GetPage(pGMM, pPageDesc->idPage); + AssertMsgReturn(pPage, ("idPage=%#x (GCPhys=%RGp HCPhys=%RHp idxRegion=%#x idxPage=%#x) #1\n", + pPageDesc->idPage, pPageDesc->GCPhys, pPageDesc->HCPhys, idxRegion, idxPage), + VERR_PGM_PHYS_INVALID_PAGE_ID); + NOREF(idxRegion); + + AssertMsg(pPageDesc->GCPhys == (pPage->Private.pfn << 12), ("desc %RGp gmm %RGp\n", pPageDesc->HCPhys, (pPage->Private.pfn << 12))); + + gmmR0ConvertToSharedPage(pGMM, pGVM, pPageDesc->HCPhys, pPageDesc->idPage, pPage, pPageDesc); + + /* Keep track of these references. */ + pGlobalRegion->paidPages[idxPage] = pPageDesc->idPage; + + return VINF_SUCCESS; +} + +/** + * Checks specified shared module range for changes + * + * Performs the following tasks: + * - If a shared page is new, then it changes the GMM page type to shared and + * returns it in the pPageDesc descriptor. + * - If a shared page already exists, then it checks if the VM page is + * identical and if so frees the VM page and returns the shared page in + * pPageDesc descriptor. + * + * @remarks ASSUMES the caller has acquired the GMM semaphore!! + * + * @returns VBox status code. + * @param pGVM Pointer to the GVM instance data. + * @param pModule Module description + * @param idxRegion Region index + * @param idxPage Page index + * @param pPageDesc Page descriptor + */ +GMMR0DECL(int) GMMR0SharedModuleCheckPage(PGVM pGVM, PGMMSHAREDMODULE pModule, uint32_t idxRegion, uint32_t idxPage, + PGMMSHAREDPAGEDESC pPageDesc) +{ + int rc; + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + pPageDesc->u32StrictChecksum = 0; + + AssertMsgReturn(idxRegion < pModule->cRegions, + ("idxRegion=%#x cRegions=%#x %s %s\n", idxRegion, pModule->cRegions, pModule->szName, pModule->szVersion), + VERR_INVALID_PARAMETER); + + uint32_t const cPages = pModule->aRegions[idxRegion].cb >> PAGE_SHIFT; + AssertMsgReturn(idxPage < cPages, + ("idxRegion=%#x cRegions=%#x %s %s\n", idxRegion, pModule->cRegions, pModule->szName, pModule->szVersion), + VERR_INVALID_PARAMETER); + + LogFlow(("GMMR0SharedModuleCheckRange %s base %RGv region %d idxPage %d\n", pModule->szName, pModule->Core.Key, idxRegion, idxPage)); + + /* + * First time; create a page descriptor array. + */ + PGMMSHAREDREGIONDESC pGlobalRegion = &pModule->aRegions[idxRegion]; + if (!pGlobalRegion->paidPages) + { + Log(("Allocate page descriptor array for %d pages\n", cPages)); + pGlobalRegion->paidPages = (uint32_t *)RTMemAlloc(cPages * sizeof(pGlobalRegion->paidPages[0])); + AssertReturn(pGlobalRegion->paidPages, VERR_NO_MEMORY); + + /* Invalidate all descriptors. */ + uint32_t i = cPages; + while (i-- > 0) + pGlobalRegion->paidPages[i] = NIL_GMM_PAGEID; + } + + /* + * We've seen this shared page for the first time? + */ + if (pGlobalRegion->paidPages[idxPage] == NIL_GMM_PAGEID) + { + Log(("New shared page guest %RGp host %RHp\n", pPageDesc->GCPhys, pPageDesc->HCPhys)); + return gmmR0SharedModuleCheckPageFirstTime(pGMM, pGVM, pModule, idxRegion, idxPage, pPageDesc, pGlobalRegion); + } + + /* + * We've seen it before... + */ + Log(("Replace existing page guest %RGp host %RHp id %#x -> id %#x\n", + pPageDesc->GCPhys, pPageDesc->HCPhys, pPageDesc->idPage, pGlobalRegion->paidPages[idxPage])); + Assert(pPageDesc->idPage != pGlobalRegion->paidPages[idxPage]); + + /* + * Get the shared page source. + */ + PGMMPAGE pPage = gmmR0GetPage(pGMM, pGlobalRegion->paidPages[idxPage]); + AssertMsgReturn(pPage, ("idPage=%#x (idxRegion=%#x idxPage=%#x) #2\n", pPageDesc->idPage, idxRegion, idxPage), + VERR_PGM_PHYS_INVALID_PAGE_ID); + + if (pPage->Common.u2State != GMM_PAGE_STATE_SHARED) + { + /* + * Page was freed at some point; invalidate this entry. + */ + /** @todo this isn't really bullet proof. */ + Log(("Old shared page was freed -> create a new one\n")); + pGlobalRegion->paidPages[idxPage] = NIL_GMM_PAGEID; + return gmmR0SharedModuleCheckPageFirstTime(pGMM, pGVM, pModule, idxRegion, idxPage, pPageDesc, pGlobalRegion); + } + + Log(("Replace existing page guest host %RHp -> %RHp\n", pPageDesc->HCPhys, ((uint64_t)pPage->Shared.pfn) << PAGE_SHIFT)); + + /* + * Calculate the virtual address of the local page. + */ + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, pPageDesc->idPage >> GMM_CHUNKID_SHIFT); + AssertMsgReturn(pChunk, ("idPage=%#x (idxRegion=%#x idxPage=%#x) #4\n", pPageDesc->idPage, idxRegion, idxPage), + VERR_PGM_PHYS_INVALID_PAGE_ID); + + uint8_t *pbChunk; + AssertMsgReturn(gmmR0IsChunkMapped(pGMM, pGVM, pChunk, (PRTR3PTR)&pbChunk), + ("idPage=%#x (idxRegion=%#x idxPage=%#x) #3\n", pPageDesc->idPage, idxRegion, idxPage), + VERR_PGM_PHYS_INVALID_PAGE_ID); + uint8_t *pbLocalPage = pbChunk + ((pPageDesc->idPage & GMM_PAGEID_IDX_MASK) << PAGE_SHIFT); + + /* + * Calculate the virtual address of the shared page. + */ + pChunk = gmmR0GetChunk(pGMM, pGlobalRegion->paidPages[idxPage] >> GMM_CHUNKID_SHIFT); + Assert(pChunk); /* can't fail as gmmR0GetPage succeeded. */ + + /* + * Get the virtual address of the physical page; map the chunk into the VM + * process if not already done. + */ + if (!gmmR0IsChunkMapped(pGMM, pGVM, pChunk, (PRTR3PTR)&pbChunk)) + { + Log(("Map chunk into process!\n")); + rc = gmmR0MapChunk(pGMM, pGVM, pChunk, false /*fRelaxedSem*/, (PRTR3PTR)&pbChunk); + AssertRCReturn(rc, rc); + } + uint8_t *pbSharedPage = pbChunk + ((pGlobalRegion->paidPages[idxPage] & GMM_PAGEID_IDX_MASK) << PAGE_SHIFT); + +#ifdef VBOX_STRICT + pPageDesc->u32StrictChecksum = RTCrc32(pbSharedPage, PAGE_SIZE); + uint32_t uChecksum = pPageDesc->u32StrictChecksum & UINT32_C(0x00003fff); + AssertMsg(!uChecksum || uChecksum == pPage->Shared.u14Checksum || !pPage->Shared.u14Checksum, + ("%#x vs %#x - idPage=%#x - %s %s\n", uChecksum, pPage->Shared.u14Checksum, + pGlobalRegion->paidPages[idxPage], pModule->szName, pModule->szVersion)); +#endif + + /** @todo write ASMMemComparePage. */ + if (memcmp(pbSharedPage, pbLocalPage, PAGE_SIZE)) + { + Log(("Unexpected differences found between local and shared page; skip\n")); + /* Signal to the caller that this one hasn't changed. */ + pPageDesc->idPage = NIL_GMM_PAGEID; + return VINF_SUCCESS; + } + + /* + * Free the old local page. + */ + GMMFREEPAGEDESC PageDesc; + PageDesc.idPage = pPageDesc->idPage; + rc = gmmR0FreePages(pGMM, pGVM, 1, &PageDesc, GMMACCOUNT_BASE); + AssertRCReturn(rc, rc); + + gmmR0UseSharedPage(pGMM, pGVM, pPage); + + /* + * Pass along the new physical address & page id. + */ + pPageDesc->HCPhys = ((uint64_t)pPage->Shared.pfn) << PAGE_SHIFT; + pPageDesc->idPage = pGlobalRegion->paidPages[idxPage]; + + return VINF_SUCCESS; +} + + +/** + * RTAvlGCPtrDestroy callback. + * + * @returns 0 or VERR_GMM_INSTANCE. + * @param pNode The node to destroy. + * @param pvArgs Pointer to an argument packet. + */ +static DECLCALLBACK(int) gmmR0CleanupSharedModule(PAVLGCPTRNODECORE pNode, void *pvArgs) +{ + gmmR0ShModDeletePerVM(((GMMR0SHMODPERVMDTORARGS *)pvArgs)->pGMM, + ((GMMR0SHMODPERVMDTORARGS *)pvArgs)->pGVM, + (PGMMSHAREDMODULEPERVM)pNode, + false /*fRemove*/); + return VINF_SUCCESS; +} + + +/** + * Used by GMMR0CleanupVM to clean up shared modules. + * + * This is called without taking the GMM lock so that it can be yielded as + * needed here. + * + * @param pGMM The GMM handle. + * @param pGVM The global VM handle. + */ +static void gmmR0SharedModuleCleanup(PGMM pGMM, PGVM pGVM) +{ + gmmR0MutexAcquire(pGMM); + GMM_CHECK_SANITY_UPON_ENTERING(pGMM); + + GMMR0SHMODPERVMDTORARGS Args; + Args.pGVM = pGVM; + Args.pGMM = pGMM; + RTAvlGCPtrDestroy(&pGVM->gmm.s.pSharedModuleTree, gmmR0CleanupSharedModule, &Args); + + AssertMsg(pGVM->gmm.s.Stats.cShareableModules == 0, ("%d\n", pGVM->gmm.s.Stats.cShareableModules)); + pGVM->gmm.s.Stats.cShareableModules = 0; + + gmmR0MutexRelease(pGMM); +} + +#endif /* VBOX_WITH_PAGE_SHARING */ + +/** + * Removes all shared modules for the specified VM + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The VCPU id. + */ +GMMR0DECL(int) GMMR0ResetSharedModules(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ +#ifdef VBOX_WITH_PAGE_SHARING + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + + /* + * Take the semaphore and do some more validations. + */ + gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + Log(("GMMR0ResetSharedModules\n")); + GMMR0SHMODPERVMDTORARGS Args; + Args.pGVM = pGVM; + Args.pGMM = pGMM; + RTAvlGCPtrDestroy(&pGVM->gmm.s.pSharedModuleTree, gmmR0CleanupSharedModule, &Args); + pGVM->gmm.s.Stats.cShareableModules = 0; + + rc = VINF_SUCCESS; + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + + gmmR0MutexRelease(pGMM); + return rc; +#else + RT_NOREF(pGVM, pVM, idCpu); + return VERR_NOT_IMPLEMENTED; +#endif +} + +#ifdef VBOX_WITH_PAGE_SHARING + +/** + * Tree enumeration callback for checking a shared module. + */ +static DECLCALLBACK(int) gmmR0CheckSharedModule(PAVLGCPTRNODECORE pNode, void *pvUser) +{ + GMMCHECKSHAREDMODULEINFO *pArgs = (GMMCHECKSHAREDMODULEINFO*)pvUser; + PGMMSHAREDMODULEPERVM pRecVM = (PGMMSHAREDMODULEPERVM)pNode; + PGMMSHAREDMODULE pGblMod = pRecVM->pGlobalModule; + + Log(("gmmR0CheckSharedModule: check %s %s base=%RGv size=%x\n", + pGblMod->szName, pGblMod->szVersion, pGblMod->Core.Key, pGblMod->cbModule)); + + int rc = PGMR0SharedModuleCheck(pArgs->pGVM->pVM, pArgs->pGVM, pArgs->idCpu, pGblMod, pRecVM->aRegionsGCPtrs); + if (RT_FAILURE(rc)) + return rc; + return VINF_SUCCESS; +} + +#endif /* VBOX_WITH_PAGE_SHARING */ + +/** + * Check all shared modules for the specified VM. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The calling EMT number. + * @thread EMT(idCpu) + */ +GMMR0DECL(int) GMMR0CheckSharedModules(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ +#ifdef VBOX_WITH_PAGE_SHARING + /* + * Validate input and get the basics. + */ + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + +# ifndef DEBUG_sandervl + /* + * Take the semaphore and do some more validations. + */ + gmmR0MutexAcquire(pGMM); +# endif + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + /* + * Walk the tree, checking each module. + */ + Log(("GMMR0CheckSharedModules\n")); + + GMMCHECKSHAREDMODULEINFO Args; + Args.pGVM = pGVM; + Args.idCpu = idCpu; + rc = RTAvlGCPtrDoWithAll(&pGVM->gmm.s.pSharedModuleTree, true /* fFromLeft */, gmmR0CheckSharedModule, &Args); + + Log(("GMMR0CheckSharedModules done (rc=%Rrc)!\n", rc)); + GMM_CHECK_SANITY_UPON_LEAVING(pGMM); + } + else + rc = VERR_GMM_IS_NOT_SANE; + +# ifndef DEBUG_sandervl + gmmR0MutexRelease(pGMM); +# endif + return rc; +#else + RT_NOREF(pGVM, pVM, idCpu); + return VERR_NOT_IMPLEMENTED; +#endif +} + +#if defined(VBOX_STRICT) && HC_ARCH_BITS == 64 + +/** + * RTAvlU32DoWithAll callback. + * + * @returns 0 + * @param pNode The node to search. + * @param pvUser Pointer to the input argument packet. + */ +static DECLCALLBACK(int) gmmR0FindDupPageInChunk(PAVLU32NODECORE pNode, void *pvUser) +{ + PGMMCHUNK pChunk = (PGMMCHUNK)pNode; + GMMFINDDUPPAGEINFO *pArgs = (GMMFINDDUPPAGEINFO *)pvUser; + PGVM pGVM = pArgs->pGVM; + PGMM pGMM = pArgs->pGMM; + uint8_t *pbChunk; + + /* Only take chunks not mapped into this VM process; not entirely correct. */ + if (!gmmR0IsChunkMapped(pGMM, pGVM, pChunk, (PRTR3PTR)&pbChunk)) + { + int rc = gmmR0MapChunk(pGMM, pGVM, pChunk, false /*fRelaxedSem*/, (PRTR3PTR)&pbChunk); + if (RT_SUCCESS(rc)) + { + /* + * Look for duplicate pages + */ + unsigned iPage = (GMM_CHUNK_SIZE >> PAGE_SHIFT); + while (iPage-- > 0) + { + if (GMM_PAGE_IS_PRIVATE(&pChunk->aPages[iPage])) + { + uint8_t *pbDestPage = pbChunk + (iPage << PAGE_SHIFT); + + if (!memcmp(pArgs->pSourcePage, pbDestPage, PAGE_SIZE)) + { + pArgs->fFoundDuplicate = true; + break; + } + } + } + gmmR0UnmapChunk(pGMM, pGVM, pChunk, false /*fRelaxedSem*/); + } + } + return pArgs->fFoundDuplicate; /* (stops search if true) */ +} + + +/** + * Find a duplicate of the specified page in other active VMs + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0FindDuplicatePageReq(PGVM pGVM, PVM pVM, PGMMFINDDUPLICATEPAGEREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + + int rc = GVMMR0ValidateGVMandVM(pGVM, pVM); + if (RT_FAILURE(rc)) + return rc; + + /* + * Take the semaphore and do some more validations. + */ + rc = gmmR0MutexAcquire(pGMM); + if (GMM_CHECK_SANITY_UPON_ENTERING(pGMM)) + { + uint8_t *pbChunk; + PGMMCHUNK pChunk = gmmR0GetChunk(pGMM, pReq->idPage >> GMM_CHUNKID_SHIFT); + if (pChunk) + { + if (gmmR0IsChunkMapped(pGMM, pGVM, pChunk, (PRTR3PTR)&pbChunk)) + { + uint8_t *pbSourcePage = pbChunk + ((pReq->idPage & GMM_PAGEID_IDX_MASK) << PAGE_SHIFT); + PGMMPAGE pPage = gmmR0GetPage(pGMM, pReq->idPage); + if (pPage) + { + GMMFINDDUPPAGEINFO Args; + Args.pGVM = pGVM; + Args.pGMM = pGMM; + Args.pSourcePage = pbSourcePage; + Args.fFoundDuplicate = false; + RTAvlU32DoWithAll(&pGMM->pChunks, true /* fFromLeft */, gmmR0FindDupPageInChunk, &Args); + + pReq->fDuplicate = Args.fFoundDuplicate; + } + else + { + AssertFailed(); + rc = VERR_PGM_PHYS_INVALID_PAGE_ID; + } + } + else + AssertFailed(); + } + else + AssertFailed(); + } + else + rc = VERR_GMM_IS_NOT_SANE; + + gmmR0MutexRelease(pGMM); + return rc; +} + +#endif /* VBOX_STRICT && HC_ARCH_BITS == 64 */ + + +/** + * Retrieves the GMM statistics visible to the caller. + * + * @returns VBox status code. + * + * @param pStats Where to put the statistics. + * @param pSession The current session. + * @param pGVM The GVM to obtain statistics for. Optional. + * @param pVM The VM structure corresponding to @a pGVM. + */ +GMMR0DECL(int) GMMR0QueryStatistics(PGMMSTATS pStats, PSUPDRVSESSION pSession, PGVM pGVM, PVM pVM) +{ + LogFlow(("GVMMR0QueryStatistics: pStats=%p pSession=%p pGVM=%p pVM=%p\n", pStats, pSession, pGVM, pVM)); + + /* + * Validate input. + */ + AssertPtrReturn(pSession, VERR_INVALID_POINTER); + AssertPtrReturn(pStats, VERR_INVALID_POINTER); + pStats->cMaxPages = 0; /* (crash before taking the mutex...) */ + + PGMM pGMM; + GMM_GET_VALID_INSTANCE(pGMM, VERR_GMM_INSTANCE); + + /* + * Validate the VM handle, if not NULL, and lock the GMM. + */ + int rc; + if (pGVM) + { + rc = GVMMR0ValidateGVMandVM(pGVM, pVM); + if (RT_FAILURE(rc)) + return rc; + } + + rc = gmmR0MutexAcquire(pGMM); + if (RT_FAILURE(rc)) + return rc; + + /* + * Copy out the GMM statistics. + */ + pStats->cMaxPages = pGMM->cMaxPages; + pStats->cReservedPages = pGMM->cReservedPages; + pStats->cOverCommittedPages = pGMM->cOverCommittedPages; + pStats->cAllocatedPages = pGMM->cAllocatedPages; + pStats->cSharedPages = pGMM->cSharedPages; + pStats->cDuplicatePages = pGMM->cDuplicatePages; + pStats->cLeftBehindSharedPages = pGMM->cLeftBehindSharedPages; + pStats->cBalloonedPages = pGMM->cBalloonedPages; + pStats->cChunks = pGMM->cChunks; + pStats->cFreedChunks = pGMM->cFreedChunks; + pStats->cShareableModules = pGMM->cShareableModules; + RT_ZERO(pStats->au64Reserved); + + /* + * Copy out the VM statistics. + */ + if (pGVM) + pStats->VMStats = pGVM->gmm.s.Stats; + else + RT_ZERO(pStats->VMStats); + + gmmR0MutexRelease(pGMM); + return rc; +} + + +/** + * VMMR0 request wrapper for GMMR0QueryStatistics. + * + * @returns see GMMR0QueryStatistics. + * @param pGVM The global (ring-0) VM structure. Optional. + * @param pVM The cross context VM structure. Optional. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0QueryStatisticsReq(PGVM pGVM, PVM pVM, PGMMQUERYSTATISTICSSREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + return GMMR0QueryStatistics(&pReq->Stats, pReq->pSession, pGVM, pVM); +} + + +/** + * Resets the specified GMM statistics. + * + * @returns VBox status code. + * + * @param pStats Which statistics to reset, that is, non-zero fields + * indicates which to reset. + * @param pSession The current session. + * @param pGVM The GVM to reset statistics for. Optional. + * @param pVM The VM structure corresponding to @a pGVM. + */ +GMMR0DECL(int) GMMR0ResetStatistics(PCGMMSTATS pStats, PSUPDRVSESSION pSession, PGVM pGVM, PVM pVM) +{ + NOREF(pStats); NOREF(pSession); NOREF(pVM); NOREF(pGVM); + /* Currently nothing we can reset at the moment. */ + return VINF_SUCCESS; +} + + +/** + * VMMR0 request wrapper for GMMR0ResetStatistics. + * + * @returns see GMMR0ResetStatistics. + * @param pGVM The global (ring-0) VM structure. Optional. + * @param pVM The cross context VM structure. Optional. + * @param pReq Pointer to the request packet. + */ +GMMR0DECL(int) GMMR0ResetStatisticsReq(PGVM pGVM, PVM pVM, PGMMRESETSTATISTICSSREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + return GMMR0ResetStatistics(&pReq->Stats, pReq->pSession, pGVM, pVM); +} + diff --git a/src/VBox/VMM/VMMR0/GMMR0Internal.h b/src/VBox/VMM/VMMR0/GMMR0Internal.h new file mode 100644 index 00000000..51de8549 --- /dev/null +++ b/src/VBox/VMM/VMMR0/GMMR0Internal.h @@ -0,0 +1,92 @@ +/* $Id: GMMR0Internal.h $ */ +/** @file + * GMM - The Global Memory Manager, Internal Header. + */ + +/* + * Copyright (C) 2007-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + +#ifndef VMM_INCLUDED_SRC_VMMR0_GMMR0Internal_h +#define VMM_INCLUDED_SRC_VMMR0_GMMR0Internal_h +#ifndef RT_WITHOUT_PRAGMA_ONCE +# pragma once +#endif + +#include <VBox/vmm/gmm.h> +#include <iprt/avl.h> + + +/** + * Shared module registration info (per VM) + */ +typedef struct GMMSHAREDMODULEPERVM +{ + /** Tree node. */ + AVLGCPTRNODECORE Core; + /** Pointer to global shared module info. */ + PGMMSHAREDMODULE pGlobalModule; + /** Pointer to the region addresses. + * + * They can differe between VMs because of address space scrambling or + * simply different loading order. */ + RTGCPTR64 aRegionsGCPtrs[1]; +} GMMSHAREDMODULEPERVM; +/** Pointer to a GMMSHAREDMODULEPERVM. */ +typedef GMMSHAREDMODULEPERVM *PGMMSHAREDMODULEPERVM; + + +/** Pointer to a GMM allocation chunk. */ +typedef struct GMMCHUNK *PGMMCHUNK; + + +/** The GMMCHUNK::cFree shift count employed by gmmR0SelectFreeSetList. */ +#define GMM_CHUNK_FREE_SET_SHIFT 4 +/** Index of the list containing completely unused chunks. + * The code ASSUMES this is the last list. */ +#define GMM_CHUNK_FREE_SET_UNUSED_LIST (GMM_CHUNK_NUM_PAGES >> GMM_CHUNK_FREE_SET_SHIFT) + +/** + * A set of free chunks. + */ +typedef struct GMMCHUNKFREESET +{ + /** The number of free pages in the set. */ + uint64_t cFreePages; + /** The generation ID for the set. This is incremented whenever + * something is linked or unlinked from this set. */ + uint64_t idGeneration; + /** Chunks ordered by increasing number of free pages. + * In the final list the chunks are completely unused. */ + PGMMCHUNK apLists[GMM_CHUNK_FREE_SET_UNUSED_LIST + 1]; +} GMMCHUNKFREESET; + + + +/** + * The per-VM GMM data. + */ +typedef struct GMMPERVM +{ + /** Free set for use in bound mode. */ + GMMCHUNKFREESET Private; + /** The VM statistics. */ + GMMVMSTATS Stats; + /** Shared module tree (per-vm). */ + PAVLGCPTRNODECORE pSharedModuleTree; + /** Hints at the last chunk we allocated some memory from. */ + uint32_t idLastChunkHint; +} GMMPERVM; +/** Pointer to the per-VM GMM data. */ +typedef GMMPERVM *PGMMPERVM; + +#endif /* !VMM_INCLUDED_SRC_VMMR0_GMMR0Internal_h */ + diff --git a/src/VBox/VMM/VMMR0/GVMMR0.cpp b/src/VBox/VMM/VMMR0/GVMMR0.cpp new file mode 100644 index 00000000..13aef810 --- /dev/null +++ b/src/VBox/VMM/VMMR0/GVMMR0.cpp @@ -0,0 +1,3106 @@ +/* $Id: GVMMR0.cpp $ */ +/** @file + * GVMM - Global VM Manager. + */ + +/* + * Copyright (C) 2007-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/** @page pg_gvmm GVMM - The Global VM Manager + * + * The Global VM Manager lives in ring-0. Its main function at the moment is + * to manage a list of all running VMs, keep a ring-0 only structure (GVM) for + * each of them, and assign them unique identifiers (so GMM can track page + * owners). The GVMM also manage some of the host CPU resources, like the + * periodic preemption timer. + * + * The GVMM will create a ring-0 object for each VM when it is registered, this + * is both for session cleanup purposes and for having a point where it is + * possible to implement usage polices later (in SUPR0ObjRegister). + * + * + * @section sec_gvmm_ppt Periodic Preemption Timer (PPT) + * + * On system that sports a high resolution kernel timer API, we use per-cpu + * timers to generate interrupts that preempts VT-x, AMD-V and raw-mode guest + * execution. The timer frequency is calculating by taking the max + * TMCalcHostTimerFrequency for all VMs running on a CPU for the last ~160 ms + * (RT_ELEMENTS((PGVMMHOSTCPU)0, Ppt.aHzHistory) * + * GVMMHOSTCPU_PPT_HIST_INTERVAL_NS). + * + * The TMCalcHostTimerFrequency() part of the things gets its takes the max + * TMTimerSetFrequencyHint() value and adjusts by the current catch-up percent, + * warp drive percent and some fudge factors. VMMR0.cpp reports the result via + * GVMMR0SchedUpdatePeriodicPreemptionTimer() before switching to the VT-x, + * AMD-V and raw-mode execution environments. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_GVMM +#include <VBox/vmm/gvmm.h> +#include <VBox/vmm/gmm.h> +#include "GVMMR0Internal.h" +#include <VBox/vmm/gvm.h> +#include <VBox/vmm/vm.h> +#include <VBox/vmm/vmcpuset.h> +#include <VBox/vmm/vmm.h> +#ifdef VBOX_WITH_NEM_R0 +# include <VBox/vmm/nem.h> +#endif +#include <VBox/param.h> +#include <VBox/err.h> + +#include <iprt/asm.h> +#include <iprt/asm-amd64-x86.h> +#include <iprt/critsect.h> +#include <iprt/mem.h> +#include <iprt/semaphore.h> +#include <iprt/time.h> +#include <VBox/log.h> +#include <iprt/thread.h> +#include <iprt/process.h> +#include <iprt/param.h> +#include <iprt/string.h> +#include <iprt/assert.h> +#include <iprt/mem.h> +#include <iprt/memobj.h> +#include <iprt/mp.h> +#include <iprt/cpuset.h> +#include <iprt/spinlock.h> +#include <iprt/timer.h> + +#include "dtrace/VBoxVMM.h" + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +#if defined(RT_OS_LINUX) || defined(RT_OS_SOLARIS) || defined(DOXYGEN_RUNNING) +/** Define this to enable the periodic preemption timer. */ +# define GVMM_SCHED_WITH_PPT +#endif + + +/** @def GVMM_CHECK_SMAP_SETUP + * SMAP check setup. */ +/** @def GVMM_CHECK_SMAP_CHECK + * Checks that the AC flag is set if SMAP is enabled. If AC is not set, + * it will be logged and @a a_BadExpr is executed. */ +/** @def GVMM_CHECK_SMAP_CHECK2 + * Checks that the AC flag is set if SMAP is enabled. If AC is not set, it will + * be logged, written to the VMs assertion text buffer, and @a a_BadExpr is + * executed. */ +#if defined(VBOX_STRICT) || 1 +# define GVMM_CHECK_SMAP_SETUP() uint32_t const fKernelFeatures = SUPR0GetKernelFeatures() +# define GVMM_CHECK_SMAP_CHECK(a_BadExpr) \ + do { \ + if (fKernelFeatures & SUPKERNELFEATURES_SMAP) \ + { \ + RTCCUINTREG fEflCheck = ASMGetFlags(); \ + if (RT_LIKELY(fEflCheck & X86_EFL_AC)) \ + { /* likely */ } \ + else \ + { \ + SUPR0Printf("%s, line %d: EFLAGS.AC is clear! (%#x)\n", __FUNCTION__, __LINE__, (uint32_t)fEflCheck); \ + a_BadExpr; \ + } \ + } \ + } while (0) +# define GVMM_CHECK_SMAP_CHECK2(a_pVM, a_BadExpr) \ + do { \ + if (fKernelFeatures & SUPKERNELFEATURES_SMAP) \ + { \ + RTCCUINTREG fEflCheck = ASMGetFlags(); \ + if (RT_LIKELY(fEflCheck & X86_EFL_AC)) \ + { /* likely */ } \ + else \ + { \ + SUPR0BadContext((a_pVM) ? (a_pVM)->pSession : NULL, __FILE__, __LINE__, "EFLAGS.AC is zero!"); \ + a_BadExpr; \ + } \ + } \ + } while (0) +#else +# define GVMM_CHECK_SMAP_SETUP() uint32_t const fKernelFeatures = 0 +# define GVMM_CHECK_SMAP_CHECK(a_BadExpr) NOREF(fKernelFeatures) +# define GVMM_CHECK_SMAP_CHECK2(a_pVM, a_BadExpr) NOREF(fKernelFeatures) +#endif + + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ + +/** + * Global VM handle. + */ +typedef struct GVMHANDLE +{ + /** The index of the next handle in the list (free or used). (0 is nil.) */ + uint16_t volatile iNext; + /** Our own index / handle value. */ + uint16_t iSelf; + /** The process ID of the handle owner. + * This is used for access checks. */ + RTPROCESS ProcId; + /** The pointer to the ring-0 only (aka global) VM structure. */ + PGVM pGVM; + /** The ring-0 mapping of the shared VM instance data. */ + PVM pVM; + /** The virtual machine object. */ + void *pvObj; + /** The session this VM is associated with. */ + PSUPDRVSESSION pSession; + /** The ring-0 handle of the EMT0 thread. + * This is used for ownership checks as well as looking up a VM handle by thread + * at times like assertions. */ + RTNATIVETHREAD hEMT0; +} GVMHANDLE; +/** Pointer to a global VM handle. */ +typedef GVMHANDLE *PGVMHANDLE; + +/** Number of GVM handles (including the NIL handle). */ +#if HC_ARCH_BITS == 64 +# define GVMM_MAX_HANDLES 8192 +#else +# define GVMM_MAX_HANDLES 128 +#endif + +/** + * Per host CPU GVMM data. + */ +typedef struct GVMMHOSTCPU +{ + /** Magic number (GVMMHOSTCPU_MAGIC). */ + uint32_t volatile u32Magic; + /** The CPU ID. */ + RTCPUID idCpu; + /** The CPU set index. */ + uint32_t idxCpuSet; + +#ifdef GVMM_SCHED_WITH_PPT + /** Periodic preemption timer data. */ + struct + { + /** The handle to the periodic preemption timer. */ + PRTTIMER pTimer; + /** Spinlock protecting the data below. */ + RTSPINLOCK hSpinlock; + /** The smalles Hz that we need to care about. (static) */ + uint32_t uMinHz; + /** The number of ticks between each historization. */ + uint32_t cTicksHistoriziationInterval; + /** The current historization tick (counting up to + * cTicksHistoriziationInterval and then resetting). */ + uint32_t iTickHistorization; + /** The current timer interval. This is set to 0 when inactive. */ + uint32_t cNsInterval; + /** The current timer frequency. This is set to 0 when inactive. */ + uint32_t uTimerHz; + /** The current max frequency reported by the EMTs. + * This gets historicize and reset by the timer callback. This is + * read without holding the spinlock, so needs atomic updating. */ + uint32_t volatile uDesiredHz; + /** Whether the timer was started or not. */ + bool volatile fStarted; + /** Set if we're starting timer. */ + bool volatile fStarting; + /** The index of the next history entry (mod it). */ + uint32_t iHzHistory; + /** Historicized uDesiredHz values. The array wraps around, new entries + * are added at iHzHistory. This is updated approximately every + * GVMMHOSTCPU_PPT_HIST_INTERVAL_NS by the timer callback. */ + uint32_t aHzHistory[8]; + /** Statistics counter for recording the number of interval changes. */ + uint32_t cChanges; + /** Statistics counter for recording the number of timer starts. */ + uint32_t cStarts; + } Ppt; +#endif /* GVMM_SCHED_WITH_PPT */ + +} GVMMHOSTCPU; +/** Pointer to the per host CPU GVMM data. */ +typedef GVMMHOSTCPU *PGVMMHOSTCPU; +/** The GVMMHOSTCPU::u32Magic value (Petra, Tanya & Rachel Haden). */ +#define GVMMHOSTCPU_MAGIC UINT32_C(0x19711011) +/** The interval on history entry should cover (approximately) give in + * nanoseconds. */ +#define GVMMHOSTCPU_PPT_HIST_INTERVAL_NS UINT32_C(20000000) + + +/** + * The GVMM instance data. + */ +typedef struct GVMM +{ + /** Eyecatcher / magic. */ + uint32_t u32Magic; + /** The index of the head of the free handle chain. (0 is nil.) */ + uint16_t volatile iFreeHead; + /** The index of the head of the active handle chain. (0 is nil.) */ + uint16_t volatile iUsedHead; + /** The number of VMs. */ + uint16_t volatile cVMs; + /** Alignment padding. */ + uint16_t u16Reserved; + /** The number of EMTs. */ + uint32_t volatile cEMTs; + /** The number of EMTs that have halted in GVMMR0SchedHalt. */ + uint32_t volatile cHaltedEMTs; + /** Mini lock for restricting early wake-ups to one thread. */ + bool volatile fDoingEarlyWakeUps; + bool afPadding[3]; /**< explicit alignment padding. */ + /** When the next halted or sleeping EMT will wake up. + * This is set to 0 when it needs recalculating and to UINT64_MAX when + * there are no halted or sleeping EMTs in the GVMM. */ + uint64_t uNsNextEmtWakeup; + /** The lock used to serialize VM creation, destruction and associated events that + * isn't performance critical. Owners may acquire the list lock. */ + RTCRITSECT CreateDestroyLock; + /** The lock used to serialize used list updates and accesses. + * This indirectly includes scheduling since the scheduler will have to walk the + * used list to examin running VMs. Owners may not acquire any other locks. */ + RTCRITSECTRW UsedLock; + /** The handle array. + * The size of this array defines the maximum number of currently running VMs. + * The first entry is unused as it represents the NIL handle. */ + GVMHANDLE aHandles[GVMM_MAX_HANDLES]; + + /** @gcfgm{/GVMM/cEMTsMeansCompany, 32-bit, 0, UINT32_MAX, 1} + * The number of EMTs that means we no longer consider ourselves alone on a + * CPU/Core. + */ + uint32_t cEMTsMeansCompany; + /** @gcfgm{/GVMM/MinSleepAlone,32-bit, 0, 100000000, 750000, ns} + * The minimum sleep time for when we're alone, in nano seconds. + */ + uint32_t nsMinSleepAlone; + /** @gcfgm{/GVMM/MinSleepCompany,32-bit,0, 100000000, 15000, ns} + * The minimum sleep time for when we've got company, in nano seconds. + */ + uint32_t nsMinSleepCompany; + /** @gcfgm{/GVMM/EarlyWakeUp1, 32-bit, 0, 100000000, 25000, ns} + * The limit for the first round of early wake-ups, given in nano seconds. + */ + uint32_t nsEarlyWakeUp1; + /** @gcfgm{/GVMM/EarlyWakeUp2, 32-bit, 0, 100000000, 50000, ns} + * The limit for the second round of early wake-ups, given in nano seconds. + */ + uint32_t nsEarlyWakeUp2; + + /** Set if we're doing early wake-ups. + * This reflects nsEarlyWakeUp1 and nsEarlyWakeUp2. */ + bool volatile fDoEarlyWakeUps; + + /** The number of entries in the host CPU array (aHostCpus). */ + uint32_t cHostCpus; + /** Per host CPU data (variable length). */ + GVMMHOSTCPU aHostCpus[1]; +} GVMM; +AssertCompileMemberAlignment(GVMM, CreateDestroyLock, 8); +AssertCompileMemberAlignment(GVMM, UsedLock, 8); +AssertCompileMemberAlignment(GVMM, uNsNextEmtWakeup, 8); +/** Pointer to the GVMM instance data. */ +typedef GVMM *PGVMM; + +/** The GVMM::u32Magic value (Charlie Haden). */ +#define GVMM_MAGIC UINT32_C(0x19370806) + + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +/** Pointer to the GVMM instance data. + * (Just my general dislike for global variables.) */ +static PGVMM g_pGVMM = NULL; + +/** Macro for obtaining and validating the g_pGVMM pointer. + * On failure it will return from the invoking function with the specified return value. + * + * @param pGVMM The name of the pGVMM variable. + * @param rc The return value on failure. Use VERR_GVMM_INSTANCE for VBox + * status codes. + */ +#define GVMM_GET_VALID_INSTANCE(pGVMM, rc) \ + do { \ + (pGVMM) = g_pGVMM;\ + AssertPtrReturn((pGVMM), (rc)); \ + AssertMsgReturn((pGVMM)->u32Magic == GVMM_MAGIC, ("%p - %#x\n", (pGVMM), (pGVMM)->u32Magic), (rc)); \ + } while (0) + +/** Macro for obtaining and validating the g_pGVMM pointer, void function variant. + * On failure it will return from the invoking function. + * + * @param pGVMM The name of the pGVMM variable. + */ +#define GVMM_GET_VALID_INSTANCE_VOID(pGVMM) \ + do { \ + (pGVMM) = g_pGVMM;\ + AssertPtrReturnVoid((pGVMM)); \ + AssertMsgReturnVoid((pGVMM)->u32Magic == GVMM_MAGIC, ("%p - %#x\n", (pGVMM), (pGVMM)->u32Magic)); \ + } while (0) + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +static void gvmmR0InitPerVMData(PGVM pGVM); +static DECLCALLBACK(void) gvmmR0HandleObjDestructor(void *pvObj, void *pvGVMM, void *pvHandle); +static int gvmmR0ByGVMandVM(PGVM pGVM, PVM pVM, PGVMM *ppGVMM, bool fTakeUsedLock); +static int gvmmR0ByGVMandVMandEMT(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGVMM *ppGVMM); + +#ifdef GVMM_SCHED_WITH_PPT +static DECLCALLBACK(void) gvmmR0SchedPeriodicPreemptionTimerCallback(PRTTIMER pTimer, void *pvUser, uint64_t iTick); +#endif + + +/** + * Initializes the GVMM. + * + * This is called while owning the loader semaphore (see supdrvIOCtl_LdrLoad()). + * + * @returns VBox status code. + */ +GVMMR0DECL(int) GVMMR0Init(void) +{ + LogFlow(("GVMMR0Init:\n")); + + /* + * Allocate and initialize the instance data. + */ + uint32_t cHostCpus = RTMpGetArraySize(); + AssertMsgReturn(cHostCpus > 0 && cHostCpus < _64K, ("%d", (int)cHostCpus), VERR_GVMM_HOST_CPU_RANGE); + + PGVMM pGVMM = (PGVMM)RTMemAllocZ(RT_UOFFSETOF_DYN(GVMM, aHostCpus[cHostCpus])); + if (!pGVMM) + return VERR_NO_MEMORY; + int rc = RTCritSectInitEx(&pGVMM->CreateDestroyLock, 0, NIL_RTLOCKVALCLASS, RTLOCKVAL_SUB_CLASS_NONE, + "GVMM-CreateDestroyLock"); + if (RT_SUCCESS(rc)) + { + rc = RTCritSectRwInitEx(&pGVMM->UsedLock, 0, NIL_RTLOCKVALCLASS, RTLOCKVAL_SUB_CLASS_NONE, "GVMM-UsedLock"); + if (RT_SUCCESS(rc)) + { + pGVMM->u32Magic = GVMM_MAGIC; + pGVMM->iUsedHead = 0; + pGVMM->iFreeHead = 1; + + /* the nil handle */ + pGVMM->aHandles[0].iSelf = 0; + pGVMM->aHandles[0].iNext = 0; + + /* the tail */ + unsigned i = RT_ELEMENTS(pGVMM->aHandles) - 1; + pGVMM->aHandles[i].iSelf = i; + pGVMM->aHandles[i].iNext = 0; /* nil */ + + /* the rest */ + while (i-- > 1) + { + pGVMM->aHandles[i].iSelf = i; + pGVMM->aHandles[i].iNext = i + 1; + } + + /* The default configuration values. */ + uint32_t cNsResolution = RTSemEventMultiGetResolution(); + pGVMM->cEMTsMeansCompany = 1; /** @todo should be adjusted to relative to the cpu count or something... */ + if (cNsResolution >= 5*RT_NS_100US) + { + pGVMM->nsMinSleepAlone = 750000 /* ns (0.750 ms) */; /** @todo this should be adjusted to be 75% (or something) of the scheduler granularity... */ + pGVMM->nsMinSleepCompany = 15000 /* ns (0.015 ms) */; + pGVMM->nsEarlyWakeUp1 = 25000 /* ns (0.025 ms) */; + pGVMM->nsEarlyWakeUp2 = 50000 /* ns (0.050 ms) */; + } + else if (cNsResolution > RT_NS_100US) + { + pGVMM->nsMinSleepAlone = cNsResolution / 2; + pGVMM->nsMinSleepCompany = cNsResolution / 4; + pGVMM->nsEarlyWakeUp1 = 0; + pGVMM->nsEarlyWakeUp2 = 0; + } + else + { + pGVMM->nsMinSleepAlone = 2000; + pGVMM->nsMinSleepCompany = 2000; + pGVMM->nsEarlyWakeUp1 = 0; + pGVMM->nsEarlyWakeUp2 = 0; + } + pGVMM->fDoEarlyWakeUps = pGVMM->nsEarlyWakeUp1 > 0 && pGVMM->nsEarlyWakeUp2 > 0; + + /* The host CPU data. */ + pGVMM->cHostCpus = cHostCpus; + uint32_t iCpu = cHostCpus; + RTCPUSET PossibleSet; + RTMpGetSet(&PossibleSet); + while (iCpu-- > 0) + { + pGVMM->aHostCpus[iCpu].idxCpuSet = iCpu; +#ifdef GVMM_SCHED_WITH_PPT + pGVMM->aHostCpus[iCpu].Ppt.pTimer = NULL; + pGVMM->aHostCpus[iCpu].Ppt.hSpinlock = NIL_RTSPINLOCK; + pGVMM->aHostCpus[iCpu].Ppt.uMinHz = 5; /** @todo Add some API which figures this one out. (not *that* important) */ + pGVMM->aHostCpus[iCpu].Ppt.cTicksHistoriziationInterval = 1; + //pGVMM->aHostCpus[iCpu].Ppt.iTickHistorization = 0; + //pGVMM->aHostCpus[iCpu].Ppt.cNsInterval = 0; + //pGVMM->aHostCpus[iCpu].Ppt.uTimerHz = 0; + //pGVMM->aHostCpus[iCpu].Ppt.uDesiredHz = 0; + //pGVMM->aHostCpus[iCpu].Ppt.fStarted = false; + //pGVMM->aHostCpus[iCpu].Ppt.fStarting = false; + //pGVMM->aHostCpus[iCpu].Ppt.iHzHistory = 0; + //pGVMM->aHostCpus[iCpu].Ppt.aHzHistory = {0}; +#endif + + if (RTCpuSetIsMember(&PossibleSet, iCpu)) + { + pGVMM->aHostCpus[iCpu].idCpu = RTMpCpuIdFromSetIndex(iCpu); + pGVMM->aHostCpus[iCpu].u32Magic = GVMMHOSTCPU_MAGIC; + +#ifdef GVMM_SCHED_WITH_PPT + rc = RTTimerCreateEx(&pGVMM->aHostCpus[iCpu].Ppt.pTimer, + 50*1000*1000 /* whatever */, + RTTIMER_FLAGS_CPU(iCpu) | RTTIMER_FLAGS_HIGH_RES, + gvmmR0SchedPeriodicPreemptionTimerCallback, + &pGVMM->aHostCpus[iCpu]); + if (RT_SUCCESS(rc)) + rc = RTSpinlockCreate(&pGVMM->aHostCpus[iCpu].Ppt.hSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_SAFE, "GVMM/CPU"); + if (RT_FAILURE(rc)) + { + while (iCpu < cHostCpus) + { + RTTimerDestroy(pGVMM->aHostCpus[iCpu].Ppt.pTimer); + RTSpinlockDestroy(pGVMM->aHostCpus[iCpu].Ppt.hSpinlock); + pGVMM->aHostCpus[iCpu].Ppt.hSpinlock = NIL_RTSPINLOCK; + iCpu++; + } + break; + } +#endif + } + else + { + pGVMM->aHostCpus[iCpu].idCpu = NIL_RTCPUID; + pGVMM->aHostCpus[iCpu].u32Magic = 0; + } + } + if (RT_SUCCESS(rc)) + { + g_pGVMM = pGVMM; + LogFlow(("GVMMR0Init: pGVMM=%p cHostCpus=%u\n", pGVMM, cHostCpus)); + return VINF_SUCCESS; + } + + /* bail out. */ + RTCritSectRwDelete(&pGVMM->UsedLock); + } + RTCritSectDelete(&pGVMM->CreateDestroyLock); + } + + RTMemFree(pGVMM); + return rc; +} + + +/** + * Terminates the GVM. + * + * This is called while owning the loader semaphore (see supdrvLdrFree()). + * And unless something is wrong, there should be absolutely no VMs + * registered at this point. + */ +GVMMR0DECL(void) GVMMR0Term(void) +{ + LogFlow(("GVMMR0Term:\n")); + + PGVMM pGVMM = g_pGVMM; + g_pGVMM = NULL; + if (RT_UNLIKELY(!VALID_PTR(pGVMM))) + { + SUPR0Printf("GVMMR0Term: pGVMM=%RKv\n", pGVMM); + return; + } + + /* + * First of all, stop all active timers. + */ + uint32_t cActiveTimers = 0; + uint32_t iCpu = pGVMM->cHostCpus; + while (iCpu-- > 0) + { + ASMAtomicWriteU32(&pGVMM->aHostCpus[iCpu].u32Magic, ~GVMMHOSTCPU_MAGIC); +#ifdef GVMM_SCHED_WITH_PPT + if ( pGVMM->aHostCpus[iCpu].Ppt.pTimer != NULL + && RT_SUCCESS(RTTimerStop(pGVMM->aHostCpus[iCpu].Ppt.pTimer))) + cActiveTimers++; +#endif + } + if (cActiveTimers) + RTThreadSleep(1); /* fudge */ + + /* + * Invalidate the and free resources. + */ + pGVMM->u32Magic = ~GVMM_MAGIC; + RTCritSectRwDelete(&pGVMM->UsedLock); + RTCritSectDelete(&pGVMM->CreateDestroyLock); + + pGVMM->iFreeHead = 0; + if (pGVMM->iUsedHead) + { + SUPR0Printf("GVMMR0Term: iUsedHead=%#x! (cVMs=%#x cEMTs=%#x)\n", pGVMM->iUsedHead, pGVMM->cVMs, pGVMM->cEMTs); + pGVMM->iUsedHead = 0; + } + +#ifdef GVMM_SCHED_WITH_PPT + iCpu = pGVMM->cHostCpus; + while (iCpu-- > 0) + { + RTTimerDestroy(pGVMM->aHostCpus[iCpu].Ppt.pTimer); + pGVMM->aHostCpus[iCpu].Ppt.pTimer = NULL; + RTSpinlockDestroy(pGVMM->aHostCpus[iCpu].Ppt.hSpinlock); + pGVMM->aHostCpus[iCpu].Ppt.hSpinlock = NIL_RTSPINLOCK; + } +#endif + + RTMemFree(pGVMM); +} + + +/** + * A quick hack for setting global config values. + * + * @returns VBox status code. + * + * @param pSession The session handle. Used for authentication. + * @param pszName The variable name. + * @param u64Value The new value. + */ +GVMMR0DECL(int) GVMMR0SetConfig(PSUPDRVSESSION pSession, const char *pszName, uint64_t u64Value) +{ + /* + * Validate input. + */ + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + AssertPtrReturn(pSession, VERR_INVALID_HANDLE); + AssertPtrReturn(pszName, VERR_INVALID_POINTER); + + /* + * String switch time! + */ + if (strncmp(pszName, RT_STR_TUPLE("/GVMM/"))) + return VERR_CFGM_VALUE_NOT_FOUND; /* borrow status codes from CFGM... */ + int rc = VINF_SUCCESS; + pszName += sizeof("/GVMM/") - 1; + if (!strcmp(pszName, "cEMTsMeansCompany")) + { + if (u64Value <= UINT32_MAX) + pGVMM->cEMTsMeansCompany = u64Value; + else + rc = VERR_OUT_OF_RANGE; + } + else if (!strcmp(pszName, "MinSleepAlone")) + { + if (u64Value <= RT_NS_100MS) + pGVMM->nsMinSleepAlone = u64Value; + else + rc = VERR_OUT_OF_RANGE; + } + else if (!strcmp(pszName, "MinSleepCompany")) + { + if (u64Value <= RT_NS_100MS) + pGVMM->nsMinSleepCompany = u64Value; + else + rc = VERR_OUT_OF_RANGE; + } + else if (!strcmp(pszName, "EarlyWakeUp1")) + { + if (u64Value <= RT_NS_100MS) + { + pGVMM->nsEarlyWakeUp1 = u64Value; + pGVMM->fDoEarlyWakeUps = pGVMM->nsEarlyWakeUp1 > 0 && pGVMM->nsEarlyWakeUp2 > 0; + } + else + rc = VERR_OUT_OF_RANGE; + } + else if (!strcmp(pszName, "EarlyWakeUp2")) + { + if (u64Value <= RT_NS_100MS) + { + pGVMM->nsEarlyWakeUp2 = u64Value; + pGVMM->fDoEarlyWakeUps = pGVMM->nsEarlyWakeUp1 > 0 && pGVMM->nsEarlyWakeUp2 > 0; + } + else + rc = VERR_OUT_OF_RANGE; + } + else + rc = VERR_CFGM_VALUE_NOT_FOUND; + return rc; +} + + +/** + * A quick hack for getting global config values. + * + * @returns VBox status code. + * + * @param pSession The session handle. Used for authentication. + * @param pszName The variable name. + * @param pu64Value Where to return the value. + */ +GVMMR0DECL(int) GVMMR0QueryConfig(PSUPDRVSESSION pSession, const char *pszName, uint64_t *pu64Value) +{ + /* + * Validate input. + */ + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + AssertPtrReturn(pSession, VERR_INVALID_HANDLE); + AssertPtrReturn(pszName, VERR_INVALID_POINTER); + AssertPtrReturn(pu64Value, VERR_INVALID_POINTER); + + /* + * String switch time! + */ + if (strncmp(pszName, RT_STR_TUPLE("/GVMM/"))) + return VERR_CFGM_VALUE_NOT_FOUND; /* borrow status codes from CFGM... */ + int rc = VINF_SUCCESS; + pszName += sizeof("/GVMM/") - 1; + if (!strcmp(pszName, "cEMTsMeansCompany")) + *pu64Value = pGVMM->cEMTsMeansCompany; + else if (!strcmp(pszName, "MinSleepAlone")) + *pu64Value = pGVMM->nsMinSleepAlone; + else if (!strcmp(pszName, "MinSleepCompany")) + *pu64Value = pGVMM->nsMinSleepCompany; + else if (!strcmp(pszName, "EarlyWakeUp1")) + *pu64Value = pGVMM->nsEarlyWakeUp1; + else if (!strcmp(pszName, "EarlyWakeUp2")) + *pu64Value = pGVMM->nsEarlyWakeUp2; + else + rc = VERR_CFGM_VALUE_NOT_FOUND; + return rc; +} + + +/** + * Acquire the 'used' lock in shared mode. + * + * This prevents destruction of the VM while we're in ring-0. + * + * @returns IPRT status code, see RTSemFastMutexRequest. + * @param a_pGVMM The GVMM instance data. + * @sa GVMMR0_USED_SHARED_UNLOCK, GVMMR0_USED_EXCLUSIVE_LOCK + */ +#define GVMMR0_USED_SHARED_LOCK(a_pGVMM) RTCritSectRwEnterShared(&(a_pGVMM)->UsedLock) + +/** + * Release the 'used' lock in when owning it in shared mode. + * + * @returns IPRT status code, see RTSemFastMutexRequest. + * @param a_pGVMM The GVMM instance data. + * @sa GVMMR0_USED_SHARED_LOCK + */ +#define GVMMR0_USED_SHARED_UNLOCK(a_pGVMM) RTCritSectRwLeaveShared(&(a_pGVMM)->UsedLock) + +/** + * Acquire the 'used' lock in exclusive mode. + * + * Only use this function when making changes to the used list. + * + * @returns IPRT status code, see RTSemFastMutexRequest. + * @param a_pGVMM The GVMM instance data. + * @sa GVMMR0_USED_EXCLUSIVE_UNLOCK + */ +#define GVMMR0_USED_EXCLUSIVE_LOCK(a_pGVMM) RTCritSectRwEnterExcl(&(a_pGVMM)->UsedLock) + +/** + * Release the 'used' lock when owning it in exclusive mode. + * + * @returns IPRT status code, see RTSemFastMutexRelease. + * @param a_pGVMM The GVMM instance data. + * @sa GVMMR0_USED_EXCLUSIVE_LOCK, GVMMR0_USED_SHARED_UNLOCK + */ +#define GVMMR0_USED_EXCLUSIVE_UNLOCK(a_pGVMM) RTCritSectRwLeaveExcl(&(a_pGVMM)->UsedLock) + + +/** + * Try acquire the 'create & destroy' lock. + * + * @returns IPRT status code, see RTSemFastMutexRequest. + * @param pGVMM The GVMM instance data. + */ +DECLINLINE(int) gvmmR0CreateDestroyLock(PGVMM pGVMM) +{ + LogFlow(("++gvmmR0CreateDestroyLock(%p)\n", pGVMM)); + int rc = RTCritSectEnter(&pGVMM->CreateDestroyLock); + LogFlow(("gvmmR0CreateDestroyLock(%p)->%Rrc\n", pGVMM, rc)); + return rc; +} + + +/** + * Release the 'create & destroy' lock. + * + * @returns IPRT status code, see RTSemFastMutexRequest. + * @param pGVMM The GVMM instance data. + */ +DECLINLINE(int) gvmmR0CreateDestroyUnlock(PGVMM pGVMM) +{ + LogFlow(("--gvmmR0CreateDestroyUnlock(%p)\n", pGVMM)); + int rc = RTCritSectLeave(&pGVMM->CreateDestroyLock); + AssertRC(rc); + return rc; +} + + +/** + * Request wrapper for the GVMMR0CreateVM API. + * + * @returns VBox status code. + * @param pReq The request buffer. + * @param pSession The session handle. The VM will be associated with this. + */ +GVMMR0DECL(int) GVMMR0CreateVMReq(PGVMMCREATEVMREQ pReq, PSUPDRVSESSION pSession) +{ + /* + * Validate the request. + */ + if (!VALID_PTR(pReq)) + return VERR_INVALID_POINTER; + if (pReq->Hdr.cbReq != sizeof(*pReq)) + return VERR_INVALID_PARAMETER; + if (pReq->pSession != pSession) + return VERR_INVALID_POINTER; + + /* + * Execute it. + */ + PVM pVM; + pReq->pVMR0 = NULL; + pReq->pVMR3 = NIL_RTR3PTR; + int rc = GVMMR0CreateVM(pSession, pReq->cCpus, &pVM); + if (RT_SUCCESS(rc)) + { + pReq->pVMR0 = pVM; + pReq->pVMR3 = pVM->pVMR3; + } + return rc; +} + + +/** + * Allocates the VM structure and registers it with GVM. + * + * The caller will become the VM owner and there by the EMT. + * + * @returns VBox status code. + * @param pSession The support driver session. + * @param cCpus Number of virtual CPUs for the new VM. + * @param ppVM Where to store the pointer to the VM structure. + * + * @thread EMT. + */ +GVMMR0DECL(int) GVMMR0CreateVM(PSUPDRVSESSION pSession, uint32_t cCpus, PVM *ppVM) +{ + LogFlow(("GVMMR0CreateVM: pSession=%p\n", pSession)); + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + + AssertPtrReturn(ppVM, VERR_INVALID_POINTER); + *ppVM = NULL; + + if ( cCpus == 0 + || cCpus > VMM_MAX_CPU_COUNT) + return VERR_INVALID_PARAMETER; + + RTNATIVETHREAD hEMT0 = RTThreadNativeSelf(); + AssertReturn(hEMT0 != NIL_RTNATIVETHREAD, VERR_GVMM_BROKEN_IPRT); + RTPROCESS ProcId = RTProcSelf(); + AssertReturn(ProcId != NIL_RTPROCESS, VERR_GVMM_BROKEN_IPRT); + + /* + * The whole allocation process is protected by the lock. + */ + int rc = gvmmR0CreateDestroyLock(pGVMM); + AssertRCReturn(rc, rc); + + /* + * Only one VM per session. + */ + if (SUPR0GetSessionVM(pSession) != NULL) + { + gvmmR0CreateDestroyUnlock(pGVMM); + SUPR0Printf("GVMMR0CreateVM: The session %p already got a VM: %p\n", pSession, SUPR0GetSessionVM(pSession)); + return VERR_ALREADY_EXISTS; + } + + /* + * Allocate a handle first so we don't waste resources unnecessarily. + */ + uint16_t iHandle = pGVMM->iFreeHead; + if (iHandle) + { + PGVMHANDLE pHandle = &pGVMM->aHandles[iHandle]; + + /* consistency checks, a bit paranoid as always. */ + if ( !pHandle->pVM + && !pHandle->pGVM + && !pHandle->pvObj + && pHandle->iSelf == iHandle) + { + pHandle->pvObj = SUPR0ObjRegister(pSession, SUPDRVOBJTYPE_VM, gvmmR0HandleObjDestructor, pGVMM, pHandle); + if (pHandle->pvObj) + { + /* + * Move the handle from the free to used list and perform permission checks. + */ + rc = GVMMR0_USED_EXCLUSIVE_LOCK(pGVMM); + AssertRC(rc); + + pGVMM->iFreeHead = pHandle->iNext; + pHandle->iNext = pGVMM->iUsedHead; + pGVMM->iUsedHead = iHandle; + pGVMM->cVMs++; + + pHandle->pVM = NULL; + pHandle->pGVM = NULL; + pHandle->pSession = pSession; + pHandle->hEMT0 = NIL_RTNATIVETHREAD; + pHandle->ProcId = NIL_RTPROCESS; + + GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM); + + rc = SUPR0ObjVerifyAccess(pHandle->pvObj, pSession, NULL); + if (RT_SUCCESS(rc)) + { + /* + * Allocate the global VM structure (GVM) and initialize it. + */ + PGVM pGVM = (PGVM)RTMemAllocZ(RT_UOFFSETOF_DYN(GVM, aCpus[cCpus])); + if (pGVM) + { + pGVM->u32Magic = GVM_MAGIC; + pGVM->hSelf = iHandle; + pGVM->pVM = NULL; + pGVM->cCpus = cCpus; + pGVM->pSession = pSession; + + gvmmR0InitPerVMData(pGVM); + GMMR0InitPerVMData(pGVM); + + /* + * Allocate the shared VM structure and associated page array. + */ + const uint32_t cbVM = RT_UOFFSETOF_DYN(VM, aCpus[cCpus]); + const uint32_t cPages = RT_ALIGN_32(cbVM, PAGE_SIZE) >> PAGE_SHIFT; + rc = RTR0MemObjAllocLow(&pGVM->gvmm.s.VMMemObj, cPages << PAGE_SHIFT, false /* fExecutable */); + if (RT_SUCCESS(rc)) + { + PVM pVM = (PVM)RTR0MemObjAddress(pGVM->gvmm.s.VMMemObj); AssertPtr(pVM); + memset(pVM, 0, cPages << PAGE_SHIFT); + pVM->enmVMState = VMSTATE_CREATING; + pVM->pVMR0 = pVM; + pVM->pSession = pSession; + pVM->hSelf = iHandle; + pVM->cbSelf = cbVM; + pVM->cCpus = cCpus; + pVM->uCpuExecutionCap = 100; /* default is no cap. */ + pVM->offVMCPU = RT_UOFFSETOF_DYN(VM, aCpus); + AssertCompileMemberAlignment(VM, cpum, 64); + AssertCompileMemberAlignment(VM, tm, 64); + AssertCompileMemberAlignment(VM, aCpus, PAGE_SIZE); + + rc = RTR0MemObjAllocPage(&pGVM->gvmm.s.VMPagesMemObj, cPages * sizeof(SUPPAGE), false /* fExecutable */); + if (RT_SUCCESS(rc)) + { + PSUPPAGE paPages = (PSUPPAGE)RTR0MemObjAddress(pGVM->gvmm.s.VMPagesMemObj); AssertPtr(paPages); + for (uint32_t iPage = 0; iPage < cPages; iPage++) + { + paPages[iPage].uReserved = 0; + paPages[iPage].Phys = RTR0MemObjGetPagePhysAddr(pGVM->gvmm.s.VMMemObj, iPage); + Assert(paPages[iPage].Phys != NIL_RTHCPHYS); + } + + /* + * Map them into ring-3. + */ + rc = RTR0MemObjMapUser(&pGVM->gvmm.s.VMMapObj, pGVM->gvmm.s.VMMemObj, (RTR3PTR)-1, 0, + RTMEM_PROT_READ | RTMEM_PROT_WRITE, NIL_RTR0PROCESS); + if (RT_SUCCESS(rc)) + { + PVMR3 pVMR3 = RTR0MemObjAddressR3(pGVM->gvmm.s.VMMapObj); + pVM->pVMR3 = pVMR3; + AssertPtr((void *)pVMR3); + + /* Initialize all the VM pointers. */ + for (VMCPUID i = 0; i < cCpus; i++) + { + pVM->aCpus[i].idCpu = i; + pVM->aCpus[i].pVMR0 = pVM; + pVM->aCpus[i].pVMR3 = pVMR3; + pVM->aCpus[i].idHostCpu = NIL_RTCPUID; + pVM->aCpus[i].hNativeThreadR0 = NIL_RTNATIVETHREAD; + } + + rc = RTR0MemObjMapUser(&pGVM->gvmm.s.VMPagesMapObj, pGVM->gvmm.s.VMPagesMemObj, (RTR3PTR)-1, + 0 /* uAlignment */, RTMEM_PROT_READ | RTMEM_PROT_WRITE, + NIL_RTR0PROCESS); + if (RT_SUCCESS(rc)) + { + pVM->paVMPagesR3 = RTR0MemObjAddressR3(pGVM->gvmm.s.VMPagesMapObj); + AssertPtr((void *)pVM->paVMPagesR3); + + /* complete the handle - take the UsedLock sem just to be careful. */ + rc = GVMMR0_USED_EXCLUSIVE_LOCK(pGVMM); + AssertRC(rc); + + pHandle->pVM = pVM; + pHandle->pGVM = pGVM; + pHandle->hEMT0 = hEMT0; + pHandle->ProcId = ProcId; + pGVM->pVM = pVM; + pGVM->pVMR3 = pVMR3; + pGVM->aCpus[0].hEMT = hEMT0; + pVM->aCpus[0].hNativeThreadR0 = hEMT0; + pGVMM->cEMTs += cCpus; + + for (VMCPUID i = 0; i < cCpus; i++) + { + pGVM->aCpus[i].pVCpu = &pVM->aCpus[i]; + pGVM->aCpus[i].pVM = pVM; + } + + /* Associate it with the session and create the context hook for EMT0. */ + rc = SUPR0SetSessionVM(pSession, pGVM, pVM); + if (RT_SUCCESS(rc)) + { + rc = VMMR0ThreadCtxHookCreateForEmt(&pVM->aCpus[0]); + if (RT_SUCCESS(rc)) + { + /* + * Done! + */ + VBOXVMM_R0_GVMM_VM_CREATED(pGVM, pVM, ProcId, (void *)hEMT0, cCpus); + + GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM); + gvmmR0CreateDestroyUnlock(pGVMM); + + CPUMR0RegisterVCpuThread(&pVM->aCpus[0]); + + *ppVM = pVM; + Log(("GVMMR0CreateVM: pVM=%p pVMR3=%p pGVM=%p hGVM=%d\n", pVM, pVMR3, pGVM, iHandle)); + return VINF_SUCCESS; + } + + SUPR0SetSessionVM(pSession, NULL, NULL); + } + GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM); + } + + RTR0MemObjFree(pGVM->gvmm.s.VMMapObj, false /* fFreeMappings */); + pGVM->gvmm.s.VMMapObj = NIL_RTR0MEMOBJ; + } + RTR0MemObjFree(pGVM->gvmm.s.VMPagesMemObj, false /* fFreeMappings */); + pGVM->gvmm.s.VMPagesMemObj = NIL_RTR0MEMOBJ; + } + RTR0MemObjFree(pGVM->gvmm.s.VMMemObj, false /* fFreeMappings */); + pGVM->gvmm.s.VMMemObj = NIL_RTR0MEMOBJ; + } + } + } + /* else: The user wasn't permitted to create this VM. */ + + /* + * The handle will be freed by gvmmR0HandleObjDestructor as we release the + * object reference here. A little extra mess because of non-recursive lock. + */ + void *pvObj = pHandle->pvObj; + pHandle->pvObj = NULL; + gvmmR0CreateDestroyUnlock(pGVMM); + + SUPR0ObjRelease(pvObj, pSession); + + SUPR0Printf("GVMMR0CreateVM: failed, rc=%d\n", rc); + return rc; + } + + rc = VERR_NO_MEMORY; + } + else + rc = VERR_GVMM_IPE_1; + } + else + rc = VERR_GVM_TOO_MANY_VMS; + + gvmmR0CreateDestroyUnlock(pGVMM); + return rc; +} + + +/** + * Initializes the per VM data belonging to GVMM. + * + * @param pGVM Pointer to the global VM structure. + */ +static void gvmmR0InitPerVMData(PGVM pGVM) +{ + AssertCompile(RT_SIZEOFMEMB(GVM,gvmm.s) <= RT_SIZEOFMEMB(GVM,gvmm.padding)); + AssertCompile(RT_SIZEOFMEMB(GVMCPU,gvmm.s) <= RT_SIZEOFMEMB(GVMCPU,gvmm.padding)); + pGVM->gvmm.s.VMMemObj = NIL_RTR0MEMOBJ; + pGVM->gvmm.s.VMMapObj = NIL_RTR0MEMOBJ; + pGVM->gvmm.s.VMPagesMemObj = NIL_RTR0MEMOBJ; + pGVM->gvmm.s.VMPagesMapObj = NIL_RTR0MEMOBJ; + pGVM->gvmm.s.fDoneVMMR0Init = false; + pGVM->gvmm.s.fDoneVMMR0Term = false; + + for (VMCPUID i = 0; i < pGVM->cCpus; i++) + { + pGVM->aCpus[i].idCpu = i; + pGVM->aCpus[i].gvmm.s.HaltEventMulti = NIL_RTSEMEVENTMULTI; + pGVM->aCpus[i].hEMT = NIL_RTNATIVETHREAD; + pGVM->aCpus[i].pGVM = pGVM; + pGVM->aCpus[i].pVCpu = NULL; + pGVM->aCpus[i].pVM = NULL; + } +} + + +/** + * Does the VM initialization. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + */ +GVMMR0DECL(int) GVMMR0InitVM(PGVM pGVM) +{ + LogFlow(("GVMMR0InitVM: pGVM=%p\n", pGVM)); + + int rc = VERR_INTERNAL_ERROR_3; + if ( !pGVM->gvmm.s.fDoneVMMR0Init + && pGVM->aCpus[0].gvmm.s.HaltEventMulti == NIL_RTSEMEVENTMULTI) + { + for (VMCPUID i = 0; i < pGVM->cCpus; i++) + { + rc = RTSemEventMultiCreate(&pGVM->aCpus[i].gvmm.s.HaltEventMulti); + if (RT_FAILURE(rc)) + { + pGVM->aCpus[i].gvmm.s.HaltEventMulti = NIL_RTSEMEVENTMULTI; + break; + } + } + } + else + rc = VERR_WRONG_ORDER; + + LogFlow(("GVMMR0InitVM: returns %Rrc\n", rc)); + return rc; +} + + +/** + * Indicates that we're done with the ring-0 initialization + * of the VM. + * + * @param pGVM The global (ring-0) VM structure. + * @thread EMT(0) + */ +GVMMR0DECL(void) GVMMR0DoneInitVM(PGVM pGVM) +{ + /* Set the indicator. */ + pGVM->gvmm.s.fDoneVMMR0Init = true; +} + + +/** + * Indicates that we're doing the ring-0 termination of the VM. + * + * @returns true if termination hasn't been done already, false if it has. + * @param pGVM Pointer to the global VM structure. Optional. + * @thread EMT(0) or session cleanup thread. + */ +GVMMR0DECL(bool) GVMMR0DoingTermVM(PGVM pGVM) +{ + /* Validate the VM structure, state and handle. */ + AssertPtrReturn(pGVM, false); + + /* Set the indicator. */ + if (pGVM->gvmm.s.fDoneVMMR0Term) + return false; + pGVM->gvmm.s.fDoneVMMR0Term = true; + return true; +} + + +/** + * Destroys the VM, freeing all associated resources (the ring-0 ones anyway). + * + * This is call from the vmR3DestroyFinalBit and from a error path in VMR3Create, + * and the caller is not the EMT thread, unfortunately. For security reasons, it + * would've been nice if the caller was actually the EMT thread or that we somehow + * could've associated the calling thread with the VM up front. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * + * @thread EMT(0) if it's associated with the VM, otherwise any thread. + */ +GVMMR0DECL(int) GVMMR0DestroyVM(PGVM pGVM, PVM pVM) +{ + LogFlow(("GVMMR0DestroyVM: pGVM=%p pVM=%p\n", pGVM, pVM)); + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + + /* + * Validate the VM structure, state and caller. + */ + AssertPtrReturn(pGVM, VERR_INVALID_POINTER); + AssertPtrReturn(pVM, VERR_INVALID_POINTER); + AssertReturn(!((uintptr_t)pVM & PAGE_OFFSET_MASK), VERR_INVALID_POINTER); + AssertReturn(pGVM->pVM == pVM, VERR_INVALID_POINTER); + AssertMsgReturn(pVM->enmVMState >= VMSTATE_CREATING && pVM->enmVMState <= VMSTATE_TERMINATED, ("%d\n", pVM->enmVMState), + VERR_WRONG_ORDER); + + uint32_t hGVM = pGVM->hSelf; + ASMCompilerBarrier(); + AssertReturn(hGVM != NIL_GVM_HANDLE, VERR_INVALID_VM_HANDLE); + AssertReturn(hGVM < RT_ELEMENTS(pGVMM->aHandles), VERR_INVALID_VM_HANDLE); + + PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM]; + AssertReturn(pHandle->pVM == pVM, VERR_NOT_OWNER); + + RTPROCESS ProcId = RTProcSelf(); + RTNATIVETHREAD hSelf = RTThreadNativeSelf(); + AssertReturn( ( pHandle->hEMT0 == hSelf + && pHandle->ProcId == ProcId) + || pHandle->hEMT0 == NIL_RTNATIVETHREAD, VERR_NOT_OWNER); + + /* + * Lookup the handle and destroy the object. + * Since the lock isn't recursive and we'll have to leave it before dereferencing the + * object, we take some precautions against racing callers just in case... + */ + int rc = gvmmR0CreateDestroyLock(pGVMM); + AssertRC(rc); + + /* Be careful here because we might theoretically be racing someone else cleaning up. */ + if ( pHandle->pVM == pVM + && ( ( pHandle->hEMT0 == hSelf + && pHandle->ProcId == ProcId) + || pHandle->hEMT0 == NIL_RTNATIVETHREAD) + && VALID_PTR(pHandle->pvObj) + && VALID_PTR(pHandle->pSession) + && VALID_PTR(pHandle->pGVM) + && pHandle->pGVM->u32Magic == GVM_MAGIC) + { + /* Check that other EMTs have deregistered. */ + uint32_t cNotDeregistered = 0; + for (VMCPUID idCpu = 1; idCpu < pGVM->cCpus; idCpu++) + cNotDeregistered += pGVM->aCpus[idCpu].hEMT != ~(RTNATIVETHREAD)1; /* see GVMMR0DeregisterVCpu for the value */ + if (cNotDeregistered == 0) + { + /* Grab the object pointer. */ + void *pvObj = pHandle->pvObj; + pHandle->pvObj = NULL; + gvmmR0CreateDestroyUnlock(pGVMM); + + SUPR0ObjRelease(pvObj, pHandle->pSession); + } + else + { + gvmmR0CreateDestroyUnlock(pGVMM); + rc = VERR_GVMM_NOT_ALL_EMTS_DEREGISTERED; + } + } + else + { + SUPR0Printf("GVMMR0DestroyVM: pHandle=%RKv:{.pVM=%p, .hEMT0=%p, .ProcId=%u, .pvObj=%p} pVM=%p hSelf=%p\n", + pHandle, pHandle->pVM, pHandle->hEMT0, pHandle->ProcId, pHandle->pvObj, pVM, hSelf); + gvmmR0CreateDestroyUnlock(pGVMM); + rc = VERR_GVMM_IPE_2; + } + + return rc; +} + + +/** + * Performs VM cleanup task as part of object destruction. + * + * @param pGVM The GVM pointer. + */ +static void gvmmR0CleanupVM(PGVM pGVM) +{ + if ( pGVM->gvmm.s.fDoneVMMR0Init + && !pGVM->gvmm.s.fDoneVMMR0Term) + { + if ( pGVM->gvmm.s.VMMemObj != NIL_RTR0MEMOBJ + && RTR0MemObjAddress(pGVM->gvmm.s.VMMemObj) == pGVM->pVM) + { + LogFlow(("gvmmR0CleanupVM: Calling VMMR0TermVM\n")); + VMMR0TermVM(pGVM, pGVM->pVM, NIL_VMCPUID); + } + else + AssertMsgFailed(("gvmmR0CleanupVM: VMMemObj=%p pVM=%p\n", pGVM->gvmm.s.VMMemObj, pGVM->pVM)); + } + + GMMR0CleanupVM(pGVM); +#ifdef VBOX_WITH_NEM_R0 + NEMR0CleanupVM(pGVM); +#endif + + AssertCompile((uintptr_t)NIL_RTTHREADCTXHOOK == 0); /* Depends on zero initialized memory working for NIL at the moment. */ + for (VMCPUID idCpu = 0; idCpu < pGVM->cCpus; idCpu++) + { + /** @todo Can we busy wait here for all thread-context hooks to be + * deregistered before releasing (destroying) it? Only until we find a + * solution for not deregistering hooks everytime we're leaving HMR0 + * context. */ + VMMR0ThreadCtxHookDestroyForEmt(&pGVM->pVM->aCpus[idCpu]); + } +} + + +/** + * @callback_method_impl{FNSUPDRVDESTRUCTOR,VM handle destructor} + * + * pvUser1 is the GVM instance pointer. + * pvUser2 is the handle pointer. + */ +static DECLCALLBACK(void) gvmmR0HandleObjDestructor(void *pvObj, void *pvUser1, void *pvUser2) +{ + LogFlow(("gvmmR0HandleObjDestructor: %p %p %p\n", pvObj, pvUser1, pvUser2)); + + NOREF(pvObj); + + /* + * Some quick, paranoid, input validation. + */ + PGVMHANDLE pHandle = (PGVMHANDLE)pvUser2; + AssertPtr(pHandle); + PGVMM pGVMM = (PGVMM)pvUser1; + Assert(pGVMM == g_pGVMM); + const uint16_t iHandle = pHandle - &pGVMM->aHandles[0]; + if ( !iHandle + || iHandle >= RT_ELEMENTS(pGVMM->aHandles) + || iHandle != pHandle->iSelf) + { + SUPR0Printf("GVM: handle %d is out of range or corrupt (iSelf=%d)!\n", iHandle, pHandle->iSelf); + return; + } + + int rc = gvmmR0CreateDestroyLock(pGVMM); + AssertRC(rc); + rc = GVMMR0_USED_EXCLUSIVE_LOCK(pGVMM); + AssertRC(rc); + + /* + * This is a tad slow but a doubly linked list is too much hassle. + */ + if (RT_UNLIKELY(pHandle->iNext >= RT_ELEMENTS(pGVMM->aHandles))) + { + SUPR0Printf("GVM: used list index %d is out of range!\n", pHandle->iNext); + GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM); + gvmmR0CreateDestroyUnlock(pGVMM); + return; + } + + if (pGVMM->iUsedHead == iHandle) + pGVMM->iUsedHead = pHandle->iNext; + else + { + uint16_t iPrev = pGVMM->iUsedHead; + int c = RT_ELEMENTS(pGVMM->aHandles) + 2; + while (iPrev) + { + if (RT_UNLIKELY(iPrev >= RT_ELEMENTS(pGVMM->aHandles))) + { + SUPR0Printf("GVM: used list index %d is out of range!\n", iPrev); + GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM); + gvmmR0CreateDestroyUnlock(pGVMM); + return; + } + if (RT_UNLIKELY(c-- <= 0)) + { + iPrev = 0; + break; + } + + if (pGVMM->aHandles[iPrev].iNext == iHandle) + break; + iPrev = pGVMM->aHandles[iPrev].iNext; + } + if (!iPrev) + { + SUPR0Printf("GVM: can't find the handle previous previous of %d!\n", pHandle->iSelf); + GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM); + gvmmR0CreateDestroyUnlock(pGVMM); + return; + } + + Assert(pGVMM->aHandles[iPrev].iNext == iHandle); + pGVMM->aHandles[iPrev].iNext = pHandle->iNext; + } + pHandle->iNext = 0; + pGVMM->cVMs--; + + /* + * Do the global cleanup round. + */ + PGVM pGVM = pHandle->pGVM; + if ( VALID_PTR(pGVM) + && pGVM->u32Magic == GVM_MAGIC) + { + pGVMM->cEMTs -= pGVM->cCpus; + + if (pGVM->pSession) + SUPR0SetSessionVM(pGVM->pSession, NULL, NULL); + + GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM); + + gvmmR0CleanupVM(pGVM); + + /* + * Do the GVMM cleanup - must be done last. + */ + /* The VM and VM pages mappings/allocations. */ + if (pGVM->gvmm.s.VMPagesMapObj != NIL_RTR0MEMOBJ) + { + rc = RTR0MemObjFree(pGVM->gvmm.s.VMPagesMapObj, false /* fFreeMappings */); AssertRC(rc); + pGVM->gvmm.s.VMPagesMapObj = NIL_RTR0MEMOBJ; + } + + if (pGVM->gvmm.s.VMMapObj != NIL_RTR0MEMOBJ) + { + rc = RTR0MemObjFree(pGVM->gvmm.s.VMMapObj, false /* fFreeMappings */); AssertRC(rc); + pGVM->gvmm.s.VMMapObj = NIL_RTR0MEMOBJ; + } + + if (pGVM->gvmm.s.VMPagesMemObj != NIL_RTR0MEMOBJ) + { + rc = RTR0MemObjFree(pGVM->gvmm.s.VMPagesMemObj, false /* fFreeMappings */); AssertRC(rc); + pGVM->gvmm.s.VMPagesMemObj = NIL_RTR0MEMOBJ; + } + + if (pGVM->gvmm.s.VMMemObj != NIL_RTR0MEMOBJ) + { + rc = RTR0MemObjFree(pGVM->gvmm.s.VMMemObj, false /* fFreeMappings */); AssertRC(rc); + pGVM->gvmm.s.VMMemObj = NIL_RTR0MEMOBJ; + } + + for (VMCPUID i = 0; i < pGVM->cCpus; i++) + { + if (pGVM->aCpus[i].gvmm.s.HaltEventMulti != NIL_RTSEMEVENTMULTI) + { + rc = RTSemEventMultiDestroy(pGVM->aCpus[i].gvmm.s.HaltEventMulti); AssertRC(rc); + pGVM->aCpus[i].gvmm.s.HaltEventMulti = NIL_RTSEMEVENTMULTI; + } + } + + /* the GVM structure itself. */ + pGVM->u32Magic |= UINT32_C(0x80000000); + RTMemFree(pGVM); + + /* Re-acquire the UsedLock before freeing the handle since we're updating handle fields. */ + rc = GVMMR0_USED_EXCLUSIVE_LOCK(pGVMM); + AssertRC(rc); + } + /* else: GVMMR0CreateVM cleanup. */ + + /* + * Free the handle. + */ + pHandle->iNext = pGVMM->iFreeHead; + pGVMM->iFreeHead = iHandle; + ASMAtomicWriteNullPtr(&pHandle->pGVM); + ASMAtomicWriteNullPtr(&pHandle->pVM); + ASMAtomicWriteNullPtr(&pHandle->pvObj); + ASMAtomicWriteNullPtr(&pHandle->pSession); + ASMAtomicWriteHandle(&pHandle->hEMT0, NIL_RTNATIVETHREAD); + ASMAtomicWriteU32(&pHandle->ProcId, NIL_RTPROCESS); + + GVMMR0_USED_EXCLUSIVE_UNLOCK(pGVMM); + gvmmR0CreateDestroyUnlock(pGVMM); + LogFlow(("gvmmR0HandleObjDestructor: returns\n")); +} + + +/** + * Registers the calling thread as the EMT of a Virtual CPU. + * + * Note that VCPU 0 is automatically registered during VM creation. + * + * @returns VBox status code + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu VCPU id to register the current thread as. + */ +GVMMR0DECL(int) GVMMR0RegisterVCpu(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + AssertReturn(idCpu != 0, VERR_INVALID_FUNCTION); + + /* + * Validate the VM structure, state and handle. + */ + PGVMM pGVMM; + int rc = gvmmR0ByGVMandVM(pGVM, pVM, &pGVMM, false /* fTakeUsedLock */); /** @todo take lock here. */ + if (RT_SUCCESS(rc)) + { + if (idCpu < pGVM->cCpus) + { + /* Check that the EMT isn't already assigned to a thread. */ + if (pGVM->aCpus[idCpu].hEMT == NIL_RTNATIVETHREAD) + { + Assert(pVM->aCpus[idCpu].hNativeThreadR0 == NIL_RTNATIVETHREAD); + + /* A thread may only be one EMT. */ + RTNATIVETHREAD const hNativeSelf = RTThreadNativeSelf(); + for (VMCPUID iCpu = 0; iCpu < pGVM->cCpus; iCpu++) + AssertBreakStmt(pGVM->aCpus[iCpu].hEMT != hNativeSelf, rc = VERR_INVALID_PARAMETER); + if (RT_SUCCESS(rc)) + { + /* + * Do the assignment, then try setup the hook. Undo if that fails. + */ + pVM->aCpus[idCpu].hNativeThreadR0 = pGVM->aCpus[idCpu].hEMT = RTThreadNativeSelf(); + + rc = VMMR0ThreadCtxHookCreateForEmt(&pVM->aCpus[idCpu]); + if (RT_SUCCESS(rc)) + CPUMR0RegisterVCpuThread(&pVM->aCpus[idCpu]); + else + pVM->aCpus[idCpu].hNativeThreadR0 = pGVM->aCpus[idCpu].hEMT = NIL_RTNATIVETHREAD; + } + } + else + rc = VERR_ACCESS_DENIED; + } + else + rc = VERR_INVALID_CPU_ID; + } + return rc; +} + + +/** + * Deregisters the calling thread as the EMT of a Virtual CPU. + * + * Note that VCPU 0 shall call GVMMR0DestroyVM intead of this API. + * + * @returns VBox status code + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu VCPU id to register the current thread as. + */ +GVMMR0DECL(int) GVMMR0DeregisterVCpu(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + AssertReturn(idCpu != 0, VERR_INVALID_FUNCTION); + + /* + * Validate the VM structure, state and handle. + */ + PGVMM pGVMM; + int rc = gvmmR0ByGVMandVMandEMT(pGVM, pVM, idCpu, &pGVMM); + if (RT_SUCCESS(rc)) + { + /* + * Take the destruction lock and recheck the handle state to + * prevent racing GVMMR0DestroyVM. + */ + gvmmR0CreateDestroyLock(pGVMM); + uint32_t hSelf = pGVM->hSelf; + ASMCompilerBarrier(); + if ( hSelf < RT_ELEMENTS(pGVMM->aHandles) + && pGVMM->aHandles[hSelf].pvObj != NULL + && pGVMM->aHandles[hSelf].pGVM == pGVM) + { + /* + * Do per-EMT cleanups. + */ + VMMR0ThreadCtxHookDestroyForEmt(&pVM->aCpus[idCpu]); + + /* + * Invalidate hEMT. We don't use NIL here as that would allow + * GVMMR0RegisterVCpu to be called again, and we don't want that. + */ + AssertCompile(~(RTNATIVETHREAD)1 != NIL_RTNATIVETHREAD); + pGVM->aCpus[idCpu].hEMT = ~(RTNATIVETHREAD)1; + pVM->aCpus[idCpu].hNativeThreadR0 = NIL_RTNATIVETHREAD; + } + + gvmmR0CreateDestroyUnlock(pGVMM); + } + return rc; +} + + +/** + * Lookup a GVM structure by its handle. + * + * @returns The GVM pointer on success, NULL on failure. + * @param hGVM The global VM handle. Asserts on bad handle. + */ +GVMMR0DECL(PGVM) GVMMR0ByHandle(uint32_t hGVM) +{ + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, NULL); + + /* + * Validate. + */ + AssertReturn(hGVM != NIL_GVM_HANDLE, NULL); + AssertReturn(hGVM < RT_ELEMENTS(pGVMM->aHandles), NULL); + + /* + * Look it up. + */ + PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM]; + AssertPtrReturn(pHandle->pVM, NULL); + AssertPtrReturn(pHandle->pvObj, NULL); + PGVM pGVM = pHandle->pGVM; + AssertPtrReturn(pGVM, NULL); + AssertReturn(pGVM->pVM == pHandle->pVM, NULL); + + return pHandle->pGVM; +} + + +/** + * Lookup a GVM structure by the shared VM structure. + * + * The calling thread must be in the same process as the VM. All current lookups + * are by threads inside the same process, so this will not be an issue. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + * @param ppGVM Where to store the GVM pointer. + * @param ppGVMM Where to store the pointer to the GVMM instance data. + * @param fTakeUsedLock Whether to take the used lock or not. We take it in + * shared mode when requested. + * + * Be very careful if not taking the lock as it's + * possible that the VM will disappear then! + * + * @remark This will not assert on an invalid pVM but try return silently. + */ +static int gvmmR0ByVM(PVM pVM, PGVM *ppGVM, PGVMM *ppGVMM, bool fTakeUsedLock) +{ + RTPROCESS ProcId = RTProcSelf(); + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + + /* + * Validate. + */ + if (RT_UNLIKELY( !VALID_PTR(pVM) + || ((uintptr_t)pVM & PAGE_OFFSET_MASK))) + return VERR_INVALID_POINTER; + if (RT_UNLIKELY( pVM->enmVMState < VMSTATE_CREATING + || pVM->enmVMState >= VMSTATE_TERMINATED)) + return VERR_INVALID_POINTER; + + uint16_t hGVM = pVM->hSelf; + ASMCompilerBarrier(); + if (RT_UNLIKELY( hGVM == NIL_GVM_HANDLE + || hGVM >= RT_ELEMENTS(pGVMM->aHandles))) + return VERR_INVALID_HANDLE; + + /* + * Look it up. + */ + PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM]; + PGVM pGVM; + if (fTakeUsedLock) + { + int rc = GVMMR0_USED_SHARED_LOCK(pGVMM); + AssertRCReturn(rc, rc); + + pGVM = pHandle->pGVM; + if (RT_UNLIKELY( pHandle->pVM != pVM + || pHandle->ProcId != ProcId + || !VALID_PTR(pHandle->pvObj) + || !VALID_PTR(pGVM) + || pGVM->pVM != pVM)) + { + GVMMR0_USED_SHARED_UNLOCK(pGVMM); + return VERR_INVALID_HANDLE; + } + } + else + { + if (RT_UNLIKELY(pHandle->pVM != pVM)) + return VERR_INVALID_HANDLE; + if (RT_UNLIKELY(pHandle->ProcId != ProcId)) + return VERR_INVALID_HANDLE; + if (RT_UNLIKELY(!VALID_PTR(pHandle->pvObj))) + return VERR_INVALID_HANDLE; + + pGVM = pHandle->pGVM; + if (RT_UNLIKELY(!VALID_PTR(pGVM))) + return VERR_INVALID_HANDLE; + if (RT_UNLIKELY(pGVM->pVM != pVM)) + return VERR_INVALID_HANDLE; + } + + *ppGVM = pGVM; + *ppGVMM = pGVMM; + return VINF_SUCCESS; +} + + +/** + * Fast look up a GVM structure by the cross context VM structure. + * + * This is mainly used a glue function, so performance is . + * + * @returns GVM on success, NULL on failure. + * @param pVM The cross context VM structure. ASSUMES to be + * reasonably valid, so we can do fewer checks than in + * gvmmR0ByVM. + * + * @note Do not use this on pVM structures from userland! + */ +GVMMR0DECL(PGVM) GVMMR0FastGetGVMByVM(PVM pVM) +{ + AssertPtr(pVM); + Assert(!((uintptr_t)pVM & PAGE_OFFSET_MASK)); + + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, NULL); + + /* + * Validate. + */ + uint16_t hGVM = pVM->hSelf; + ASMCompilerBarrier(); + AssertReturn(hGVM != NIL_GVM_HANDLE && hGVM < RT_ELEMENTS(pGVMM->aHandles), NULL); + + /* + * Look it up and check pVM against the value in the handle and GVM structures. + */ + PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM]; + AssertReturn(pHandle->pVM == pVM, NULL); + + PGVM pGVM = pHandle->pGVM; + AssertPtrReturn(pGVM, NULL); + AssertReturn(pGVM->pVM == pVM, NULL); + + return pGVM; +} + + +/** + * Check that the given GVM and VM structures match up. + * + * The calling thread must be in the same process as the VM. All current lookups + * are by threads inside the same process, so this will not be an issue. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param ppGVMM Where to store the pointer to the GVMM instance data. + * @param fTakeUsedLock Whether to take the used lock or not. We take it in + * shared mode when requested. + * + * Be very careful if not taking the lock as it's + * possible that the VM will disappear then! + * + * @remark This will not assert on an invalid pVM but try return silently. + */ +static int gvmmR0ByGVMandVM(PGVM pGVM, PVM pVM, PGVMM *ppGVMM, bool fTakeUsedLock) +{ + /* + * Check the pointers. + */ + int rc; + if (RT_LIKELY(RT_VALID_PTR(pGVM))) + { + if (RT_LIKELY( RT_VALID_PTR(pVM) + && ((uintptr_t)pVM & PAGE_OFFSET_MASK) == 0)) + { + if (RT_LIKELY(pGVM->pVM == pVM)) + { + /* + * Get the pGVMM instance and check the VM handle. + */ + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + + uint16_t hGVM = pGVM->hSelf; + if (RT_LIKELY( hGVM != NIL_GVM_HANDLE + && hGVM < RT_ELEMENTS(pGVMM->aHandles))) + { + RTPROCESS const pidSelf = RTProcSelf(); + PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM]; + if (fTakeUsedLock) + { + rc = GVMMR0_USED_SHARED_LOCK(pGVMM); + AssertRCReturn(rc, rc); + } + + if (RT_LIKELY( pHandle->pGVM == pGVM + && pHandle->pVM == pVM + && pHandle->ProcId == pidSelf + && RT_VALID_PTR(pHandle->pvObj))) + { + /* + * Some more VM data consistency checks. + */ + if (RT_LIKELY( pVM->cCpus == pGVM->cCpus + && pVM->hSelf == hGVM + && pVM->enmVMState >= VMSTATE_CREATING + && pVM->enmVMState <= VMSTATE_TERMINATED + && pVM->pVMR0 == pVM)) + { + *ppGVMM = pGVMM; + return VINF_SUCCESS; + } + } + + if (fTakeUsedLock) + GVMMR0_USED_SHARED_UNLOCK(pGVMM); + } + } + rc = VERR_INVALID_VM_HANDLE; + } + else + rc = VERR_INVALID_POINTER; + } + else + rc = VERR_INVALID_POINTER; + return rc; +} + + +/** + * Check that the given GVM and VM structures match up. + * + * The calling thread must be in the same process as the VM. All current lookups + * are by threads inside the same process, so this will not be an issue. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The (alleged) Virtual CPU ID of the calling EMT. + * @param ppGVMM Where to store the pointer to the GVMM instance data. + * @thread EMT + * + * @remarks This will assert in all failure paths. + */ +static int gvmmR0ByGVMandVMandEMT(PGVM pGVM, PVM pVM, VMCPUID idCpu, PGVMM *ppGVMM) +{ + /* + * Check the pointers. + */ + AssertPtrReturn(pGVM, VERR_INVALID_POINTER); + + AssertPtrReturn(pVM, VERR_INVALID_POINTER); + AssertReturn(((uintptr_t)pVM & PAGE_OFFSET_MASK) == 0, VERR_INVALID_POINTER); + AssertReturn(pGVM->pVM == pVM, VERR_INVALID_VM_HANDLE); + + + /* + * Get the pGVMM instance and check the VM handle. + */ + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + + uint16_t hGVM = pGVM->hSelf; + ASMCompilerBarrier(); + AssertReturn( hGVM != NIL_GVM_HANDLE + && hGVM < RT_ELEMENTS(pGVMM->aHandles), VERR_INVALID_VM_HANDLE); + + RTPROCESS const pidSelf = RTProcSelf(); + PGVMHANDLE pHandle = &pGVMM->aHandles[hGVM]; + AssertReturn( pHandle->pGVM == pGVM + && pHandle->pVM == pVM + && pHandle->ProcId == pidSelf + && RT_VALID_PTR(pHandle->pvObj), + VERR_INVALID_HANDLE); + + /* + * Check the EMT claim. + */ + RTNATIVETHREAD const hAllegedEMT = RTThreadNativeSelf(); + AssertReturn(idCpu < pGVM->cCpus, VERR_INVALID_CPU_ID); + AssertReturn(pGVM->aCpus[idCpu].hEMT == hAllegedEMT, VERR_NOT_OWNER); + + /* + * Some more VM data consistency checks. + */ + AssertReturn(pVM->cCpus == pGVM->cCpus, VERR_INCONSISTENT_VM_HANDLE); + AssertReturn(pVM->hSelf == hGVM, VERR_INCONSISTENT_VM_HANDLE); + AssertReturn(pVM->pVMR0 == pVM, VERR_INCONSISTENT_VM_HANDLE); + AssertReturn( pVM->enmVMState >= VMSTATE_CREATING + && pVM->enmVMState <= VMSTATE_TERMINATED, VERR_INCONSISTENT_VM_HANDLE); + + *ppGVMM = pGVMM; + return VINF_SUCCESS; +} + + +/** + * Validates a GVM/VM pair. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + */ +GVMMR0DECL(int) GVMMR0ValidateGVMandVM(PGVM pGVM, PVM pVM) +{ + PGVMM pGVMM; + return gvmmR0ByGVMandVM(pGVM, pVM, &pGVMM, false /*fTakeUsedLock*/); +} + + + +/** + * Validates a GVM/VM/EMT combo. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The Virtual CPU ID of the calling EMT. + * @thread EMT(idCpu) + */ +GVMMR0DECL(int) GVMMR0ValidateGVMandVMandEMT(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + PGVMM pGVMM; + return gvmmR0ByGVMandVMandEMT(pGVM, pVM, idCpu, &pGVMM); +} + + +/** + * Looks up the VM belonging to the specified EMT thread. + * + * This is used by the assertion machinery in VMMR0.cpp to avoid causing + * unnecessary kernel panics when the EMT thread hits an assertion. The + * call may or not be an EMT thread. + * + * @returns Pointer to the VM on success, NULL on failure. + * @param hEMT The native thread handle of the EMT. + * NIL_RTNATIVETHREAD means the current thread + */ +GVMMR0DECL(PVM) GVMMR0GetVMByEMT(RTNATIVETHREAD hEMT) +{ + /* + * No Assertions here as we're usually called in a AssertMsgN or + * RTAssert* context. + */ + PGVMM pGVMM = g_pGVMM; + if ( !VALID_PTR(pGVMM) + || pGVMM->u32Magic != GVMM_MAGIC) + return NULL; + + if (hEMT == NIL_RTNATIVETHREAD) + hEMT = RTThreadNativeSelf(); + RTPROCESS ProcId = RTProcSelf(); + + /* + * Search the handles in a linear fashion as we don't dare to take the lock (assert). + */ +/** @todo introduce some pid hash table here, please. */ + for (unsigned i = 1; i < RT_ELEMENTS(pGVMM->aHandles); i++) + { + if ( pGVMM->aHandles[i].iSelf == i + && pGVMM->aHandles[i].ProcId == ProcId + && VALID_PTR(pGVMM->aHandles[i].pvObj) + && VALID_PTR(pGVMM->aHandles[i].pVM) + && VALID_PTR(pGVMM->aHandles[i].pGVM)) + { + if (pGVMM->aHandles[i].hEMT0 == hEMT) + return pGVMM->aHandles[i].pVM; + + /* This is fearly safe with the current process per VM approach. */ + PGVM pGVM = pGVMM->aHandles[i].pGVM; + VMCPUID const cCpus = pGVM->cCpus; + ASMCompilerBarrier(); + if ( cCpus < 1 + || cCpus > VMM_MAX_CPU_COUNT) + continue; + for (VMCPUID idCpu = 1; idCpu < cCpus; idCpu++) + if (pGVM->aCpus[idCpu].hEMT == hEMT) + return pGVMM->aHandles[i].pVM; + } + } + return NULL; +} + + +/** + * Looks up the GVMCPU belonging to the specified EMT thread. + * + * This is used by the assertion machinery in VMMR0.cpp to avoid causing + * unnecessary kernel panics when the EMT thread hits an assertion. The + * call may or not be an EMT thread. + * + * @returns Pointer to the VM on success, NULL on failure. + * @param hEMT The native thread handle of the EMT. + * NIL_RTNATIVETHREAD means the current thread + */ +GVMMR0DECL(PGVMCPU) GVMMR0GetGVCpuByEMT(RTNATIVETHREAD hEMT) +{ + /* + * No Assertions here as we're usually called in a AssertMsgN, + * RTAssert*, Log and LogRel contexts. + */ + PGVMM pGVMM = g_pGVMM; + if ( !VALID_PTR(pGVMM) + || pGVMM->u32Magic != GVMM_MAGIC) + return NULL; + + if (hEMT == NIL_RTNATIVETHREAD) + hEMT = RTThreadNativeSelf(); + RTPROCESS ProcId = RTProcSelf(); + + /* + * Search the handles in a linear fashion as we don't dare to take the lock (assert). + */ +/** @todo introduce some pid hash table here, please. */ + for (unsigned i = 1; i < RT_ELEMENTS(pGVMM->aHandles); i++) + { + if ( pGVMM->aHandles[i].iSelf == i + && pGVMM->aHandles[i].ProcId == ProcId + && VALID_PTR(pGVMM->aHandles[i].pvObj) + && VALID_PTR(pGVMM->aHandles[i].pVM) + && VALID_PTR(pGVMM->aHandles[i].pGVM)) + { + PGVM pGVM = pGVMM->aHandles[i].pGVM; + if (pGVMM->aHandles[i].hEMT0 == hEMT) + return &pGVM->aCpus[0]; + + /* This is fearly safe with the current process per VM approach. */ + VMCPUID const cCpus = pGVM->cCpus; + ASMCompilerBarrier(); + ASMCompilerBarrier(); + if ( cCpus < 1 + || cCpus > VMM_MAX_CPU_COUNT) + continue; + for (VMCPUID idCpu = 1; idCpu < cCpus; idCpu++) + if (pGVM->aCpus[idCpu].hEMT == hEMT) + return &pGVM->aCpus[idCpu]; + } + } + return NULL; +} + + +/** + * This is will wake up expired and soon-to-be expired VMs. + * + * @returns Number of VMs that has been woken up. + * @param pGVMM Pointer to the GVMM instance data. + * @param u64Now The current time. + */ +static unsigned gvmmR0SchedDoWakeUps(PGVMM pGVMM, uint64_t u64Now) +{ + /* + * Skip this if we've got disabled because of high resolution wakeups or by + * the user. + */ + if (!pGVMM->fDoEarlyWakeUps) + return 0; + +/** @todo Rewrite this algorithm. See performance defect XYZ. */ + + /* + * A cheap optimization to stop wasting so much time here on big setups. + */ + const uint64_t uNsEarlyWakeUp2 = u64Now + pGVMM->nsEarlyWakeUp2; + if ( pGVMM->cHaltedEMTs == 0 + || uNsEarlyWakeUp2 > pGVMM->uNsNextEmtWakeup) + return 0; + + /* + * Only one thread doing this at a time. + */ + if (!ASMAtomicCmpXchgBool(&pGVMM->fDoingEarlyWakeUps, true, false)) + return 0; + + /* + * The first pass will wake up VMs which have actually expired + * and look for VMs that should be woken up in the 2nd and 3rd passes. + */ + const uint64_t uNsEarlyWakeUp1 = u64Now + pGVMM->nsEarlyWakeUp1; + uint64_t u64Min = UINT64_MAX; + unsigned cWoken = 0; + unsigned cHalted = 0; + unsigned cTodo2nd = 0; + unsigned cTodo3rd = 0; + for (unsigned i = pGVMM->iUsedHead, cGuard = 0; + i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles); + i = pGVMM->aHandles[i].iNext) + { + PGVM pCurGVM = pGVMM->aHandles[i].pGVM; + if ( VALID_PTR(pCurGVM) + && pCurGVM->u32Magic == GVM_MAGIC) + { + for (VMCPUID idCpu = 0; idCpu < pCurGVM->cCpus; idCpu++) + { + PGVMCPU pCurGVCpu = &pCurGVM->aCpus[idCpu]; + uint64_t u64 = ASMAtomicUoReadU64(&pCurGVCpu->gvmm.s.u64HaltExpire); + if (u64) + { + if (u64 <= u64Now) + { + if (ASMAtomicXchgU64(&pCurGVCpu->gvmm.s.u64HaltExpire, 0)) + { + int rc = RTSemEventMultiSignal(pCurGVCpu->gvmm.s.HaltEventMulti); + AssertRC(rc); + cWoken++; + } + } + else + { + cHalted++; + if (u64 <= uNsEarlyWakeUp1) + cTodo2nd++; + else if (u64 <= uNsEarlyWakeUp2) + cTodo3rd++; + else if (u64 < u64Min) + u64 = u64Min; + } + } + } + } + AssertLogRelBreak(cGuard++ < RT_ELEMENTS(pGVMM->aHandles)); + } + + if (cTodo2nd) + { + for (unsigned i = pGVMM->iUsedHead, cGuard = 0; + i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles); + i = pGVMM->aHandles[i].iNext) + { + PGVM pCurGVM = pGVMM->aHandles[i].pGVM; + if ( VALID_PTR(pCurGVM) + && pCurGVM->u32Magic == GVM_MAGIC) + { + for (VMCPUID idCpu = 0; idCpu < pCurGVM->cCpus; idCpu++) + { + PGVMCPU pCurGVCpu = &pCurGVM->aCpus[idCpu]; + uint64_t u64 = ASMAtomicUoReadU64(&pCurGVCpu->gvmm.s.u64HaltExpire); + if ( u64 + && u64 <= uNsEarlyWakeUp1) + { + if (ASMAtomicXchgU64(&pCurGVCpu->gvmm.s.u64HaltExpire, 0)) + { + int rc = RTSemEventMultiSignal(pCurGVCpu->gvmm.s.HaltEventMulti); + AssertRC(rc); + cWoken++; + } + } + } + } + AssertLogRelBreak(cGuard++ < RT_ELEMENTS(pGVMM->aHandles)); + } + } + + if (cTodo3rd) + { + for (unsigned i = pGVMM->iUsedHead, cGuard = 0; + i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles); + i = pGVMM->aHandles[i].iNext) + { + PGVM pCurGVM = pGVMM->aHandles[i].pGVM; + if ( VALID_PTR(pCurGVM) + && pCurGVM->u32Magic == GVM_MAGIC) + { + for (VMCPUID idCpu = 0; idCpu < pCurGVM->cCpus; idCpu++) + { + PGVMCPU pCurGVCpu = &pCurGVM->aCpus[idCpu]; + uint64_t u64 = ASMAtomicUoReadU64(&pCurGVCpu->gvmm.s.u64HaltExpire); + if ( u64 + && u64 <= uNsEarlyWakeUp2) + { + if (ASMAtomicXchgU64(&pCurGVCpu->gvmm.s.u64HaltExpire, 0)) + { + int rc = RTSemEventMultiSignal(pCurGVCpu->gvmm.s.HaltEventMulti); + AssertRC(rc); + cWoken++; + } + } + } + } + AssertLogRelBreak(cGuard++ < RT_ELEMENTS(pGVMM->aHandles)); + } + } + + /* + * Set the minimum value. + */ + pGVMM->uNsNextEmtWakeup = u64Min; + + ASMAtomicWriteBool(&pGVMM->fDoingEarlyWakeUps, false); + return cWoken; +} + + +/** + * Halt the EMT thread. + * + * @returns VINF_SUCCESS normal wakeup (timeout or kicked by other thread). + * VERR_INTERRUPTED if a signal was scheduled for the thread. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param pGVCpu The global (ring-0) CPU structure of the calling + * EMT. + * @param u64ExpireGipTime The time for the sleep to expire expressed as GIP time. + * @thread EMT(pGVCpu). + */ +GVMMR0DECL(int) GVMMR0SchedHalt(PGVM pGVM, PVM pVM, PGVMCPU pGVCpu, uint64_t u64ExpireGipTime) +{ + LogFlow(("GVMMR0SchedHalt: pGVM=%p pVM=%p pGVCpu=%p(%d) u64ExpireGipTime=%#RX64\n", + pGVM, pVM, pGVCpu, pGVCpu->idCpu, u64ExpireGipTime)); + GVMM_CHECK_SMAP_SETUP(); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + + PGVMM pGVMM; + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + + pGVM->gvmm.s.StatsSched.cHaltCalls++; + Assert(!pGVCpu->gvmm.s.u64HaltExpire); + + /* + * If we're doing early wake-ups, we must take the UsedList lock before we + * start querying the current time. + * Note! Interrupts must NOT be disabled at this point because we ask for GIP time! + */ + bool const fDoEarlyWakeUps = pGVMM->fDoEarlyWakeUps; + if (fDoEarlyWakeUps) + { + int rc2 = GVMMR0_USED_SHARED_LOCK(pGVMM); AssertRC(rc2); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + } + + pGVCpu->gvmm.s.iCpuEmt = ASMGetApicId(); + + /* GIP hack: We might are frequently sleeping for short intervals where the + difference between GIP and system time matters on systems with high resolution + system time. So, convert the input from GIP to System time in that case. */ + Assert(ASMGetFlags() & X86_EFL_IF); + const uint64_t u64NowSys = RTTimeSystemNanoTS(); + const uint64_t u64NowGip = RTTimeNanoTS(); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + + if (fDoEarlyWakeUps) + { + pGVM->gvmm.s.StatsSched.cHaltWakeUps += gvmmR0SchedDoWakeUps(pGVMM, u64NowGip); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + } + + /* + * Go to sleep if we must... + * Cap the sleep time to 1 second to be on the safe side. + */ + int rc; + uint64_t cNsInterval = u64ExpireGipTime - u64NowGip; + if ( u64NowGip < u64ExpireGipTime + && cNsInterval >= (pGVMM->cEMTs > pGVMM->cEMTsMeansCompany + ? pGVMM->nsMinSleepCompany + : pGVMM->nsMinSleepAlone)) + { + pGVM->gvmm.s.StatsSched.cHaltBlocking++; + if (cNsInterval > RT_NS_1SEC) + u64ExpireGipTime = u64NowGip + RT_NS_1SEC; + ASMAtomicWriteU64(&pGVCpu->gvmm.s.u64HaltExpire, u64ExpireGipTime); + ASMAtomicIncU32(&pGVMM->cHaltedEMTs); + if (fDoEarlyWakeUps) + { + if (u64ExpireGipTime < pGVMM->uNsNextEmtWakeup) + pGVMM->uNsNextEmtWakeup = u64ExpireGipTime; + GVMMR0_USED_SHARED_UNLOCK(pGVMM); + } + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + + rc = RTSemEventMultiWaitEx(pGVCpu->gvmm.s.HaltEventMulti, + RTSEMWAIT_FLAGS_ABSOLUTE | RTSEMWAIT_FLAGS_NANOSECS | RTSEMWAIT_FLAGS_INTERRUPTIBLE, + u64NowGip > u64NowSys ? u64ExpireGipTime : u64NowSys + cNsInterval); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + + ASMAtomicWriteU64(&pGVCpu->gvmm.s.u64HaltExpire, 0); + ASMAtomicDecU32(&pGVMM->cHaltedEMTs); + + /* Reset the semaphore to try prevent a few false wake-ups. */ + if (rc == VINF_SUCCESS) + { + RTSemEventMultiReset(pGVCpu->gvmm.s.HaltEventMulti); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + } + else if (rc == VERR_TIMEOUT) + { + pGVM->gvmm.s.StatsSched.cHaltTimeouts++; + rc = VINF_SUCCESS; + } + } + else + { + pGVM->gvmm.s.StatsSched.cHaltNotBlocking++; + if (fDoEarlyWakeUps) + GVMMR0_USED_SHARED_UNLOCK(pGVMM); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + RTSemEventMultiReset(pGVCpu->gvmm.s.HaltEventMulti); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = VINF_SUCCESS; + } + + return rc; +} + + +/** + * Halt the EMT thread. + * + * @returns VINF_SUCCESS normal wakeup (timeout or kicked by other thread). + * VERR_INTERRUPTED if a signal was scheduled for the thread. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The Virtual CPU ID of the calling EMT. + * @param u64ExpireGipTime The time for the sleep to expire expressed as GIP time. + * @thread EMT(idCpu). + */ +GVMMR0DECL(int) GVMMR0SchedHaltReq(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint64_t u64ExpireGipTime) +{ + GVMM_CHECK_SMAP_SETUP(); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + PGVMM pGVMM; + int rc = gvmmR0ByGVMandVMandEMT(pGVM, pVM, idCpu, &pGVMM); + if (RT_SUCCESS(rc)) + { + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = GVMMR0SchedHalt(pGVM, pVM, &pGVM->aCpus[idCpu], u64ExpireGipTime); + } + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + return rc; +} + + + +/** + * Worker for GVMMR0SchedWakeUp and GVMMR0SchedWakeUpAndPokeCpus that wakes up + * the a sleeping EMT. + * + * @retval VINF_SUCCESS if successfully woken up. + * @retval VINF_GVM_NOT_BLOCKED if the EMT wasn't blocked. + * + * @param pGVM The global (ring-0) VM structure. + * @param pGVCpu The global (ring-0) VCPU structure. + */ +DECLINLINE(int) gvmmR0SchedWakeUpOne(PGVM pGVM, PGVMCPU pGVCpu) +{ + pGVM->gvmm.s.StatsSched.cWakeUpCalls++; + + /* + * Signal the semaphore regardless of whether it's current blocked on it. + * + * The reason for this is that there is absolutely no way we can be 100% + * certain that it isn't *about* go to go to sleep on it and just got + * delayed a bit en route. So, we will always signal the semaphore when + * the it is flagged as halted in the VMM. + */ +/** @todo we can optimize some of that by means of the pVCpu->enmState now. */ + int rc; + if (pGVCpu->gvmm.s.u64HaltExpire) + { + rc = VINF_SUCCESS; + ASMAtomicWriteU64(&pGVCpu->gvmm.s.u64HaltExpire, 0); + } + else + { + rc = VINF_GVM_NOT_BLOCKED; + pGVM->gvmm.s.StatsSched.cWakeUpNotHalted++; + } + + int rc2 = RTSemEventMultiSignal(pGVCpu->gvmm.s.HaltEventMulti); + AssertRC(rc2); + + return rc; +} + + +/** + * Wakes up the halted EMT thread so it can service a pending request. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if successfully woken up. + * @retval VINF_GVM_NOT_BLOCKED if the EMT wasn't blocked. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The Virtual CPU ID of the EMT to wake up. + * @param fTakeUsedLock Take the used lock or not + * @thread Any but EMT(idCpu). + */ +GVMMR0DECL(int) GVMMR0SchedWakeUpEx(PGVM pGVM, PVM pVM, VMCPUID idCpu, bool fTakeUsedLock) +{ + GVMM_CHECK_SMAP_SETUP(); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + + /* + * Validate input and take the UsedLock. + */ + PGVMM pGVMM; + int rc = gvmmR0ByGVMandVM(pGVM, pVM, &pGVMM, fTakeUsedLock); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + if (RT_SUCCESS(rc)) + { + if (idCpu < pGVM->cCpus) + { + /* + * Do the actual job. + */ + rc = gvmmR0SchedWakeUpOne(pGVM, &pGVM->aCpus[idCpu]); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + + if (fTakeUsedLock && pGVMM->fDoEarlyWakeUps) + { + /* + * While we're here, do a round of scheduling. + */ + Assert(ASMGetFlags() & X86_EFL_IF); + const uint64_t u64Now = RTTimeNanoTS(); /* (GIP time) */ + pGVM->gvmm.s.StatsSched.cWakeUpWakeUps += gvmmR0SchedDoWakeUps(pGVMM, u64Now); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + } + } + else + rc = VERR_INVALID_CPU_ID; + + if (fTakeUsedLock) + { + int rc2 = GVMMR0_USED_SHARED_UNLOCK(pGVMM); + AssertRC(rc2); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + } + } + + LogFlow(("GVMMR0SchedWakeUpEx: returns %Rrc\n", rc)); + return rc; +} + + +/** + * Wakes up the halted EMT thread so it can service a pending request. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if successfully woken up. + * @retval VINF_GVM_NOT_BLOCKED if the EMT wasn't blocked. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The Virtual CPU ID of the EMT to wake up. + * @thread Any but EMT(idCpu). + */ +GVMMR0DECL(int) GVMMR0SchedWakeUp(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + return GVMMR0SchedWakeUpEx(pGVM, pVM, idCpu, true /* fTakeUsedLock */); +} + + +/** + * Wakes up the halted EMT thread so it can service a pending request, no GVM + * parameter and no used locking. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if successfully woken up. + * @retval VINF_GVM_NOT_BLOCKED if the EMT wasn't blocked. + * + * @param pVM The cross context VM structure. + * @param idCpu The Virtual CPU ID of the EMT to wake up. + * @thread Any but EMT(idCpu). + * @deprecated Don't use in new code if possible! Use the GVM variant. + */ +GVMMR0DECL(int) GVMMR0SchedWakeUpNoGVMNoLock(PVM pVM, VMCPUID idCpu) +{ + GVMM_CHECK_SMAP_SETUP(); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + PGVM pGVM; + PGVMM pGVMM; + int rc = gvmmR0ByVM(pVM, &pGVM, &pGVMM, false /*fTakeUsedLock*/); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + if (RT_SUCCESS(rc)) + rc = GVMMR0SchedWakeUpEx(pGVM, pVM, idCpu, false /*fTakeUsedLock*/); + return rc; +} + + +/** + * Worker common to GVMMR0SchedPoke and GVMMR0SchedWakeUpAndPokeCpus that pokes + * the Virtual CPU if it's still busy executing guest code. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if poked successfully. + * @retval VINF_GVM_NOT_BUSY_IN_GC if the EMT wasn't busy in GC. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(int) gvmmR0SchedPokeOne(PGVM pGVM, PVMCPU pVCpu) +{ + pGVM->gvmm.s.StatsSched.cPokeCalls++; + + RTCPUID idHostCpu = pVCpu->idHostCpu; + if ( idHostCpu == NIL_RTCPUID + || VMCPU_GET_STATE(pVCpu) != VMCPUSTATE_STARTED_EXEC) + { + pGVM->gvmm.s.StatsSched.cPokeNotBusy++; + return VINF_GVM_NOT_BUSY_IN_GC; + } + + /* Note: this function is not implemented on Darwin and Linux (kernel < 2.6.19) */ + RTMpPokeCpu(idHostCpu); + return VINF_SUCCESS; +} + + +/** + * Pokes an EMT if it's still busy running guest code. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if poked successfully. + * @retval VINF_GVM_NOT_BUSY_IN_GC if the EMT wasn't busy in GC. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The ID of the virtual CPU to poke. + * @param fTakeUsedLock Take the used lock or not + */ +GVMMR0DECL(int) GVMMR0SchedPokeEx(PGVM pGVM, PVM pVM, VMCPUID idCpu, bool fTakeUsedLock) +{ + /* + * Validate input and take the UsedLock. + */ + PGVMM pGVMM; + int rc = gvmmR0ByGVMandVM(pGVM, pVM, &pGVMM, fTakeUsedLock); + if (RT_SUCCESS(rc)) + { + if (idCpu < pGVM->cCpus) + rc = gvmmR0SchedPokeOne(pGVM, &pVM->aCpus[idCpu]); + else + rc = VERR_INVALID_CPU_ID; + + if (fTakeUsedLock) + { + int rc2 = GVMMR0_USED_SHARED_UNLOCK(pGVMM); + AssertRC(rc2); + } + } + + LogFlow(("GVMMR0SchedWakeUpAndPokeCpus: returns %Rrc\n", rc)); + return rc; +} + + +/** + * Pokes an EMT if it's still busy running guest code. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if poked successfully. + * @retval VINF_GVM_NOT_BUSY_IN_GC if the EMT wasn't busy in GC. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The ID of the virtual CPU to poke. + */ +GVMMR0DECL(int) GVMMR0SchedPoke(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + return GVMMR0SchedPokeEx(pGVM, pVM, idCpu, true /* fTakeUsedLock */); +} + + +/** + * Pokes an EMT if it's still busy running guest code, no GVM parameter and no + * used locking. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if poked successfully. + * @retval VINF_GVM_NOT_BUSY_IN_GC if the EMT wasn't busy in GC. + * + * @param pVM The cross context VM structure. + * @param idCpu The ID of the virtual CPU to poke. + * + * @deprecated Don't use in new code if possible! Use the GVM variant. + */ +GVMMR0DECL(int) GVMMR0SchedPokeNoGVMNoLock(PVM pVM, VMCPUID idCpu) +{ + PGVM pGVM; + PGVMM pGVMM; + int rc = gvmmR0ByVM(pVM, &pGVM, &pGVMM, false /*fTakeUsedLock*/); + if (RT_SUCCESS(rc)) + { + if (idCpu < pGVM->cCpus) + rc = gvmmR0SchedPokeOne(pGVM, &pVM->aCpus[idCpu]); + else + rc = VERR_INVALID_CPU_ID; + } + return rc; +} + + +/** + * Wakes up a set of halted EMT threads so they can service pending request. + * + * @returns VBox status code, no informational stuff. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param pSleepSet The set of sleepers to wake up. + * @param pPokeSet The set of CPUs to poke. + */ +GVMMR0DECL(int) GVMMR0SchedWakeUpAndPokeCpus(PGVM pGVM, PVM pVM, PCVMCPUSET pSleepSet, PCVMCPUSET pPokeSet) +{ + AssertPtrReturn(pSleepSet, VERR_INVALID_POINTER); + AssertPtrReturn(pPokeSet, VERR_INVALID_POINTER); + GVMM_CHECK_SMAP_SETUP(); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + RTNATIVETHREAD hSelf = RTThreadNativeSelf(); + + /* + * Validate input and take the UsedLock. + */ + PGVMM pGVMM; + int rc = gvmmR0ByGVMandVM(pGVM, pVM, &pGVMM, true /* fTakeUsedLock */); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + if (RT_SUCCESS(rc)) + { + rc = VINF_SUCCESS; + VMCPUID idCpu = pGVM->cCpus; + while (idCpu-- > 0) + { + /* Don't try poke or wake up ourselves. */ + if (pGVM->aCpus[idCpu].hEMT == hSelf) + continue; + + /* just ignore errors for now. */ + if (VMCPUSET_IS_PRESENT(pSleepSet, idCpu)) + { + gvmmR0SchedWakeUpOne(pGVM, &pGVM->aCpus[idCpu]); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + } + else if (VMCPUSET_IS_PRESENT(pPokeSet, idCpu)) + { + gvmmR0SchedPokeOne(pGVM, &pVM->aCpus[idCpu]); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + } + } + + int rc2 = GVMMR0_USED_SHARED_UNLOCK(pGVMM); + AssertRC(rc2); + GVMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + } + + LogFlow(("GVMMR0SchedWakeUpAndPokeCpus: returns %Rrc\n", rc)); + return rc; +} + + +/** + * VMMR0 request wrapper for GVMMR0SchedWakeUpAndPokeCpus. + * + * @returns see GVMMR0SchedWakeUpAndPokeCpus. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param pReq Pointer to the request packet. + */ +GVMMR0DECL(int) GVMMR0SchedWakeUpAndPokeCpusReq(PGVM pGVM, PVM pVM, PGVMMSCHEDWAKEUPANDPOKECPUSREQ pReq) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + return GVMMR0SchedWakeUpAndPokeCpus(pGVM, pVM, &pReq->SleepSet, &pReq->PokeSet); +} + + + +/** + * Poll the schedule to see if someone else should get a chance to run. + * + * This is a bit hackish and will not work too well if the machine is + * under heavy load from non-VM processes. + * + * @returns VINF_SUCCESS if not yielded. + * VINF_GVM_YIELDED if an attempt to switch to a different VM task was made. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The Virtual CPU ID of the calling EMT. + * @param fYield Whether to yield or not. + * This is for when we're spinning in the halt loop. + * @thread EMT(idCpu). + */ +GVMMR0DECL(int) GVMMR0SchedPoll(PGVM pGVM, PVM pVM, VMCPUID idCpu, bool fYield) +{ + /* + * Validate input. + */ + PGVMM pGVMM; + int rc = gvmmR0ByGVMandVMandEMT(pGVM, pVM, idCpu, &pGVMM); + if (RT_SUCCESS(rc)) + { + /* + * We currently only implement helping doing wakeups (fYield = false), so don't + * bother taking the lock if gvmmR0SchedDoWakeUps is not going to do anything. + */ + if (!fYield && pGVMM->fDoEarlyWakeUps) + { + rc = GVMMR0_USED_SHARED_LOCK(pGVMM); AssertRC(rc); + pGVM->gvmm.s.StatsSched.cPollCalls++; + + Assert(ASMGetFlags() & X86_EFL_IF); + const uint64_t u64Now = RTTimeNanoTS(); /* (GIP time) */ + + pGVM->gvmm.s.StatsSched.cPollWakeUps += gvmmR0SchedDoWakeUps(pGVMM, u64Now); + + GVMMR0_USED_SHARED_UNLOCK(pGVMM); + } + /* + * Not quite sure what we could do here... + */ + else if (fYield) + rc = VERR_NOT_IMPLEMENTED; /** @todo implement this... */ + else + rc = VINF_SUCCESS; + } + + LogFlow(("GVMMR0SchedWakeUp: returns %Rrc\n", rc)); + return rc; +} + + +#ifdef GVMM_SCHED_WITH_PPT +/** + * Timer callback for the periodic preemption timer. + * + * @param pTimer The timer handle. + * @param pvUser Pointer to the per cpu structure. + * @param iTick The current tick. + */ +static DECLCALLBACK(void) gvmmR0SchedPeriodicPreemptionTimerCallback(PRTTIMER pTimer, void *pvUser, uint64_t iTick) +{ + PGVMMHOSTCPU pCpu = (PGVMMHOSTCPU)pvUser; + NOREF(pTimer); NOREF(iTick); + + /* + * Termination check + */ + if (pCpu->u32Magic != GVMMHOSTCPU_MAGIC) + return; + + /* + * Do the house keeping. + */ + RTSpinlockAcquire(pCpu->Ppt.hSpinlock); + + if (++pCpu->Ppt.iTickHistorization >= pCpu->Ppt.cTicksHistoriziationInterval) + { + /* + * Historicize the max frequency. + */ + uint32_t iHzHistory = ++pCpu->Ppt.iHzHistory % RT_ELEMENTS(pCpu->Ppt.aHzHistory); + pCpu->Ppt.aHzHistory[iHzHistory] = pCpu->Ppt.uDesiredHz; + pCpu->Ppt.iTickHistorization = 0; + pCpu->Ppt.uDesiredHz = 0; + + /* + * Check if the current timer frequency. + */ + uint32_t uHistMaxHz = 0; + for (uint32_t i = 0; i < RT_ELEMENTS(pCpu->Ppt.aHzHistory); i++) + if (pCpu->Ppt.aHzHistory[i] > uHistMaxHz) + uHistMaxHz = pCpu->Ppt.aHzHistory[i]; + if (uHistMaxHz == pCpu->Ppt.uTimerHz) + RTSpinlockRelease(pCpu->Ppt.hSpinlock); + else if (uHistMaxHz) + { + /* + * Reprogram it. + */ + pCpu->Ppt.cChanges++; + pCpu->Ppt.iTickHistorization = 0; + pCpu->Ppt.uTimerHz = uHistMaxHz; + uint32_t const cNsInterval = RT_NS_1SEC / uHistMaxHz; + pCpu->Ppt.cNsInterval = cNsInterval; + if (cNsInterval < GVMMHOSTCPU_PPT_HIST_INTERVAL_NS) + pCpu->Ppt.cTicksHistoriziationInterval = ( GVMMHOSTCPU_PPT_HIST_INTERVAL_NS + + GVMMHOSTCPU_PPT_HIST_INTERVAL_NS / 2 - 1) + / cNsInterval; + else + pCpu->Ppt.cTicksHistoriziationInterval = 1; + RTSpinlockRelease(pCpu->Ppt.hSpinlock); + + /*SUPR0Printf("Cpu%u: change to %u Hz / %u ns\n", pCpu->idxCpuSet, uHistMaxHz, cNsInterval);*/ + RTTimerChangeInterval(pTimer, cNsInterval); + } + else + { + /* + * Stop it. + */ + pCpu->Ppt.fStarted = false; + pCpu->Ppt.uTimerHz = 0; + pCpu->Ppt.cNsInterval = 0; + RTSpinlockRelease(pCpu->Ppt.hSpinlock); + + /*SUPR0Printf("Cpu%u: stopping (%u Hz)\n", pCpu->idxCpuSet, uHistMaxHz);*/ + RTTimerStop(pTimer); + } + } + else + RTSpinlockRelease(pCpu->Ppt.hSpinlock); +} +#endif /* GVMM_SCHED_WITH_PPT */ + + +/** + * Updates the periodic preemption timer for the calling CPU. + * + * The caller must have disabled preemption! + * The caller must check that the host can do high resolution timers. + * + * @param pVM The cross context VM structure. + * @param idHostCpu The current host CPU id. + * @param uHz The desired frequency. + */ +GVMMR0DECL(void) GVMMR0SchedUpdatePeriodicPreemptionTimer(PVM pVM, RTCPUID idHostCpu, uint32_t uHz) +{ + NOREF(pVM); +#ifdef GVMM_SCHED_WITH_PPT + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(RTTimerCanDoHighResolution()); + + /* + * Resolve the per CPU data. + */ + uint32_t iCpu = RTMpCpuIdToSetIndex(idHostCpu); + PGVMM pGVMM = g_pGVMM; + if ( !VALID_PTR(pGVMM) + || pGVMM->u32Magic != GVMM_MAGIC) + return; + AssertMsgReturnVoid(iCpu < pGVMM->cHostCpus, ("iCpu=%d cHostCpus=%d\n", iCpu, pGVMM->cHostCpus)); + PGVMMHOSTCPU pCpu = &pGVMM->aHostCpus[iCpu]; + AssertMsgReturnVoid( pCpu->u32Magic == GVMMHOSTCPU_MAGIC + && pCpu->idCpu == idHostCpu, + ("u32Magic=%#x idCpu=% idHostCpu=%d\n", pCpu->u32Magic, pCpu->idCpu, idHostCpu)); + + /* + * Check whether we need to do anything about the timer. + * We have to be a little bit careful since we might be race the timer + * callback here. + */ + if (uHz > 16384) + uHz = 16384; /** @todo add a query method for this! */ + if (RT_UNLIKELY( uHz > ASMAtomicReadU32(&pCpu->Ppt.uDesiredHz) + && uHz >= pCpu->Ppt.uMinHz + && !pCpu->Ppt.fStarting /* solaris paranoia */)) + { + RTSpinlockAcquire(pCpu->Ppt.hSpinlock); + + pCpu->Ppt.uDesiredHz = uHz; + uint32_t cNsInterval = 0; + if (!pCpu->Ppt.fStarted) + { + pCpu->Ppt.cStarts++; + pCpu->Ppt.fStarted = true; + pCpu->Ppt.fStarting = true; + pCpu->Ppt.iTickHistorization = 0; + pCpu->Ppt.uTimerHz = uHz; + pCpu->Ppt.cNsInterval = cNsInterval = RT_NS_1SEC / uHz; + if (cNsInterval < GVMMHOSTCPU_PPT_HIST_INTERVAL_NS) + pCpu->Ppt.cTicksHistoriziationInterval = ( GVMMHOSTCPU_PPT_HIST_INTERVAL_NS + + GVMMHOSTCPU_PPT_HIST_INTERVAL_NS / 2 - 1) + / cNsInterval; + else + pCpu->Ppt.cTicksHistoriziationInterval = 1; + } + + RTSpinlockRelease(pCpu->Ppt.hSpinlock); + + if (cNsInterval) + { + RTTimerChangeInterval(pCpu->Ppt.pTimer, cNsInterval); + int rc = RTTimerStart(pCpu->Ppt.pTimer, cNsInterval); + AssertRC(rc); + + RTSpinlockAcquire(pCpu->Ppt.hSpinlock); + if (RT_FAILURE(rc)) + pCpu->Ppt.fStarted = false; + pCpu->Ppt.fStarting = false; + RTSpinlockRelease(pCpu->Ppt.hSpinlock); + } + } +#else /* !GVMM_SCHED_WITH_PPT */ + NOREF(idHostCpu); NOREF(uHz); +#endif /* !GVMM_SCHED_WITH_PPT */ +} + + +/** + * Retrieves the GVMM statistics visible to the caller. + * + * @returns VBox status code. + * + * @param pStats Where to put the statistics. + * @param pSession The current session. + * @param pGVM The GVM to obtain statistics for. Optional. + * @param pVM The VM structure corresponding to @a pGVM. + */ +GVMMR0DECL(int) GVMMR0QueryStatistics(PGVMMSTATS pStats, PSUPDRVSESSION pSession, PGVM pGVM, PVM pVM) +{ + LogFlow(("GVMMR0QueryStatistics: pStats=%p pSession=%p pGVM=%p pVM=%p\n", pStats, pSession, pGVM, pVM)); + + /* + * Validate input. + */ + AssertPtrReturn(pSession, VERR_INVALID_POINTER); + AssertPtrReturn(pStats, VERR_INVALID_POINTER); + pStats->cVMs = 0; /* (crash before taking the sem...) */ + + /* + * Take the lock and get the VM statistics. + */ + PGVMM pGVMM; + if (pGVM) + { + int rc = gvmmR0ByGVMandVM(pGVM, pVM, &pGVMM, true /*fTakeUsedLock*/); + if (RT_FAILURE(rc)) + return rc; + pStats->SchedVM = pGVM->gvmm.s.StatsSched; + } + else + { + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + memset(&pStats->SchedVM, 0, sizeof(pStats->SchedVM)); + + int rc = GVMMR0_USED_SHARED_LOCK(pGVMM); + AssertRCReturn(rc, rc); + } + + /* + * Enumerate the VMs and add the ones visible to the statistics. + */ + pStats->cVMs = 0; + pStats->cEMTs = 0; + memset(&pStats->SchedSum, 0, sizeof(pStats->SchedSum)); + + for (unsigned i = pGVMM->iUsedHead; + i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles); + i = pGVMM->aHandles[i].iNext) + { + PGVM pOtherGVM = pGVMM->aHandles[i].pGVM; + void *pvObj = pGVMM->aHandles[i].pvObj; + if ( VALID_PTR(pvObj) + && VALID_PTR(pOtherGVM) + && pOtherGVM->u32Magic == GVM_MAGIC + && RT_SUCCESS(SUPR0ObjVerifyAccess(pvObj, pSession, NULL))) + { + pStats->cVMs++; + pStats->cEMTs += pOtherGVM->cCpus; + + pStats->SchedSum.cHaltCalls += pOtherGVM->gvmm.s.StatsSched.cHaltCalls; + pStats->SchedSum.cHaltBlocking += pOtherGVM->gvmm.s.StatsSched.cHaltBlocking; + pStats->SchedSum.cHaltTimeouts += pOtherGVM->gvmm.s.StatsSched.cHaltTimeouts; + pStats->SchedSum.cHaltNotBlocking += pOtherGVM->gvmm.s.StatsSched.cHaltNotBlocking; + pStats->SchedSum.cHaltWakeUps += pOtherGVM->gvmm.s.StatsSched.cHaltWakeUps; + + pStats->SchedSum.cWakeUpCalls += pOtherGVM->gvmm.s.StatsSched.cWakeUpCalls; + pStats->SchedSum.cWakeUpNotHalted += pOtherGVM->gvmm.s.StatsSched.cWakeUpNotHalted; + pStats->SchedSum.cWakeUpWakeUps += pOtherGVM->gvmm.s.StatsSched.cWakeUpWakeUps; + + pStats->SchedSum.cPokeCalls += pOtherGVM->gvmm.s.StatsSched.cPokeCalls; + pStats->SchedSum.cPokeNotBusy += pOtherGVM->gvmm.s.StatsSched.cPokeNotBusy; + + pStats->SchedSum.cPollCalls += pOtherGVM->gvmm.s.StatsSched.cPollCalls; + pStats->SchedSum.cPollHalts += pOtherGVM->gvmm.s.StatsSched.cPollHalts; + pStats->SchedSum.cPollWakeUps += pOtherGVM->gvmm.s.StatsSched.cPollWakeUps; + } + } + + /* + * Copy out the per host CPU statistics. + */ + uint32_t iDstCpu = 0; + uint32_t cSrcCpus = pGVMM->cHostCpus; + for (uint32_t iSrcCpu = 0; iSrcCpu < cSrcCpus; iSrcCpu++) + { + if (pGVMM->aHostCpus[iSrcCpu].idCpu != NIL_RTCPUID) + { + pStats->aHostCpus[iDstCpu].idCpu = pGVMM->aHostCpus[iSrcCpu].idCpu; + pStats->aHostCpus[iDstCpu].idxCpuSet = pGVMM->aHostCpus[iSrcCpu].idxCpuSet; +#ifdef GVMM_SCHED_WITH_PPT + pStats->aHostCpus[iDstCpu].uDesiredHz = pGVMM->aHostCpus[iSrcCpu].Ppt.uDesiredHz; + pStats->aHostCpus[iDstCpu].uTimerHz = pGVMM->aHostCpus[iSrcCpu].Ppt.uTimerHz; + pStats->aHostCpus[iDstCpu].cChanges = pGVMM->aHostCpus[iSrcCpu].Ppt.cChanges; + pStats->aHostCpus[iDstCpu].cStarts = pGVMM->aHostCpus[iSrcCpu].Ppt.cStarts; +#else + pStats->aHostCpus[iDstCpu].uDesiredHz = 0; + pStats->aHostCpus[iDstCpu].uTimerHz = 0; + pStats->aHostCpus[iDstCpu].cChanges = 0; + pStats->aHostCpus[iDstCpu].cStarts = 0; +#endif + iDstCpu++; + if (iDstCpu >= RT_ELEMENTS(pStats->aHostCpus)) + break; + } + } + pStats->cHostCpus = iDstCpu; + + GVMMR0_USED_SHARED_UNLOCK(pGVMM); + + return VINF_SUCCESS; +} + + +/** + * VMMR0 request wrapper for GVMMR0QueryStatistics. + * + * @returns see GVMMR0QueryStatistics. + * @param pGVM The global (ring-0) VM structure. Optional. + * @param pVM The cross context VM structure. Optional. + * @param pReq Pointer to the request packet. + * @param pSession The current session. + */ +GVMMR0DECL(int) GVMMR0QueryStatisticsReq(PGVM pGVM, PVM pVM, PGVMMQUERYSTATISTICSSREQ pReq, PSUPDRVSESSION pSession) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + AssertReturn(pReq->pSession == pSession, VERR_INVALID_PARAMETER); + + return GVMMR0QueryStatistics(&pReq->Stats, pSession, pGVM, pVM); +} + + +/** + * Resets the specified GVMM statistics. + * + * @returns VBox status code. + * + * @param pStats Which statistics to reset, that is, non-zero fields indicates which to reset. + * @param pSession The current session. + * @param pGVM The GVM to reset statistics for. Optional. + * @param pVM The VM structure corresponding to @a pGVM. + */ +GVMMR0DECL(int) GVMMR0ResetStatistics(PCGVMMSTATS pStats, PSUPDRVSESSION pSession, PGVM pGVM, PVM pVM) +{ + LogFlow(("GVMMR0ResetStatistics: pStats=%p pSession=%p pGVM=%p pVM=%p\n", pStats, pSession, pGVM, pVM)); + + /* + * Validate input. + */ + AssertPtrReturn(pSession, VERR_INVALID_POINTER); + AssertPtrReturn(pStats, VERR_INVALID_POINTER); + + /* + * Take the lock and get the VM statistics. + */ + PGVMM pGVMM; + if (pGVM) + { + int rc = gvmmR0ByGVMandVM(pGVM, pVM, &pGVMM, true /*fTakeUsedLock*/); + if (RT_FAILURE(rc)) + return rc; +# define MAYBE_RESET_FIELD(field) \ + do { if (pStats->SchedVM. field ) { pGVM->gvmm.s.StatsSched. field = 0; } } while (0) + MAYBE_RESET_FIELD(cHaltCalls); + MAYBE_RESET_FIELD(cHaltBlocking); + MAYBE_RESET_FIELD(cHaltTimeouts); + MAYBE_RESET_FIELD(cHaltNotBlocking); + MAYBE_RESET_FIELD(cHaltWakeUps); + MAYBE_RESET_FIELD(cWakeUpCalls); + MAYBE_RESET_FIELD(cWakeUpNotHalted); + MAYBE_RESET_FIELD(cWakeUpWakeUps); + MAYBE_RESET_FIELD(cPokeCalls); + MAYBE_RESET_FIELD(cPokeNotBusy); + MAYBE_RESET_FIELD(cPollCalls); + MAYBE_RESET_FIELD(cPollHalts); + MAYBE_RESET_FIELD(cPollWakeUps); +# undef MAYBE_RESET_FIELD + } + else + { + GVMM_GET_VALID_INSTANCE(pGVMM, VERR_GVMM_INSTANCE); + + int rc = GVMMR0_USED_SHARED_LOCK(pGVMM); + AssertRCReturn(rc, rc); + } + + /* + * Enumerate the VMs and add the ones visible to the statistics. + */ + if (!ASMMemIsZero(&pStats->SchedSum, sizeof(pStats->SchedSum))) + { + for (unsigned i = pGVMM->iUsedHead; + i != NIL_GVM_HANDLE && i < RT_ELEMENTS(pGVMM->aHandles); + i = pGVMM->aHandles[i].iNext) + { + PGVM pOtherGVM = pGVMM->aHandles[i].pGVM; + void *pvObj = pGVMM->aHandles[i].pvObj; + if ( VALID_PTR(pvObj) + && VALID_PTR(pOtherGVM) + && pOtherGVM->u32Magic == GVM_MAGIC + && RT_SUCCESS(SUPR0ObjVerifyAccess(pvObj, pSession, NULL))) + { +# define MAYBE_RESET_FIELD(field) \ + do { if (pStats->SchedSum. field ) { pOtherGVM->gvmm.s.StatsSched. field = 0; } } while (0) + MAYBE_RESET_FIELD(cHaltCalls); + MAYBE_RESET_FIELD(cHaltBlocking); + MAYBE_RESET_FIELD(cHaltTimeouts); + MAYBE_RESET_FIELD(cHaltNotBlocking); + MAYBE_RESET_FIELD(cHaltWakeUps); + MAYBE_RESET_FIELD(cWakeUpCalls); + MAYBE_RESET_FIELD(cWakeUpNotHalted); + MAYBE_RESET_FIELD(cWakeUpWakeUps); + MAYBE_RESET_FIELD(cPokeCalls); + MAYBE_RESET_FIELD(cPokeNotBusy); + MAYBE_RESET_FIELD(cPollCalls); + MAYBE_RESET_FIELD(cPollHalts); + MAYBE_RESET_FIELD(cPollWakeUps); +# undef MAYBE_RESET_FIELD + } + } + } + + GVMMR0_USED_SHARED_UNLOCK(pGVMM); + + return VINF_SUCCESS; +} + + +/** + * VMMR0 request wrapper for GVMMR0ResetStatistics. + * + * @returns see GVMMR0ResetStatistics. + * @param pGVM The global (ring-0) VM structure. Optional. + * @param pVM The cross context VM structure. Optional. + * @param pReq Pointer to the request packet. + * @param pSession The current session. + */ +GVMMR0DECL(int) GVMMR0ResetStatisticsReq(PGVM pGVM, PVM pVM, PGVMMRESETSTATISTICSSREQ pReq, PSUPDRVSESSION pSession) +{ + /* + * Validate input and pass it on. + */ + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + AssertReturn(pReq->pSession == pSession, VERR_INVALID_PARAMETER); + + return GVMMR0ResetStatistics(&pReq->Stats, pSession, pGVM, pVM); +} + diff --git a/src/VBox/VMM/VMMR0/GVMMR0Internal.h b/src/VBox/VMM/VMMR0/GVMMR0Internal.h new file mode 100644 index 00000000..b343b3f5 --- /dev/null +++ b/src/VBox/VMM/VMMR0/GVMMR0Internal.h @@ -0,0 +1,69 @@ +/* $Id: GVMMR0Internal.h $ */ +/** @file + * GVMM - The Global VM Manager, Internal header. + */ + +/* + * Copyright (C) 2007-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + +#ifndef VMM_INCLUDED_SRC_VMMR0_GVMMR0Internal_h +#define VMM_INCLUDED_SRC_VMMR0_GVMMR0Internal_h +#ifndef RT_WITHOUT_PRAGMA_ONCE +# pragma once +#endif + +#include <iprt/mem.h> + +/** + * The GVMM per VM data. + */ +typedef struct GVMMPERVCPU +{ + /** The time the halted EMT thread expires. + * 0 if the EMT thread is blocked here. */ + uint64_t volatile u64HaltExpire; + /** The event semaphore the EMT thread is blocking on. */ + RTSEMEVENTMULTI HaltEventMulti; + /** The APIC ID of the CPU that EMT was scheduled on the last time we checked. */ + uint8_t iCpuEmt; +} GVMMPERVCPU; +/** Pointer to the GVMM per VCPU data. */ +typedef GVMMPERVCPU *PGVMMPERVCPU; + +/** + * The GVMM per VM data. + */ +typedef struct GVMMPERVM +{ + /** The shared VM data structure allocation object (PVMR0). */ + RTR0MEMOBJ VMMemObj; + /** The Ring-3 mapping of the shared VM data structure (PVMR3). */ + RTR0MEMOBJ VMMapObj; + /** The allocation object for the VM pages. */ + RTR0MEMOBJ VMPagesMemObj; + /** The ring-3 mapping of the VM pages. */ + RTR0MEMOBJ VMPagesMapObj; + + /** The scheduler statistics. */ + GVMMSTATSSCHED StatsSched; + + /** Whether the per-VM ring-0 initialization has been performed. */ + bool fDoneVMMR0Init; + /** Whether the per-VM ring-0 termination is being or has been performed. */ + bool fDoneVMMR0Term; +} GVMMPERVM; +/** Pointer to the GVMM per VM data. */ +typedef GVMMPERVM *PGVMMPERVM; + + +#endif /* !VMM_INCLUDED_SRC_VMMR0_GVMMR0Internal_h */ + diff --git a/src/VBox/VMM/VMMR0/HMR0.cpp b/src/VBox/VMM/VMMR0/HMR0.cpp new file mode 100644 index 00000000..3386e7f1 --- /dev/null +++ b/src/VBox/VMM/VMMR0/HMR0.cpp @@ -0,0 +1,2005 @@ +/* $Id: HMR0.cpp $ */ +/** @file + * Hardware Assisted Virtualization Manager (HM) - Host Context Ring-0. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_HM +#define VMCPU_INCL_CPUM_GST_CTX +#include <VBox/vmm/hm.h> +#include <VBox/vmm/pgm.h> +#include "HMInternal.h" +#include <VBox/vmm/vm.h> +#include <VBox/vmm/hm_svm.h> +#include <VBox/vmm/hmvmxinline.h> +#include <VBox/err.h> +#include <VBox/log.h> +#include <iprt/assert.h> +#include <iprt/asm.h> +#include <iprt/asm-amd64-x86.h> +#include <iprt/cpuset.h> +#include <iprt/mem.h> +#include <iprt/memobj.h> +#include <iprt/once.h> +#include <iprt/param.h> +#include <iprt/power.h> +#include <iprt/string.h> +#include <iprt/thread.h> +#include <iprt/x86.h> +#include "HMVMXR0.h" +#include "HMSVMR0.h" + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +static DECLCALLBACK(void) hmR0EnableCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2); +static DECLCALLBACK(void) hmR0DisableCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2); +static DECLCALLBACK(void) hmR0InitIntelCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2); +static DECLCALLBACK(void) hmR0InitAmdCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2); +static DECLCALLBACK(void) hmR0PowerCallback(RTPOWEREVENT enmEvent, void *pvUser); +static DECLCALLBACK(void) hmR0MpEventCallback(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvData); + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * This is used to manage the status code of a RTMpOnAll in HM. + */ +typedef struct HMR0FIRSTRC +{ + /** The status code. */ + int32_t volatile rc; + /** The ID of the CPU reporting the first failure. */ + RTCPUID volatile idCpu; +} HMR0FIRSTRC; +/** Pointer to a first return code structure. */ +typedef HMR0FIRSTRC *PHMR0FIRSTRC; + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +/** + * Global data. + */ +static struct +{ + /** Per CPU globals. */ + HMPHYSCPU aCpuInfo[RTCPUSET_MAX_CPUS]; + + /** @name Ring-0 method table for AMD-V and VT-x specific operations. + * @{ */ + DECLR0CALLBACKMEMBER(int, pfnEnterSession, (PVMCPU pVCpu)); + DECLR0CALLBACKMEMBER(void, pfnThreadCtxCallback, (RTTHREADCTXEVENT enmEvent, PVMCPU pVCpu, bool fGlobalInit)); + DECLR0CALLBACKMEMBER(int, pfnExportHostState, (PVMCPU pVCpu)); + DECLR0CALLBACKMEMBER(VBOXSTRICTRC, pfnRunGuestCode, (PVMCPU pVCpu)); + DECLR0CALLBACKMEMBER(int, pfnEnableCpu, (PHMPHYSCPU pHostCpu, PVM pVM, void *pvCpuPage, RTHCPHYS HCPhysCpuPage, + bool fEnabledByHost, PCSUPHWVIRTMSRS pHwvirtMsrs)); + DECLR0CALLBACKMEMBER(int, pfnDisableCpu, (void *pvCpuPage, RTHCPHYS HCPhysCpuPage)); + DECLR0CALLBACKMEMBER(int, pfnInitVM, (PVM pVM)); + DECLR0CALLBACKMEMBER(int, pfnTermVM, (PVM pVM)); + DECLR0CALLBACKMEMBER(int, pfnSetupVM, (PVM pVM)); + /** @} */ + + /** Hardware-virtualization data. */ + struct + { + union + { + /** VT-x data. */ + struct + { + /** Host CR4 value (set by ring-0 VMX init) */ + uint64_t u64HostCr4; + /** Host EFER value (set by ring-0 VMX init) */ + uint64_t u64HostEfer; + /** Host SMM monitor control (used for logging/diagnostics) */ + uint64_t u64HostSmmMonitorCtl; + /** Last instruction error. */ + uint32_t ulLastInstrError; + /** The shift mask employed by the VMX-Preemption timer. */ + uint8_t cPreemptTimerShift; + /** Padding. */ + uint8_t abPadding[3]; + /** Whether we're using the preemption timer or not. */ + bool fUsePreemptTimer; + /** Whether we're using SUPR0EnableVTx or not. */ + bool fUsingSUPR0EnableVTx; + /** Set if we've called SUPR0EnableVTx(true) and should disable it during + * module termination. */ + bool fCalledSUPR0EnableVTx; + /** Set to by us to indicate VMX is supported by the CPU. */ + bool fSupported; + } vmx; + + /** AMD-V data. */ + struct + { + /** SVM revision. */ + uint32_t u32Rev; + /** SVM feature bits from cpuid 0x8000000a */ + uint32_t u32Features; + /** Padding. */ + bool afPadding[3]; + /** Set by us to indicate SVM is supported by the CPU. */ + bool fSupported; + } svm; + } u; + /** Maximum allowed ASID/VPID (inclusive). */ + uint32_t uMaxAsid; + /** MSRs. */ + SUPHWVIRTMSRS Msrs; + } hwvirt; + + /** Last recorded error code during HM ring-0 init. */ + int32_t rcInit; + + /** If set, VT-x/AMD-V is enabled globally at init time, otherwise it's + * enabled and disabled each time it's used to execute guest code. */ + bool fGlobalInit; + /** Indicates whether the host is suspending or not. We'll refuse a few + * actions when the host is being suspended to speed up the suspending and + * avoid trouble. */ + bool volatile fSuspended; + + /** Whether we've already initialized all CPUs. + * @remarks We could check the EnableAllCpusOnce state, but this is + * simpler and hopefully easier to understand. */ + bool fEnabled; + /** Serialize initialization in HMR0EnableAllCpus. */ + RTONCE EnableAllCpusOnce; +} g_HmR0; + + +/** + * Initializes a first return code structure. + * + * @param pFirstRc The structure to init. + */ +static void hmR0FirstRcInit(PHMR0FIRSTRC pFirstRc) +{ + pFirstRc->rc = VINF_SUCCESS; + pFirstRc->idCpu = NIL_RTCPUID; +} + + +/** + * Try set the status code (success ignored). + * + * @param pFirstRc The first return code structure. + * @param rc The status code. + */ +static void hmR0FirstRcSetStatus(PHMR0FIRSTRC pFirstRc, int rc) +{ + if ( RT_FAILURE(rc) + && ASMAtomicCmpXchgS32(&pFirstRc->rc, rc, VINF_SUCCESS)) + pFirstRc->idCpu = RTMpCpuId(); +} + + +/** + * Get the status code of a first return code structure. + * + * @returns The status code; VINF_SUCCESS or error status, no informational or + * warning errors. + * @param pFirstRc The first return code structure. + */ +static int hmR0FirstRcGetStatus(PHMR0FIRSTRC pFirstRc) +{ + return pFirstRc->rc; +} + + +#ifdef VBOX_STRICT +# ifndef DEBUG_bird +/** + * Get the CPU ID on which the failure status code was reported. + * + * @returns The CPU ID, NIL_RTCPUID if no failure was reported. + * @param pFirstRc The first return code structure. + */ +static RTCPUID hmR0FirstRcGetCpuId(PHMR0FIRSTRC pFirstRc) +{ + return pFirstRc->idCpu; +} +# endif +#endif /* VBOX_STRICT */ + + +/** @name Dummy callback handlers. + * @{ */ + +static DECLCALLBACK(int) hmR0DummyEnter(PVMCPU pVCpu) +{ + RT_NOREF1(pVCpu); + return VINF_SUCCESS; +} + +static DECLCALLBACK(void) hmR0DummyThreadCtxCallback(RTTHREADCTXEVENT enmEvent, PVMCPU pVCpu, bool fGlobalInit) +{ + RT_NOREF3(enmEvent, pVCpu, fGlobalInit); +} + +static DECLCALLBACK(int) hmR0DummyEnableCpu(PHMPHYSCPU pHostCpu, PVM pVM, void *pvCpuPage, RTHCPHYS HCPhysCpuPage, + bool fEnabledBySystem, PCSUPHWVIRTMSRS pHwvirtMsrs) +{ + RT_NOREF6(pHostCpu, pVM, pvCpuPage, HCPhysCpuPage, fEnabledBySystem, pHwvirtMsrs); + return VINF_SUCCESS; +} + +static DECLCALLBACK(int) hmR0DummyDisableCpu(void *pvCpuPage, RTHCPHYS HCPhysCpuPage) +{ + RT_NOREF2(pvCpuPage, HCPhysCpuPage); + return VINF_SUCCESS; +} + +static DECLCALLBACK(int) hmR0DummyInitVM(PVM pVM) +{ + RT_NOREF1(pVM); + return VINF_SUCCESS; +} + +static DECLCALLBACK(int) hmR0DummyTermVM(PVM pVM) +{ + RT_NOREF1(pVM); + return VINF_SUCCESS; +} + +static DECLCALLBACK(int) hmR0DummySetupVM(PVM pVM) +{ + RT_NOREF1(pVM); + return VINF_SUCCESS; +} + +static DECLCALLBACK(VBOXSTRICTRC) hmR0DummyRunGuestCode(PVMCPU pVCpu) +{ + RT_NOREF(pVCpu); + return VINF_SUCCESS; +} + +static DECLCALLBACK(int) hmR0DummyExportHostState(PVMCPU pVCpu) +{ + RT_NOREF1(pVCpu); + return VINF_SUCCESS; +} + +/** @} */ + + +/** + * Checks if the CPU is subject to the "VMX-Preemption Timer Does Not Count + * Down at the Rate Specified" erratum. + * + * Errata names and related steppings: + * - BA86 - D0. + * - AAX65 - C2. + * - AAU65 - C2, K0. + * - AAO95 - B1. + * - AAT59 - C2. + * - AAK139 - D0. + * - AAM126 - C0, C1, D0. + * - AAN92 - B1. + * - AAJ124 - C0, D0. + * - AAP86 - B1. + * + * Steppings: B1, C0, C1, C2, D0, K0. + * + * @returns true if subject to it, false if not. + */ +static bool hmR0InitIntelIsSubjectToVmxPreemptTimerErratum(void) +{ + uint32_t u = ASMCpuId_EAX(1); + u &= ~(RT_BIT_32(14) | RT_BIT_32(15) | RT_BIT_32(28) | RT_BIT_32(29) | RT_BIT_32(30) | RT_BIT_32(31)); + if ( u == UINT32_C(0x000206E6) /* 323344.pdf - BA86 - D0 - Intel Xeon Processor 7500 Series */ + || u == UINT32_C(0x00020652) /* 323056.pdf - AAX65 - C2 - Intel Xeon Processor L3406 */ + /* 322814.pdf - AAT59 - C2 - Intel CoreTM i7-600, i5-500, i5-400 and i3-300 Mobile Processor Series */ + /* 322911.pdf - AAU65 - C2 - Intel CoreTM i5-600, i3-500 Desktop Processor Series and Intel Pentium Processor G6950 */ + || u == UINT32_C(0x00020655) /* 322911.pdf - AAU65 - K0 - Intel CoreTM i5-600, i3-500 Desktop Processor Series and Intel Pentium Processor G6950 */ + || u == UINT32_C(0x000106E5) /* 322373.pdf - AAO95 - B1 - Intel Xeon Processor 3400 Series */ + /* 322166.pdf - AAN92 - B1 - Intel CoreTM i7-800 and i5-700 Desktop Processor Series */ + /* 320767.pdf - AAP86 - B1 - Intel Core i7-900 Mobile Processor Extreme Edition Series, Intel Core i7-800 and i7-700 Mobile Processor Series */ + || u == UINT32_C(0x000106A0) /* 321333.pdf - AAM126 - C0 - Intel Xeon Processor 3500 Series Specification */ + || u == UINT32_C(0x000106A1) /* 321333.pdf - AAM126 - C1 - Intel Xeon Processor 3500 Series Specification */ + || u == UINT32_C(0x000106A4) /* 320836.pdf - AAJ124 - C0 - Intel Core i7-900 Desktop Processor Extreme Edition Series and Intel Core i7-900 Desktop Processor Series */ + || u == UINT32_C(0x000106A5) /* 321333.pdf - AAM126 - D0 - Intel Xeon Processor 3500 Series Specification */ + /* 321324.pdf - AAK139 - D0 - Intel Xeon Processor 5500 Series Specification */ + /* 320836.pdf - AAJ124 - D0 - Intel Core i7-900 Desktop Processor Extreme Edition Series and Intel Core i7-900 Desktop Processor Series */ + ) + return true; + return false; +} + + +/** + * Intel specific initialization code. + * + * @returns VBox status code (will only fail if out of memory). + */ +static int hmR0InitIntel(void) +{ + /* Read this MSR now as it may be useful for error reporting when initializing VT-x fails. */ + g_HmR0.hwvirt.Msrs.u.vmx.u64FeatCtrl = ASMRdMsr(MSR_IA32_FEATURE_CONTROL); + + /* + * First try use native kernel API for controlling VT-x. + * (This is only supported by some Mac OS X kernels atm.) + */ + int rc = g_HmR0.rcInit = SUPR0EnableVTx(true /* fEnable */); + g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx = rc != VERR_NOT_SUPPORTED; + if (g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx) + { + AssertLogRelMsg(rc == VINF_SUCCESS || rc == VERR_VMX_IN_VMX_ROOT_MODE || rc == VERR_VMX_NO_VMX, ("%Rrc\n", rc)); + if (RT_SUCCESS(rc)) + { + g_HmR0.hwvirt.u.vmx.fSupported = true; + rc = SUPR0EnableVTx(false /* fEnable */); + AssertLogRelRC(rc); + } + } + else + { + HMR0FIRSTRC FirstRc; + hmR0FirstRcInit(&FirstRc); + g_HmR0.rcInit = RTMpOnAll(hmR0InitIntelCpu, &FirstRc, NULL); + if (RT_SUCCESS(g_HmR0.rcInit)) + g_HmR0.rcInit = hmR0FirstRcGetStatus(&FirstRc); + } + + if (RT_SUCCESS(g_HmR0.rcInit)) + { + /* Read CR4 and EFER for logging/diagnostic purposes. */ + g_HmR0.hwvirt.u.vmx.u64HostCr4 = ASMGetCR4(); + g_HmR0.hwvirt.u.vmx.u64HostEfer = ASMRdMsr(MSR_K6_EFER); + + /* Get VMX MSRs for determining VMX features we can ultimately use. */ + SUPR0GetHwvirtMsrs(&g_HmR0.hwvirt.Msrs, SUPVTCAPS_VT_X, false /* fForce */); + + /* + * Nested KVM workaround: Intel SDM section 34.15.5 describes that + * MSR_IA32_SMM_MONITOR_CTL depends on bit 49 of MSR_IA32_VMX_BASIC while + * table 35-2 says that this MSR is available if either VMX or SMX is supported. + */ + uint64_t const uVmxBasicMsr = g_HmR0.hwvirt.Msrs.u.vmx.u64Basic; + if (RT_BF_GET(uVmxBasicMsr, VMX_BF_BASIC_DUAL_MON)) + g_HmR0.hwvirt.u.vmx.u64HostSmmMonitorCtl = ASMRdMsr(MSR_IA32_SMM_MONITOR_CTL); + + /* Initialize VPID - 16 bits ASID. */ + g_HmR0.hwvirt.uMaxAsid = 0x10000; /* exclusive */ + + /* + * If the host OS has not enabled VT-x for us, try enter VMX root mode + * to really verify if VT-x is usable. + */ + if (!g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx) + { + /* Allocate a temporary VMXON region. */ + RTR0MEMOBJ hScatchMemObj; + rc = RTR0MemObjAllocCont(&hScatchMemObj, PAGE_SIZE, false /* fExecutable */); + if (RT_FAILURE(rc)) + { + LogRel(("hmR0InitIntel: RTR0MemObjAllocCont(,PAGE_SIZE,false) -> %Rrc\n", rc)); + return rc; + } + void *pvScatchPage = RTR0MemObjAddress(hScatchMemObj); + RTHCPHYS HCPhysScratchPage = RTR0MemObjGetPagePhysAddr(hScatchMemObj, 0); + ASMMemZeroPage(pvScatchPage); + + /* Set revision dword at the beginning of the VMXON structure. */ + *(uint32_t *)pvScatchPage = RT_BF_GET(uVmxBasicMsr, VMX_BF_BASIC_VMCS_ID); + + /* Make sure we don't get rescheduled to another CPU during this probe. */ + RTCCUINTREG const fEFlags = ASMIntDisableFlags(); + + /* Check CR4.VMXE. */ + g_HmR0.hwvirt.u.vmx.u64HostCr4 = ASMGetCR4(); + if (!(g_HmR0.hwvirt.u.vmx.u64HostCr4 & X86_CR4_VMXE)) + { + /* In theory this bit could be cleared behind our back. Which would cause #UD + faults when we try to execute the VMX instructions... */ + ASMSetCR4(g_HmR0.hwvirt.u.vmx.u64HostCr4 | X86_CR4_VMXE); + } + + /* + * The only way of checking if we're in VMX root mode or not is to try and enter it. + * There is no instruction or control bit that tells us if we're in VMX root mode. + * Therefore, try and enter VMX root mode here. + */ + rc = VMXEnable(HCPhysScratchPage); + if (RT_SUCCESS(rc)) + { + g_HmR0.hwvirt.u.vmx.fSupported = true; + VMXDisable(); + } + else + { + /* + * KVM leaves the CPU in VMX root mode. Not only is this not allowed, + * it will crash the host when we enter raw mode, because: + * + * (a) clearing X86_CR4_VMXE in CR4 causes a #GP (we no longer modify + * this bit), and + * (b) turning off paging causes a #GP (unavoidable when switching + * from long to 32 bits mode or 32 bits to PAE). + * + * They should fix their code, but until they do we simply refuse to run. + */ + g_HmR0.rcInit = VERR_VMX_IN_VMX_ROOT_MODE; + Assert(g_HmR0.hwvirt.u.vmx.fSupported == false); + } + + /* + * Restore CR4 again; don't leave the X86_CR4_VMXE flag set if it was not + * set before (some software could incorrectly think it is in VMX mode). + */ + ASMSetCR4(g_HmR0.hwvirt.u.vmx.u64HostCr4); + ASMSetFlags(fEFlags); + + RTR0MemObjFree(hScatchMemObj, false); + } + + if (g_HmR0.hwvirt.u.vmx.fSupported) + { + rc = VMXR0GlobalInit(); + if (RT_FAILURE(rc)) + g_HmR0.rcInit = rc; + + /* + * Install the VT-x methods. + */ + g_HmR0.pfnEnterSession = VMXR0Enter; + g_HmR0.pfnThreadCtxCallback = VMXR0ThreadCtxCallback; + g_HmR0.pfnExportHostState = VMXR0ExportHostState; + g_HmR0.pfnRunGuestCode = VMXR0RunGuestCode; + g_HmR0.pfnEnableCpu = VMXR0EnableCpu; + g_HmR0.pfnDisableCpu = VMXR0DisableCpu; + g_HmR0.pfnInitVM = VMXR0InitVM; + g_HmR0.pfnTermVM = VMXR0TermVM; + g_HmR0.pfnSetupVM = VMXR0SetupVM; + + /* + * Check for the VMX-Preemption Timer and adjust for the "VMX-Preemption + * Timer Does Not Count Down at the Rate Specified" CPU erratum. + */ + uint32_t const fPinCtls = RT_HI_U32(g_HmR0.hwvirt.Msrs.u.vmx.u64PinCtls); + if (fPinCtls & VMX_PIN_CTLS_PREEMPT_TIMER) + { + uint64_t const uVmxMiscMsr = g_HmR0.hwvirt.Msrs.u.vmx.u64Misc; + g_HmR0.hwvirt.u.vmx.fUsePreemptTimer = true; + g_HmR0.hwvirt.u.vmx.cPreemptTimerShift = RT_BF_GET(uVmxMiscMsr, VMX_BF_MISC_PREEMPT_TIMER_TSC); + if (hmR0InitIntelIsSubjectToVmxPreemptTimerErratum()) + g_HmR0.hwvirt.u.vmx.cPreemptTimerShift = 0; /* This is about right most of the time here. */ + } + } + } +#ifdef LOG_ENABLED + else + SUPR0Printf("hmR0InitIntelCpu failed with rc=%Rrc\n", g_HmR0.rcInit); +#endif + return VINF_SUCCESS; +} + + +/** + * AMD-specific initialization code. + * + * @returns VBox status code (will only fail if out of memory). + */ +static int hmR0InitAmd(void) +{ + /* Call the global AMD-V initialization routine (should only fail in out-of-memory situations). */ + int rc = SVMR0GlobalInit(); + if (RT_FAILURE(rc)) + { + g_HmR0.rcInit = rc; + return rc; + } + + /* + * Install the AMD-V methods. + */ + g_HmR0.pfnEnterSession = SVMR0Enter; + g_HmR0.pfnThreadCtxCallback = SVMR0ThreadCtxCallback; + g_HmR0.pfnExportHostState = SVMR0ExportHostState; + g_HmR0.pfnRunGuestCode = SVMR0RunGuestCode; + g_HmR0.pfnEnableCpu = SVMR0EnableCpu; + g_HmR0.pfnDisableCpu = SVMR0DisableCpu; + g_HmR0.pfnInitVM = SVMR0InitVM; + g_HmR0.pfnTermVM = SVMR0TermVM; + g_HmR0.pfnSetupVM = SVMR0SetupVM; + + /* Query AMD features. */ + uint32_t u32Dummy; + ASMCpuId(0x8000000a, &g_HmR0.hwvirt.u.svm.u32Rev, &g_HmR0.hwvirt.uMaxAsid, &u32Dummy, &g_HmR0.hwvirt.u.svm.u32Features); + + /* + * We need to check if AMD-V has been properly initialized on all CPUs. + * Some BIOSes might do a poor job. + */ + HMR0FIRSTRC FirstRc; + hmR0FirstRcInit(&FirstRc); + rc = RTMpOnAll(hmR0InitAmdCpu, &FirstRc, NULL); + AssertRC(rc); + if (RT_SUCCESS(rc)) + rc = hmR0FirstRcGetStatus(&FirstRc); +#ifndef DEBUG_bird + AssertMsg(rc == VINF_SUCCESS || rc == VERR_SVM_IN_USE, + ("hmR0InitAmdCpu failed for cpu %d with rc=%Rrc\n", hmR0FirstRcGetCpuId(&FirstRc), rc)); +#endif + if (RT_SUCCESS(rc)) + { + SUPR0GetHwvirtMsrs(&g_HmR0.hwvirt.Msrs, SUPVTCAPS_AMD_V, false /* fForce */); + g_HmR0.hwvirt.u.svm.fSupported = true; + } + else + { + g_HmR0.rcInit = rc; + if (rc == VERR_SVM_DISABLED || rc == VERR_SVM_IN_USE) + rc = VINF_SUCCESS; /* Don't fail if AMD-V is disabled or in use. */ + } + return rc; +} + + +/** + * Does global Ring-0 HM initialization (at module init). + * + * @returns VBox status code. + */ +VMMR0_INT_DECL(int) HMR0Init(void) +{ + /* + * Initialize the globals. + */ + g_HmR0.fEnabled = false; + static RTONCE s_OnceInit = RTONCE_INITIALIZER; + g_HmR0.EnableAllCpusOnce = s_OnceInit; + for (unsigned i = 0; i < RT_ELEMENTS(g_HmR0.aCpuInfo); i++) + { + g_HmR0.aCpuInfo[i].idCpu = NIL_RTCPUID; + g_HmR0.aCpuInfo[i].hMemObj = NIL_RTR0MEMOBJ; + g_HmR0.aCpuInfo[i].HCPhysMemObj = NIL_RTHCPHYS; + g_HmR0.aCpuInfo[i].pvMemObj = NULL; +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm = NIL_RTR0MEMOBJ; + g_HmR0.aCpuInfo[i].n.svm.HCPhysNstGstMsrpm = NIL_RTHCPHYS; + g_HmR0.aCpuInfo[i].n.svm.pvNstGstMsrpm = NULL; +#endif + } + + /* Fill in all callbacks with placeholders. */ + g_HmR0.pfnEnterSession = hmR0DummyEnter; + g_HmR0.pfnThreadCtxCallback = hmR0DummyThreadCtxCallback; + g_HmR0.pfnExportHostState = hmR0DummyExportHostState; + g_HmR0.pfnRunGuestCode = hmR0DummyRunGuestCode; + g_HmR0.pfnEnableCpu = hmR0DummyEnableCpu; + g_HmR0.pfnDisableCpu = hmR0DummyDisableCpu; + g_HmR0.pfnInitVM = hmR0DummyInitVM; + g_HmR0.pfnTermVM = hmR0DummyTermVM; + g_HmR0.pfnSetupVM = hmR0DummySetupVM; + + /* Default is global VT-x/AMD-V init. */ + g_HmR0.fGlobalInit = true; + + /* + * Make sure aCpuInfo is big enough for all the CPUs on this system. + */ + if (RTMpGetArraySize() > RT_ELEMENTS(g_HmR0.aCpuInfo)) + { + LogRel(("HM: Too many real CPUs/cores/threads - %u, max %u\n", RTMpGetArraySize(), RT_ELEMENTS(g_HmR0.aCpuInfo))); + return VERR_TOO_MANY_CPUS; + } + + /* + * Check for VT-x or AMD-V support. + * Return failure only in out-of-memory situations. + */ + uint32_t fCaps = 0; + int rc = SUPR0GetVTSupport(&fCaps); + if (RT_SUCCESS(rc)) + { + if (fCaps & SUPVTCAPS_VT_X) + { + rc = hmR0InitIntel(); + if (RT_FAILURE(rc)) + return rc; + } + else + { + Assert(fCaps & SUPVTCAPS_AMD_V); + rc = hmR0InitAmd(); + if (RT_FAILURE(rc)) + return rc; + } + } + else + g_HmR0.rcInit = VERR_UNSUPPORTED_CPU; + + /* + * Register notification callbacks that we can use to disable/enable CPUs + * when brought offline/online or suspending/resuming. + */ + if (!g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx) + { + rc = RTMpNotificationRegister(hmR0MpEventCallback, NULL); + AssertRC(rc); + + rc = RTPowerNotificationRegister(hmR0PowerCallback, NULL); + AssertRC(rc); + } + + /* We return success here because module init shall not fail if HM fails to initialize. */ + return VINF_SUCCESS; +} + + +/** + * Does global Ring-0 HM termination (at module termination). + * + * @returns VBox status code. + */ +VMMR0_INT_DECL(int) HMR0Term(void) +{ + int rc; + if ( g_HmR0.hwvirt.u.vmx.fSupported + && g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx) + { + /* + * Simple if the host OS manages VT-x. + */ + Assert(g_HmR0.fGlobalInit); + + if (g_HmR0.hwvirt.u.vmx.fCalledSUPR0EnableVTx) + { + rc = SUPR0EnableVTx(false /* fEnable */); + g_HmR0.hwvirt.u.vmx.fCalledSUPR0EnableVTx = false; + } + else + rc = VINF_SUCCESS; + + for (unsigned iCpu = 0; iCpu < RT_ELEMENTS(g_HmR0.aCpuInfo); iCpu++) + { + g_HmR0.aCpuInfo[iCpu].fConfigured = false; + Assert(g_HmR0.aCpuInfo[iCpu].hMemObj == NIL_RTR0MEMOBJ); + } + } + else + { + Assert(!g_HmR0.hwvirt.u.vmx.fSupported || !g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx); + + /* Doesn't really matter if this fails. */ + rc = RTMpNotificationDeregister(hmR0MpEventCallback, NULL); AssertRC(rc); + rc = RTPowerNotificationDeregister(hmR0PowerCallback, NULL); AssertRC(rc); + + /* + * Disable VT-x/AMD-V on all CPUs if we enabled it before. + */ + if (g_HmR0.fGlobalInit) + { + HMR0FIRSTRC FirstRc; + hmR0FirstRcInit(&FirstRc); + rc = RTMpOnAll(hmR0DisableCpuCallback, NULL /* pvUser 1 */, &FirstRc); + Assert(RT_SUCCESS(rc) || rc == VERR_NOT_SUPPORTED); + if (RT_SUCCESS(rc)) + rc = hmR0FirstRcGetStatus(&FirstRc); + } + + /* + * Free the per-cpu pages used for VT-x and AMD-V. + */ + for (unsigned i = 0; i < RT_ELEMENTS(g_HmR0.aCpuInfo); i++) + { + if (g_HmR0.aCpuInfo[i].hMemObj != NIL_RTR0MEMOBJ) + { + RTR0MemObjFree(g_HmR0.aCpuInfo[i].hMemObj, false); + g_HmR0.aCpuInfo[i].hMemObj = NIL_RTR0MEMOBJ; + g_HmR0.aCpuInfo[i].HCPhysMemObj = NIL_RTHCPHYS; + g_HmR0.aCpuInfo[i].pvMemObj = NULL; + } +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + if (g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm != NIL_RTR0MEMOBJ) + { + RTR0MemObjFree(g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm, false); + g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm = NIL_RTR0MEMOBJ; + g_HmR0.aCpuInfo[i].n.svm.HCPhysNstGstMsrpm = NIL_RTHCPHYS; + g_HmR0.aCpuInfo[i].n.svm.pvNstGstMsrpm = NULL; + } +#endif + } + } + + /** @todo This needs cleaning up. There's no matching + * hmR0TermIntel()/hmR0TermAmd() and all the VT-x/AMD-V specific bits + * should move into their respective modules. */ + /* Finally, call global VT-x/AMD-V termination. */ + if (g_HmR0.hwvirt.u.vmx.fSupported) + VMXR0GlobalTerm(); + else if (g_HmR0.hwvirt.u.svm.fSupported) + SVMR0GlobalTerm(); + + return rc; +} + + +/** + * Worker function used by hmR0PowerCallback() and HMR0Init() to initalize VT-x + * on a CPU. + * + * @param idCpu The identifier for the CPU the function is called on. + * @param pvUser1 Pointer to the first RC structure. + * @param pvUser2 Ignored. + */ +static DECLCALLBACK(void) hmR0InitIntelCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PHMR0FIRSTRC pFirstRc = (PHMR0FIRSTRC)pvUser1; + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(idCpu == (RTCPUID)RTMpCpuIdToSetIndex(idCpu)); /** @todo fix idCpu == index assumption (rainy day) */ + NOREF(idCpu); NOREF(pvUser2); + + int rc = SUPR0GetVmxUsability(NULL /* pfIsSmxModeAmbiguous */); + hmR0FirstRcSetStatus(pFirstRc, rc); +} + + +/** + * Worker function used by hmR0PowerCallback() and HMR0Init() to initalize AMD-V + * on a CPU. + * + * @param idCpu The identifier for the CPU the function is called on. + * @param pvUser1 Pointer to the first RC structure. + * @param pvUser2 Ignored. + */ +static DECLCALLBACK(void) hmR0InitAmdCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PHMR0FIRSTRC pFirstRc = (PHMR0FIRSTRC)pvUser1; + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(idCpu == (RTCPUID)RTMpCpuIdToSetIndex(idCpu)); /** @todo fix idCpu == index assumption (rainy day) */ + NOREF(idCpu); NOREF(pvUser2); + + int rc = SUPR0GetSvmUsability(true /* fInitSvm */); + hmR0FirstRcSetStatus(pFirstRc, rc); +} + + +/** + * Enable VT-x or AMD-V on the current CPU + * + * @returns VBox status code. + * @param pVM The cross context VM structure. Can be NULL. + * @param idCpu The identifier for the CPU the function is called on. + * + * @remarks Maybe called with interrupts disabled! + */ +static int hmR0EnableCpu(PVM pVM, RTCPUID idCpu) +{ + PHMPHYSCPU pHostCpu = &g_HmR0.aCpuInfo[idCpu]; + + Assert(idCpu == (RTCPUID)RTMpCpuIdToSetIndex(idCpu)); /** @todo fix idCpu == index assumption (rainy day) */ + Assert(idCpu < RT_ELEMENTS(g_HmR0.aCpuInfo)); + Assert(!pHostCpu->fConfigured); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + pHostCpu->idCpu = idCpu; + /* Do NOT reset cTlbFlushes here, see @bugref{6255}. */ + + int rc; + if ( g_HmR0.hwvirt.u.vmx.fSupported + && g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx) + rc = g_HmR0.pfnEnableCpu(pHostCpu, pVM, NULL /* pvCpuPage */, NIL_RTHCPHYS, true, &g_HmR0.hwvirt.Msrs); + else + { + AssertLogRelMsgReturn(pHostCpu->hMemObj != NIL_RTR0MEMOBJ, ("hmR0EnableCpu failed idCpu=%u.\n", idCpu), VERR_HM_IPE_1); + rc = g_HmR0.pfnEnableCpu(pHostCpu, pVM, pHostCpu->pvMemObj, pHostCpu->HCPhysMemObj, false, &g_HmR0.hwvirt.Msrs); + } + if (RT_SUCCESS(rc)) + pHostCpu->fConfigured = true; + return rc; +} + + +/** + * Worker function passed to RTMpOnAll() that is to be called on all CPUs. + * + * @param idCpu The identifier for the CPU the function is called on. + * @param pvUser1 Opaque pointer to the VM (can be NULL!). + * @param pvUser2 The 2nd user argument. + */ +static DECLCALLBACK(void) hmR0EnableCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PVM pVM = (PVM)pvUser1; /* can be NULL! */ + PHMR0FIRSTRC pFirstRc = (PHMR0FIRSTRC)pvUser2; + AssertReturnVoid(g_HmR0.fGlobalInit); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + hmR0FirstRcSetStatus(pFirstRc, hmR0EnableCpu(pVM, idCpu)); +} + + +/** + * RTOnce callback employed by HMR0EnableAllCpus. + * + * @returns VBox status code. + * @param pvUser Pointer to the VM. + */ +static DECLCALLBACK(int32_t) hmR0EnableAllCpuOnce(void *pvUser) +{ + PVM pVM = (PVM)pvUser; + + /* + * Indicate that we've initialized. + * + * Note! There is a potential race between this function and the suspend + * notification. Kind of unlikely though, so ignored for now. + */ + AssertReturn(!g_HmR0.fEnabled, VERR_HM_ALREADY_ENABLED_IPE); + ASMAtomicWriteBool(&g_HmR0.fEnabled, true); + + /* + * The global init variable is set by the first VM. + */ + g_HmR0.fGlobalInit = pVM->hm.s.fGlobalInit; + +#ifdef VBOX_STRICT + for (unsigned i = 0; i < RT_ELEMENTS(g_HmR0.aCpuInfo); i++) + { + Assert(g_HmR0.aCpuInfo[i].hMemObj == NIL_RTR0MEMOBJ); + Assert(g_HmR0.aCpuInfo[i].HCPhysMemObj == NIL_RTHCPHYS); + Assert(g_HmR0.aCpuInfo[i].pvMemObj == NULL); + Assert(!g_HmR0.aCpuInfo[i].fConfigured); + Assert(!g_HmR0.aCpuInfo[i].cTlbFlushes); + Assert(!g_HmR0.aCpuInfo[i].uCurrentAsid); +# ifdef VBOX_WITH_NESTED_HWVIRT_SVM + Assert(g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm == NIL_RTR0MEMOBJ); + Assert(g_HmR0.aCpuInfo[i].n.svm.HCPhysNstGstMsrpm == NIL_RTHCPHYS); + Assert(g_HmR0.aCpuInfo[i].n.svm.pvNstGstMsrpm == NULL); +# endif + } +#endif + + int rc; + if ( g_HmR0.hwvirt.u.vmx.fSupported + && g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx) + { + /* + * Global VT-x initialization API (only darwin for now). + */ + rc = SUPR0EnableVTx(true /* fEnable */); + if (RT_SUCCESS(rc)) + { + g_HmR0.hwvirt.u.vmx.fCalledSUPR0EnableVTx = true; + /* If the host provides a VT-x init API, then we'll rely on that for global init. */ + g_HmR0.fGlobalInit = pVM->hm.s.fGlobalInit = true; + } + else + AssertMsgFailed(("hmR0EnableAllCpuOnce/SUPR0EnableVTx: rc=%Rrc\n", rc)); + } + else + { + /* + * We're doing the job ourselves. + */ + /* Allocate one page per cpu for the global VT-x and AMD-V pages */ + for (unsigned i = 0; i < RT_ELEMENTS(g_HmR0.aCpuInfo); i++) + { + Assert(g_HmR0.aCpuInfo[i].hMemObj == NIL_RTR0MEMOBJ); +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + Assert(g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm == NIL_RTR0MEMOBJ); +#endif + if (RTMpIsCpuPossible(RTMpCpuIdFromSetIndex(i))) + { + /** @todo NUMA */ + rc = RTR0MemObjAllocCont(&g_HmR0.aCpuInfo[i].hMemObj, PAGE_SIZE, false /* executable R0 mapping */); + AssertLogRelRCReturn(rc, rc); + + g_HmR0.aCpuInfo[i].HCPhysMemObj = RTR0MemObjGetPagePhysAddr(g_HmR0.aCpuInfo[i].hMemObj, 0); + Assert(g_HmR0.aCpuInfo[i].HCPhysMemObj != NIL_RTHCPHYS); + Assert(!(g_HmR0.aCpuInfo[i].HCPhysMemObj & PAGE_OFFSET_MASK)); + + g_HmR0.aCpuInfo[i].pvMemObj = RTR0MemObjAddress(g_HmR0.aCpuInfo[i].hMemObj); + AssertPtr(g_HmR0.aCpuInfo[i].pvMemObj); + ASMMemZeroPage(g_HmR0.aCpuInfo[i].pvMemObj); + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + rc = RTR0MemObjAllocCont(&g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm, SVM_MSRPM_PAGES << X86_PAGE_4K_SHIFT, + false /* executable R0 mapping */); + AssertLogRelRCReturn(rc, rc); + + g_HmR0.aCpuInfo[i].n.svm.HCPhysNstGstMsrpm = RTR0MemObjGetPagePhysAddr(g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm, 0); + Assert(g_HmR0.aCpuInfo[i].n.svm.HCPhysNstGstMsrpm != NIL_RTHCPHYS); + Assert(!(g_HmR0.aCpuInfo[i].n.svm.HCPhysNstGstMsrpm & PAGE_OFFSET_MASK)); + + g_HmR0.aCpuInfo[i].n.svm.pvNstGstMsrpm = RTR0MemObjAddress(g_HmR0.aCpuInfo[i].n.svm.hNstGstMsrpm); + AssertPtr(g_HmR0.aCpuInfo[i].n.svm.pvNstGstMsrpm); + ASMMemFill32(g_HmR0.aCpuInfo[i].n.svm.pvNstGstMsrpm, SVM_MSRPM_PAGES << X86_PAGE_4K_SHIFT, UINT32_C(0xffffffff)); +#endif + } + } + + rc = VINF_SUCCESS; + } + + if ( RT_SUCCESS(rc) + && g_HmR0.fGlobalInit) + { + /* First time, so initialize each cpu/core. */ + HMR0FIRSTRC FirstRc; + hmR0FirstRcInit(&FirstRc); + rc = RTMpOnAll(hmR0EnableCpuCallback, (void *)pVM, &FirstRc); + if (RT_SUCCESS(rc)) + rc = hmR0FirstRcGetStatus(&FirstRc); + } + + return rc; +} + + +/** + * Sets up HM on all cpus. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) HMR0EnableAllCpus(PVM pVM) +{ + /* Make sure we don't touch HM after we've disabled HM in preparation of a suspend. */ + if (ASMAtomicReadBool(&g_HmR0.fSuspended)) + return VERR_HM_SUSPEND_PENDING; + + return RTOnce(&g_HmR0.EnableAllCpusOnce, hmR0EnableAllCpuOnce, pVM); +} + + +/** + * Disable VT-x or AMD-V on the current CPU. + * + * @returns VBox status code. + * @param idCpu The identifier for the CPU this function is called on. + * + * @remarks Must be called with preemption disabled. + */ +static int hmR0DisableCpu(RTCPUID idCpu) +{ + PHMPHYSCPU pHostCpu = &g_HmR0.aCpuInfo[idCpu]; + + Assert(!g_HmR0.hwvirt.u.vmx.fSupported || !g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(idCpu == (RTCPUID)RTMpCpuIdToSetIndex(idCpu)); /** @todo fix idCpu == index assumption (rainy day) */ + Assert(idCpu < RT_ELEMENTS(g_HmR0.aCpuInfo)); + Assert(!pHostCpu->fConfigured || pHostCpu->hMemObj != NIL_RTR0MEMOBJ); + AssertRelease(idCpu == RTMpCpuId()); + + if (pHostCpu->hMemObj == NIL_RTR0MEMOBJ) + return pHostCpu->fConfigured ? VERR_NO_MEMORY : VINF_SUCCESS /* not initialized. */; + AssertPtr(pHostCpu->pvMemObj); + Assert(pHostCpu->HCPhysMemObj != NIL_RTHCPHYS); + + int rc; + if (pHostCpu->fConfigured) + { + rc = g_HmR0.pfnDisableCpu(pHostCpu->pvMemObj, pHostCpu->HCPhysMemObj); + AssertRCReturn(rc, rc); + + pHostCpu->fConfigured = false; + pHostCpu->idCpu = NIL_RTCPUID; + } + else + rc = VINF_SUCCESS; /* nothing to do */ + return rc; +} + + +/** + * Worker function passed to RTMpOnAll() that is to be called on the target + * CPUs. + * + * @param idCpu The identifier for the CPU the function is called on. + * @param pvUser1 The 1st user argument. + * @param pvUser2 Opaque pointer to the FirstRc. + */ +static DECLCALLBACK(void) hmR0DisableCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + PHMR0FIRSTRC pFirstRc = (PHMR0FIRSTRC)pvUser2; NOREF(pvUser1); + AssertReturnVoid(g_HmR0.fGlobalInit); + hmR0FirstRcSetStatus(pFirstRc, hmR0DisableCpu(idCpu)); +} + + +/** + * Worker function passed to RTMpOnSpecific() that is to be called on the target + * CPU. + * + * @param idCpu The identifier for the CPU the function is called on. + * @param pvUser1 Null, not used. + * @param pvUser2 Null, not used. + */ +static DECLCALLBACK(void) hmR0DisableCpuOnSpecificCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2) +{ + NOREF(pvUser1); + NOREF(pvUser2); + hmR0DisableCpu(idCpu); +} + + +/** + * Callback function invoked when a cpu goes online or offline. + * + * @param enmEvent The Mp event. + * @param idCpu The identifier for the CPU the function is called on. + * @param pvData Opaque data (PVM pointer). + */ +static DECLCALLBACK(void) hmR0MpEventCallback(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvData) +{ + NOREF(pvData); + Assert(!g_HmR0.hwvirt.u.vmx.fSupported || !g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx); + + /* + * We only care about uninitializing a CPU that is going offline. When a + * CPU comes online, the initialization is done lazily in HMR0Enter(). + */ + switch (enmEvent) + { + case RTMPEVENT_OFFLINE: + { + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + RTThreadPreemptDisable(&PreemptState); + if (idCpu == RTMpCpuId()) + { + int rc = hmR0DisableCpu(idCpu); + AssertRC(rc); + RTThreadPreemptRestore(&PreemptState); + } + else + { + RTThreadPreemptRestore(&PreemptState); + RTMpOnSpecific(idCpu, hmR0DisableCpuOnSpecificCallback, NULL /* pvUser1 */, NULL /* pvUser2 */); + } + break; + } + + default: + break; + } +} + + +/** + * Called whenever a system power state change occurs. + * + * @param enmEvent The Power event. + * @param pvUser User argument. + */ +static DECLCALLBACK(void) hmR0PowerCallback(RTPOWEREVENT enmEvent, void *pvUser) +{ + NOREF(pvUser); + Assert(!g_HmR0.hwvirt.u.vmx.fSupported || !g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx); + +#ifdef LOG_ENABLED + if (enmEvent == RTPOWEREVENT_SUSPEND) + SUPR0Printf("hmR0PowerCallback RTPOWEREVENT_SUSPEND\n"); + else + SUPR0Printf("hmR0PowerCallback RTPOWEREVENT_RESUME\n"); +#endif + + if (enmEvent == RTPOWEREVENT_SUSPEND) + ASMAtomicWriteBool(&g_HmR0.fSuspended, true); + + if (g_HmR0.fEnabled) + { + int rc; + HMR0FIRSTRC FirstRc; + hmR0FirstRcInit(&FirstRc); + + if (enmEvent == RTPOWEREVENT_SUSPEND) + { + if (g_HmR0.fGlobalInit) + { + /* Turn off VT-x or AMD-V on all CPUs. */ + rc = RTMpOnAll(hmR0DisableCpuCallback, NULL /* pvUser 1 */, &FirstRc); + Assert(RT_SUCCESS(rc) || rc == VERR_NOT_SUPPORTED); + } + /* else nothing to do here for the local init case */ + } + else + { + /* Reinit the CPUs from scratch as the suspend state might have + messed with the MSRs. (lousy BIOSes as usual) */ + if (g_HmR0.hwvirt.u.vmx.fSupported) + rc = RTMpOnAll(hmR0InitIntelCpu, &FirstRc, NULL); + else + rc = RTMpOnAll(hmR0InitAmdCpu, &FirstRc, NULL); + Assert(RT_SUCCESS(rc) || rc == VERR_NOT_SUPPORTED); + if (RT_SUCCESS(rc)) + rc = hmR0FirstRcGetStatus(&FirstRc); +#ifdef LOG_ENABLED + if (RT_FAILURE(rc)) + SUPR0Printf("hmR0PowerCallback hmR0InitXxxCpu failed with %Rc\n", rc); +#endif + if (g_HmR0.fGlobalInit) + { + /* Turn VT-x or AMD-V back on on all CPUs. */ + rc = RTMpOnAll(hmR0EnableCpuCallback, NULL /* pVM */, &FirstRc /* output ignored */); + Assert(RT_SUCCESS(rc) || rc == VERR_NOT_SUPPORTED); + } + /* else nothing to do here for the local init case */ + } + } + + if (enmEvent == RTPOWEREVENT_RESUME) + ASMAtomicWriteBool(&g_HmR0.fSuspended, false); +} + + +/** + * Does ring-0 per-VM HM initialization. + * + * This will call the CPU specific init. routine which may initialize and allocate + * resources for virtual CPUs. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + * + * @remarks This is called after HMR3Init(), see vmR3CreateU() and + * vmR3InitRing3(). + */ +VMMR0_INT_DECL(int) HMR0InitVM(PVM pVM) +{ + AssertReturn(pVM, VERR_INVALID_PARAMETER); + + /* Make sure we don't touch HM after we've disabled HM in preparation of a suspend. */ + if (ASMAtomicReadBool(&g_HmR0.fSuspended)) + return VERR_HM_SUSPEND_PENDING; + + /* + * Copy globals to the VM structure. + */ + Assert(!(pVM->hm.s.vmx.fSupported && pVM->hm.s.svm.fSupported)); + if (pVM->hm.s.vmx.fSupported) + { + pVM->hm.s.vmx.fUsePreemptTimer &= g_HmR0.hwvirt.u.vmx.fUsePreemptTimer; /* Can be overridden by CFGM see HMR3Init(). */ + pVM->hm.s.vmx.cPreemptTimerShift = g_HmR0.hwvirt.u.vmx.cPreemptTimerShift; + pVM->hm.s.vmx.u64HostCr4 = g_HmR0.hwvirt.u.vmx.u64HostCr4; + pVM->hm.s.vmx.u64HostEfer = g_HmR0.hwvirt.u.vmx.u64HostEfer; + pVM->hm.s.vmx.u64HostSmmMonitorCtl = g_HmR0.hwvirt.u.vmx.u64HostSmmMonitorCtl; + HMGetVmxMsrsFromHwvirtMsrs(&g_HmR0.hwvirt.Msrs, &pVM->hm.s.vmx.Msrs); + } + else if (pVM->hm.s.svm.fSupported) + { + pVM->hm.s.svm.u32Rev = g_HmR0.hwvirt.u.svm.u32Rev; + pVM->hm.s.svm.u32Features = g_HmR0.hwvirt.u.svm.u32Features; + pVM->hm.s.svm.u64MsrHwcr = g_HmR0.hwvirt.Msrs.u.svm.u64MsrHwcr; + } + pVM->hm.s.rcInit = g_HmR0.rcInit; + pVM->hm.s.uMaxAsid = g_HmR0.hwvirt.uMaxAsid; + + /* + * Set default maximum inner loops in ring-0 before returning to ring-3. + * Can be overriden using CFGM. + */ + if (!pVM->hm.s.cMaxResumeLoops) + { + pVM->hm.s.cMaxResumeLoops = 1024; + if (RTThreadPreemptIsPendingTrusty()) + pVM->hm.s.cMaxResumeLoops = 8192; + } + + /* + * Initialize some per-VCPU fields. + */ + for (VMCPUID i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + pVCpu->hm.s.idEnteredCpu = NIL_RTCPUID; + pVCpu->hm.s.idLastCpu = NIL_RTCPUID; + + /* We'll aways increment this the first time (host uses ASID 0). */ + AssertReturn(!pVCpu->hm.s.uCurrentAsid, VERR_HM_IPE_3); + } + + /* + * Get host kernel features that HM might need to know in order + * to co-operate and function properly with the host OS (e.g. SMAP). + * + * Technically, we could do this as part of the pre-init VM procedure + * but it shouldn't be done later than this point so we do it here. + */ + pVM->hm.s.fHostKernelFeatures = SUPR0GetKernelFeatures(); + + /* + * Call the hardware specific initialization method. + */ + return g_HmR0.pfnInitVM(pVM); +} + + +/** + * Does ring-0 per VM HM termination. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) HMR0TermVM(PVM pVM) +{ + Log(("HMR0TermVM: %p\n", pVM)); + AssertReturn(pVM, VERR_INVALID_PARAMETER); + + /* + * Call the hardware specific method. + * + * Note! We might be preparing for a suspend, so the pfnTermVM() functions should probably not + * mess with VT-x/AMD-V features on the CPU, currently all they do is free memory so this is safe. + */ + return g_HmR0.pfnTermVM(pVM); +} + + +/** + * Sets up a VT-x or AMD-V session. + * + * This is mostly about setting up the hardware VM state. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) HMR0SetupVM(PVM pVM) +{ + Log(("HMR0SetupVM: %p\n", pVM)); + AssertReturn(pVM, VERR_INVALID_PARAMETER); + + /* Make sure we don't touch HM after we've disabled HM in preparation of a suspend. */ + AssertReturn(!ASMAtomicReadBool(&g_HmR0.fSuspended), VERR_HM_SUSPEND_PENDING); + + /* On first entry we'll sync everything. */ + for (VMCPUID i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + pVCpu->hm.s.fCtxChanged |= HM_CHANGED_HOST_CONTEXT | HM_CHANGED_ALL_GUEST; + } + + /* + * Call the hardware specific setup VM method. This requires the CPU to be + * enabled for AMD-V/VT-x and preemption to be prevented. + */ + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + RTThreadPreemptDisable(&PreemptState); + RTCPUID const idCpu = RTMpCpuId(); + + /* Enable VT-x or AMD-V if local init is required. */ + int rc; + if (!g_HmR0.fGlobalInit) + { + Assert(!g_HmR0.hwvirt.u.vmx.fSupported || !g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx); + rc = hmR0EnableCpu(pVM, idCpu); + if (RT_FAILURE(rc)) + { + RTThreadPreemptRestore(&PreemptState); + return rc; + } + } + + /* Setup VT-x or AMD-V. */ + rc = g_HmR0.pfnSetupVM(pVM); + + /* Disable VT-x or AMD-V if local init was done before. */ + if (!g_HmR0.fGlobalInit) + { + Assert(!g_HmR0.hwvirt.u.vmx.fSupported || !g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx); + int rc2 = hmR0DisableCpu(idCpu); + AssertRC(rc2); + } + + RTThreadPreemptRestore(&PreemptState); + return rc; +} + + +/** + * Turns on HM on the CPU if necessary and initializes the bare minimum state + * required for entering HM context. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +VMMR0_INT_DECL(int) hmR0EnterCpu(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + int rc = VINF_SUCCESS; + RTCPUID const idCpu = RTMpCpuId(); + PHMPHYSCPU pHostCpu = &g_HmR0.aCpuInfo[idCpu]; + AssertPtr(pHostCpu); + + /* Enable VT-x or AMD-V if local init is required, or enable if it's a freshly onlined CPU. */ + if (!pHostCpu->fConfigured) + rc = hmR0EnableCpu(pVCpu->CTX_SUFF(pVM), idCpu); + + /* Reload host-state (back from ring-3/migrated CPUs) and shared guest/host bits. */ + if (g_HmR0.hwvirt.u.vmx.fSupported) + pVCpu->hm.s.fCtxChanged |= HM_CHANGED_HOST_CONTEXT | HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE; + else + pVCpu->hm.s.fCtxChanged |= HM_CHANGED_HOST_CONTEXT | HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE; + + Assert(pHostCpu->idCpu == idCpu && pHostCpu->idCpu != NIL_RTCPUID); + pVCpu->hm.s.idEnteredCpu = idCpu; + return rc; +} + + +/** + * Enters the VT-x or AMD-V session. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks This is called with preemption disabled. + */ +VMMR0_INT_DECL(int) HMR0Enter(PVMCPU pVCpu) +{ + /* Make sure we can't enter a session after we've disabled HM in preparation of a suspend. */ + AssertReturn(!ASMAtomicReadBool(&g_HmR0.fSuspended), VERR_HM_SUSPEND_PENDING); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* Load the bare minimum state required for entering HM. */ + int rc = hmR0EnterCpu(pVCpu); + if (RT_SUCCESS(rc)) + { + if (g_HmR0.hwvirt.u.vmx.fSupported) + { + Assert((pVCpu->hm.s.fCtxChanged & (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)) + == (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)); + } + else + { + Assert((pVCpu->hm.s.fCtxChanged & (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE)) + == (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE)); + } + +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + AssertReturn(!VMMR0ThreadCtxHookIsEnabled(pVCpu), VERR_HM_IPE_5); + bool const fStartedSet = PGMR0DynMapStartOrMigrateAutoSet(pVCpu); +#endif + + /* Keep track of the CPU owning the VMCS for debugging scheduling weirdness and ring-3 calls. */ + rc = g_HmR0.pfnEnterSession(pVCpu); + AssertMsgRCReturnStmt(rc, ("rc=%Rrc pVCpu=%p\n", rc, pVCpu), pVCpu->hm.s.idEnteredCpu = NIL_RTCPUID, rc); + + /* Exports the host-state as we may be resuming code after a longjmp and quite + possibly now be scheduled on a different CPU. */ + rc = g_HmR0.pfnExportHostState(pVCpu); + AssertMsgRCReturnStmt(rc, ("rc=%Rrc pVCpu=%p\n", rc, pVCpu), pVCpu->hm.s.idEnteredCpu = NIL_RTCPUID, rc); + +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + if (fStartedSet) + PGMRZDynMapReleaseAutoSet(pVCpu); +#endif + } + return rc; +} + + +/** + * Deinitializes the bare minimum state used for HM context and if necessary + * disable HM on the CPU. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +VMMR0_INT_DECL(int) HMR0LeaveCpu(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + VMCPU_ASSERT_EMT_RETURN(pVCpu, VERR_HM_WRONG_CPU); + + RTCPUID const idCpu = RTMpCpuId(); + PCHMPHYSCPU pHostCpu = &g_HmR0.aCpuInfo[idCpu]; + + if ( !g_HmR0.fGlobalInit + && pHostCpu->fConfigured) + { + int rc = hmR0DisableCpu(idCpu); + AssertRCReturn(rc, rc); + Assert(!pHostCpu->fConfigured); + Assert(pHostCpu->idCpu == NIL_RTCPUID); + + /* For obtaining a non-zero ASID/VPID on next re-entry. */ + pVCpu->hm.s.idLastCpu = NIL_RTCPUID; + } + + /* Clear it while leaving HM context, hmPokeCpuForTlbFlush() relies on this. */ + pVCpu->hm.s.idEnteredCpu = NIL_RTCPUID; + + return VINF_SUCCESS; +} + + +/** + * Thread-context hook for HM. + * + * @param enmEvent The thread-context event. + * @param pvUser Opaque pointer to the VMCPU. + */ +VMMR0_INT_DECL(void) HMR0ThreadCtxCallback(RTTHREADCTXEVENT enmEvent, void *pvUser) +{ + PVMCPU pVCpu = (PVMCPU)pvUser; + Assert(pVCpu); + Assert(g_HmR0.pfnThreadCtxCallback); + + g_HmR0.pfnThreadCtxCallback(enmEvent, pVCpu, g_HmR0.fGlobalInit); +} + + +/** + * Runs guest code in a hardware accelerated VM. + * + * @returns Strict VBox status code. (VBOXSTRICTRC isn't used because it's + * called from setjmp assembly.) + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Can be called with preemption enabled if thread-context hooks are + * used!!! + */ +VMMR0_INT_DECL(int) HMR0RunGuestCode(PVM pVM, PVMCPU pVCpu) +{ + RT_NOREF(pVM); + +#ifdef VBOX_STRICT + /* With thread-context hooks we would be running this code with preemption enabled. */ + if (!RTThreadPreemptIsEnabled(NIL_RTTHREAD)) + { + PCHMPHYSCPU pHostCpu = &g_HmR0.aCpuInfo[RTMpCpuId()]; + Assert(!VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3 | VMCPU_FF_PGM_SYNC_CR3_NON_GLOBAL)); + Assert(pHostCpu->fConfigured); + AssertReturn(!ASMAtomicReadBool(&g_HmR0.fSuspended), VERR_HM_SUSPEND_PENDING); + } +#endif + +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + AssertReturn(!VMMR0ThreadCtxHookIsEnabled(pVCpu), VERR_HM_IPE_4); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + PGMRZDynMapStartAutoSet(pVCpu); +#endif + + VBOXSTRICTRC rcStrict = g_HmR0.pfnRunGuestCode(pVCpu); + +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + PGMRZDynMapReleaseAutoSet(pVCpu); +#endif + return VBOXSTRICTRC_VAL(rcStrict); +} + + +/** + * Notification from CPUM that it has unloaded the guest FPU/SSE/AVX state from + * the host CPU and that guest access to it must be intercepted. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + */ +VMMR0_INT_DECL(void) HMR0NotifyCpumUnloadedGuestFpuState(PVMCPU pVCpu) +{ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CR0); +} + + +/** + * Notification from CPUM that it has modified the host CR0 (because of FPU). + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + */ +VMMR0_INT_DECL(void) HMR0NotifyCpumModifiedHostCr0(PVMCPU pVCpu) +{ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_HOST_CONTEXT); +} + + +#if HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) + +/** + * Save guest FPU/XMM state (64 bits guest mode & 32 bits host only) + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * @param pCtx Pointer to the guest CPU context. + */ +VMMR0_INT_DECL(int) HMR0SaveFPUState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx) +{ + RT_NOREF(pCtx); + STAM_COUNTER_INC(&pVCpu->hm.s.StatFpu64SwitchBack); + if (pVM->hm.s.vmx.fSupported) + return VMXR0Execute64BitsHandler(pVCpu, HM64ON32OP_HMRCSaveGuestFPU64, 0, NULL); + return SVMR0Execute64BitsHandler(pVCpu, HM64ON32OP_HMRCSaveGuestFPU64, 0, NULL); +} + + +/** + * Save guest debug state (64 bits guest mode & 32 bits host only) + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * @param pCtx Pointer to the guest CPU context. + */ +VMMR0_INT_DECL(int) HMR0SaveDebugState(PVM pVM, PVMCPU pVCpu, PCPUMCTX pCtx) +{ + RT_NOREF(pCtx); + STAM_COUNTER_INC(&pVCpu->hm.s.StatDebug64SwitchBack); + if (pVM->hm.s.vmx.fSupported) + return VMXR0Execute64BitsHandler(pVCpu, HM64ON32OP_HMRCSaveGuestDebug64, 0, NULL); + return SVMR0Execute64BitsHandler(pVCpu, HM64ON32OP_HMRCSaveGuestDebug64, 0, NULL); +} + + +/** + * Test the 32->64 bits switcher. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) HMR0TestSwitcher3264(PVM pVM) +{ + PVMCPU pVCpu = &pVM->aCpus[0]; + uint32_t aParam[5] = { 0, 1, 2, 3, 4 }; + int rc; + + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatWorldSwitch3264, z); + if (pVM->hm.s.vmx.fSupported) + rc = VMXR0Execute64BitsHandler(pVCpu, HM64ON32OP_HMRCTestSwitcher64, 5, &aParam[0]); + else + rc = SVMR0Execute64BitsHandler(pVCpu, HM64ON32OP_HMRCTestSwitcher64, 5, &aParam[0]); + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatWorldSwitch3264, z); + + return rc; +} + +#endif /* HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) */ + +/** + * Returns suspend status of the host. + * + * @returns Suspend pending or not. + */ +VMMR0_INT_DECL(bool) HMR0SuspendPending(void) +{ + return ASMAtomicReadBool(&g_HmR0.fSuspended); +} + + +/** + * Invalidates a guest page from the host TLB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param GCVirt Page to invalidate. + */ +VMMR0_INT_DECL(int) HMR0InvalidatePage(PVMCPU pVCpu, RTGCPTR GCVirt) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (pVM->hm.s.vmx.fSupported) + return VMXR0InvalidatePage(pVCpu, GCVirt); + return SVMR0InvalidatePage(pVCpu, GCVirt); +} + + +/** + * Returns the cpu structure for the current cpu. + * Keep in mind that there is no guarantee it will stay the same (long jumps to ring 3!!!). + * + * @returns The cpu structure pointer. + */ +VMMR0_INT_DECL(PHMPHYSCPU) hmR0GetCurrentCpu(void) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + RTCPUID const idCpu = RTMpCpuId(); + Assert(idCpu < RT_ELEMENTS(g_HmR0.aCpuInfo)); + return &g_HmR0.aCpuInfo[idCpu]; +} + + +/** + * Interface for importing state on demand (used by IEM). + * + * @returns VBox status code. + * @param pVCpu The cross context CPU structure. + * @param fWhat What to import, CPUMCTX_EXTRN_XXX. + */ +VMMR0_INT_DECL(int) HMR0ImportStateOnDemand(PVMCPU pVCpu, uint64_t fWhat) +{ + if (pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fSupported) + return VMXR0ImportStateOnDemand(pVCpu, fWhat); + return SVMR0ImportStateOnDemand(pVCpu, fWhat); +} + + +#ifdef VBOX_WITH_RAW_MODE +/** + * Raw-mode switcher hook - disable VT-x if it's active *and* the current + * switcher turns off paging. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + * @param enmSwitcher The switcher we're about to use. + * @param pfVTxDisabled Where to store whether VT-x was disabled or not. + */ +VMMR0_INT_DECL(int) HMR0EnterSwitcher(PVM pVM, VMMSWITCHER enmSwitcher, bool *pfVTxDisabled) +{ + NOREF(pVM); + + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + *pfVTxDisabled = false; + + /* No such issues with AMD-V */ + if (!g_HmR0.hwvirt.u.vmx.fSupported) + return VINF_SUCCESS; + + /* Check if the switching we're up to is safe. */ + switch (enmSwitcher) + { + case VMMSWITCHER_32_TO_32: + case VMMSWITCHER_PAE_TO_PAE: + return VINF_SUCCESS; /* safe switchers as they don't turn off paging */ + + case VMMSWITCHER_32_TO_PAE: + case VMMSWITCHER_PAE_TO_32: /* is this one actually used?? */ + case VMMSWITCHER_AMD64_TO_32: + case VMMSWITCHER_AMD64_TO_PAE: + break; /* unsafe switchers */ + + default: + AssertFailedReturn(VERR_HM_WRONG_SWITCHER); + } + + /* When using SUPR0EnableVTx we must let the host suspend and resume VT-x, + regardless of whether we're currently using VT-x or not. */ + if (g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx) + { + *pfVTxDisabled = SUPR0SuspendVTxOnCpu(); + return VINF_SUCCESS; + } + + /** @todo Check if this code is presumptive wrt other VT-x users on the + * system... */ + + /* Nothing to do if we haven't enabled VT-x. */ + if (!g_HmR0.fEnabled) + return VINF_SUCCESS; + + /* Local init implies the CPU is currently not in VMX root mode. */ + if (!g_HmR0.fGlobalInit) + return VINF_SUCCESS; + + /* Ok, disable VT-x. */ + PCHMPHYSCPU pHostCpu = hmR0GetCurrentCpu(); + AssertReturn( pHostCpu + && pHostCpu->hMemObj != NIL_RTR0MEMOBJ + && pHostCpu->pvMemObj + && pHostCpu->HCPhysMemObj != NIL_RTHCPHYS, + VERR_HM_IPE_2); + + *pfVTxDisabled = true; + return VMXR0DisableCpu(pHostCpu->pvMemObj, pHostCpu->HCPhysMemObj); +} + + +/** + * Raw-mode switcher hook - re-enable VT-x if was active *and* the current + * switcher turned off paging. + * + * @param pVM The cross context VM structure. + * @param fVTxDisabled Whether VT-x was disabled or not. + */ +VMMR0_INT_DECL(void) HMR0LeaveSwitcher(PVM pVM, bool fVTxDisabled) +{ + Assert(!ASMIntAreEnabled()); + + if (!fVTxDisabled) + return; /* nothing to do */ + + Assert(g_HmR0.hwvirt.u.vmx.fSupported); + if (g_HmR0.hwvirt.u.vmx.fUsingSUPR0EnableVTx) + SUPR0ResumeVTxOnCpu(fVTxDisabled); + else + { + Assert(g_HmR0.fEnabled); + Assert(g_HmR0.fGlobalInit); + + PHMPHYSCPU pHostCpu = hmR0GetCurrentCpu(); + AssertReturnVoid( pHostCpu + && pHostCpu->hMemObj != NIL_RTR0MEMOBJ + && pHostCpu->pvMemObj + && pHostCpu->HCPhysMemObj != NIL_RTHCPHYS); + + VMXR0EnableCpu(pHostCpu, pVM, pHostCpu->pvMemObj, pHostCpu->HCPhysMemObj, false, &g_HmR0.hwvirt.Msrs); + } +} +#endif /* VBOX_WITH_RAW_MODE */ + + +#ifdef VBOX_STRICT +/** + * Dumps a descriptor. + * + * @param pDesc Descriptor to dump. + * @param Sel Selector number. + * @param pszMsg Message to prepend the log entry with. + */ +VMMR0_INT_DECL(void) hmR0DumpDescriptor(PCX86DESCHC pDesc, RTSEL Sel, const char *pszMsg) +{ + /* + * Make variable description string. + */ + static struct + { + unsigned cch; + const char *psz; + } const s_aTypes[32] = + { +# define STRENTRY(str) { sizeof(str) - 1, str } + + /* system */ +# if HC_ARCH_BITS == 64 + STRENTRY("Reserved0 "), /* 0x00 */ + STRENTRY("Reserved1 "), /* 0x01 */ + STRENTRY("LDT "), /* 0x02 */ + STRENTRY("Reserved3 "), /* 0x03 */ + STRENTRY("Reserved4 "), /* 0x04 */ + STRENTRY("Reserved5 "), /* 0x05 */ + STRENTRY("Reserved6 "), /* 0x06 */ + STRENTRY("Reserved7 "), /* 0x07 */ + STRENTRY("Reserved8 "), /* 0x08 */ + STRENTRY("TSS64Avail "), /* 0x09 */ + STRENTRY("ReservedA "), /* 0x0a */ + STRENTRY("TSS64Busy "), /* 0x0b */ + STRENTRY("Call64 "), /* 0x0c */ + STRENTRY("ReservedD "), /* 0x0d */ + STRENTRY("Int64 "), /* 0x0e */ + STRENTRY("Trap64 "), /* 0x0f */ +# else + STRENTRY("Reserved0 "), /* 0x00 */ + STRENTRY("TSS16Avail "), /* 0x01 */ + STRENTRY("LDT "), /* 0x02 */ + STRENTRY("TSS16Busy "), /* 0x03 */ + STRENTRY("Call16 "), /* 0x04 */ + STRENTRY("Task "), /* 0x05 */ + STRENTRY("Int16 "), /* 0x06 */ + STRENTRY("Trap16 "), /* 0x07 */ + STRENTRY("Reserved8 "), /* 0x08 */ + STRENTRY("TSS32Avail "), /* 0x09 */ + STRENTRY("ReservedA "), /* 0x0a */ + STRENTRY("TSS32Busy "), /* 0x0b */ + STRENTRY("Call32 "), /* 0x0c */ + STRENTRY("ReservedD "), /* 0x0d */ + STRENTRY("Int32 "), /* 0x0e */ + STRENTRY("Trap32 "), /* 0x0f */ +# endif + /* non system */ + STRENTRY("DataRO "), /* 0x10 */ + STRENTRY("DataRO Accessed "), /* 0x11 */ + STRENTRY("DataRW "), /* 0x12 */ + STRENTRY("DataRW Accessed "), /* 0x13 */ + STRENTRY("DataDownRO "), /* 0x14 */ + STRENTRY("DataDownRO Accessed "), /* 0x15 */ + STRENTRY("DataDownRW "), /* 0x16 */ + STRENTRY("DataDownRW Accessed "), /* 0x17 */ + STRENTRY("CodeEO "), /* 0x18 */ + STRENTRY("CodeEO Accessed "), /* 0x19 */ + STRENTRY("CodeER "), /* 0x1a */ + STRENTRY("CodeER Accessed "), /* 0x1b */ + STRENTRY("CodeConfEO "), /* 0x1c */ + STRENTRY("CodeConfEO Accessed "), /* 0x1d */ + STRENTRY("CodeConfER "), /* 0x1e */ + STRENTRY("CodeConfER Accessed ") /* 0x1f */ +# undef SYSENTRY + }; +# define ADD_STR(psz, pszAdd) do { strcpy(psz, pszAdd); psz += strlen(pszAdd); } while (0) + char szMsg[128]; + char *psz = &szMsg[0]; + unsigned i = pDesc->Gen.u1DescType << 4 | pDesc->Gen.u4Type; + memcpy(psz, s_aTypes[i].psz, s_aTypes[i].cch); + psz += s_aTypes[i].cch; + + if (pDesc->Gen.u1Present) + ADD_STR(psz, "Present "); + else + ADD_STR(psz, "Not-Present "); +# if HC_ARCH_BITS == 64 + if (pDesc->Gen.u1Long) + ADD_STR(psz, "64-bit "); + else + ADD_STR(psz, "Comp "); +# else + if (pDesc->Gen.u1Granularity) + ADD_STR(psz, "Page "); + if (pDesc->Gen.u1DefBig) + ADD_STR(psz, "32-bit "); + else + ADD_STR(psz, "16-bit "); +# endif +# undef ADD_STR + *psz = '\0'; + + /* + * Limit and Base and format the output. + */ +#ifdef LOG_ENABLED + uint32_t u32Limit = X86DESC_LIMIT_G(pDesc); + +# if HC_ARCH_BITS == 64 + uint64_t u32Base = X86DESC64_BASE(pDesc); + Log(("%s %04x - %RX64 %RX64 - base=%RX64 limit=%08x dpl=%d %s\n", pszMsg, + Sel, pDesc->au64[0], pDesc->au64[1], u32Base, u32Limit, pDesc->Gen.u2Dpl, szMsg)); +# else + uint32_t u32Base = X86DESC_BASE(pDesc); + Log(("%s %04x - %08x %08x - base=%08x limit=%08x dpl=%d %s\n", pszMsg, + Sel, pDesc->au32[0], pDesc->au32[1], u32Base, u32Limit, pDesc->Gen.u2Dpl, szMsg)); +# endif +#else + NOREF(Sel); NOREF(pszMsg); +#endif +} + + +/** + * Formats a full register dump. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0_INT_DECL(void) hmR0DumpRegs(PVMCPU pVCpu) +{ + /* + * Format the flags. + */ + static struct + { + const char *pszSet; const char *pszClear; uint32_t fFlag; + } const s_aFlags[] = + { + { "vip", NULL, X86_EFL_VIP }, + { "vif", NULL, X86_EFL_VIF }, + { "ac", NULL, X86_EFL_AC }, + { "vm", NULL, X86_EFL_VM }, + { "rf", NULL, X86_EFL_RF }, + { "nt", NULL, X86_EFL_NT }, + { "ov", "nv", X86_EFL_OF }, + { "dn", "up", X86_EFL_DF }, + { "ei", "di", X86_EFL_IF }, + { "tf", NULL, X86_EFL_TF }, + { "nt", "pl", X86_EFL_SF }, + { "nz", "zr", X86_EFL_ZF }, + { "ac", "na", X86_EFL_AF }, + { "po", "pe", X86_EFL_PF }, + { "cy", "nc", X86_EFL_CF }, + }; + char szEFlags[80]; + char *psz = szEFlags; + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint32_t uEFlags = pCtx->eflags.u32; + for (unsigned i = 0; i < RT_ELEMENTS(s_aFlags); i++) + { + const char *pszAdd = s_aFlags[i].fFlag & uEFlags ? s_aFlags[i].pszSet : s_aFlags[i].pszClear; + if (pszAdd) + { + strcpy(psz, pszAdd); + psz += strlen(pszAdd); + *psz++ = ' '; + } + } + psz[-1] = '\0'; + + /* + * Format the registers. + */ + if (CPUMIsGuestIn64BitCode(pVCpu)) + { + Log(("rax=%016RX64 rbx=%016RX64 rcx=%016RX64 rdx=%016RX64\n" + "rsi=%016RX64 rdi=%016RX64 r8 =%016RX64 r9 =%016RX64\n" + "r10=%016RX64 r11=%016RX64 r12=%016RX64 r13=%016RX64\n" + "r14=%016RX64 r15=%016RX64\n" + "rip=%016RX64 rsp=%016RX64 rbp=%016RX64 iopl=%d %*s\n" + "cs={%04x base=%016RX64 limit=%08x flags=%08x}\n" + "ds={%04x base=%016RX64 limit=%08x flags=%08x}\n" + "es={%04x base=%016RX64 limit=%08x flags=%08x}\n" + "fs={%04x base=%016RX64 limit=%08x flags=%08x}\n" + "gs={%04x base=%016RX64 limit=%08x flags=%08x}\n" + "ss={%04x base=%016RX64 limit=%08x flags=%08x}\n" + "cr0=%016RX64 cr2=%016RX64 cr3=%016RX64 cr4=%016RX64\n" + "dr0=%016RX64 dr1=%016RX64 dr2=%016RX64 dr3=%016RX64\n" + "dr4=%016RX64 dr5=%016RX64 dr6=%016RX64 dr7=%016RX64\n" + "gdtr=%016RX64:%04x idtr=%016RX64:%04x eflags=%08x\n" + "ldtr={%04x base=%08RX64 limit=%08x flags=%08x}\n" + "tr ={%04x base=%08RX64 limit=%08x flags=%08x}\n" + "SysEnter={cs=%04llx eip=%08llx esp=%08llx}\n" + , + pCtx->rax, pCtx->rbx, pCtx->rcx, pCtx->rdx, pCtx->rsi, pCtx->rdi, + pCtx->r8, pCtx->r9, pCtx->r10, pCtx->r11, pCtx->r12, pCtx->r13, + pCtx->r14, pCtx->r15, + pCtx->rip, pCtx->rsp, pCtx->rbp, X86_EFL_GET_IOPL(uEFlags), 31, szEFlags, + pCtx->cs.Sel, pCtx->cs.u64Base, pCtx->cs.u32Limit, pCtx->cs.Attr.u, + pCtx->ds.Sel, pCtx->ds.u64Base, pCtx->ds.u32Limit, pCtx->ds.Attr.u, + pCtx->es.Sel, pCtx->es.u64Base, pCtx->es.u32Limit, pCtx->es.Attr.u, + pCtx->fs.Sel, pCtx->fs.u64Base, pCtx->fs.u32Limit, pCtx->fs.Attr.u, + pCtx->gs.Sel, pCtx->gs.u64Base, pCtx->gs.u32Limit, pCtx->gs.Attr.u, + pCtx->ss.Sel, pCtx->ss.u64Base, pCtx->ss.u32Limit, pCtx->ss.Attr.u, + pCtx->cr0, pCtx->cr2, pCtx->cr3, pCtx->cr4, + pCtx->dr[0], pCtx->dr[1], pCtx->dr[2], pCtx->dr[3], + pCtx->dr[4], pCtx->dr[5], pCtx->dr[6], pCtx->dr[7], + pCtx->gdtr.pGdt, pCtx->gdtr.cbGdt, pCtx->idtr.pIdt, pCtx->idtr.cbIdt, uEFlags, + pCtx->ldtr.Sel, pCtx->ldtr.u64Base, pCtx->ldtr.u32Limit, pCtx->ldtr.Attr.u, + pCtx->tr.Sel, pCtx->tr.u64Base, pCtx->tr.u32Limit, pCtx->tr.Attr.u, + pCtx->SysEnter.cs, pCtx->SysEnter.eip, pCtx->SysEnter.esp)); + } + else + Log(("eax=%08x ebx=%08x ecx=%08x edx=%08x esi=%08x edi=%08x\n" + "eip=%08x esp=%08x ebp=%08x iopl=%d %*s\n" + "cs={%04x base=%016RX64 limit=%08x flags=%08x} dr0=%08RX64 dr1=%08RX64\n" + "ds={%04x base=%016RX64 limit=%08x flags=%08x} dr2=%08RX64 dr3=%08RX64\n" + "es={%04x base=%016RX64 limit=%08x flags=%08x} dr4=%08RX64 dr5=%08RX64\n" + "fs={%04x base=%016RX64 limit=%08x flags=%08x} dr6=%08RX64 dr7=%08RX64\n" + "gs={%04x base=%016RX64 limit=%08x flags=%08x} cr0=%08RX64 cr2=%08RX64\n" + "ss={%04x base=%016RX64 limit=%08x flags=%08x} cr3=%08RX64 cr4=%08RX64\n" + "gdtr=%016RX64:%04x idtr=%016RX64:%04x eflags=%08x\n" + "ldtr={%04x base=%08RX64 limit=%08x flags=%08x}\n" + "tr ={%04x base=%08RX64 limit=%08x flags=%08x}\n" + "SysEnter={cs=%04llx eip=%08llx esp=%08llx}\n" + , + pCtx->eax, pCtx->ebx, pCtx->ecx, pCtx->edx, pCtx->esi, pCtx->edi, + pCtx->eip, pCtx->esp, pCtx->ebp, X86_EFL_GET_IOPL(uEFlags), 31, szEFlags, + pCtx->cs.Sel, pCtx->cs.u64Base, pCtx->cs.u32Limit, pCtx->cs.Attr.u, pCtx->dr[0], pCtx->dr[1], + pCtx->ds.Sel, pCtx->ds.u64Base, pCtx->ds.u32Limit, pCtx->ds.Attr.u, pCtx->dr[2], pCtx->dr[3], + pCtx->es.Sel, pCtx->es.u64Base, pCtx->es.u32Limit, pCtx->es.Attr.u, pCtx->dr[4], pCtx->dr[5], + pCtx->fs.Sel, pCtx->fs.u64Base, pCtx->fs.u32Limit, pCtx->fs.Attr.u, pCtx->dr[6], pCtx->dr[7], + pCtx->gs.Sel, pCtx->gs.u64Base, pCtx->gs.u32Limit, pCtx->gs.Attr.u, pCtx->cr0, pCtx->cr2, + pCtx->ss.Sel, pCtx->ss.u64Base, pCtx->ss.u32Limit, pCtx->ss.Attr.u, pCtx->cr3, pCtx->cr4, + pCtx->gdtr.pGdt, pCtx->gdtr.cbGdt, pCtx->idtr.pIdt, pCtx->idtr.cbIdt, uEFlags, + pCtx->ldtr.Sel, pCtx->ldtr.u64Base, pCtx->ldtr.u32Limit, pCtx->ldtr.Attr.u, + pCtx->tr.Sel, pCtx->tr.u64Base, pCtx->tr.u32Limit, pCtx->tr.Attr.u, + pCtx->SysEnter.cs, pCtx->SysEnter.eip, pCtx->SysEnter.esp)); + + PX86FXSTATE pFpuCtx = &pCtx->CTX_SUFF(pXState)->x87; + Log(("FPU:\n" + "FCW=%04x FSW=%04x FTW=%02x\n" + "FOP=%04x FPUIP=%08x CS=%04x Rsrvd1=%04x\n" + "FPUDP=%04x DS=%04x Rsvrd2=%04x MXCSR=%08x MXCSR_MASK=%08x\n" + , + pFpuCtx->FCW, pFpuCtx->FSW, pFpuCtx->FTW, + pFpuCtx->FOP, pFpuCtx->FPUIP, pFpuCtx->CS, pFpuCtx->Rsrvd1, + pFpuCtx->FPUDP, pFpuCtx->DS, pFpuCtx->Rsrvd2, + pFpuCtx->MXCSR, pFpuCtx->MXCSR_MASK)); + + Log(("MSR:\n" + "EFER =%016RX64\n" + "PAT =%016RX64\n" + "STAR =%016RX64\n" + "CSTAR =%016RX64\n" + "LSTAR =%016RX64\n" + "SFMASK =%016RX64\n" + "KERNELGSBASE =%016RX64\n", + pCtx->msrEFER, + pCtx->msrPAT, + pCtx->msrSTAR, + pCtx->msrCSTAR, + pCtx->msrLSTAR, + pCtx->msrSFMASK, + pCtx->msrKERNELGSBASE)); + + NOREF(pFpuCtx); +} +#endif /* VBOX_STRICT */ + diff --git a/src/VBox/VMM/VMMR0/HMR0A.asm b/src/VBox/VMM/VMMR0/HMR0A.asm new file mode 100644 index 00000000..3db49a1e --- /dev/null +++ b/src/VBox/VMM/VMMR0/HMR0A.asm @@ -0,0 +1,2184 @@ +; $Id: HMR0A.asm $ +;; @file +; HM - Ring-0 VMX, SVM world-switch and helper routines +; + +; +; Copyright (C) 2006-2019 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + +;********************************************************************************************************************************* +;* Header Files * +;********************************************************************************************************************************* +%include "VBox/asmdefs.mac" +%include "VBox/err.mac" +%include "VBox/vmm/hm_vmx.mac" +%include "VBox/vmm/cpum.mac" +%include "VBox/vmm/vm.mac" +%include "iprt/x86.mac" +%include "HMInternal.mac" + +%ifdef RT_OS_OS2 ;; @todo fix OMF support in yasm and kick nasm out completely. + %macro vmwrite 2, + int3 + %endmacro + %define vmlaunch int3 + %define vmresume int3 + %define vmsave int3 + %define vmload int3 + %define vmrun int3 + %define clgi int3 + %define stgi int3 + %macro invlpga 2, + int3 + %endmacro +%endif + +;********************************************************************************************************************************* +;* Defined Constants And Macros * +;********************************************************************************************************************************* +;; The offset of the XMM registers in X86FXSTATE. +; Use define because I'm too lazy to convert the struct. +%define XMM_OFF_IN_X86FXSTATE 160 + +;; Spectre filler for 32-bit mode. +; Some user space address that points to a 4MB page boundrary in hope that it +; will somehow make it less useful. +%define SPECTRE_FILLER32 0x227fffff +;; Spectre filler for 64-bit mode. +; Choosen to be an invalid address (also with 5 level paging). +%define SPECTRE_FILLER64 0x02204204207fffff +;; Spectre filler for the current CPU mode. +%ifdef RT_ARCH_AMD64 + %define SPECTRE_FILLER SPECTRE_FILLER64 +%else + %define SPECTRE_FILLER SPECTRE_FILLER32 +%endif + +;; +; Determine skipping restoring of GDTR, IDTR, TR across VMX non-root operation +; +%ifdef RT_ARCH_AMD64 + %define VMX_SKIP_GDTR + %define VMX_SKIP_TR + %define VBOX_SKIP_RESTORE_SEG + %ifdef RT_OS_DARWIN + ; Load the NULL selector into DS, ES, FS and GS on 64-bit darwin so we don't + ; risk loading a stale LDT value or something invalid. + %define HM_64_BIT_USE_NULL_SEL + ; Darwin (Mavericks) uses IDTR limit to store the CPU Id so we need to restore it always. + ; See @bugref{6875}. + %else + %define VMX_SKIP_IDTR + %endif +%endif + +;; @def MYPUSHAD +; Macro generating an equivalent to pushad + +;; @def MYPOPAD +; Macro generating an equivalent to popad + +;; @def MYPUSHSEGS +; Macro saving all segment registers on the stack. +; @param 1 full width register name +; @param 2 16-bit register name for \a 1. + +;; @def MYPOPSEGS +; Macro restoring all segment registers on the stack +; @param 1 full width register name +; @param 2 16-bit register name for \a 1. + +%ifdef ASM_CALL64_GCC + %macro MYPUSHAD64 0 + push r15 + push r14 + push r13 + push r12 + push rbx + %endmacro + %macro MYPOPAD64 0 + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + %endmacro + +%else ; ASM_CALL64_MSC + %macro MYPUSHAD64 0 + push r15 + push r14 + push r13 + push r12 + push rbx + push rsi + push rdi + %endmacro + %macro MYPOPAD64 0 + pop rdi + pop rsi + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + %endmacro +%endif + +%ifdef VBOX_SKIP_RESTORE_SEG + %macro MYPUSHSEGS64 2 + %endmacro + + %macro MYPOPSEGS64 2 + %endmacro +%else ; !VBOX_SKIP_RESTORE_SEG + ; trashes, rax, rdx & rcx + %macro MYPUSHSEGS64 2 + %ifndef HM_64_BIT_USE_NULL_SEL + mov %2, es + push %1 + mov %2, ds + push %1 + %endif + + ; Special case for FS; Windows and Linux either don't use it or restore it when leaving kernel mode, Solaris OTOH doesn't and we must save it. + mov ecx, MSR_K8_FS_BASE + rdmsr + push rdx + push rax + %ifndef HM_64_BIT_USE_NULL_SEL + push fs + %endif + + ; Special case for GS; OSes typically use swapgs to reset the hidden base register for GS on entry into the kernel. The same happens on exit + mov ecx, MSR_K8_GS_BASE + rdmsr + push rdx + push rax + %ifndef HM_64_BIT_USE_NULL_SEL + push gs + %endif + %endmacro + + ; trashes, rax, rdx & rcx + %macro MYPOPSEGS64 2 + ; Note: do not step through this code with a debugger! + %ifndef HM_64_BIT_USE_NULL_SEL + xor eax, eax + mov ds, ax + mov es, ax + mov fs, ax + mov gs, ax + %endif + + %ifndef HM_64_BIT_USE_NULL_SEL + pop gs + %endif + pop rax + pop rdx + mov ecx, MSR_K8_GS_BASE + wrmsr + + %ifndef HM_64_BIT_USE_NULL_SEL + pop fs + %endif + pop rax + pop rdx + mov ecx, MSR_K8_FS_BASE + wrmsr + ; Now it's safe to step again + + %ifndef HM_64_BIT_USE_NULL_SEL + pop %1 + mov ds, %2 + pop %1 + mov es, %2 + %endif + %endmacro +%endif ; VBOX_SKIP_RESTORE_SEG + +%macro MYPUSHAD32 0 + pushad +%endmacro +%macro MYPOPAD32 0 + popad +%endmacro + +%macro MYPUSHSEGS32 2 + push ds + push es + push fs + push gs +%endmacro +%macro MYPOPSEGS32 2 + pop gs + pop fs + pop es + pop ds +%endmacro + +%ifdef RT_ARCH_AMD64 + %define MYPUSHAD MYPUSHAD64 + %define MYPOPAD MYPOPAD64 + %define MYPUSHSEGS MYPUSHSEGS64 + %define MYPOPSEGS MYPOPSEGS64 +%else + %define MYPUSHAD MYPUSHAD32 + %define MYPOPAD MYPOPAD32 + %define MYPUSHSEGS MYPUSHSEGS32 + %define MYPOPSEGS MYPOPSEGS32 +%endif + +;; +; Creates an indirect branch prediction barrier on CPUs that need and supports that. +; @clobbers eax, edx, ecx +; @param 1 How to address CPUMCTX. +; @param 2 Which flag to test for (CPUMCTX_WSF_IBPB_ENTRY or CPUMCTX_WSF_IBPB_EXIT) +%macro INDIRECT_BRANCH_PREDICTION_BARRIER 2 + test byte [%1 + CPUMCTX.fWorldSwitcher], %2 + jz %%no_indirect_branch_barrier + mov ecx, MSR_IA32_PRED_CMD + mov eax, MSR_IA32_PRED_CMD_F_IBPB + xor edx, edx + wrmsr +%%no_indirect_branch_barrier: +%endmacro + +;; +; Creates an indirect branch prediction and L1D barrier on CPUs that need and supports that. +; @clobbers eax, edx, ecx +; @param 1 How to address CPUMCTX. +; @param 2 Which IBPB flag to test for (CPUMCTX_WSF_IBPB_ENTRY or CPUMCTX_WSF_IBPB_EXIT) +; @param 3 Which FLUSH flag to test for (CPUMCTX_WSF_L1D_ENTRY) +%macro INDIRECT_BRANCH_PREDICTION_AND_L1_CACHE_BARRIER 3 + ; Only one test+jmp when disabled CPUs. + test byte [%1 + CPUMCTX.fWorldSwitcher], (%2 | %3) + jz %%no_barrier_needed + + ; The eax:edx value is the same for both. + AssertCompile(MSR_IA32_PRED_CMD_F_IBPB == MSR_IA32_FLUSH_CMD_F_L1D) + mov eax, MSR_IA32_PRED_CMD_F_IBPB + xor edx, edx + + ; Indirect branch barrier. + test byte [%1 + CPUMCTX.fWorldSwitcher], %2 + jz %%no_indirect_branch_barrier + mov ecx, MSR_IA32_PRED_CMD + wrmsr +%%no_indirect_branch_barrier: + + ; Level 1 data cache flush. + test byte [%1 + CPUMCTX.fWorldSwitcher], %3 + jz %%no_cache_flush_barrier + mov ecx, MSR_IA32_FLUSH_CMD + wrmsr +%%no_cache_flush_barrier: + +%%no_barrier_needed: +%endmacro + + +;********************************************************************************************************************************* +;* External Symbols * +;********************************************************************************************************************************* +%ifdef VBOX_WITH_KERNEL_USING_XMM +extern NAME(CPUMIsGuestFPUStateActive) +%endif + + +BEGINCODE + + +;/** +; * Restores host-state fields. +; * +; * @returns VBox status code +; * @param f32RestoreHost x86: [ebp + 08h] msc: ecx gcc: edi RestoreHost flags. +; * @param pRestoreHost x86: [ebp + 0ch] msc: rdx gcc: rsi Pointer to the RestoreHost struct. +; */ +ALIGNCODE(16) +BEGINPROC VMXRestoreHostState +%ifdef RT_ARCH_AMD64 + %ifndef ASM_CALL64_GCC + ; Use GCC's input registers since we'll be needing both rcx and rdx further + ; down with the wrmsr instruction. Use the R10 and R11 register for saving + ; RDI and RSI since MSC preserve the two latter registers. + mov r10, rdi + mov r11, rsi + mov rdi, rcx + mov rsi, rdx + %endif + + test edi, VMX_RESTORE_HOST_GDTR + jz .test_idtr + lgdt [rsi + VMXRESTOREHOST.HostGdtr] + +.test_idtr: + test edi, VMX_RESTORE_HOST_IDTR + jz .test_ds + lidt [rsi + VMXRESTOREHOST.HostIdtr] + +.test_ds: + test edi, VMX_RESTORE_HOST_SEL_DS + jz .test_es + mov ax, [rsi + VMXRESTOREHOST.uHostSelDS] + mov ds, eax + +.test_es: + test edi, VMX_RESTORE_HOST_SEL_ES + jz .test_tr + mov ax, [rsi + VMXRESTOREHOST.uHostSelES] + mov es, eax + +.test_tr: + test edi, VMX_RESTORE_HOST_SEL_TR + jz .test_fs + ; When restoring the TR, we must first clear the busy flag or we'll end up faulting. + mov dx, [rsi + VMXRESTOREHOST.uHostSelTR] + mov ax, dx + and eax, X86_SEL_MASK_OFF_RPL ; Mask away TI and RPL bits leaving only the descriptor offset. + test edi, VMX_RESTORE_HOST_GDT_READ_ONLY | VMX_RESTORE_HOST_GDT_NEED_WRITABLE + jnz .gdt_readonly + add rax, qword [rsi + VMXRESTOREHOST.HostGdtr + 2] ; xAX <- descriptor offset + GDTR.pGdt. + and dword [rax + 4], ~RT_BIT(9) ; Clear the busy flag in TSS desc (bits 0-7=base, bit 9=busy bit). + ltr dx + jmp short .test_fs +.gdt_readonly: + test edi, VMX_RESTORE_HOST_GDT_NEED_WRITABLE + jnz .gdt_readonly_need_writable + mov rcx, cr0 + mov r9, rcx + add rax, qword [rsi + VMXRESTOREHOST.HostGdtr + 2] ; xAX <- descriptor offset + GDTR.pGdt. + and rcx, ~X86_CR0_WP + mov cr0, rcx + and dword [rax + 4], ~RT_BIT(9) ; Clear the busy flag in TSS desc (bits 0-7=base, bit 9=busy bit). + ltr dx + mov cr0, r9 + jmp short .test_fs +.gdt_readonly_need_writable: + add rax, qword [rsi + VMXRESTOREHOST.HostGdtrRw + 2] ; xAX <- descriptor offset + GDTR.pGdtRw. + and dword [rax + 4], ~RT_BIT(9) ; Clear the busy flag in TSS desc (bits 0-7=base, bit 9=busy bit). + lgdt [rsi + VMXRESTOREHOST.HostGdtrRw] + ltr dx + lgdt [rsi + VMXRESTOREHOST.HostGdtr] ; Load the original GDT + +.test_fs: + ; + ; When restoring the selector values for FS and GS, we'll temporarily trash + ; the base address (at least the high 32-bit bits, but quite possibly the + ; whole base address), the wrmsr will restore it correctly. (VT-x actually + ; restores the base correctly when leaving guest mode, but not the selector + ; value, so there is little problem with interrupts being enabled prior to + ; this restore job.) + ; We'll disable ints once for both FS and GS as that's probably faster. + ; + test edi, VMX_RESTORE_HOST_SEL_FS | VMX_RESTORE_HOST_SEL_GS + jz .restore_success + pushfq + cli ; (see above) + + test edi, VMX_RESTORE_HOST_SEL_FS + jz .test_gs + mov ax, word [rsi + VMXRESTOREHOST.uHostSelFS] + mov fs, eax + mov eax, dword [rsi + VMXRESTOREHOST.uHostFSBase] ; uHostFSBase - Lo + mov edx, dword [rsi + VMXRESTOREHOST.uHostFSBase + 4h] ; uHostFSBase - Hi + mov ecx, MSR_K8_FS_BASE + wrmsr + +.test_gs: + test edi, VMX_RESTORE_HOST_SEL_GS + jz .restore_flags + mov ax, word [rsi + VMXRESTOREHOST.uHostSelGS] + mov gs, eax + mov eax, dword [rsi + VMXRESTOREHOST.uHostGSBase] ; uHostGSBase - Lo + mov edx, dword [rsi + VMXRESTOREHOST.uHostGSBase + 4h] ; uHostGSBase - Hi + mov ecx, MSR_K8_GS_BASE + wrmsr + +.restore_flags: + popfq + +.restore_success: + mov eax, VINF_SUCCESS + %ifndef ASM_CALL64_GCC + ; Restore RDI and RSI on MSC. + mov rdi, r10 + mov rsi, r11 + %endif +%else ; RT_ARCH_X86 + mov eax, VERR_NOT_IMPLEMENTED +%endif + ret +ENDPROC VMXRestoreHostState + + +;/** +; * Dispatches an NMI to the host. +; */ +ALIGNCODE(16) +BEGINPROC VMXDispatchHostNmi + int 2 ; NMI is always vector 2. The IDT[2] IRQ handler cannot be anything else. See Intel spec. 6.3.1 "External Interrupts". + ret +ENDPROC VMXDispatchHostNmi + + +;/** +; * Executes VMWRITE, 64-bit value. +; * +; * @returns VBox status code. +; * @param idxField x86: [ebp + 08h] msc: rcx gcc: rdi VMCS index. +; * @param u64Data x86: [ebp + 0ch] msc: rdx gcc: rsi VM field value. +; */ +ALIGNCODE(16) +BEGINPROC VMXWriteVmcs64 +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + and edi, 0ffffffffh + xor rax, rax + vmwrite rdi, rsi + %else + and ecx, 0ffffffffh + xor rax, rax + vmwrite rcx, rdx + %endif +%else ; RT_ARCH_X86 + mov ecx, [esp + 4] ; idxField + lea edx, [esp + 8] ; &u64Data + vmwrite ecx, [edx] ; low dword + jz .done + jc .done + inc ecx + xor eax, eax + vmwrite ecx, [edx + 4] ; high dword +.done: +%endif ; RT_ARCH_X86 + jnc .valid_vmcs + mov eax, VERR_VMX_INVALID_VMCS_PTR + ret +.valid_vmcs: + jnz .the_end + mov eax, VERR_VMX_INVALID_VMCS_FIELD +.the_end: + ret +ENDPROC VMXWriteVmcs64 + + +;/** +; * Executes VMREAD, 64-bit value. +; * +; * @returns VBox status code. +; * @param idxField VMCS index. +; * @param pData Where to store VM field value. +; */ +;DECLASM(int) VMXReadVmcs64(uint32_t idxField, uint64_t *pData); +ALIGNCODE(16) +BEGINPROC VMXReadVmcs64 +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + and edi, 0ffffffffh + xor rax, rax + vmread [rsi], rdi + %else + and ecx, 0ffffffffh + xor rax, rax + vmread [rdx], rcx + %endif +%else ; RT_ARCH_X86 + mov ecx, [esp + 4] ; idxField + mov edx, [esp + 8] ; pData + vmread [edx], ecx ; low dword + jz .done + jc .done + inc ecx + xor eax, eax + vmread [edx + 4], ecx ; high dword +.done: +%endif ; RT_ARCH_X86 + jnc .valid_vmcs + mov eax, VERR_VMX_INVALID_VMCS_PTR + ret +.valid_vmcs: + jnz .the_end + mov eax, VERR_VMX_INVALID_VMCS_FIELD +.the_end: + ret +ENDPROC VMXReadVmcs64 + + +;/** +; * Executes VMREAD, 32-bit value. +; * +; * @returns VBox status code. +; * @param idxField VMCS index. +; * @param pu32Data Where to store VM field value. +; */ +;DECLASM(int) VMXReadVmcs32(uint32_t idxField, uint32_t *pu32Data); +ALIGNCODE(16) +BEGINPROC VMXReadVmcs32 +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + and edi, 0ffffffffh + xor rax, rax + vmread r10, rdi + mov [rsi], r10d + %else + and ecx, 0ffffffffh + xor rax, rax + vmread r10, rcx + mov [rdx], r10d + %endif +%else ; RT_ARCH_X86 + mov ecx, [esp + 4] ; idxField + mov edx, [esp + 8] ; pu32Data + xor eax, eax + vmread [edx], ecx +%endif ; RT_ARCH_X86 + jnc .valid_vmcs + mov eax, VERR_VMX_INVALID_VMCS_PTR + ret +.valid_vmcs: + jnz .the_end + mov eax, VERR_VMX_INVALID_VMCS_FIELD +.the_end: + ret +ENDPROC VMXReadVmcs32 + + +;/** +; * Executes VMWRITE, 32-bit value. +; * +; * @returns VBox status code. +; * @param idxField VMCS index. +; * @param u32Data Where to store VM field value. +; */ +;DECLASM(int) VMXWriteVmcs32(uint32_t idxField, uint32_t u32Data); +ALIGNCODE(16) +BEGINPROC VMXWriteVmcs32 +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + and edi, 0ffffffffh + and esi, 0ffffffffh + xor rax, rax + vmwrite rdi, rsi + %else + and ecx, 0ffffffffh + and edx, 0ffffffffh + xor rax, rax + vmwrite rcx, rdx + %endif +%else ; RT_ARCH_X86 + mov ecx, [esp + 4] ; idxField + mov edx, [esp + 8] ; u32Data + xor eax, eax + vmwrite ecx, edx +%endif ; RT_ARCH_X86 + jnc .valid_vmcs + mov eax, VERR_VMX_INVALID_VMCS_PTR + ret +.valid_vmcs: + jnz .the_end + mov eax, VERR_VMX_INVALID_VMCS_FIELD +.the_end: + ret +ENDPROC VMXWriteVmcs32 + + +;/** +; * Executes VMXON. +; * +; * @returns VBox status code. +; * @param HCPhysVMXOn Physical address of VMXON structure. +; */ +;DECLASM(int) VMXEnable(RTHCPHYS HCPhysVMXOn); +BEGINPROC VMXEnable +%ifdef RT_ARCH_AMD64 + xor rax, rax + %ifdef ASM_CALL64_GCC + push rdi + %else + push rcx + %endif + vmxon [rsp] +%else ; RT_ARCH_X86 + xor eax, eax + vmxon [esp + 4] +%endif ; RT_ARCH_X86 + jnc .good + mov eax, VERR_VMX_INVALID_VMXON_PTR + jmp .the_end + +.good: + jnz .the_end + mov eax, VERR_VMX_VMXON_FAILED + +.the_end: +%ifdef RT_ARCH_AMD64 + add rsp, 8 +%endif + ret +ENDPROC VMXEnable + + +;/** +; * Executes VMXOFF. +; */ +;DECLASM(void) VMXDisable(void); +BEGINPROC VMXDisable + vmxoff +.the_end: + ret +ENDPROC VMXDisable + + +;/** +; * Executes VMCLEAR. +; * +; * @returns VBox status code. +; * @param HCPhysVmcs Physical address of VM control structure. +; */ +;DECLASM(int) VMXClearVmcs(RTHCPHYS HCPhysVmcs); +ALIGNCODE(16) +BEGINPROC VMXClearVmcs +%ifdef RT_ARCH_AMD64 + xor rax, rax + %ifdef ASM_CALL64_GCC + push rdi + %else + push rcx + %endif + vmclear [rsp] +%else ; RT_ARCH_X86 + xor eax, eax + vmclear [esp + 4] +%endif ; RT_ARCH_X86 + jnc .the_end + mov eax, VERR_VMX_INVALID_VMCS_PTR +.the_end: +%ifdef RT_ARCH_AMD64 + add rsp, 8 +%endif + ret +ENDPROC VMXClearVmcs + + +;/** +; * Executes VMPTRLD. +; * +; * @returns VBox status code. +; * @param HCPhysVmcs Physical address of VMCS structure. +; */ +;DECLASM(int) VMXActivateVmcs(RTHCPHYS HCPhysVmcs); +ALIGNCODE(16) +BEGINPROC VMXActivateVmcs +%ifdef RT_ARCH_AMD64 + xor rax, rax + %ifdef ASM_CALL64_GCC + push rdi + %else + push rcx + %endif + vmptrld [rsp] +%else + xor eax, eax + vmptrld [esp + 4] +%endif + jnc .the_end + mov eax, VERR_VMX_INVALID_VMCS_PTR +.the_end: +%ifdef RT_ARCH_AMD64 + add rsp, 8 +%endif + ret +ENDPROC VMXActivateVmcs + + +;/** +; * Executes VMPTRST. +; * +; * @returns VBox status code. +; * @param [esp + 04h] gcc:rdi msc:rcx Param 1 - First parameter - Address that will receive the current pointer. +; */ +;DECLASM(int) VMXGetActivatedVmcs(RTHCPHYS *pVMCS); +BEGINPROC VMXGetActivatedVmcs +%ifdef RT_OS_OS2 + mov eax, VERR_NOT_SUPPORTED + ret +%else + %ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + vmptrst qword [rdi] + %else + vmptrst qword [rcx] + %endif + %else + vmptrst qword [esp+04h] + %endif + xor eax, eax +.the_end: + ret +%endif +ENDPROC VMXGetActivatedVmcs + +;/** +; * Invalidate a page using INVEPT. +; @param enmTlbFlush msc:ecx gcc:edi x86:[esp+04] Type of flush. +; @param pDescriptor msc:edx gcc:esi x86:[esp+08] Descriptor pointer. +; */ +;DECLASM(int) VMXR0InvEPT(VMXTLBFLUSHEPT enmTlbFlush, uint64_t *pDescriptor); +BEGINPROC VMXR0InvEPT +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + and edi, 0ffffffffh + xor rax, rax +; invept rdi, qword [rsi] + DB 0x66, 0x0F, 0x38, 0x80, 0x3E + %else + and ecx, 0ffffffffh + xor rax, rax +; invept rcx, qword [rdx] + DB 0x66, 0x0F, 0x38, 0x80, 0xA + %endif +%else + mov ecx, [esp + 4] + mov edx, [esp + 8] + xor eax, eax +; invept ecx, qword [edx] + DB 0x66, 0x0F, 0x38, 0x80, 0xA +%endif + jnc .valid_vmcs + mov eax, VERR_VMX_INVALID_VMCS_PTR + ret +.valid_vmcs: + jnz .the_end + mov eax, VERR_INVALID_PARAMETER +.the_end: + ret +ENDPROC VMXR0InvEPT + + +;/** +; * Invalidate a page using invvpid +; @param enmTlbFlush msc:ecx gcc:edi x86:[esp+04] Type of flush +; @param pDescriptor msc:edx gcc:esi x86:[esp+08] Descriptor pointer +; */ +;DECLASM(int) VMXR0InvVPID(VMXTLBFLUSHVPID enmTlbFlush, uint64_t *pDescriptor); +BEGINPROC VMXR0InvVPID +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + and edi, 0ffffffffh + xor rax, rax +; invvpid rdi, qword [rsi] + DB 0x66, 0x0F, 0x38, 0x81, 0x3E + %else + and ecx, 0ffffffffh + xor rax, rax +; invvpid rcx, qword [rdx] + DB 0x66, 0x0F, 0x38, 0x81, 0xA + %endif +%else + mov ecx, [esp + 4] + mov edx, [esp + 8] + xor eax, eax +; invvpid ecx, qword [edx] + DB 0x66, 0x0F, 0x38, 0x81, 0xA +%endif + jnc .valid_vmcs + mov eax, VERR_VMX_INVALID_VMCS_PTR + ret +.valid_vmcs: + jnz .the_end + mov eax, VERR_INVALID_PARAMETER +.the_end: + ret +ENDPROC VMXR0InvVPID + + +%if GC_ARCH_BITS == 64 +;; +; Executes INVLPGA +; +; @param pPageGC msc:rcx gcc:rdi x86:[esp+04] Virtual page to invalidate +; @param uASID msc:rdx gcc:rsi x86:[esp+0C] Tagged TLB id +; +;DECLASM(void) SVMR0InvlpgA(RTGCPTR pPageGC, uint32_t uASID); +BEGINPROC SVMR0InvlpgA +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + mov rax, rdi + mov rcx, rsi + %else + mov rax, rcx + mov rcx, rdx + %endif +%else + mov eax, [esp + 4] + mov ecx, [esp + 0Ch] +%endif + invlpga [xAX], ecx + ret +ENDPROC SVMR0InvlpgA + +%else ; GC_ARCH_BITS != 64 +;; +; Executes INVLPGA +; +; @param pPageGC msc:ecx gcc:edi x86:[esp+04] Virtual page to invalidate +; @param uASID msc:edx gcc:esi x86:[esp+08] Tagged TLB id +; +;DECLASM(void) SVMR0InvlpgA(RTGCPTR pPageGC, uint32_t uASID); +BEGINPROC SVMR0InvlpgA +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + movzx rax, edi + mov ecx, esi + %else + ; from http://www.cs.cmu.edu/~fp/courses/15213-s06/misc/asm64-handout.pdf: + ; ``Perhaps unexpectedly, instructions that move or generate 32-bit register + ; values also set the upper 32 bits of the register to zero. Consequently + ; there is no need for an instruction movzlq.'' + mov eax, ecx + mov ecx, edx + %endif +%else + mov eax, [esp + 4] + mov ecx, [esp + 8] +%endif + invlpga [xAX], ecx + ret +ENDPROC SVMR0InvlpgA + +%endif ; GC_ARCH_BITS != 64 + + +%ifdef VBOX_WITH_KERNEL_USING_XMM + +;; +; Wrapper around vmx.pfnStartVM that preserves host XMM registers and +; load the guest ones when necessary. +; +; @cproto DECLASM(int) HMR0VMXStartVMhmR0DumpDescriptorM(RTHCUINT fResume, PCPUMCTX pCtx, PVMCSCACHE pCache, PVM pVM, +; PVMCPU pVCpu, PFNHMVMXSTARTVM pfnStartVM); +; +; @returns eax +; +; @param fResumeVM msc:rcx +; @param pCtx msc:rdx +; @param pVMCSCache msc:r8 +; @param pVM msc:r9 +; @param pVCpu msc:[rbp+30h] The cross context virtual CPU structure of the calling EMT. +; @param pfnStartVM msc:[rbp+38h] +; +; @remarks This is essentially the same code as hmR0SVMRunWrapXMM, only the parameters differ a little bit. +; +; @remarks Drivers shouldn't use AVX registers without saving+loading: +; https://msdn.microsoft.com/en-us/library/windows/hardware/ff545910%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396 +; However the compiler docs have different idea: +; https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx +; We'll go with the former for now. +; +; ASSUMING 64-bit and windows for now. +; +ALIGNCODE(16) +BEGINPROC hmR0VMXStartVMWrapXMM + push xBP + mov xBP, xSP + sub xSP, 0b0h + 040h ; Don't bother optimizing the frame size. + + ; spill input parameters. + mov [xBP + 010h], rcx ; fResumeVM + mov [xBP + 018h], rdx ; pCtx + mov [xBP + 020h], r8 ; pVMCSCache + mov [xBP + 028h], r9 ; pVM + + ; Ask CPUM whether we've started using the FPU yet. + mov rcx, [xBP + 30h] ; pVCpu + call NAME(CPUMIsGuestFPUStateActive) + test al, al + jnz .guest_fpu_state_active + + ; No need to mess with XMM registers just call the start routine and return. + mov r11, [xBP + 38h] ; pfnStartVM + mov r10, [xBP + 30h] ; pVCpu + mov [xSP + 020h], r10 + mov rcx, [xBP + 010h] ; fResumeVM + mov rdx, [xBP + 018h] ; pCtx + mov r8, [xBP + 020h] ; pVMCSCache + mov r9, [xBP + 028h] ; pVM + call r11 + + leave + ret + +ALIGNCODE(8) +.guest_fpu_state_active: + ; Save the non-volatile host XMM registers. + movdqa [rsp + 040h + 000h], xmm6 + movdqa [rsp + 040h + 010h], xmm7 + movdqa [rsp + 040h + 020h], xmm8 + movdqa [rsp + 040h + 030h], xmm9 + movdqa [rsp + 040h + 040h], xmm10 + movdqa [rsp + 040h + 050h], xmm11 + movdqa [rsp + 040h + 060h], xmm12 + movdqa [rsp + 040h + 070h], xmm13 + movdqa [rsp + 040h + 080h], xmm14 + movdqa [rsp + 040h + 090h], xmm15 + stmxcsr [rsp + 040h + 0a0h] + + mov r10, [xBP + 018h] ; pCtx + mov eax, [r10 + CPUMCTX.fXStateMask] + test eax, eax + jz .guest_fpu_state_manually + + ; + ; Using XSAVE to load the guest XMM, YMM and ZMM registers. + ; + and eax, CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS + xor edx, edx + mov r10, [r10 + CPUMCTX.pXStateR0] + xrstor [r10] + + ; Make the call (same as in the other case ). + mov r11, [xBP + 38h] ; pfnStartVM + mov r10, [xBP + 30h] ; pVCpu + mov [xSP + 020h], r10 + mov rcx, [xBP + 010h] ; fResumeVM + mov rdx, [xBP + 018h] ; pCtx + mov r8, [xBP + 020h] ; pVMCSCache + mov r9, [xBP + 028h] ; pVM + call r11 + + mov r11d, eax ; save return value (xsave below uses eax) + + ; Save the guest XMM registers. + mov r10, [xBP + 018h] ; pCtx + mov eax, [r10 + CPUMCTX.fXStateMask] + and eax, CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS + xor edx, edx + mov r10, [r10 + CPUMCTX.pXStateR0] + xsave [r10] + + mov eax, r11d ; restore return value. + +.restore_non_volatile_host_xmm_regs: + ; Load the non-volatile host XMM registers. + movdqa xmm6, [rsp + 040h + 000h] + movdqa xmm7, [rsp + 040h + 010h] + movdqa xmm8, [rsp + 040h + 020h] + movdqa xmm9, [rsp + 040h + 030h] + movdqa xmm10, [rsp + 040h + 040h] + movdqa xmm11, [rsp + 040h + 050h] + movdqa xmm12, [rsp + 040h + 060h] + movdqa xmm13, [rsp + 040h + 070h] + movdqa xmm14, [rsp + 040h + 080h] + movdqa xmm15, [rsp + 040h + 090h] + ldmxcsr [rsp + 040h + 0a0h] + leave + ret + + ; + ; No XSAVE, load and save the guest XMM registers manually. + ; +.guest_fpu_state_manually: + ; Load the full guest XMM register state. + mov r10, [r10 + CPUMCTX.pXStateR0] + movdqa xmm0, [r10 + XMM_OFF_IN_X86FXSTATE + 000h] + movdqa xmm1, [r10 + XMM_OFF_IN_X86FXSTATE + 010h] + movdqa xmm2, [r10 + XMM_OFF_IN_X86FXSTATE + 020h] + movdqa xmm3, [r10 + XMM_OFF_IN_X86FXSTATE + 030h] + movdqa xmm4, [r10 + XMM_OFF_IN_X86FXSTATE + 040h] + movdqa xmm5, [r10 + XMM_OFF_IN_X86FXSTATE + 050h] + movdqa xmm6, [r10 + XMM_OFF_IN_X86FXSTATE + 060h] + movdqa xmm7, [r10 + XMM_OFF_IN_X86FXSTATE + 070h] + movdqa xmm8, [r10 + XMM_OFF_IN_X86FXSTATE + 080h] + movdqa xmm9, [r10 + XMM_OFF_IN_X86FXSTATE + 090h] + movdqa xmm10, [r10 + XMM_OFF_IN_X86FXSTATE + 0a0h] + movdqa xmm11, [r10 + XMM_OFF_IN_X86FXSTATE + 0b0h] + movdqa xmm12, [r10 + XMM_OFF_IN_X86FXSTATE + 0c0h] + movdqa xmm13, [r10 + XMM_OFF_IN_X86FXSTATE + 0d0h] + movdqa xmm14, [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h] + movdqa xmm15, [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h] + ldmxcsr [r10 + X86FXSTATE.MXCSR] + + ; Make the call (same as in the other case ). + mov r11, [xBP + 38h] ; pfnStartVM + mov r10, [xBP + 30h] ; pVCpu + mov [xSP + 020h], r10 + mov rcx, [xBP + 010h] ; fResumeVM + mov rdx, [xBP + 018h] ; pCtx + mov r8, [xBP + 020h] ; pVMCSCache + mov r9, [xBP + 028h] ; pVM + call r11 + + ; Save the guest XMM registers. + mov r10, [xBP + 018h] ; pCtx + mov r10, [r10 + CPUMCTX.pXStateR0] + stmxcsr [r10 + X86FXSTATE.MXCSR] + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 000h], xmm0 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 010h], xmm1 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 020h], xmm2 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 030h], xmm3 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 040h], xmm4 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 050h], xmm5 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 060h], xmm6 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 070h], xmm7 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 080h], xmm8 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 090h], xmm9 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0a0h], xmm10 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0b0h], xmm11 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0c0h], xmm12 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0d0h], xmm13 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h], xmm14 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h], xmm15 + jmp .restore_non_volatile_host_xmm_regs +ENDPROC hmR0VMXStartVMWrapXMM + +;; +; Wrapper around svm.pfnVMRun that preserves host XMM registers and +; load the guest ones when necessary. +; +; @cproto DECLASM(int) hmR0SVMRunWrapXMM(RTHCPHYS HCPhysVmcbHost, RTHCPHYS HCPhysVmcb, PCPUMCTX pCtx, PVM pVM, PVMCPU pVCpu, +; PFNHMSVMVMRUN pfnVMRun); +; +; @returns eax +; +; @param HCPhysVmcbHost msc:rcx +; @param HCPhysVmcb msc:rdx +; @param pCtx msc:r8 +; @param pVM msc:r9 +; @param pVCpu msc:[rbp+30h] The cross context virtual CPU structure of the calling EMT. +; @param pfnVMRun msc:[rbp+38h] +; +; @remarks This is essentially the same code as hmR0VMXStartVMWrapXMM, only the parameters differ a little bit. +; +; @remarks Drivers shouldn't use AVX registers without saving+loading: +; https://msdn.microsoft.com/en-us/library/windows/hardware/ff545910%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396 +; However the compiler docs have different idea: +; https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx +; We'll go with the former for now. +; +; ASSUMING 64-bit and windows for now. +ALIGNCODE(16) +BEGINPROC hmR0SVMRunWrapXMM + push xBP + mov xBP, xSP + sub xSP, 0b0h + 040h ; Don't bother optimizing the frame size. + + ; spill input parameters. + mov [xBP + 010h], rcx ; HCPhysVmcbHost + mov [xBP + 018h], rdx ; HCPhysVmcb + mov [xBP + 020h], r8 ; pCtx + mov [xBP + 028h], r9 ; pVM + + ; Ask CPUM whether we've started using the FPU yet. + mov rcx, [xBP + 30h] ; pVCpu + call NAME(CPUMIsGuestFPUStateActive) + test al, al + jnz .guest_fpu_state_active + + ; No need to mess with XMM registers just call the start routine and return. + mov r11, [xBP + 38h] ; pfnVMRun + mov r10, [xBP + 30h] ; pVCpu + mov [xSP + 020h], r10 + mov rcx, [xBP + 010h] ; HCPhysVmcbHost + mov rdx, [xBP + 018h] ; HCPhysVmcb + mov r8, [xBP + 020h] ; pCtx + mov r9, [xBP + 028h] ; pVM + call r11 + + leave + ret + +ALIGNCODE(8) +.guest_fpu_state_active: + ; Save the non-volatile host XMM registers. + movdqa [rsp + 040h + 000h], xmm6 + movdqa [rsp + 040h + 010h], xmm7 + movdqa [rsp + 040h + 020h], xmm8 + movdqa [rsp + 040h + 030h], xmm9 + movdqa [rsp + 040h + 040h], xmm10 + movdqa [rsp + 040h + 050h], xmm11 + movdqa [rsp + 040h + 060h], xmm12 + movdqa [rsp + 040h + 070h], xmm13 + movdqa [rsp + 040h + 080h], xmm14 + movdqa [rsp + 040h + 090h], xmm15 + stmxcsr [rsp + 040h + 0a0h] + + mov r10, [xBP + 020h] ; pCtx + mov eax, [r10 + CPUMCTX.fXStateMask] + test eax, eax + jz .guest_fpu_state_manually + + ; + ; Using XSAVE. + ; + and eax, CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS + xor edx, edx + mov r10, [r10 + CPUMCTX.pXStateR0] + xrstor [r10] + + ; Make the call (same as in the other case ). + mov r11, [xBP + 38h] ; pfnVMRun + mov r10, [xBP + 30h] ; pVCpu + mov [xSP + 020h], r10 + mov rcx, [xBP + 010h] ; HCPhysVmcbHost + mov rdx, [xBP + 018h] ; HCPhysVmcb + mov r8, [xBP + 020h] ; pCtx + mov r9, [xBP + 028h] ; pVM + call r11 + + mov r11d, eax ; save return value (xsave below uses eax) + + ; Save the guest XMM registers. + mov r10, [xBP + 020h] ; pCtx + mov eax, [r10 + CPUMCTX.fXStateMask] + and eax, CPUM_VOLATILE_XSAVE_GUEST_COMPONENTS + xor edx, edx + mov r10, [r10 + CPUMCTX.pXStateR0] + xsave [r10] + + mov eax, r11d ; restore return value. + +.restore_non_volatile_host_xmm_regs: + ; Load the non-volatile host XMM registers. + movdqa xmm6, [rsp + 040h + 000h] + movdqa xmm7, [rsp + 040h + 010h] + movdqa xmm8, [rsp + 040h + 020h] + movdqa xmm9, [rsp + 040h + 030h] + movdqa xmm10, [rsp + 040h + 040h] + movdqa xmm11, [rsp + 040h + 050h] + movdqa xmm12, [rsp + 040h + 060h] + movdqa xmm13, [rsp + 040h + 070h] + movdqa xmm14, [rsp + 040h + 080h] + movdqa xmm15, [rsp + 040h + 090h] + ldmxcsr [rsp + 040h + 0a0h] + leave + ret + + ; + ; No XSAVE, load and save the guest XMM registers manually. + ; +.guest_fpu_state_manually: + ; Load the full guest XMM register state. + mov r10, [r10 + CPUMCTX.pXStateR0] + movdqa xmm0, [r10 + XMM_OFF_IN_X86FXSTATE + 000h] + movdqa xmm1, [r10 + XMM_OFF_IN_X86FXSTATE + 010h] + movdqa xmm2, [r10 + XMM_OFF_IN_X86FXSTATE + 020h] + movdqa xmm3, [r10 + XMM_OFF_IN_X86FXSTATE + 030h] + movdqa xmm4, [r10 + XMM_OFF_IN_X86FXSTATE + 040h] + movdqa xmm5, [r10 + XMM_OFF_IN_X86FXSTATE + 050h] + movdqa xmm6, [r10 + XMM_OFF_IN_X86FXSTATE + 060h] + movdqa xmm7, [r10 + XMM_OFF_IN_X86FXSTATE + 070h] + movdqa xmm8, [r10 + XMM_OFF_IN_X86FXSTATE + 080h] + movdqa xmm9, [r10 + XMM_OFF_IN_X86FXSTATE + 090h] + movdqa xmm10, [r10 + XMM_OFF_IN_X86FXSTATE + 0a0h] + movdqa xmm11, [r10 + XMM_OFF_IN_X86FXSTATE + 0b0h] + movdqa xmm12, [r10 + XMM_OFF_IN_X86FXSTATE + 0c0h] + movdqa xmm13, [r10 + XMM_OFF_IN_X86FXSTATE + 0d0h] + movdqa xmm14, [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h] + movdqa xmm15, [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h] + ldmxcsr [r10 + X86FXSTATE.MXCSR] + + ; Make the call (same as in the other case ). + mov r11, [xBP + 38h] ; pfnVMRun + mov r10, [xBP + 30h] ; pVCpu + mov [xSP + 020h], r10 + mov rcx, [xBP + 010h] ; HCPhysVmcbHost + mov rdx, [xBP + 018h] ; HCPhysVmcb + mov r8, [xBP + 020h] ; pCtx + mov r9, [xBP + 028h] ; pVM + call r11 + + ; Save the guest XMM registers. + mov r10, [xBP + 020h] ; pCtx + mov r10, [r10 + CPUMCTX.pXStateR0] + stmxcsr [r10 + X86FXSTATE.MXCSR] + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 000h], xmm0 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 010h], xmm1 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 020h], xmm2 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 030h], xmm3 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 040h], xmm4 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 050h], xmm5 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 060h], xmm6 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 070h], xmm7 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 080h], xmm8 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 090h], xmm9 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0a0h], xmm10 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0b0h], xmm11 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0c0h], xmm12 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0d0h], xmm13 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0e0h], xmm14 + movdqa [r10 + XMM_OFF_IN_X86FXSTATE + 0f0h], xmm15 + jmp .restore_non_volatile_host_xmm_regs +ENDPROC hmR0SVMRunWrapXMM + +%endif ; VBOX_WITH_KERNEL_USING_XMM + + +;; @def RESTORE_STATE_VM32 +; Macro restoring essential host state and updating guest state +; for common host, 32-bit guest for VT-x. +%macro RESTORE_STATE_VM32 0 + ; Restore base and limit of the IDTR & GDTR. + %ifndef VMX_SKIP_IDTR + lidt [xSP] + add xSP, xCB * 2 + %endif + %ifndef VMX_SKIP_GDTR + lgdt [xSP] + add xSP, xCB * 2 + %endif + + push xDI + %ifndef VMX_SKIP_TR + mov xDI, [xSP + xCB * 3] ; pCtx (*3 to skip the saved xDI, TR, LDTR). + %else + mov xDI, [xSP + xCB * 2] ; pCtx (*2 to skip the saved xDI, LDTR). + %endif + + mov [ss:xDI + CPUMCTX.eax], eax + mov xAX, SPECTRE_FILLER + mov [ss:xDI + CPUMCTX.ebx], ebx + mov xBX, xAX + mov [ss:xDI + CPUMCTX.ecx], ecx + mov xCX, xAX + mov [ss:xDI + CPUMCTX.edx], edx + mov xDX, xAX + mov [ss:xDI + CPUMCTX.esi], esi + mov xSI, xAX + mov [ss:xDI + CPUMCTX.ebp], ebp + mov xBP, xAX + mov xAX, cr2 + mov [ss:xDI + CPUMCTX.cr2], xAX + + %ifdef RT_ARCH_AMD64 + pop xAX ; The guest edi we pushed above. + mov dword [ss:xDI + CPUMCTX.edi], eax + %else + pop dword [ss:xDI + CPUMCTX.edi] ; The guest edi we pushed above. + %endif + + ; Fight spectre. + INDIRECT_BRANCH_PREDICTION_BARRIER ss:xDI, CPUMCTX_WSF_IBPB_EXIT + + %ifndef VMX_SKIP_TR + ; Restore TSS selector; must mark it as not busy before using ltr (!) + ; ASSUME that this is supposed to be 'BUSY'. (saves 20-30 ticks on the T42p) + ; @todo get rid of sgdt + pop xBX ; Saved TR + sub xSP, xCB * 2 + sgdt [xSP] + mov xAX, xBX + and eax, X86_SEL_MASK_OFF_RPL ; Mask away TI and RPL bits leaving only the descriptor offset. + add xAX, [xSP + 2] ; eax <- GDTR.address + descriptor offset. + and dword [ss:xAX + 4], ~RT_BIT(9) ; Clear the busy flag in TSS desc (bits 0-7=base, bit 9=busy bit). + ltr bx + add xSP, xCB * 2 + %endif + + pop xAX ; Saved LDTR + %ifdef RT_ARCH_AMD64 + cmp eax, 0 + je %%skip_ldt_write32 + %endif + lldt ax + +%%skip_ldt_write32: + add xSP, xCB ; pCtx + + %ifdef VMX_USE_CACHED_VMCS_ACCESSES + pop xDX ; Saved pCache + + ; Note! If we get here as a result of invalid VMCS pointer, all the following + ; vmread's will fail (only eflags.cf=1 will be set) but that shouldn't cause any + ; trouble only just less efficient. + mov ecx, [ss:xDX + VMCSCACHE.Read.cValidEntries] + cmp ecx, 0 ; Can't happen + je %%no_cached_read32 + jmp %%cached_read32 + +ALIGN(16) +%%cached_read32: + dec xCX + mov eax, [ss:xDX + VMCSCACHE.Read.aField + xCX * 4] + ; Note! This leaves the high 32 bits of the cache entry unmodified!! + vmread [ss:xDX + VMCSCACHE.Read.aFieldVal + xCX * 8], xAX + cmp xCX, 0 + jnz %%cached_read32 +%%no_cached_read32: + %endif + + ; Restore segment registers. + MYPOPSEGS xAX, ax + + ; Restore the host XCR0 if necessary. + pop xCX + test ecx, ecx + jnz %%xcr0_after_skip + pop xAX + pop xDX + xsetbv ; ecx is already zero. +%%xcr0_after_skip: + + ; Restore general purpose registers. + MYPOPAD +%endmacro + + +;; +; Prepares for and executes VMLAUNCH/VMRESUME (32 bits guest mode) +; +; @returns VBox status code +; @param fResume x86:[ebp+8], msc:rcx,gcc:rdi Whether to use vmlauch/vmresume. +; @param pCtx x86:[ebp+c], msc:rdx,gcc:rsi Pointer to the guest-CPU context. +; @param pCache x86:[ebp+10],msc:r8, gcc:rdx Pointer to the VMCS cache. +; @param pVM x86:[ebp+14],msc:r9, gcc:rcx The cross context VM structure. +; @param pVCpu x86:[ebp+18],msc:[ebp+30],gcc:r8 The cross context virtual CPU structure of the calling EMT. +; +ALIGNCODE(16) +BEGINPROC VMXR0StartVM32 + push xBP + mov xBP, xSP + + pushf + cli + + ; + ; Save all general purpose host registers. + ; + MYPUSHAD + + ; + ; First we have to write some final guest CPU context registers. + ; + mov eax, VMX_VMCS_HOST_RIP +%ifdef RT_ARCH_AMD64 + lea r10, [.vmlaunch_done wrt rip] + vmwrite rax, r10 +%else + mov ecx, .vmlaunch_done + vmwrite eax, ecx +%endif + ; Note: assumes success! + + ; + ; Unify input parameter registers. + ; +%ifdef RT_ARCH_AMD64 + %ifdef ASM_CALL64_GCC + ; fResume already in rdi + ; pCtx already in rsi + mov rbx, rdx ; pCache + %else + mov rdi, rcx ; fResume + mov rsi, rdx ; pCtx + mov rbx, r8 ; pCache + %endif +%else + mov edi, [ebp + 8] ; fResume + mov esi, [ebp + 12] ; pCtx + mov ebx, [ebp + 16] ; pCache +%endif + + ; + ; Save the host XCR0 and load the guest one if necessary. + ; Note! Trashes rdx and rcx. + ; +%ifdef ASM_CALL64_MSC + mov rax, [xBP + 30h] ; pVCpu +%elifdef ASM_CALL64_GCC + mov rax, r8 ; pVCpu +%else + mov eax, [xBP + 18h] ; pVCpu +%endif + test byte [xAX + VMCPU.hm + HMCPU.fLoadSaveGuestXcr0], 1 + jz .xcr0_before_skip + + xor ecx, ecx + xgetbv ; Save the host one on the stack. + push xDX + push xAX + + mov eax, [xSI + CPUMCTX.aXcr] ; Load the guest one. + mov edx, [xSI + CPUMCTX.aXcr + 4] + xor ecx, ecx ; paranoia + xsetbv + + push 0 ; Indicate that we must restore XCR0 (popped into ecx, thus 0). + jmp .xcr0_before_done + +.xcr0_before_skip: + push 3fh ; indicate that we need not. +.xcr0_before_done: + + ; + ; Save segment registers. + ; Note! Trashes rdx & rcx, so we moved it here (amd64 case). + ; + MYPUSHSEGS xAX, ax + +%ifdef VMX_USE_CACHED_VMCS_ACCESSES + mov ecx, [xBX + VMCSCACHE.Write.cValidEntries] + cmp ecx, 0 + je .no_cached_writes + mov edx, ecx + mov ecx, 0 + jmp .cached_write + +ALIGN(16) +.cached_write: + mov eax, [xBX + VMCSCACHE.Write.aField + xCX * 4] + vmwrite xAX, [xBX + VMCSCACHE.Write.aFieldVal + xCX * 8] + inc xCX + cmp xCX, xDX + jl .cached_write + + mov dword [xBX + VMCSCACHE.Write.cValidEntries], 0 +.no_cached_writes: + + ; Save the pCache pointer. + push xBX +%endif + + ; Save the pCtx pointer. + push xSI + + ; Save host LDTR. + xor eax, eax + sldt ax + push xAX + +%ifndef VMX_SKIP_TR + ; The host TR limit is reset to 0x67; save & restore it manually. + str eax + push xAX +%endif + +%ifndef VMX_SKIP_GDTR + ; VT-x only saves the base of the GDTR & IDTR and resets the limit to 0xffff; we must restore the limit correctly! + sub xSP, xCB * 2 + sgdt [xSP] +%endif +%ifndef VMX_SKIP_IDTR + sub xSP, xCB * 2 + sidt [xSP] +%endif + + ; Load CR2 if necessary (may be expensive as writing CR2 is a synchronizing instruction). + mov xBX, [xSI + CPUMCTX.cr2] + mov xDX, cr2 + cmp xBX, xDX + je .skip_cr2_write32 + mov cr2, xBX + +.skip_cr2_write32: + mov eax, VMX_VMCS_HOST_RSP + vmwrite xAX, xSP + ; Note: assumes success! + ; Don't mess with ESP anymore!!! + + ; Fight spectre and similar. + INDIRECT_BRANCH_PREDICTION_AND_L1_CACHE_BARRIER xSI, CPUMCTX_WSF_IBPB_ENTRY, CPUMCTX_WSF_L1D_ENTRY + + ; Load guest general purpose registers. + mov eax, [xSI + CPUMCTX.eax] + mov ebx, [xSI + CPUMCTX.ebx] + mov ecx, [xSI + CPUMCTX.ecx] + mov edx, [xSI + CPUMCTX.edx] + mov ebp, [xSI + CPUMCTX.ebp] + + ; Resume or start VM? + cmp xDI, 0 ; fResume + + ; Load guest edi & esi. + mov edi, [xSI + CPUMCTX.edi] + mov esi, [xSI + CPUMCTX.esi] + + je .vmlaunch_launch + + vmresume + jc near .vmxstart_invalid_vmcs_ptr + jz near .vmxstart_start_failed + jmp .vmlaunch_done; ; Here if vmresume detected a failure. + +.vmlaunch_launch: + vmlaunch + jc near .vmxstart_invalid_vmcs_ptr + jz near .vmxstart_start_failed + jmp .vmlaunch_done; ; Here if vmlaunch detected a failure. + +ALIGNCODE(16) ;; @todo YASM BUG - this alignment is wrong on darwin, it's 1 byte off. +.vmlaunch_done: + RESTORE_STATE_VM32 + mov eax, VINF_SUCCESS + +.vmstart_end: + popf + pop xBP + ret + +.vmxstart_invalid_vmcs_ptr: + RESTORE_STATE_VM32 + mov eax, VERR_VMX_INVALID_VMCS_PTR_TO_START_VM + jmp .vmstart_end + +.vmxstart_start_failed: + RESTORE_STATE_VM32 + mov eax, VERR_VMX_UNABLE_TO_START_VM + jmp .vmstart_end + +ENDPROC VMXR0StartVM32 + + +%ifdef RT_ARCH_AMD64 +;; @def RESTORE_STATE_VM64 +; Macro restoring essential host state and updating guest state +; for 64-bit host, 64-bit guest for VT-x. +; +%macro RESTORE_STATE_VM64 0 + ; Restore base and limit of the IDTR & GDTR + %ifndef VMX_SKIP_IDTR + lidt [xSP] + add xSP, xCB * 2 + %endif + %ifndef VMX_SKIP_GDTR + lgdt [xSP] + add xSP, xCB * 2 + %endif + + push xDI + %ifndef VMX_SKIP_TR + mov xDI, [xSP + xCB * 3] ; pCtx (*3 to skip the saved xDI, TR, LDTR) + %else + mov xDI, [xSP + xCB * 2] ; pCtx (*2 to skip the saved xDI, LDTR) + %endif + + mov qword [xDI + CPUMCTX.eax], rax + mov rax, SPECTRE_FILLER64 + mov qword [xDI + CPUMCTX.ebx], rbx + mov rbx, rax + mov qword [xDI + CPUMCTX.ecx], rcx + mov rcx, rax + mov qword [xDI + CPUMCTX.edx], rdx + mov rdx, rax + mov qword [xDI + CPUMCTX.esi], rsi + mov rsi, rax + mov qword [xDI + CPUMCTX.ebp], rbp + mov rbp, rax + mov qword [xDI + CPUMCTX.r8], r8 + mov r8, rax + mov qword [xDI + CPUMCTX.r9], r9 + mov r9, rax + mov qword [xDI + CPUMCTX.r10], r10 + mov r10, rax + mov qword [xDI + CPUMCTX.r11], r11 + mov r11, rax + mov qword [xDI + CPUMCTX.r12], r12 + mov r12, rax + mov qword [xDI + CPUMCTX.r13], r13 + mov r13, rax + mov qword [xDI + CPUMCTX.r14], r14 + mov r14, rax + mov qword [xDI + CPUMCTX.r15], r15 + mov r15, rax + mov rax, cr2 + mov qword [xDI + CPUMCTX.cr2], rax + + pop xAX ; The guest rdi we pushed above + mov qword [xDI + CPUMCTX.edi], rax + + ; Fight spectre. + INDIRECT_BRANCH_PREDICTION_BARRIER xDI, CPUMCTX_WSF_IBPB_EXIT + + %ifndef VMX_SKIP_TR + ; Restore TSS selector; must mark it as not busy before using ltr (!) + ; ASSUME that this is supposed to be 'BUSY'. (saves 20-30 ticks on the T42p). + ; @todo get rid of sgdt + pop xBX ; Saved TR + sub xSP, xCB * 2 + sgdt [xSP] + mov xAX, xBX + and eax, X86_SEL_MASK_OFF_RPL ; Mask away TI and RPL bits leaving only the descriptor offset. + add xAX, [xSP + 2] ; eax <- GDTR.address + descriptor offset. + and dword [xAX + 4], ~RT_BIT(9) ; Clear the busy flag in TSS desc (bits 0-7=base, bit 9=busy bit). + ltr bx + add xSP, xCB * 2 + %endif + + pop xAX ; Saved LDTR + cmp eax, 0 + je %%skip_ldt_write64 + lldt ax + +%%skip_ldt_write64: + pop xSI ; pCtx (needed in rsi by the macros below) + + %ifdef VMX_USE_CACHED_VMCS_ACCESSES + pop xDX ; Saved pCache + + ; Note! If we get here as a result of invalid VMCS pointer, all the following + ; vmread's will fail (only eflags.cf=1 will be set) but that shouldn't cause any + ; trouble only just less efficient. + mov ecx, [xDX + VMCSCACHE.Read.cValidEntries] + cmp ecx, 0 ; Can't happen + je %%no_cached_read64 + jmp %%cached_read64 + +ALIGN(16) +%%cached_read64: + dec xCX + mov eax, [xDX + VMCSCACHE.Read.aField + xCX * 4] + vmread [xDX + VMCSCACHE.Read.aFieldVal + xCX * 8], xAX + cmp xCX, 0 + jnz %%cached_read64 +%%no_cached_read64: + %endif + + ; Restore segment registers. + MYPOPSEGS xAX, ax + + ; Restore the host XCR0 if necessary. + pop xCX + test ecx, ecx + jnz %%xcr0_after_skip + pop xAX + pop xDX + xsetbv ; ecx is already zero. +%%xcr0_after_skip: + + ; Restore general purpose registers. + MYPOPAD +%endmacro + + +;; +; Prepares for and executes VMLAUNCH/VMRESUME (64 bits guest mode) +; +; @returns VBox status code +; @param fResume msc:rcx, gcc:rdi Whether to use vmlauch/vmresume. +; @param pCtx msc:rdx, gcc:rsi Pointer to the guest-CPU context. +; @param pCache msc:r8, gcc:rdx Pointer to the VMCS cache. +; @param pVM msc:r9, gcc:rcx The cross context VM structure. +; @param pVCpu msc:[ebp+30], gcc:r8 The cross context virtual CPU structure of the calling EMT. +; +ALIGNCODE(16) +BEGINPROC VMXR0StartVM64 + push xBP + mov xBP, xSP + + pushf + cli + + ; Save all general purpose host registers. + MYPUSHAD + + ; First we have to save some final CPU context registers. + lea r10, [.vmlaunch64_done wrt rip] + mov rax, VMX_VMCS_HOST_RIP ; Return address (too difficult to continue after VMLAUNCH?). + vmwrite rax, r10 + ; Note: assumes success! + + ; + ; Unify the input parameter registers. + ; +%ifdef ASM_CALL64_GCC + ; fResume already in rdi + ; pCtx already in rsi + mov rbx, rdx ; pCache +%else + mov rdi, rcx ; fResume + mov rsi, rdx ; pCtx + mov rbx, r8 ; pCache +%endif + + ; + ; Save the host XCR0 and load the guest one if necessary. + ; Note! Trashes rdx and rcx. + ; +%ifdef ASM_CALL64_MSC + mov rax, [xBP + 30h] ; pVCpu +%else + mov rax, r8 ; pVCpu +%endif + test byte [xAX + VMCPU.hm + HMCPU.fLoadSaveGuestXcr0], 1 + jz .xcr0_before_skip + + xor ecx, ecx + xgetbv ; Save the host one on the stack. + push xDX + push xAX + + mov eax, [xSI + CPUMCTX.aXcr] ; Load the guest one. + mov edx, [xSI + CPUMCTX.aXcr + 4] + xor ecx, ecx ; paranoia + xsetbv + + push 0 ; Indicate that we must restore XCR0 (popped into ecx, thus 0). + jmp .xcr0_before_done + +.xcr0_before_skip: + push 3fh ; indicate that we need not. +.xcr0_before_done: + + ; + ; Save segment registers. + ; Note! Trashes rdx & rcx, so we moved it here (amd64 case). + ; + MYPUSHSEGS xAX, ax + +%ifdef VMX_USE_CACHED_VMCS_ACCESSES + mov ecx, [xBX + VMCSCACHE.Write.cValidEntries] + cmp ecx, 0 + je .no_cached_writes + mov edx, ecx + mov ecx, 0 + jmp .cached_write + +ALIGN(16) +.cached_write: + mov eax, [xBX + VMCSCACHE.Write.aField + xCX * 4] + vmwrite xAX, [xBX + VMCSCACHE.Write.aFieldVal + xCX * 8] + inc xCX + cmp xCX, xDX + jl .cached_write + + mov dword [xBX + VMCSCACHE.Write.cValidEntries], 0 +.no_cached_writes: + + ; Save the pCache pointer. + push xBX +%endif + + ; Save the pCtx pointer. + push xSI + + ; Save host LDTR. + xor eax, eax + sldt ax + push xAX + +%ifndef VMX_SKIP_TR + ; The host TR limit is reset to 0x67; save & restore it manually. + str eax + push xAX +%endif + +%ifndef VMX_SKIP_GDTR + ; VT-x only saves the base of the GDTR & IDTR and resets the limit to 0xffff; we must restore the limit correctly! + sub xSP, xCB * 2 + sgdt [xSP] +%endif +%ifndef VMX_SKIP_IDTR + sub xSP, xCB * 2 + sidt [xSP] +%endif + + ; Load CR2 if necessary (may be expensive as writing CR2 is a synchronizing instruction). + mov rbx, qword [xSI + CPUMCTX.cr2] + mov rdx, cr2 + cmp rbx, rdx + je .skip_cr2_write + mov cr2, rbx + +.skip_cr2_write: + mov eax, VMX_VMCS_HOST_RSP + vmwrite xAX, xSP + ; Note: assumes success! + ; Don't mess with ESP anymore!!! + + ; Fight spectre and similar. + INDIRECT_BRANCH_PREDICTION_AND_L1_CACHE_BARRIER xSI, CPUMCTX_WSF_IBPB_ENTRY, CPUMCTX_WSF_L1D_ENTRY + + ; Load guest general purpose registers. + mov rax, qword [xSI + CPUMCTX.eax] + mov rbx, qword [xSI + CPUMCTX.ebx] + mov rcx, qword [xSI + CPUMCTX.ecx] + mov rdx, qword [xSI + CPUMCTX.edx] + mov rbp, qword [xSI + CPUMCTX.ebp] + mov r8, qword [xSI + CPUMCTX.r8] + mov r9, qword [xSI + CPUMCTX.r9] + mov r10, qword [xSI + CPUMCTX.r10] + mov r11, qword [xSI + CPUMCTX.r11] + mov r12, qword [xSI + CPUMCTX.r12] + mov r13, qword [xSI + CPUMCTX.r13] + mov r14, qword [xSI + CPUMCTX.r14] + mov r15, qword [xSI + CPUMCTX.r15] + + ; Resume or start VM? + cmp xDI, 0 ; fResume + + ; Load guest rdi & rsi. + mov rdi, qword [xSI + CPUMCTX.edi] + mov rsi, qword [xSI + CPUMCTX.esi] + + je .vmlaunch64_launch + + vmresume + jc near .vmxstart64_invalid_vmcs_ptr + jz near .vmxstart64_start_failed + jmp .vmlaunch64_done; ; Here if vmresume detected a failure. + +.vmlaunch64_launch: + vmlaunch + jc near .vmxstart64_invalid_vmcs_ptr + jz near .vmxstart64_start_failed + jmp .vmlaunch64_done; ; Here if vmlaunch detected a failure. + +ALIGNCODE(16) +.vmlaunch64_done: + RESTORE_STATE_VM64 + mov eax, VINF_SUCCESS + +.vmstart64_end: + popf + pop xBP + ret + +.vmxstart64_invalid_vmcs_ptr: + RESTORE_STATE_VM64 + mov eax, VERR_VMX_INVALID_VMCS_PTR_TO_START_VM + jmp .vmstart64_end + +.vmxstart64_start_failed: + RESTORE_STATE_VM64 + mov eax, VERR_VMX_UNABLE_TO_START_VM + jmp .vmstart64_end +ENDPROC VMXR0StartVM64 +%endif ; RT_ARCH_AMD64 + + +;; +; Prepares for and executes VMRUN (32 bits guests) +; +; @returns VBox status code +; @param HCPhysVmcbHost msc:rcx,gcc:rdi Physical address of host VMCB. +; @param HCPhysVmcb msc:rdx,gcc:rsi Physical address of guest VMCB. +; @param pCtx msc:r8,gcc:rdx Pointer to the guest CPU-context. +; @param pVM msc:r9,gcc:rcx The cross context VM structure. +; @param pVCpu msc:[rsp+28],gcc:r8 The cross context virtual CPU structure of the calling EMT. +; +ALIGNCODE(16) +BEGINPROC SVMR0VMRun +%ifdef RT_ARCH_AMD64 ; fake a cdecl stack frame + %ifdef ASM_CALL64_GCC + push r8 ; pVCpu + push rcx ; pVM + push rdx ; pCtx + push rsi ; HCPhysVmcb + push rdi ; HCPhysVmcbHost + %else + mov rax, [rsp + 28h] + push rax ; pVCpu + push r9 ; pVM + push r8 ; pCtx + push rdx ; HCPhysVmcb + push rcx ; HCPhysVmcbHost + %endif + push 0 +%endif + push xBP + mov xBP, xSP + pushf + + ; Save all general purpose host registers. + MYPUSHAD + + ; Load pCtx into xSI. + mov xSI, [xBP + xCB * 2 + RTHCPHYS_CB * 2] ; pCtx + + ; Save the host XCR0 and load the guest one if necessary. + mov xAX, [xBP + xCB * 2 + RTHCPHYS_CB * 2 + xCB * 2] ; pVCpu + test byte [xAX + VMCPU.hm + HMCPU.fLoadSaveGuestXcr0], 1 + jz .xcr0_before_skip + + xor ecx, ecx + xgetbv ; Save the host XCR0 on the stack + push xDX + push xAX + + mov xSI, [xBP + xCB * 2 + RTHCPHYS_CB * 2] ; pCtx + mov eax, [xSI + CPUMCTX.aXcr] ; load the guest XCR0 + mov edx, [xSI + CPUMCTX.aXcr + 4] + xor ecx, ecx ; paranoia + xsetbv + + push 0 ; indicate that we must restore XCR0 (popped into ecx, thus 0) + jmp .xcr0_before_done + +.xcr0_before_skip: + push 3fh ; indicate that we need not restore XCR0 +.xcr0_before_done: + + ; Save guest CPU-context pointer for simplifying saving of the GPRs afterwards. + push xSI + + ; Save host fs, gs, sysenter msr etc. + mov xAX, [xBP + xCB * 2] ; HCPhysVmcbHost (64 bits physical address; x86: take low dword only) + push xAX ; save for the vmload after vmrun + vmsave + + ; Fight spectre. + INDIRECT_BRANCH_PREDICTION_BARRIER xSI, CPUMCTX_WSF_IBPB_ENTRY + + ; Setup xAX for VMLOAD. + mov xAX, [xBP + xCB * 2 + RTHCPHYS_CB] ; HCPhysVmcb (64 bits physical address; x86: take low dword only) + + ; Load guest general purpose registers. + ; eax is loaded from the VMCB by VMRUN. + mov ebx, [xSI + CPUMCTX.ebx] + mov ecx, [xSI + CPUMCTX.ecx] + mov edx, [xSI + CPUMCTX.edx] + mov edi, [xSI + CPUMCTX.edi] + mov ebp, [xSI + CPUMCTX.ebp] + mov esi, [xSI + CPUMCTX.esi] + + ; Clear the global interrupt flag & execute sti to make sure external interrupts cause a world switch. + clgi + sti + + ; Load guest fs, gs, sysenter msr etc. + vmload + + ; Run the VM. + vmrun + + ; Save guest fs, gs, sysenter msr etc. + vmsave + + ; Load host fs, gs, sysenter msr etc. + pop xAX ; load HCPhysVmcbHost (pushed above) + vmload + + ; Set the global interrupt flag again, but execute cli to make sure IF=0. + cli + stgi + + ; Pop the context pointer (pushed above) and save the guest GPRs (sans RSP and RAX). + pop xAX + + mov [ss:xAX + CPUMCTX.ebx], ebx + mov xBX, SPECTRE_FILLER + mov [ss:xAX + CPUMCTX.ecx], ecx + mov xCX, xBX + mov [ss:xAX + CPUMCTX.edx], edx + mov xDX, xBX + mov [ss:xAX + CPUMCTX.esi], esi + mov xSI, xBX + mov [ss:xAX + CPUMCTX.edi], edi + mov xDI, xBX + mov [ss:xAX + CPUMCTX.ebp], ebp + mov xBP, xBX + + ; Fight spectre. Note! Trashes xAX! + INDIRECT_BRANCH_PREDICTION_BARRIER ss:xAX, CPUMCTX_WSF_IBPB_EXIT + + ; Restore the host xcr0 if necessary. + pop xCX + test ecx, ecx + jnz .xcr0_after_skip + pop xAX + pop xDX + xsetbv ; ecx is already zero +.xcr0_after_skip: + + ; Restore host general purpose registers. + MYPOPAD + + mov eax, VINF_SUCCESS + + popf + pop xBP +%ifdef RT_ARCH_AMD64 + add xSP, 6*xCB +%endif + ret +ENDPROC SVMR0VMRun + + +%ifdef RT_ARCH_AMD64 +;; +; Prepares for and executes VMRUN (64 bits guests) +; +; @returns VBox status code +; @param HCPhysVmcbHost msc:rcx,gcc:rdi Physical address of host VMCB. +; @param HCPhysVmcb msc:rdx,gcc:rsi Physical address of guest VMCB. +; @param pCtx msc:r8,gcc:rdx Pointer to the guest-CPU context. +; @param pVM msc:r9,gcc:rcx The cross context VM structure. +; @param pVCpu msc:[rsp+28],gcc:r8 The cross context virtual CPU structure of the calling EMT. +; +ALIGNCODE(16) +BEGINPROC SVMR0VMRun64 + ; Fake a cdecl stack frame + %ifdef ASM_CALL64_GCC + push r8 ;pVCpu + push rcx ;pVM + push rdx ;pCtx + push rsi ;HCPhysVmcb + push rdi ;HCPhysVmcbHost + %else + mov rax, [rsp + 28h] + push rax ; rbp + 30h pVCpu + push r9 ; rbp + 28h pVM + push r8 ; rbp + 20h pCtx + push rdx ; rbp + 18h HCPhysVmcb + push rcx ; rbp + 10h HCPhysVmcbHost + %endif + push 0 ; rbp + 08h "fake ret addr" + push rbp ; rbp + 00h + mov rbp, rsp + pushf + + ; Manual save and restore: + ; - General purpose registers except RIP, RSP, RAX + ; + ; Trashed: + ; - CR2 (we don't care) + ; - LDTR (reset to 0) + ; - DRx (presumably not changed at all) + ; - DR7 (reset to 0x400) + + ; Save all general purpose host registers. + MYPUSHAD + + ; Load pCtx into xSI. + mov xSI, [rbp + xCB * 2 + RTHCPHYS_CB * 2] + + ; Save the host XCR0 and load the guest one if necessary. + mov rax, [xBP + 30h] ; pVCpu + test byte [xAX + VMCPU.hm + HMCPU.fLoadSaveGuestXcr0], 1 + jz .xcr0_before_skip + + xor ecx, ecx + xgetbv ; save the host XCR0 on the stack. + push xDX + push xAX + + mov xSI, [xBP + xCB * 2 + RTHCPHYS_CB * 2] ; pCtx + mov eax, [xSI + CPUMCTX.aXcr] ; load the guest XCR0 + mov edx, [xSI + CPUMCTX.aXcr + 4] + xor ecx, ecx ; paranoia + xsetbv + + push 0 ; indicate that we must restore XCR0 (popped into ecx, thus 0) + jmp .xcr0_before_done + +.xcr0_before_skip: + push 3fh ; indicate that we need not restore XCR0 +.xcr0_before_done: + + ; Save guest CPU-context pointer for simplifying saving of the GPRs afterwards. + push rsi + + ; Save host fs, gs, sysenter msr etc. + mov rax, [rbp + xCB * 2] ; HCPhysVmcbHost (64 bits physical address; x86: take low dword only) + push rax ; save for the vmload after vmrun + vmsave + + ; Fight spectre. + INDIRECT_BRANCH_PREDICTION_BARRIER xSI, CPUMCTX_WSF_IBPB_ENTRY + + ; Setup rax for VMLOAD. + mov rax, [rbp + xCB * 2 + RTHCPHYS_CB] ; HCPhysVmcb (64 bits physical address; take low dword only) + + ; Load guest general purpose registers (rax is loaded from the VMCB by VMRUN). + mov rbx, qword [xSI + CPUMCTX.ebx] + mov rcx, qword [xSI + CPUMCTX.ecx] + mov rdx, qword [xSI + CPUMCTX.edx] + mov rdi, qword [xSI + CPUMCTX.edi] + mov rbp, qword [xSI + CPUMCTX.ebp] + mov r8, qword [xSI + CPUMCTX.r8] + mov r9, qword [xSI + CPUMCTX.r9] + mov r10, qword [xSI + CPUMCTX.r10] + mov r11, qword [xSI + CPUMCTX.r11] + mov r12, qword [xSI + CPUMCTX.r12] + mov r13, qword [xSI + CPUMCTX.r13] + mov r14, qword [xSI + CPUMCTX.r14] + mov r15, qword [xSI + CPUMCTX.r15] + mov rsi, qword [xSI + CPUMCTX.esi] + + ; Clear the global interrupt flag & execute sti to make sure external interrupts cause a world switch. + clgi + sti + + ; Load guest FS, GS, Sysenter MSRs etc. + vmload + + ; Run the VM. + vmrun + + ; Save guest fs, gs, sysenter msr etc. + vmsave + + ; Load host fs, gs, sysenter msr etc. + pop rax ; load HCPhysVmcbHost (pushed above) + vmload + + ; Set the global interrupt flag again, but execute cli to make sure IF=0. + cli + stgi + + ; Pop the context pointer (pushed above) and save the guest GPRs (sans RSP and RAX). + pop rax + + mov qword [rax + CPUMCTX.ebx], rbx + mov rbx, SPECTRE_FILLER64 + mov qword [rax + CPUMCTX.ecx], rcx + mov rcx, rbx + mov qword [rax + CPUMCTX.edx], rdx + mov rdx, rbx + mov qword [rax + CPUMCTX.esi], rsi + mov rsi, rbx + mov qword [rax + CPUMCTX.edi], rdi + mov rdi, rbx + mov qword [rax + CPUMCTX.ebp], rbp + mov rbp, rbx + mov qword [rax + CPUMCTX.r8], r8 + mov r8, rbx + mov qword [rax + CPUMCTX.r9], r9 + mov r9, rbx + mov qword [rax + CPUMCTX.r10], r10 + mov r10, rbx + mov qword [rax + CPUMCTX.r11], r11 + mov r11, rbx + mov qword [rax + CPUMCTX.r12], r12 + mov r12, rbx + mov qword [rax + CPUMCTX.r13], r13 + mov r13, rbx + mov qword [rax + CPUMCTX.r14], r14 + mov r14, rbx + mov qword [rax + CPUMCTX.r15], r15 + mov r15, rbx + + ; Fight spectre. Note! Trashes rax! + INDIRECT_BRANCH_PREDICTION_BARRIER rax, CPUMCTX_WSF_IBPB_EXIT + + ; Restore the host xcr0 if necessary. + pop xCX + test ecx, ecx + jnz .xcr0_after_skip + pop xAX + pop xDX + xsetbv ; ecx is already zero +.xcr0_after_skip: + + ; Restore host general purpose registers. + MYPOPAD + + mov eax, VINF_SUCCESS + + popf + pop rbp + add rsp, 6 * xCB + ret +ENDPROC SVMR0VMRun64 +%endif ; RT_ARCH_AMD64 + diff --git a/src/VBox/VMM/VMMR0/HMSVMR0.cpp b/src/VBox/VMM/VMMR0/HMSVMR0.cpp new file mode 100644 index 00000000..50338e0e --- /dev/null +++ b/src/VBox/VMM/VMMR0/HMSVMR0.cpp @@ -0,0 +1,8232 @@ +/* $Id: HMSVMR0.cpp $ */ +/** @file + * HM SVM (AMD-V) - Host Context Ring-0. + */ + +/* + * Copyright (C) 2013-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_HM +#define VMCPU_INCL_CPUM_GST_CTX +#include <iprt/asm-amd64-x86.h> +#include <iprt/thread.h> + +#include <VBox/vmm/pdmapi.h> +#include <VBox/vmm/dbgf.h> +#include <VBox/vmm/iem.h> +#include <VBox/vmm/iom.h> +#include <VBox/vmm/tm.h> +#include <VBox/vmm/em.h> +#include <VBox/vmm/gim.h> +#include <VBox/vmm/apic.h> +#include "HMInternal.h" +#include <VBox/vmm/vm.h> +#include <VBox/err.h> +#include "HMSVMR0.h" +#include "dtrace/VBoxVMM.h" + +#ifdef DEBUG_ramshankar +# define HMSVM_SYNC_FULL_GUEST_STATE +# define HMSVM_ALWAYS_TRAP_ALL_XCPTS +# define HMSVM_ALWAYS_TRAP_PF +# define HMSVM_ALWAYS_TRAP_TASK_SWITCH +#endif + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +#ifdef VBOX_WITH_STATISTICS +# define HMSVM_EXITCODE_STAM_COUNTER_INC(u64ExitCode) do { \ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitAll); \ + if ((u64ExitCode) == SVM_EXIT_NPF) \ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitReasonNpf); \ + else \ + STAM_COUNTER_INC(&pVCpu->hm.s.paStatExitReasonR0[(u64ExitCode) & MASK_EXITREASON_STAT]); \ + } while (0) + +# ifdef VBOX_WITH_NESTED_HWVIRT_SVM +# define HMSVM_NESTED_EXITCODE_STAM_COUNTER_INC(u64ExitCode) do { \ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitAll); \ + if ((u64ExitCode) == SVM_EXIT_NPF) \ + STAM_COUNTER_INC(&pVCpu->hm.s.StatNestedExitReasonNpf); \ + else \ + STAM_COUNTER_INC(&pVCpu->hm.s.paStatNestedExitReasonR0[(u64ExitCode) & MASK_EXITREASON_STAT]); \ + } while (0) +# endif +#else +# define HMSVM_EXITCODE_STAM_COUNTER_INC(u64ExitCode) do { } while (0) +# ifdef VBOX_WITH_NESTED_HWVIRT_SVM +# define HMSVM_NESTED_EXITCODE_STAM_COUNTER_INC(u64ExitCode) do { } while (0) +# endif +#endif /* !VBOX_WITH_STATISTICS */ + +/** If we decide to use a function table approach this can be useful to + * switch to a "static DECLCALLBACK(int)". */ +#define HMSVM_EXIT_DECL static int + +/** + * Subset of the guest-CPU state that is kept by SVM R0 code while executing the + * guest using hardware-assisted SVM. + * + * This excludes state like TSC AUX, GPRs (other than RSP, RAX) which are always + * are swapped and restored across the world-switch and also registers like + * EFER, PAT MSR etc. which cannot be modified by the guest without causing a + * \#VMEXIT. + */ +#define HMSVM_CPUMCTX_EXTRN_ALL ( CPUMCTX_EXTRN_RIP \ + | CPUMCTX_EXTRN_RFLAGS \ + | CPUMCTX_EXTRN_RAX \ + | CPUMCTX_EXTRN_RSP \ + | CPUMCTX_EXTRN_SREG_MASK \ + | CPUMCTX_EXTRN_CR0 \ + | CPUMCTX_EXTRN_CR2 \ + | CPUMCTX_EXTRN_CR3 \ + | CPUMCTX_EXTRN_TABLE_MASK \ + | CPUMCTX_EXTRN_DR6 \ + | CPUMCTX_EXTRN_DR7 \ + | CPUMCTX_EXTRN_KERNEL_GS_BASE \ + | CPUMCTX_EXTRN_SYSCALL_MSRS \ + | CPUMCTX_EXTRN_SYSENTER_MSRS \ + | CPUMCTX_EXTRN_HWVIRT \ + | CPUMCTX_EXTRN_HM_SVM_MASK) + +/** + * Subset of the guest-CPU state that is shared between the guest and host. + */ +#define HMSVM_CPUMCTX_SHARED_STATE CPUMCTX_EXTRN_DR_MASK + +/** Macro for importing guest state from the VMCB back into CPUMCTX. */ +#define HMSVM_CPUMCTX_IMPORT_STATE(a_pVCpu, a_fWhat) \ + do { \ + if ((a_pVCpu)->cpum.GstCtx.fExtrn & (a_fWhat)) \ + hmR0SvmImportGuestState((a_pVCpu), (a_fWhat)); \ + } while (0) + +/** Assert that the required state bits are fetched. */ +#define HMSVM_CPUMCTX_ASSERT(a_pVCpu, a_fExtrnMbz) AssertMsg(!((a_pVCpu)->cpum.GstCtx.fExtrn & (a_fExtrnMbz)), \ + ("fExtrn=%#RX64 fExtrnMbz=%#RX64\n", \ + (a_pVCpu)->cpum.GstCtx.fExtrn, (a_fExtrnMbz))) + +/** Assert that preemption is disabled or covered by thread-context hooks. */ +#define HMSVM_ASSERT_PREEMPT_SAFE(a_pVCpu) Assert( VMMR0ThreadCtxHookIsEnabled((a_pVCpu)) \ + || !RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + +/** Assert that we haven't migrated CPUs when thread-context hooks are not + * used. */ +#define HMSVM_ASSERT_CPU_SAFE(a_pVCpu) AssertMsg( VMMR0ThreadCtxHookIsEnabled((a_pVCpu)) \ + || (a_pVCpu)->hm.s.idEnteredCpu == RTMpCpuId(), \ + ("Illegal migration! Entered on CPU %u Current %u\n", \ + (a_pVCpu)->hm.s.idEnteredCpu, RTMpCpuId())); + +/** Assert that we're not executing a nested-guest. */ +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +# define HMSVM_ASSERT_NOT_IN_NESTED_GUEST(a_pCtx) Assert(!CPUMIsGuestInSvmNestedHwVirtMode((a_pCtx))) +#else +# define HMSVM_ASSERT_NOT_IN_NESTED_GUEST(a_pCtx) do { NOREF((a_pCtx)); } while (0) +#endif + +/** Assert that we're executing a nested-guest. */ +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +# define HMSVM_ASSERT_IN_NESTED_GUEST(a_pCtx) Assert(CPUMIsGuestInSvmNestedHwVirtMode((a_pCtx))) +#else +# define HMSVM_ASSERT_IN_NESTED_GUEST(a_pCtx) do { NOREF((a_pCtx)); } while (0) +#endif + +/** Macro for checking and returning from the using function for + * \#VMEXIT intercepts that maybe caused during delivering of another + * event in the guest. */ +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +# define HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(a_pVCpu, a_pSvmTransient) \ + do \ + { \ + int rc = hmR0SvmCheckExitDueToEventDelivery((a_pVCpu), (a_pSvmTransient)); \ + if (RT_LIKELY(rc == VINF_SUCCESS)) { /* continue #VMEXIT handling */ } \ + else if ( rc == VINF_HM_DOUBLE_FAULT) { return VINF_SUCCESS; } \ + else if ( rc == VINF_EM_RESET \ + && CPUMIsGuestSvmCtrlInterceptSet((a_pVCpu), &(a_pVCpu)->cpum.GstCtx, SVM_CTRL_INTERCEPT_SHUTDOWN)) \ + { \ + HMSVM_CPUMCTX_IMPORT_STATE((a_pVCpu), HMSVM_CPUMCTX_EXTRN_ALL); \ + return VBOXSTRICTRC_TODO(IEMExecSvmVmexit((a_pVCpu), SVM_EXIT_SHUTDOWN, 0, 0)); \ + } \ + else \ + return rc; \ + } while (0) +#else +# define HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(a_pVCpu, a_pSvmTransient) \ + do \ + { \ + int rc = hmR0SvmCheckExitDueToEventDelivery((a_pVCpu), (a_pSvmTransient)); \ + if (RT_LIKELY(rc == VINF_SUCCESS)) { /* continue #VMEXIT handling */ } \ + else if ( rc == VINF_HM_DOUBLE_FAULT) { return VINF_SUCCESS; } \ + else \ + return rc; \ + } while (0) +#endif + +/** Macro for upgrading a @a a_rc to VINF_EM_DBG_STEPPED after emulating an + * instruction that exited. */ +#define HMSVM_CHECK_SINGLE_STEP(a_pVCpu, a_rc) \ + do { \ + if ((a_pVCpu)->hm.s.fSingleInstruction && (a_rc) == VINF_SUCCESS) \ + (a_rc) = VINF_EM_DBG_STEPPED; \ + } while (0) + +/** Validate segment descriptor granularity bit. */ +#ifdef VBOX_STRICT +# define HMSVM_ASSERT_SEG_GRANULARITY(a_pCtx, reg) \ + AssertMsg( !(a_pCtx)->reg.Attr.n.u1Present \ + || ( (a_pCtx)->reg.Attr.n.u1Granularity \ + ? ((a_pCtx)->reg.u32Limit & 0xfff) == 0xfff \ + : (a_pCtx)->reg.u32Limit <= UINT32_C(0xfffff)), \ + ("Invalid Segment Attributes Limit=%#RX32 Attr=%#RX32 Base=%#RX64\n", (a_pCtx)->reg.u32Limit, \ + (a_pCtx)->reg.Attr.u, (a_pCtx)->reg.u64Base)) +#else +# define HMSVM_ASSERT_SEG_GRANULARITY(a_pCtx, reg) do { } while (0) +#endif + +/** + * Exception bitmap mask for all contributory exceptions. + * + * Page fault is deliberately excluded here as it's conditional as to whether + * it's contributory or benign. Page faults are handled separately. + */ +#define HMSVM_CONTRIBUTORY_XCPT_MASK ( RT_BIT(X86_XCPT_GP) | RT_BIT(X86_XCPT_NP) | RT_BIT(X86_XCPT_SS) | RT_BIT(X86_XCPT_TS) \ + | RT_BIT(X86_XCPT_DE)) + +/** + * Mandatory/unconditional guest control intercepts. + * + * SMIs can and do happen in normal operation. We need not intercept them + * while executing the guest (or nested-guest). + */ +#define HMSVM_MANDATORY_GUEST_CTRL_INTERCEPTS ( SVM_CTRL_INTERCEPT_INTR \ + | SVM_CTRL_INTERCEPT_NMI \ + | SVM_CTRL_INTERCEPT_INIT \ + | SVM_CTRL_INTERCEPT_RDPMC \ + | SVM_CTRL_INTERCEPT_CPUID \ + | SVM_CTRL_INTERCEPT_RSM \ + | SVM_CTRL_INTERCEPT_HLT \ + | SVM_CTRL_INTERCEPT_IOIO_PROT \ + | SVM_CTRL_INTERCEPT_MSR_PROT \ + | SVM_CTRL_INTERCEPT_INVLPGA \ + | SVM_CTRL_INTERCEPT_SHUTDOWN \ + | SVM_CTRL_INTERCEPT_FERR_FREEZE \ + | SVM_CTRL_INTERCEPT_VMRUN \ + | SVM_CTRL_INTERCEPT_SKINIT \ + | SVM_CTRL_INTERCEPT_WBINVD \ + | SVM_CTRL_INTERCEPT_MONITOR \ + | SVM_CTRL_INTERCEPT_MWAIT \ + | SVM_CTRL_INTERCEPT_CR0_SEL_WRITE \ + | SVM_CTRL_INTERCEPT_XSETBV) + +/** @name VMCB Clean Bits. + * + * These flags are used for VMCB-state caching. A set VMCB Clean bit indicates + * AMD-V doesn't need to reload the corresponding value(s) from the VMCB in + * memory. + * + * @{ */ +/** All intercepts vectors, TSC offset, PAUSE filter counter. */ +#define HMSVM_VMCB_CLEAN_INTERCEPTS RT_BIT(0) +/** I/O permission bitmap, MSR permission bitmap. */ +#define HMSVM_VMCB_CLEAN_IOPM_MSRPM RT_BIT(1) +/** ASID. */ +#define HMSVM_VMCB_CLEAN_ASID RT_BIT(2) +/** TRP: V_TPR, V_IRQ, V_INTR_PRIO, V_IGN_TPR, V_INTR_MASKING, +V_INTR_VECTOR. */ +#define HMSVM_VMCB_CLEAN_INT_CTRL RT_BIT(3) +/** Nested Paging: Nested CR3 (nCR3), PAT. */ +#define HMSVM_VMCB_CLEAN_NP RT_BIT(4) +/** Control registers (CR0, CR3, CR4, EFER). */ +#define HMSVM_VMCB_CLEAN_CRX_EFER RT_BIT(5) +/** Debug registers (DR6, DR7). */ +#define HMSVM_VMCB_CLEAN_DRX RT_BIT(6) +/** GDT, IDT limit and base. */ +#define HMSVM_VMCB_CLEAN_DT RT_BIT(7) +/** Segment register: CS, SS, DS, ES limit and base. */ +#define HMSVM_VMCB_CLEAN_SEG RT_BIT(8) +/** CR2.*/ +#define HMSVM_VMCB_CLEAN_CR2 RT_BIT(9) +/** Last-branch record (DbgCtlMsr, br_from, br_to, lastint_from, lastint_to) */ +#define HMSVM_VMCB_CLEAN_LBR RT_BIT(10) +/** AVIC (AVIC APIC_BAR; AVIC APIC_BACKING_PAGE, AVIC +PHYSICAL_TABLE and AVIC LOGICAL_TABLE Pointers). */ +#define HMSVM_VMCB_CLEAN_AVIC RT_BIT(11) +/** Mask of all valid VMCB Clean bits. */ +#define HMSVM_VMCB_CLEAN_ALL ( HMSVM_VMCB_CLEAN_INTERCEPTS \ + | HMSVM_VMCB_CLEAN_IOPM_MSRPM \ + | HMSVM_VMCB_CLEAN_ASID \ + | HMSVM_VMCB_CLEAN_INT_CTRL \ + | HMSVM_VMCB_CLEAN_NP \ + | HMSVM_VMCB_CLEAN_CRX_EFER \ + | HMSVM_VMCB_CLEAN_DRX \ + | HMSVM_VMCB_CLEAN_DT \ + | HMSVM_VMCB_CLEAN_SEG \ + | HMSVM_VMCB_CLEAN_CR2 \ + | HMSVM_VMCB_CLEAN_LBR \ + | HMSVM_VMCB_CLEAN_AVIC) +/** @} */ + +/** @name SVM transient. + * + * A state structure for holding miscellaneous information across AMD-V + * VMRUN/\#VMEXIT operation, restored after the transition. + * + * @{ */ +typedef struct SVMTRANSIENT +{ + /** The host's rflags/eflags. */ + RTCCUINTREG fEFlags; +#if HC_ARCH_BITS == 32 + uint32_t u32Alignment0; +#endif + + /** The \#VMEXIT exit code (the EXITCODE field in the VMCB). */ + uint64_t u64ExitCode; + /** The guest's TPR value used for TPR shadowing. */ + uint8_t u8GuestTpr; + /** Alignment. */ + uint8_t abAlignment0[7]; + + /** Pointer to the currently executing VMCB. */ + PSVMVMCB pVmcb; + /** Whether we are currently executing a nested-guest. */ + bool fIsNestedGuest; + + /** Whether the guest debug state was active at the time of \#VMEXIT. */ + bool fWasGuestDebugStateActive; + /** Whether the hyper debug state was active at the time of \#VMEXIT. */ + bool fWasHyperDebugStateActive; + /** Whether the TSC offset mode needs to be updated. */ + bool fUpdateTscOffsetting; + /** Whether the TSC_AUX MSR needs restoring on \#VMEXIT. */ + bool fRestoreTscAuxMsr; + /** Whether the \#VMEXIT was caused by a page-fault during delivery of a + * contributary exception or a page-fault. */ + bool fVectoringDoublePF; + /** Whether the \#VMEXIT was caused by a page-fault during delivery of an + * external interrupt or NMI. */ + bool fVectoringPF; +} SVMTRANSIENT, *PSVMTRANSIENT; +AssertCompileMemberAlignment(SVMTRANSIENT, u64ExitCode, sizeof(uint64_t)); +AssertCompileMemberAlignment(SVMTRANSIENT, pVmcb, sizeof(uint64_t)); +/** @} */ + +/** + * MSRPM (MSR permission bitmap) read permissions (for guest RDMSR). + */ +typedef enum SVMMSREXITREAD +{ + /** Reading this MSR causes a \#VMEXIT. */ + SVMMSREXIT_INTERCEPT_READ = 0xb, + /** Reading this MSR does not cause a \#VMEXIT. */ + SVMMSREXIT_PASSTHRU_READ +} SVMMSREXITREAD; + +/** + * MSRPM (MSR permission bitmap) write permissions (for guest WRMSR). + */ +typedef enum SVMMSREXITWRITE +{ + /** Writing to this MSR causes a \#VMEXIT. */ + SVMMSREXIT_INTERCEPT_WRITE = 0xd, + /** Writing to this MSR does not cause a \#VMEXIT. */ + SVMMSREXIT_PASSTHRU_WRITE +} SVMMSREXITWRITE; + +/** + * SVM \#VMEXIT handler. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pSvmTransient Pointer to the SVM-transient structure. + */ +typedef int FNSVMEXITHANDLER(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient); + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +static void hmR0SvmPendingEventToTrpmTrap(PVMCPU pVCpu); +static void hmR0SvmLeave(PVMCPU pVCpu, bool fImportState); + + +/** @name \#VMEXIT handlers. + * @{ + */ +static FNSVMEXITHANDLER hmR0SvmExitIntr; +static FNSVMEXITHANDLER hmR0SvmExitWbinvd; +static FNSVMEXITHANDLER hmR0SvmExitInvd; +static FNSVMEXITHANDLER hmR0SvmExitCpuid; +static FNSVMEXITHANDLER hmR0SvmExitRdtsc; +static FNSVMEXITHANDLER hmR0SvmExitRdtscp; +static FNSVMEXITHANDLER hmR0SvmExitRdpmc; +static FNSVMEXITHANDLER hmR0SvmExitInvlpg; +static FNSVMEXITHANDLER hmR0SvmExitHlt; +static FNSVMEXITHANDLER hmR0SvmExitMonitor; +static FNSVMEXITHANDLER hmR0SvmExitMwait; +static FNSVMEXITHANDLER hmR0SvmExitShutdown; +static FNSVMEXITHANDLER hmR0SvmExitUnexpected; +static FNSVMEXITHANDLER hmR0SvmExitReadCRx; +static FNSVMEXITHANDLER hmR0SvmExitWriteCRx; +static FNSVMEXITHANDLER hmR0SvmExitMsr; +static FNSVMEXITHANDLER hmR0SvmExitReadDRx; +static FNSVMEXITHANDLER hmR0SvmExitWriteDRx; +static FNSVMEXITHANDLER hmR0SvmExitXsetbv; +static FNSVMEXITHANDLER hmR0SvmExitIOInstr; +static FNSVMEXITHANDLER hmR0SvmExitNestedPF; +static FNSVMEXITHANDLER hmR0SvmExitVIntr; +static FNSVMEXITHANDLER hmR0SvmExitTaskSwitch; +static FNSVMEXITHANDLER hmR0SvmExitVmmCall; +static FNSVMEXITHANDLER hmR0SvmExitPause; +static FNSVMEXITHANDLER hmR0SvmExitFerrFreeze; +static FNSVMEXITHANDLER hmR0SvmExitIret; +static FNSVMEXITHANDLER hmR0SvmExitXcptPF; +static FNSVMEXITHANDLER hmR0SvmExitXcptUD; +static FNSVMEXITHANDLER hmR0SvmExitXcptMF; +static FNSVMEXITHANDLER hmR0SvmExitXcptDB; +static FNSVMEXITHANDLER hmR0SvmExitXcptAC; +static FNSVMEXITHANDLER hmR0SvmExitXcptBP; +static FNSVMEXITHANDLER hmR0SvmExitXcptGP; +#if defined(HMSVM_ALWAYS_TRAP_ALL_XCPTS) || defined(VBOX_WITH_NESTED_HWVIRT_SVM) +static FNSVMEXITHANDLER hmR0SvmExitXcptGeneric; +#endif +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +static FNSVMEXITHANDLER hmR0SvmExitClgi; +static FNSVMEXITHANDLER hmR0SvmExitStgi; +static FNSVMEXITHANDLER hmR0SvmExitVmload; +static FNSVMEXITHANDLER hmR0SvmExitVmsave; +static FNSVMEXITHANDLER hmR0SvmExitInvlpga; +static FNSVMEXITHANDLER hmR0SvmExitVmrun; +static FNSVMEXITHANDLER hmR0SvmNestedExitXcptDB; +static FNSVMEXITHANDLER hmR0SvmNestedExitXcptBP; +#endif +/** @} */ + +static int hmR0SvmHandleExit(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient); +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +static int hmR0SvmHandleExitNested(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient); +#endif + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +/** Ring-0 memory object for the IO bitmap. */ +static RTR0MEMOBJ g_hMemObjIOBitmap = NIL_RTR0MEMOBJ; +/** Physical address of the IO bitmap. */ +static RTHCPHYS g_HCPhysIOBitmap; +/** Pointer to the IO bitmap. */ +static R0PTRTYPE(void *) g_pvIOBitmap; + +#ifdef VBOX_STRICT +# define HMSVM_LOG_RBP_RSP RT_BIT_32(0) +# define HMSVM_LOG_CR_REGS RT_BIT_32(1) +# define HMSVM_LOG_CS RT_BIT_32(2) +# define HMSVM_LOG_SS RT_BIT_32(3) +# define HMSVM_LOG_FS RT_BIT_32(4) +# define HMSVM_LOG_GS RT_BIT_32(5) +# define HMSVM_LOG_LBR RT_BIT_32(6) +# define HMSVM_LOG_ALL ( HMSVM_LOG_RBP_RSP \ + | HMSVM_LOG_CR_REGS \ + | HMSVM_LOG_CS \ + | HMSVM_LOG_SS \ + | HMSVM_LOG_FS \ + | HMSVM_LOG_GS \ + | HMSVM_LOG_LBR) + +/** + * Dumps virtual CPU state and additional info. to the logger for diagnostics. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * @param pszPrefix Log prefix. + * @param fFlags Log flags, see HMSVM_LOG_XXX. + * @param uVerbose The verbosity level, currently unused. + */ +static void hmR0SvmLogState(PVMCPU pVCpu, PCSVMVMCB pVmcb, const char *pszPrefix, uint32_t fFlags, uint8_t uVerbose) +{ + RT_NOREF2(pVCpu, uVerbose); + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RFLAGS); + Log4(("%s: cs:rip=%04x:%RX64 efl=%#RX64\n", pszPrefix, pCtx->cs.Sel, pCtx->rip, pCtx->rflags.u)); + + if (fFlags & HMSVM_LOG_RBP_RSP) + { + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_RBP); + Log4(("%s: rsp=%#RX64 rbp=%#RX64\n", pszPrefix, pCtx->rsp, pCtx->rbp)); + } + + if (fFlags & HMSVM_LOG_CR_REGS) + { + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_CR3 | CPUMCTX_EXTRN_CR4); + Log4(("%s: cr0=%#RX64 cr3=%#RX64 cr4=%#RX64\n", pszPrefix, pCtx->cr0, pCtx->cr3, pCtx->cr4)); + } + + if (fFlags & HMSVM_LOG_CS) + { + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CS); + Log4(("%s: cs={%04x base=%016RX64 limit=%08x flags=%08x}\n", pszPrefix, pCtx->cs.Sel, pCtx->cs.u64Base, + pCtx->cs.u32Limit, pCtx->cs.Attr.u)); + } + if (fFlags & HMSVM_LOG_SS) + { + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_SS); + Log4(("%s: ss={%04x base=%016RX64 limit=%08x flags=%08x}\n", pszPrefix, pCtx->ss.Sel, pCtx->ss.u64Base, + pCtx->ss.u32Limit, pCtx->ss.Attr.u)); + } + if (fFlags & HMSVM_LOG_FS) + { + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_FS); + Log4(("%s: fs={%04x base=%016RX64 limit=%08x flags=%08x}\n", pszPrefix, pCtx->fs.Sel, pCtx->fs.u64Base, + pCtx->fs.u32Limit, pCtx->fs.Attr.u)); + } + if (fFlags & HMSVM_LOG_GS) + { + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_GS); + Log4(("%s: gs={%04x base=%016RX64 limit=%08x flags=%08x}\n", pszPrefix, pCtx->gs.Sel, pCtx->gs.u64Base, + pCtx->gs.u32Limit, pCtx->gs.Attr.u)); + } + + PCSVMVMCBSTATESAVE pVmcbGuest = &pVmcb->guest; + if (fFlags & HMSVM_LOG_LBR) + { + Log4(("%s: br_from=%#RX64 br_to=%#RX64 lastxcpt_from=%#RX64 lastxcpt_to=%#RX64\n", pszPrefix, pVmcbGuest->u64BR_FROM, + pVmcbGuest->u64BR_TO, pVmcbGuest->u64LASTEXCPFROM, pVmcbGuest->u64LASTEXCPTO)); + } + NOREF(pszPrefix); NOREF(pVmcbGuest); NOREF(pCtx); +} +#endif /* VBOX_STRICT */ + + +/** + * Sets up and activates AMD-V on the current CPU. + * + * @returns VBox status code. + * @param pHostCpu The HM physical-CPU structure. + * @param pVM The cross context VM structure. Can be + * NULL after a resume! + * @param pvCpuPage Pointer to the global CPU page. + * @param HCPhysCpuPage Physical address of the global CPU page. + * @param fEnabledByHost Whether the host OS has already initialized AMD-V. + * @param pHwvirtMsrs Pointer to the hardware-virtualization MSRs (currently + * unused). + */ +VMMR0DECL(int) SVMR0EnableCpu(PHMPHYSCPU pHostCpu, PVM pVM, void *pvCpuPage, RTHCPHYS HCPhysCpuPage, bool fEnabledByHost, + PCSUPHWVIRTMSRS pHwvirtMsrs) +{ + Assert(!fEnabledByHost); + Assert(HCPhysCpuPage && HCPhysCpuPage != NIL_RTHCPHYS); + Assert(RT_ALIGN_T(HCPhysCpuPage, _4K, RTHCPHYS) == HCPhysCpuPage); + Assert(pvCpuPage); NOREF(pvCpuPage); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + RT_NOREF2(fEnabledByHost, pHwvirtMsrs); + + /* Paranoid: Disable interrupt as, in theory, interrupt handlers might mess with EFER. */ + RTCCUINTREG const fEFlags = ASMIntDisableFlags(); + + /* + * We must turn on AMD-V and setup the host state physical address, as those MSRs are per CPU. + */ + uint64_t u64HostEfer = ASMRdMsr(MSR_K6_EFER); + if (u64HostEfer & MSR_K6_EFER_SVME) + { + /* If the VBOX_HWVIRTEX_IGNORE_SVM_IN_USE is active, then we blindly use AMD-V. */ + if ( pVM + && pVM->hm.s.svm.fIgnoreInUseError) + pHostCpu->fIgnoreAMDVInUseError = true; + + if (!pHostCpu->fIgnoreAMDVInUseError) + { + ASMSetFlags(fEFlags); + return VERR_SVM_IN_USE; + } + } + + /* Turn on AMD-V in the EFER MSR. */ + ASMWrMsr(MSR_K6_EFER, u64HostEfer | MSR_K6_EFER_SVME); + + /* Write the physical page address where the CPU will store the host state while executing the VM. */ + ASMWrMsr(MSR_K8_VM_HSAVE_PA, HCPhysCpuPage); + + /* Restore interrupts. */ + ASMSetFlags(fEFlags); + + /* + * Theoretically, other hypervisors may have used ASIDs, ideally we should flush all + * non-zero ASIDs when enabling SVM. AMD doesn't have an SVM instruction to flush all + * ASIDs (flushing is done upon VMRUN). Therefore, flag that we need to flush the TLB + * entirely with before executing any guest code. + */ + pHostCpu->fFlushAsidBeforeUse = true; + + /* + * Ensure each VCPU scheduled on this CPU gets a new ASID on resume. See @bugref{6255}. + */ + ++pHostCpu->cTlbFlushes; + + return VINF_SUCCESS; +} + + +/** + * Deactivates AMD-V on the current CPU. + * + * @returns VBox status code. + * @param pvCpuPage Pointer to the global CPU page. + * @param HCPhysCpuPage Physical address of the global CPU page. + */ +VMMR0DECL(int) SVMR0DisableCpu(void *pvCpuPage, RTHCPHYS HCPhysCpuPage) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + AssertReturn( HCPhysCpuPage + && HCPhysCpuPage != NIL_RTHCPHYS, VERR_INVALID_PARAMETER); + AssertReturn(pvCpuPage, VERR_INVALID_PARAMETER); + + /* Paranoid: Disable interrupts as, in theory, interrupt handlers might mess with EFER. */ + RTCCUINTREG const fEFlags = ASMIntDisableFlags(); + + /* Turn off AMD-V in the EFER MSR. */ + uint64_t u64HostEfer = ASMRdMsr(MSR_K6_EFER); + ASMWrMsr(MSR_K6_EFER, u64HostEfer & ~MSR_K6_EFER_SVME); + + /* Invalidate host state physical address. */ + ASMWrMsr(MSR_K8_VM_HSAVE_PA, 0); + + /* Restore interrupts. */ + ASMSetFlags(fEFlags); + + return VINF_SUCCESS; +} + + +/** + * Does global AMD-V initialization (called during module initialization). + * + * @returns VBox status code. + */ +VMMR0DECL(int) SVMR0GlobalInit(void) +{ + /* + * Allocate 12 KB (3 pages) for the IO bitmap. Since this is non-optional and we always + * intercept all IO accesses, it's done once globally here instead of per-VM. + */ + Assert(g_hMemObjIOBitmap == NIL_RTR0MEMOBJ); + int rc = RTR0MemObjAllocCont(&g_hMemObjIOBitmap, SVM_IOPM_PAGES << X86_PAGE_4K_SHIFT, false /* fExecutable */); + if (RT_FAILURE(rc)) + return rc; + + g_pvIOBitmap = RTR0MemObjAddress(g_hMemObjIOBitmap); + g_HCPhysIOBitmap = RTR0MemObjGetPagePhysAddr(g_hMemObjIOBitmap, 0 /* iPage */); + + /* Set all bits to intercept all IO accesses. */ + ASMMemFill32(g_pvIOBitmap, SVM_IOPM_PAGES << X86_PAGE_4K_SHIFT, UINT32_C(0xffffffff)); + + return VINF_SUCCESS; +} + + +/** + * Does global AMD-V termination (called during module termination). + */ +VMMR0DECL(void) SVMR0GlobalTerm(void) +{ + if (g_hMemObjIOBitmap != NIL_RTR0MEMOBJ) + { + RTR0MemObjFree(g_hMemObjIOBitmap, true /* fFreeMappings */); + g_pvIOBitmap = NULL; + g_HCPhysIOBitmap = 0; + g_hMemObjIOBitmap = NIL_RTR0MEMOBJ; + } +} + + +/** + * Frees any allocated per-VCPU structures for a VM. + * + * @param pVM The cross context VM structure. + */ +DECLINLINE(void) hmR0SvmFreeStructs(PVM pVM) +{ + for (uint32_t i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + AssertPtr(pVCpu); + + if (pVCpu->hm.s.svm.hMemObjVmcbHost != NIL_RTR0MEMOBJ) + { + RTR0MemObjFree(pVCpu->hm.s.svm.hMemObjVmcbHost, false); + pVCpu->hm.s.svm.HCPhysVmcbHost = 0; + pVCpu->hm.s.svm.hMemObjVmcbHost = NIL_RTR0MEMOBJ; + } + + if (pVCpu->hm.s.svm.hMemObjVmcb != NIL_RTR0MEMOBJ) + { + RTR0MemObjFree(pVCpu->hm.s.svm.hMemObjVmcb, false); + pVCpu->hm.s.svm.pVmcb = NULL; + pVCpu->hm.s.svm.HCPhysVmcb = 0; + pVCpu->hm.s.svm.hMemObjVmcb = NIL_RTR0MEMOBJ; + } + + if (pVCpu->hm.s.svm.hMemObjMsrBitmap != NIL_RTR0MEMOBJ) + { + RTR0MemObjFree(pVCpu->hm.s.svm.hMemObjMsrBitmap, false); + pVCpu->hm.s.svm.pvMsrBitmap = NULL; + pVCpu->hm.s.svm.HCPhysMsrBitmap = 0; + pVCpu->hm.s.svm.hMemObjMsrBitmap = NIL_RTR0MEMOBJ; + } + } +} + + +/** + * Does per-VM AMD-V initialization. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0DECL(int) SVMR0InitVM(PVM pVM) +{ + int rc = VERR_INTERNAL_ERROR_5; + + /* + * Check for an AMD CPU erratum which requires us to flush the TLB before every world-switch. + */ + uint32_t u32Family; + uint32_t u32Model; + uint32_t u32Stepping; + if (HMIsSubjectToSvmErratum170(&u32Family, &u32Model, &u32Stepping)) + { + Log4Func(("AMD cpu with erratum 170 family %#x model %#x stepping %#x\n", u32Family, u32Model, u32Stepping)); + pVM->hm.s.svm.fAlwaysFlushTLB = true; + } + + /* + * Initialize the R0 memory objects up-front so we can properly cleanup on allocation failures. + */ + for (VMCPUID i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + pVCpu->hm.s.svm.hMemObjVmcbHost = NIL_RTR0MEMOBJ; + pVCpu->hm.s.svm.hMemObjVmcb = NIL_RTR0MEMOBJ; + pVCpu->hm.s.svm.hMemObjMsrBitmap = NIL_RTR0MEMOBJ; + } + + for (VMCPUID i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + + /* + * Allocate one page for the host-context VM control block (VMCB). This is used for additional host-state (such as + * FS, GS, Kernel GS Base, etc.) apart from the host-state save area specified in MSR_K8_VM_HSAVE_PA. + */ + rc = RTR0MemObjAllocCont(&pVCpu->hm.s.svm.hMemObjVmcbHost, SVM_VMCB_PAGES << PAGE_SHIFT, false /* fExecutable */); + if (RT_FAILURE(rc)) + goto failure_cleanup; + + void *pvVmcbHost = RTR0MemObjAddress(pVCpu->hm.s.svm.hMemObjVmcbHost); + pVCpu->hm.s.svm.HCPhysVmcbHost = RTR0MemObjGetPagePhysAddr(pVCpu->hm.s.svm.hMemObjVmcbHost, 0 /* iPage */); + Assert(pVCpu->hm.s.svm.HCPhysVmcbHost < _4G); + ASMMemZeroPage(pvVmcbHost); + + /* + * Allocate one page for the guest-state VMCB. + */ + rc = RTR0MemObjAllocCont(&pVCpu->hm.s.svm.hMemObjVmcb, SVM_VMCB_PAGES << PAGE_SHIFT, false /* fExecutable */); + if (RT_FAILURE(rc)) + goto failure_cleanup; + + pVCpu->hm.s.svm.pVmcb = (PSVMVMCB)RTR0MemObjAddress(pVCpu->hm.s.svm.hMemObjVmcb); + pVCpu->hm.s.svm.HCPhysVmcb = RTR0MemObjGetPagePhysAddr(pVCpu->hm.s.svm.hMemObjVmcb, 0 /* iPage */); + Assert(pVCpu->hm.s.svm.HCPhysVmcb < _4G); + ASMMemZeroPage(pVCpu->hm.s.svm.pVmcb); + + /* + * Allocate two pages (8 KB) for the MSR permission bitmap. There doesn't seem to be a way to convince + * SVM to not require one. + */ + rc = RTR0MemObjAllocCont(&pVCpu->hm.s.svm.hMemObjMsrBitmap, SVM_MSRPM_PAGES << X86_PAGE_4K_SHIFT, + false /* fExecutable */); + if (RT_FAILURE(rc)) + goto failure_cleanup; + + pVCpu->hm.s.svm.pvMsrBitmap = RTR0MemObjAddress(pVCpu->hm.s.svm.hMemObjMsrBitmap); + pVCpu->hm.s.svm.HCPhysMsrBitmap = RTR0MemObjGetPagePhysAddr(pVCpu->hm.s.svm.hMemObjMsrBitmap, 0 /* iPage */); + /* Set all bits to intercept all MSR accesses (changed later on). */ + ASMMemFill32(pVCpu->hm.s.svm.pvMsrBitmap, SVM_MSRPM_PAGES << X86_PAGE_4K_SHIFT, UINT32_C(0xffffffff)); + } + + return VINF_SUCCESS; + +failure_cleanup: + hmR0SvmFreeStructs(pVM); + return rc; +} + + +/** + * Does per-VM AMD-V termination. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0DECL(int) SVMR0TermVM(PVM pVM) +{ + hmR0SvmFreeStructs(pVM); + return VINF_SUCCESS; +} + + +/** + * Returns whether the VMCB Clean Bits feature is supported. + * + * @return @c true if supported, @c false otherwise. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(bool) hmR0SvmSupportsVmcbCleanBits(PVMCPU pVCpu) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + if (CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) + { + return (pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_VMCB_CLEAN) + && pVM->cpum.ro.GuestFeatures.fSvmVmcbClean; + } +#endif + return RT_BOOL(pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_VMCB_CLEAN); +} + + +/** + * Returns whether the decode assists feature is supported. + * + * @return @c true if supported, @c false otherwise. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(bool) hmR0SvmSupportsDecodeAssists(PVMCPU pVCpu) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + if (CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) + { + return (pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_DECODE_ASSISTS) + && pVM->cpum.ro.GuestFeatures.fSvmDecodeAssists; + } +#endif + return RT_BOOL(pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_DECODE_ASSISTS); +} + + +/** + * Returns whether the NRIP_SAVE feature is supported. + * + * @return @c true if supported, @c false otherwise. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(bool) hmR0SvmSupportsNextRipSave(PVMCPU pVCpu) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + if (CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) + { + return (pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_NRIP_SAVE) + && pVM->cpum.ro.GuestFeatures.fSvmNextRipSave; + } +#endif + return RT_BOOL(pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_NRIP_SAVE); +} + + +/** + * Sets the permission bits for the specified MSR in the MSRPM bitmap. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pbMsrBitmap Pointer to the MSR bitmap. + * @param idMsr The MSR for which the permissions are being set. + * @param enmRead MSR read permissions. + * @param enmWrite MSR write permissions. + * + * @remarks This function does -not- clear the VMCB clean bits for MSRPM. The + * caller needs to take care of this. + */ +static void hmR0SvmSetMsrPermission(PVMCPU pVCpu, uint8_t *pbMsrBitmap, uint32_t idMsr, SVMMSREXITREAD enmRead, + SVMMSREXITWRITE enmWrite) +{ + bool const fInNestedGuestMode = CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx); + uint16_t offMsrpm; + uint8_t uMsrpmBit; + int rc = HMGetSvmMsrpmOffsetAndBit(idMsr, &offMsrpm, &uMsrpmBit); + AssertRC(rc); + + Assert(uMsrpmBit == 0 || uMsrpmBit == 2 || uMsrpmBit == 4 || uMsrpmBit == 6); + Assert(offMsrpm < SVM_MSRPM_PAGES << X86_PAGE_4K_SHIFT); + + pbMsrBitmap += offMsrpm; + if (enmRead == SVMMSREXIT_INTERCEPT_READ) + *pbMsrBitmap |= RT_BIT(uMsrpmBit); + else + { + if (!fInNestedGuestMode) + *pbMsrBitmap &= ~RT_BIT(uMsrpmBit); +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + else + { + /* Only clear the bit if the nested-guest is also not intercepting the MSR read.*/ + uint8_t const *pbNstGstMsrBitmap = (uint8_t *)pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pvMsrBitmap); + pbNstGstMsrBitmap += offMsrpm; + if (!(*pbNstGstMsrBitmap & RT_BIT(uMsrpmBit))) + *pbMsrBitmap &= ~RT_BIT(uMsrpmBit); + else + Assert(*pbMsrBitmap & RT_BIT(uMsrpmBit)); + } +#endif + } + + if (enmWrite == SVMMSREXIT_INTERCEPT_WRITE) + *pbMsrBitmap |= RT_BIT(uMsrpmBit + 1); + else + { + if (!fInNestedGuestMode) + *pbMsrBitmap &= ~RT_BIT(uMsrpmBit + 1); +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + else + { + /* Only clear the bit if the nested-guest is also not intercepting the MSR write.*/ + uint8_t const *pbNstGstMsrBitmap = (uint8_t *)pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pvMsrBitmap); + pbNstGstMsrBitmap += offMsrpm; + if (!(*pbNstGstMsrBitmap & RT_BIT(uMsrpmBit + 1))) + *pbMsrBitmap &= ~RT_BIT(uMsrpmBit + 1); + else + Assert(*pbMsrBitmap & RT_BIT(uMsrpmBit + 1)); + } +#endif + } +} + + +/** + * Sets up AMD-V for the specified VM. + * This function is only called once per-VM during initalization. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0DECL(int) SVMR0SetupVM(PVM pVM) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + AssertReturn(pVM, VERR_INVALID_PARAMETER); + Assert(pVM->hm.s.svm.fSupported); + + bool const fPauseFilter = RT_BOOL(pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_PAUSE_FILTER); + bool const fPauseFilterThreshold = RT_BOOL(pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_PAUSE_FILTER_THRESHOLD); + bool const fUsePauseFilter = fPauseFilter && pVM->hm.s.svm.cPauseFilter; + + bool const fLbrVirt = RT_BOOL(pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_LBR_VIRT); + bool const fUseLbrVirt = fLbrVirt; /** @todo CFGM, IEM implementation etc. */ + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + bool const fVirtVmsaveVmload = RT_BOOL(pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_VIRT_VMSAVE_VMLOAD); + bool const fUseVirtVmsaveVmload = fVirtVmsaveVmload && pVM->hm.s.svm.fVirtVmsaveVmload && pVM->hm.s.fNestedPaging; + + bool const fVGif = RT_BOOL(pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_VGIF); + bool const fUseVGif = fVGif && pVM->hm.s.svm.fVGif; +#endif + + PVMCPU pVCpu = &pVM->aCpus[0]; + PSVMVMCB pVmcb = pVCpu->hm.s.svm.pVmcb; + AssertMsgReturn(pVmcb, ("Invalid pVmcb for vcpu[0]\n"), VERR_SVM_INVALID_PVMCB); + PSVMVMCBCTRL pVmcbCtrl = &pVmcb->ctrl; + + /* Always trap #AC for reasons of security. */ + pVmcbCtrl->u32InterceptXcpt |= RT_BIT_32(X86_XCPT_AC); + + /* Always trap #DB for reasons of security. */ + pVmcbCtrl->u32InterceptXcpt |= RT_BIT_32(X86_XCPT_DB); + + /* Trap exceptions unconditionally (debug purposes). */ +#ifdef HMSVM_ALWAYS_TRAP_PF + pVmcbCtrl->u32InterceptXcpt |= RT_BIT(X86_XCPT_PF); +#endif +#ifdef HMSVM_ALWAYS_TRAP_ALL_XCPTS + /* If you add any exceptions here, make sure to update hmR0SvmHandleExit(). */ + pVmcbCtrl->u32InterceptXcpt |= 0 + | RT_BIT(X86_XCPT_BP) + | RT_BIT(X86_XCPT_DE) + | RT_BIT(X86_XCPT_NM) + | RT_BIT(X86_XCPT_UD) + | RT_BIT(X86_XCPT_NP) + | RT_BIT(X86_XCPT_SS) + | RT_BIT(X86_XCPT_GP) + | RT_BIT(X86_XCPT_PF) + | RT_BIT(X86_XCPT_MF) + ; +#endif + + /* Apply the exceptions intercepts needed by the GIM provider. */ + if (pVCpu->hm.s.fGIMTrapXcptUD) + pVmcbCtrl->u32InterceptXcpt |= RT_BIT(X86_XCPT_UD); + + /* The mesa 3d driver hack needs #GP. */ + if (pVCpu->hm.s.fTrapXcptGpForLovelyMesaDrv) + pVmcbCtrl->u32InterceptXcpt |= RT_BIT(X86_XCPT_GP); + + /* Set up unconditional intercepts and conditions. */ + pVmcbCtrl->u64InterceptCtrl = HMSVM_MANDATORY_GUEST_CTRL_INTERCEPTS + | SVM_CTRL_INTERCEPT_VMMCALL; + +#ifdef HMSVM_ALWAYS_TRAP_TASK_SWITCH + pVmcbCtrl->u64InterceptCtrl |= SVM_CTRL_INTERCEPT_TASK_SWITCH; +#endif + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + /* Virtualized VMSAVE/VMLOAD. */ + pVmcbCtrl->LbrVirt.n.u1VirtVmsaveVmload = fUseVirtVmsaveVmload; + if (!fUseVirtVmsaveVmload) + { + pVmcbCtrl->u64InterceptCtrl |= SVM_CTRL_INTERCEPT_VMSAVE + | SVM_CTRL_INTERCEPT_VMLOAD; + } + + /* Virtual GIF. */ + pVmcbCtrl->IntCtrl.n.u1VGifEnable = fUseVGif; + if (!fUseVGif) + { + pVmcbCtrl->u64InterceptCtrl |= SVM_CTRL_INTERCEPT_CLGI + | SVM_CTRL_INTERCEPT_STGI; + } +#endif + + /* CR4 writes must always be intercepted for tracking PGM mode changes. */ + pVmcbCtrl->u16InterceptWrCRx = RT_BIT(4); + + /* Intercept all DRx reads and writes by default. Changed later on. */ + pVmcbCtrl->u16InterceptRdDRx = 0xffff; + pVmcbCtrl->u16InterceptWrDRx = 0xffff; + + /* Virtualize masking of INTR interrupts. (reads/writes from/to CR8 go to the V_TPR register) */ + pVmcbCtrl->IntCtrl.n.u1VIntrMasking = 1; + + /* Ignore the priority in the virtual TPR. This is necessary for delivering PIC style (ExtInt) interrupts + and we currently deliver both PIC and APIC interrupts alike, see hmR0SvmEvaluatePendingEvent() */ + pVmcbCtrl->IntCtrl.n.u1IgnoreTPR = 1; + + /* Set the IO permission bitmap physical addresses. */ + pVmcbCtrl->u64IOPMPhysAddr = g_HCPhysIOBitmap; + + /* LBR virtualization. */ + pVmcbCtrl->LbrVirt.n.u1LbrVirt = fUseLbrVirt; + + /* The host ASID MBZ, for the guest start with 1. */ + pVmcbCtrl->TLBCtrl.n.u32ASID = 1; + + /* Setup Nested Paging. This doesn't change throughout the execution time of the VM. */ + pVmcbCtrl->NestedPagingCtrl.n.u1NestedPaging = pVM->hm.s.fNestedPaging; + + /* Without Nested Paging, we need additionally intercepts. */ + if (!pVM->hm.s.fNestedPaging) + { + /* CR3 reads/writes must be intercepted; our shadow values differ from the guest values. */ + pVmcbCtrl->u16InterceptRdCRx |= RT_BIT(3); + pVmcbCtrl->u16InterceptWrCRx |= RT_BIT(3); + + /* Intercept INVLPG and task switches (may change CR3, EFLAGS, LDT). */ + pVmcbCtrl->u64InterceptCtrl |= SVM_CTRL_INTERCEPT_INVLPG + | SVM_CTRL_INTERCEPT_TASK_SWITCH; + + /* Page faults must be intercepted to implement shadow paging. */ + pVmcbCtrl->u32InterceptXcpt |= RT_BIT(X86_XCPT_PF); + } + + /* Setup Pause Filter for guest pause-loop (spinlock) exiting. */ + if (fUsePauseFilter) + { + Assert(pVM->hm.s.svm.cPauseFilter > 0); + pVmcbCtrl->u16PauseFilterCount = pVM->hm.s.svm.cPauseFilter; + if (fPauseFilterThreshold) + pVmcbCtrl->u16PauseFilterThreshold = pVM->hm.s.svm.cPauseFilterThresholdTicks; + pVmcbCtrl->u64InterceptCtrl |= SVM_CTRL_INTERCEPT_PAUSE; + } + + /* + * Setup the MSR permission bitmap. + * The following MSRs are saved/restored automatically during the world-switch. + * Don't intercept guest read/write accesses to these MSRs. + */ + uint8_t *pbMsrBitmap = (uint8_t *)pVCpu->hm.s.svm.pvMsrBitmap; + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_LSTAR, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_CSTAR, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K6_STAR, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_SF_MASK, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_FS_BASE, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_GS_BASE, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_KERNEL_GS_BASE, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_IA32_SYSENTER_CS, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_IA32_SYSENTER_ESP, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_IA32_SYSENTER_EIP, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + pVmcbCtrl->u64MSRPMPhysAddr = pVCpu->hm.s.svm.HCPhysMsrBitmap; + + /* Initially all VMCB clean bits MBZ indicating that everything should be loaded from the VMCB in memory. */ + Assert(pVmcbCtrl->u32VmcbCleanBits == 0); + + for (VMCPUID i = 1; i < pVM->cCpus; i++) + { + PVMCPU pVCpuCur = &pVM->aCpus[i]; + PSVMVMCB pVmcbCur = pVM->aCpus[i].hm.s.svm.pVmcb; + AssertMsgReturn(pVmcbCur, ("Invalid pVmcb for vcpu[%u]\n", i), VERR_SVM_INVALID_PVMCB); + PSVMVMCBCTRL pVmcbCtrlCur = &pVmcbCur->ctrl; + + /* Copy the VMCB control area. */ + memcpy(pVmcbCtrlCur, pVmcbCtrl, sizeof(*pVmcbCtrlCur)); + + /* Copy the MSR bitmap and setup the VCPU-specific host physical address. */ + uint8_t *pbMsrBitmapCur = (uint8_t *)pVCpuCur->hm.s.svm.pvMsrBitmap; + memcpy(pbMsrBitmapCur, pbMsrBitmap, SVM_MSRPM_PAGES << X86_PAGE_4K_SHIFT); + pVmcbCtrlCur->u64MSRPMPhysAddr = pVCpuCur->hm.s.svm.HCPhysMsrBitmap; + + /* Initially all VMCB clean bits MBZ indicating that everything should be loaded from the VMCB in memory. */ + Assert(pVmcbCtrlCur->u32VmcbCleanBits == 0); + + /* Verify our assumption that GIM providers trap #UD uniformly across VCPUs initially. */ + Assert(pVCpuCur->hm.s.fGIMTrapXcptUD == pVCpu->hm.s.fGIMTrapXcptUD); + } + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + LogRel(("HM: fUsePauseFilter=%RTbool fUseLbrVirt=%RTbool fUseVGif=%RTbool fUseVirtVmsaveVmload=%RTbool\n", fUsePauseFilter, + fUseLbrVirt, fUseVGif, fUseVirtVmsaveVmload)); +#else + LogRel(("HM: fUsePauseFilter=%RTbool fUseLbrVirt=%RTbool\n", fUsePauseFilter, fUseLbrVirt)); +#endif + return VINF_SUCCESS; +} + + +/** + * Gets a pointer to the currently active guest (or nested-guest) VMCB. + * + * @returns Pointer to the current context VMCB. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(PSVMVMCB) hmR0SvmGetCurrentVmcb(PVMCPU pVCpu) +{ +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + if (CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) + return pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pVmcb); +#endif + return pVCpu->hm.s.svm.pVmcb; +} + + +/** + * Gets a pointer to the nested-guest VMCB cache. + * + * @returns Pointer to the nested-guest VMCB cache. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(PSVMNESTEDVMCBCACHE) hmR0SvmGetNestedVmcbCache(PVMCPU pVCpu) +{ +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + Assert(pVCpu->hm.s.svm.NstGstVmcbCache.fCacheValid); + return &pVCpu->hm.s.svm.NstGstVmcbCache; +#else + RT_NOREF(pVCpu); + return NULL; +#endif +} + + +/** + * Invalidates a guest page by guest virtual address. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param GCVirt Guest virtual address of the page to invalidate. + */ +VMMR0DECL(int) SVMR0InvalidatePage(PVMCPU pVCpu, RTGCPTR GCVirt) +{ + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.svm.fSupported); + + bool const fFlushPending = pVCpu->CTX_SUFF(pVM)->hm.s.svm.fAlwaysFlushTLB || VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_TLB_FLUSH); + + /* Skip it if a TLB flush is already pending. */ + if (!fFlushPending) + { + Log4Func(("%#RGv\n", GCVirt)); + + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + AssertMsgReturn(pVmcb, ("Invalid pVmcb!\n"), VERR_SVM_INVALID_PVMCB); + +#if HC_ARCH_BITS == 32 + /* If we get a flush in 64-bit guest mode, then force a full TLB flush. INVLPGA takes only 32-bit addresses. */ + if (CPUMIsGuestInLongMode(pVCpu)) + VMCPU_FF_SET(pVCpu, VMCPU_FF_TLB_FLUSH); + else +#endif + { + SVMR0InvlpgA(GCVirt, pVmcb->ctrl.TLBCtrl.n.u32ASID); + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlbInvlpgVirt); + } + } + return VINF_SUCCESS; +} + + +/** + * Flushes the appropriate tagged-TLB entries. + * + * @param pHostCpu The HM physical-CPU structure. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + */ +static void hmR0SvmFlushTaggedTlb(PHMPHYSCPU pHostCpu, PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + /* + * Force a TLB flush for the first world switch if the current CPU differs from the one + * we ran on last. This can happen both for start & resume due to long jumps back to + * ring-3. + * + * We also force a TLB flush every time when executing a nested-guest VCPU as there is no + * correlation between it and the physical CPU. + * + * If the TLB flush count changed, another VM (VCPU rather) has hit the ASID limit while + * flushing the TLB, so we cannot reuse the ASIDs without flushing. + */ + bool fNewAsid = false; + Assert(pHostCpu->idCpu != NIL_RTCPUID); + if ( pVCpu->hm.s.idLastCpu != pHostCpu->idCpu + || pVCpu->hm.s.cTlbFlushes != pHostCpu->cTlbFlushes +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + || CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx) +#endif + ) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlbWorldSwitch); + pVCpu->hm.s.fForceTLBFlush = true; + fNewAsid = true; + } + + /* Set TLB flush state as checked until we return from the world switch. */ + ASMAtomicWriteBool(&pVCpu->hm.s.fCheckedTLBFlush, true); + + /* Check for explicit TLB flushes. */ + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_TLB_FLUSH)) + { + pVCpu->hm.s.fForceTLBFlush = true; + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlb); + } + + /* + * If the AMD CPU erratum 170, We need to flush the entire TLB for each world switch. Sad. + * This Host CPU requirement takes precedence. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (pVM->hm.s.svm.fAlwaysFlushTLB) + { + pHostCpu->uCurrentAsid = 1; + pVCpu->hm.s.uCurrentAsid = 1; + pVCpu->hm.s.cTlbFlushes = pHostCpu->cTlbFlushes; + pVCpu->hm.s.idLastCpu = pHostCpu->idCpu; + pVmcb->ctrl.TLBCtrl.n.u8TLBFlush = SVM_TLB_FLUSH_ENTIRE; + + /* Clear the VMCB Clean Bit for NP while flushing the TLB. See @bugref{7152}. */ + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_NP; + } + else + { + pVmcb->ctrl.TLBCtrl.n.u8TLBFlush = SVM_TLB_FLUSH_NOTHING; + if (pVCpu->hm.s.fForceTLBFlush) + { + /* Clear the VMCB Clean Bit for NP while flushing the TLB. See @bugref{7152}. */ + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_NP; + + if (fNewAsid) + { + ++pHostCpu->uCurrentAsid; + + bool fHitASIDLimit = false; + if (pHostCpu->uCurrentAsid >= pVM->hm.s.uMaxAsid) + { + pHostCpu->uCurrentAsid = 1; /* Wraparound at 1; host uses 0 */ + pHostCpu->cTlbFlushes++; /* All VCPUs that run on this host CPU must use a new ASID. */ + fHitASIDLimit = true; + } + + if ( fHitASIDLimit + || pHostCpu->fFlushAsidBeforeUse) + { + pVmcb->ctrl.TLBCtrl.n.u8TLBFlush = SVM_TLB_FLUSH_ENTIRE; + pHostCpu->fFlushAsidBeforeUse = false; + } + + pVCpu->hm.s.uCurrentAsid = pHostCpu->uCurrentAsid; + pVCpu->hm.s.idLastCpu = pHostCpu->idCpu; + pVCpu->hm.s.cTlbFlushes = pHostCpu->cTlbFlushes; + } + else + { + if (pVM->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_FLUSH_BY_ASID) + pVmcb->ctrl.TLBCtrl.n.u8TLBFlush = SVM_TLB_FLUSH_SINGLE_CONTEXT; + else + pVmcb->ctrl.TLBCtrl.n.u8TLBFlush = SVM_TLB_FLUSH_ENTIRE; + } + + pVCpu->hm.s.fForceTLBFlush = false; + } + } + + /* Update VMCB with the ASID. */ + if (pVmcb->ctrl.TLBCtrl.n.u32ASID != pVCpu->hm.s.uCurrentAsid) + { + pVmcb->ctrl.TLBCtrl.n.u32ASID = pVCpu->hm.s.uCurrentAsid; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_ASID; + } + + AssertMsg(pVCpu->hm.s.idLastCpu == pHostCpu->idCpu, + ("vcpu idLastCpu=%u hostcpu idCpu=%u\n", pVCpu->hm.s.idLastCpu, pHostCpu->idCpu)); + AssertMsg(pVCpu->hm.s.cTlbFlushes == pHostCpu->cTlbFlushes, + ("Flush count mismatch for cpu %u (%u vs %u)\n", pHostCpu->idCpu, pVCpu->hm.s.cTlbFlushes, pHostCpu->cTlbFlushes)); + AssertMsg(pHostCpu->uCurrentAsid >= 1 && pHostCpu->uCurrentAsid < pVM->hm.s.uMaxAsid, + ("cpu%d uCurrentAsid = %x\n", pHostCpu->idCpu, pHostCpu->uCurrentAsid)); + AssertMsg(pVCpu->hm.s.uCurrentAsid >= 1 && pVCpu->hm.s.uCurrentAsid < pVM->hm.s.uMaxAsid, + ("cpu%d VM uCurrentAsid = %x\n", pHostCpu->idCpu, pVCpu->hm.s.uCurrentAsid)); + +#ifdef VBOX_WITH_STATISTICS + if (pVmcb->ctrl.TLBCtrl.n.u8TLBFlush == SVM_TLB_FLUSH_NOTHING) + STAM_COUNTER_INC(&pVCpu->hm.s.StatNoFlushTlbWorldSwitch); + else if ( pVmcb->ctrl.TLBCtrl.n.u8TLBFlush == SVM_TLB_FLUSH_SINGLE_CONTEXT + || pVmcb->ctrl.TLBCtrl.n.u8TLBFlush == SVM_TLB_FLUSH_SINGLE_CONTEXT_RETAIN_GLOBALS) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushAsid); + } + else + { + Assert(pVmcb->ctrl.TLBCtrl.n.u8TLBFlush == SVM_TLB_FLUSH_ENTIRE); + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushEntire); + } +#endif +} + + +/** @name 64-bit guest on 32-bit host OS helper functions. + * + * The host CPU is still 64-bit capable but the host OS is running in 32-bit + * mode (code segment, paging). These wrappers/helpers perform the necessary + * bits for the 32->64 switcher. + * + * @{ */ +#if HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) +/** + * Prepares for and executes VMRUN (64-bit guests on a 32-bit host). + * + * @returns VBox status code. + * @param HCPhysVmcbHost Physical address of host VMCB. + * @param HCPhysVmcb Physical address of the VMCB. + * @param pCtx Pointer to the guest-CPU context. + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLASM(int) SVMR0VMSwitcherRun64(RTHCPHYS HCPhysVmcbHost, RTHCPHYS HCPhysVmcb, PCPUMCTX pCtx, PVM pVM, PVMCPU pVCpu) +{ + RT_NOREF2(pVM, pCtx); + uint32_t aParam[8]; + aParam[0] = RT_LO_U32(HCPhysVmcbHost); /* Param 1: HCPhysVmcbHost - Lo. */ + aParam[1] = RT_HI_U32(HCPhysVmcbHost); /* Param 1: HCPhysVmcbHost - Hi. */ + aParam[2] = RT_LO_U32(HCPhysVmcb); /* Param 2: HCPhysVmcb - Lo. */ + aParam[3] = RT_HI_U32(HCPhysVmcb); /* Param 2: HCPhysVmcb - Hi. */ + aParam[4] = VM_RC_ADDR(pVM, pVM); + aParam[5] = 0; + aParam[6] = VM_RC_ADDR(pVM, pVCpu); + aParam[7] = 0; + + return SVMR0Execute64BitsHandler(pVCpu, HM64ON32OP_SVMRCVMRun64, RT_ELEMENTS(aParam), &aParam[0]); +} + + +/** + * Executes the specified VMRUN handler in 64-bit mode. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param enmOp The operation to perform. + * @param cParams Number of parameters. + * @param paParam Array of 32-bit parameters. + */ +VMMR0DECL(int) SVMR0Execute64BitsHandler(PVMCPU pVCpu, HM64ON32OP enmOp, uint32_t cParams, uint32_t *paParam) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); + AssertReturn(pVM->hm.s.pfnHost32ToGuest64R0, VERR_HM_NO_32_TO_64_SWITCHER); + Assert(enmOp > HM64ON32OP_INVALID && enmOp < HM64ON32OP_END); + + /* Disable interrupts. */ + RTHCUINTREG const fEFlags = ASMIntDisableFlags(); + +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI + RTCPUID idHostCpu = RTMpCpuId(); + CPUMR0SetLApic(pVCpu, idHostCpu); +#endif + + CPUMSetHyperESP(pVCpu, VMMGetStackRC(pVCpu)); + CPUMSetHyperEIP(pVCpu, enmOp); + for (int i = (int)cParams - 1; i >= 0; i--) + CPUMPushHyper(pVCpu, paParam[i]); + + /* Call the switcher. */ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatWorldSwitch3264, z); + int rc = pVM->hm.s.pfnHost32ToGuest64R0(pVM, RT_UOFFSETOF_DYN(VM, aCpus[pVCpu->idCpu].cpum) - RT_UOFFSETOF(VM, cpum)); + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatWorldSwitch3264, z); + + /* Restore interrupts. */ + ASMSetFlags(fEFlags); + return rc; +} + +#endif /* HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) */ +/** @} */ + + +/** + * Sets an exception intercept in the specified VMCB. + * + * @param pVmcb Pointer to the VM control block. + * @param uXcpt The exception (X86_XCPT_*). + */ +DECLINLINE(void) hmR0SvmSetXcptIntercept(PSVMVMCB pVmcb, uint8_t uXcpt) +{ + if (!(pVmcb->ctrl.u32InterceptXcpt & RT_BIT(uXcpt))) + { + pVmcb->ctrl.u32InterceptXcpt |= RT_BIT(uXcpt); + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + } +} + + +/** + * Clears an exception intercept in the specified VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * @param uXcpt The exception (X86_XCPT_*). + * + * @remarks This takes into account if we're executing a nested-guest and only + * removes the exception intercept if both the guest -and- nested-guest + * are not intercepting it. + */ +DECLINLINE(void) hmR0SvmClearXcptIntercept(PVMCPU pVCpu, PSVMVMCB pVmcb, uint8_t uXcpt) +{ + Assert(uXcpt != X86_XCPT_DB); + Assert(uXcpt != X86_XCPT_AC); + Assert(uXcpt != X86_XCPT_GP); +#ifndef HMSVM_ALWAYS_TRAP_ALL_XCPTS + if (pVmcb->ctrl.u32InterceptXcpt & RT_BIT(uXcpt)) + { + bool fRemove = true; +# ifdef VBOX_WITH_NESTED_HWVIRT_SVM + /* Only remove the intercept if the nested-guest is also not intercepting it! */ + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + { + PCSVMNESTEDVMCBCACHE pVmcbNstGstCache = hmR0SvmGetNestedVmcbCache(pVCpu); + fRemove = !(pVmcbNstGstCache->u32InterceptXcpt & RT_BIT(uXcpt)); + } +# else + RT_NOREF(pVCpu); +# endif + if (fRemove) + { + pVmcb->ctrl.u32InterceptXcpt &= ~RT_BIT(uXcpt); + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + } + } +#else + RT_NOREF3(pVCpu, pVmcb, uXcpt); +#endif +} + + +/** + * Sets a control intercept in the specified VMCB. + * + * @param pVmcb Pointer to the VM control block. + * @param fCtrlIntercept The control intercept (SVM_CTRL_INTERCEPT_*). + */ +DECLINLINE(void) hmR0SvmSetCtrlIntercept(PSVMVMCB pVmcb, uint64_t fCtrlIntercept) +{ + if (!(pVmcb->ctrl.u64InterceptCtrl & fCtrlIntercept)) + { + pVmcb->ctrl.u64InterceptCtrl |= fCtrlIntercept; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + } +} + + +/** + * Clears a control intercept in the specified VMCB. + * + * @returns @c true if the intercept is still set, @c false otherwise. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * @param fCtrlIntercept The control intercept (SVM_CTRL_INTERCEPT_*). + * + * @remarks This takes into account if we're executing a nested-guest and only + * removes the control intercept if both the guest -and- nested-guest + * are not intercepting it. + */ +static bool hmR0SvmClearCtrlIntercept(PVMCPU pVCpu, PSVMVMCB pVmcb, uint64_t fCtrlIntercept) +{ + if (pVmcb->ctrl.u64InterceptCtrl & fCtrlIntercept) + { + bool fRemove = true; +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + /* Only remove the control intercept if the nested-guest is also not intercepting it! */ + if (CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) + { + PCSVMNESTEDVMCBCACHE pVmcbNstGstCache = hmR0SvmGetNestedVmcbCache(pVCpu); + fRemove = !(pVmcbNstGstCache->u64InterceptCtrl & fCtrlIntercept); + } +#else + RT_NOREF(pVCpu); +#endif + if (fRemove) + { + pVmcb->ctrl.u64InterceptCtrl &= ~fCtrlIntercept; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + } + } + + return RT_BOOL(pVmcb->ctrl.u64InterceptCtrl & fCtrlIntercept); +} + + +/** + * Exports the guest (or nested-guest) CR0 into the VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks This assumes we always pre-load the guest FPU. + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmExportGuestCR0(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint64_t const uGuestCr0 = pCtx->cr0; + uint64_t uShadowCr0 = uGuestCr0; + + /* Always enable caching. */ + uShadowCr0 &= ~(X86_CR0_CD | X86_CR0_NW); + + /* When Nested Paging is not available use shadow page tables and intercept #PFs (latter done in SVMR0SetupVM()). */ + if (!pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging) + { + uShadowCr0 |= X86_CR0_PG /* Use shadow page tables. */ + | X86_CR0_WP; /* Guest CPL 0 writes to its read-only pages should cause a #PF #VMEXIT. */ + } + + /* + * Use the #MF style of legacy-FPU error reporting for now. Although AMD-V has MSRs that + * lets us isolate the host from it, IEM/REM still needs work to emulate it properly, + * see @bugref{7243#c103}. + */ + if (!(uGuestCr0 & X86_CR0_NE)) + { + uShadowCr0 |= X86_CR0_NE; + hmR0SvmSetXcptIntercept(pVmcb, X86_XCPT_MF); + } + else + hmR0SvmClearXcptIntercept(pVCpu, pVmcb, X86_XCPT_MF); + + /* + * If the shadow and guest CR0 are identical we can avoid intercepting CR0 reads. + * + * CR0 writes still needs interception as PGM requires tracking paging mode changes, + * see @bugref{6944}. + * + * We also don't ever want to honor weird things like cache disable from the guest. + * However, we can avoid intercepting changes to the TS & MP bits by clearing the CR0 + * write intercept below and keeping SVM_CTRL_INTERCEPT_CR0_SEL_WRITE instead. + */ + if (uShadowCr0 == uGuestCr0) + { + if (!CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + { + pVmcb->ctrl.u16InterceptRdCRx &= ~RT_BIT(0); + pVmcb->ctrl.u16InterceptWrCRx &= ~RT_BIT(0); + Assert(pVmcb->ctrl.u64InterceptCtrl & SVM_CTRL_INTERCEPT_CR0_SEL_WRITE); + } + else + { + /* If the nested-hypervisor intercepts CR0 reads/writes, we need to continue intercepting them. */ + PCSVMNESTEDVMCBCACHE pVmcbNstGstCache = hmR0SvmGetNestedVmcbCache(pVCpu); + pVmcb->ctrl.u16InterceptRdCRx = (pVmcb->ctrl.u16InterceptRdCRx & ~RT_BIT(0)) + | (pVmcbNstGstCache->u16InterceptRdCRx & RT_BIT(0)); + pVmcb->ctrl.u16InterceptWrCRx = (pVmcb->ctrl.u16InterceptWrCRx & ~RT_BIT(0)) + | (pVmcbNstGstCache->u16InterceptWrCRx & RT_BIT(0)); + } + } + else + { + pVmcb->ctrl.u16InterceptRdCRx |= RT_BIT(0); + pVmcb->ctrl.u16InterceptWrCRx |= RT_BIT(0); + } + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + + Assert(!RT_HI_U32(uShadowCr0)); + if (pVmcb->guest.u64CR0 != uShadowCr0) + { + pVmcb->guest.u64CR0 = uShadowCr0; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_CRX_EFER; + } +} + + +/** + * Exports the guest (or nested-guest) CR3 into the VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmExportGuestCR3(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (pVM->hm.s.fNestedPaging) + { + PGMMODE enmShwPagingMode; +#if HC_ARCH_BITS == 32 + if (CPUMIsGuestInLongModeEx(pCtx)) + enmShwPagingMode = PGMMODE_AMD64_NX; + else +#endif + enmShwPagingMode = PGMGetHostMode(pVM); + + pVmcb->ctrl.u64NestedPagingCR3 = PGMGetNestedCR3(pVCpu, enmShwPagingMode); + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_NP; + pVmcb->guest.u64CR3 = pCtx->cr3; + Assert(pVmcb->ctrl.u64NestedPagingCR3); + } + else + pVmcb->guest.u64CR3 = PGMGetHyperCR3(pVCpu); + + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_CRX_EFER; +} + + +/** + * Exports the guest (or nested-guest) CR4 into the VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0SvmExportGuestCR4(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint64_t uShadowCr4 = pCtx->cr4; + if (!pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging) + { + switch (pVCpu->hm.s.enmShadowMode) + { + case PGMMODE_REAL: + case PGMMODE_PROTECTED: /* Protected mode, no paging. */ + return VERR_PGM_UNSUPPORTED_SHADOW_PAGING_MODE; + + case PGMMODE_32_BIT: /* 32-bit paging. */ + uShadowCr4 &= ~X86_CR4_PAE; + break; + + case PGMMODE_PAE: /* PAE paging. */ + case PGMMODE_PAE_NX: /* PAE paging with NX enabled. */ + /** Must use PAE paging as we could use physical memory > 4 GB */ + uShadowCr4 |= X86_CR4_PAE; + break; + + case PGMMODE_AMD64: /* 64-bit AMD paging (long mode). */ + case PGMMODE_AMD64_NX: /* 64-bit AMD paging (long mode) with NX enabled. */ +#ifdef VBOX_ENABLE_64_BITS_GUESTS + break; +#else + return VERR_PGM_UNSUPPORTED_SHADOW_PAGING_MODE; +#endif + + default: /* shut up gcc */ + return VERR_PGM_UNSUPPORTED_SHADOW_PAGING_MODE; + } + } + + /* Whether to save/load/restore XCR0 during world switch depends on CR4.OSXSAVE and host+guest XCR0. */ + pVCpu->hm.s.fLoadSaveGuestXcr0 = (pCtx->cr4 & X86_CR4_OSXSAVE) && pCtx->aXcr[0] != ASMGetXcr0(); + + /* Avoid intercepting CR4 reads if the guest and shadow CR4 values are identical. */ + if (uShadowCr4 == pCtx->cr4) + { + if (!CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + pVmcb->ctrl.u16InterceptRdCRx &= ~RT_BIT(4); + else + { + /* If the nested-hypervisor intercepts CR4 reads, we need to continue intercepting them. */ + PCSVMNESTEDVMCBCACHE pVmcbNstGstCache = hmR0SvmGetNestedVmcbCache(pVCpu); + pVmcb->ctrl.u16InterceptRdCRx = (pVmcb->ctrl.u16InterceptRdCRx & ~RT_BIT(4)) + | (pVmcbNstGstCache->u16InterceptRdCRx & RT_BIT(4)); + } + } + else + pVmcb->ctrl.u16InterceptRdCRx |= RT_BIT(4); + + /* CR4 writes are always intercepted (both guest, nested-guest) for tracking PGM mode changes. */ + Assert(pVmcb->ctrl.u16InterceptWrCRx & RT_BIT(4)); + + /* Update VMCB with the shadow CR4 the appropriate VMCB clean bits. */ + Assert(!RT_HI_U32(uShadowCr4)); + pVmcb->guest.u64CR4 = uShadowCr4; + pVmcb->ctrl.u32VmcbCleanBits &= ~(HMSVM_VMCB_CLEAN_CRX_EFER | HMSVM_VMCB_CLEAN_INTERCEPTS); + + return VINF_SUCCESS; +} + + +/** + * Exports the guest (or nested-guest) control registers into the VMCB. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0SvmExportGuestControlRegs(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_CR_MASK) + { + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_CR0) + hmR0SvmExportGuestCR0(pVCpu, pVmcb); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_CR2) + { + pVmcb->guest.u64CR2 = pVCpu->cpum.GstCtx.cr2; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_CR2; + } + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_CR3) + hmR0SvmExportGuestCR3(pVCpu, pVmcb); + + /* CR4 re-loading is ASSUMED to be done everytime we get in from ring-3! (XCR0) */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_CR4) + { + int rc = hmR0SvmExportGuestCR4(pVCpu, pVmcb); + if (RT_FAILURE(rc)) + return rc; + } + + pVCpu->hm.s.fCtxChanged &= ~HM_CHANGED_GUEST_CR_MASK; + } + return VINF_SUCCESS; +} + + +/** + * Exports the guest (or nested-guest) segment registers into the VMCB. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmExportGuestSegmentRegs(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + + /* Guest segment registers. */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_SREG_MASK) + { + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_CS) + HMSVM_SEG_REG_COPY_TO_VMCB(pCtx, &pVmcb->guest, CS, cs); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_SS) + { + HMSVM_SEG_REG_COPY_TO_VMCB(pCtx, &pVmcb->guest, SS, ss); + pVmcb->guest.u8CPL = pCtx->ss.Attr.n.u2Dpl; + } + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_DS) + HMSVM_SEG_REG_COPY_TO_VMCB(pCtx, &pVmcb->guest, DS, ds); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_ES) + HMSVM_SEG_REG_COPY_TO_VMCB(pCtx, &pVmcb->guest, ES, es); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_FS) + HMSVM_SEG_REG_COPY_TO_VMCB(pCtx, &pVmcb->guest, FS, fs); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_GS) + HMSVM_SEG_REG_COPY_TO_VMCB(pCtx, &pVmcb->guest, GS, gs); + + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_SEG; + } + + /* Guest TR. */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_TR) + HMSVM_SEG_REG_COPY_TO_VMCB(pCtx, &pVmcb->guest, TR, tr); + + /* Guest LDTR. */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_LDTR) + HMSVM_SEG_REG_COPY_TO_VMCB(pCtx, &pVmcb->guest, LDTR, ldtr); + + /* Guest GDTR. */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_GDTR) + { + pVmcb->guest.GDTR.u32Limit = pCtx->gdtr.cbGdt; + pVmcb->guest.GDTR.u64Base = pCtx->gdtr.pGdt; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_DT; + } + + /* Guest IDTR. */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_IDTR) + { + pVmcb->guest.IDTR.u32Limit = pCtx->idtr.cbIdt; + pVmcb->guest.IDTR.u64Base = pCtx->idtr.pIdt; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_DT; + } + + pVCpu->hm.s.fCtxChanged &= ~( HM_CHANGED_GUEST_SREG_MASK + | HM_CHANGED_GUEST_TABLE_MASK); +} + + +/** + * Exports the guest (or nested-guest) MSRs into the VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmExportGuestMsrs(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + + /* Guest Sysenter MSRs. */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_SYSENTER_MSR_MASK) + { + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_SYSENTER_CS_MSR) + pVmcb->guest.u64SysEnterCS = pCtx->SysEnter.cs; + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_SYSENTER_EIP_MSR) + pVmcb->guest.u64SysEnterEIP = pCtx->SysEnter.eip; + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_SYSENTER_ESP_MSR) + pVmcb->guest.u64SysEnterESP = pCtx->SysEnter.esp; + } + + /* + * Guest EFER MSR. + * AMD-V requires guest EFER.SVME to be set. Weird. + * See AMD spec. 15.5.1 "Basic Operation" | "Canonicalization and Consistency Checks". + */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_EFER_MSR) + { + pVmcb->guest.u64EFER = pCtx->msrEFER | MSR_K6_EFER_SVME; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_CRX_EFER; + } + + /* If the guest isn't in 64-bit mode, clear MSR_K6_LME bit, otherwise SVM expects amd64 shadow paging. */ + if ( !CPUMIsGuestInLongModeEx(pCtx) + && (pCtx->msrEFER & MSR_K6_EFER_LME)) + { + pVmcb->guest.u64EFER &= ~MSR_K6_EFER_LME; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_CRX_EFER; + } + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_SYSCALL_MSRS) + { + pVmcb->guest.u64STAR = pCtx->msrSTAR; + pVmcb->guest.u64LSTAR = pCtx->msrLSTAR; + pVmcb->guest.u64CSTAR = pCtx->msrCSTAR; + pVmcb->guest.u64SFMASK = pCtx->msrSFMASK; + } + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_KERNEL_GS_BASE) + pVmcb->guest.u64KernelGSBase = pCtx->msrKERNELGSBASE; + + pVCpu->hm.s.fCtxChanged &= ~( HM_CHANGED_GUEST_SYSENTER_MSR_MASK + | HM_CHANGED_GUEST_EFER_MSR + | HM_CHANGED_GUEST_SYSCALL_MSRS + | HM_CHANGED_GUEST_KERNEL_GS_BASE); + + /* + * Setup the PAT MSR (applicable for Nested Paging only). + * + * While guests can modify and see the modified values through the shadow values, + * we shall not honor any guest modifications of this MSR to ensure caching is always + * enabled similar to how we clear CR0.CD and NW bits. + * + * For nested-guests this needs to always be set as well, see @bugref{7243#c109}. + */ + pVmcb->guest.u64PAT = MSR_IA32_CR_PAT_INIT_VAL; + + /* Enable the last branch record bit if LBR virtualization is enabled. */ + if (pVmcb->ctrl.LbrVirt.n.u1LbrVirt) + pVmcb->guest.u64DBGCTL = MSR_IA32_DEBUGCTL_LBR; +} + + +/** + * Exports the guest (or nested-guest) debug state into the VMCB and programs + * the necessary intercepts accordingly. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + * @remarks Requires EFLAGS to be up-to-date in the VMCB! + */ +static void hmR0SvmExportSharedDebugState(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + + /* + * Anyone single stepping on the host side? If so, we'll have to use the + * trap flag in the guest EFLAGS since AMD-V doesn't have a trap flag on + * the VMM level like the VT-x implementations does. + */ + bool fInterceptMovDRx = false; + bool const fStepping = pVCpu->hm.s.fSingleInstruction || DBGFIsStepping(pVCpu); + if (fStepping) + { + pVCpu->hm.s.fClearTrapFlag = true; + pVmcb->guest.u64RFlags |= X86_EFL_TF; + fInterceptMovDRx = true; /* Need clean DR6, no guest mess. */ + } + + if ( fStepping + || (CPUMGetHyperDR7(pVCpu) & X86_DR7_ENABLED_MASK)) + { + /* + * Use the combined guest and host DRx values found in the hypervisor + * register set because the debugger has breakpoints active or someone + * is single stepping on the host side. + * + * Note! DBGF expects a clean DR6 state before executing guest code. + */ +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if ( CPUMIsGuestInLongModeEx(pCtx) + && !CPUMIsHyperDebugStateActivePending(pVCpu)) + { + CPUMR0LoadHyperDebugState(pVCpu, false /* include DR6 */); + Assert(!CPUMIsGuestDebugStateActivePending(pVCpu)); + Assert(CPUMIsHyperDebugStateActivePending(pVCpu)); + } + else +#endif + if (!CPUMIsHyperDebugStateActive(pVCpu)) + { + CPUMR0LoadHyperDebugState(pVCpu, false /* include DR6 */); + Assert(!CPUMIsGuestDebugStateActive(pVCpu)); + Assert(CPUMIsHyperDebugStateActive(pVCpu)); + } + + /* Update DR6 & DR7. (The other DRx values are handled by CPUM one way or the other.) */ + if ( pVmcb->guest.u64DR6 != X86_DR6_INIT_VAL + || pVmcb->guest.u64DR7 != CPUMGetHyperDR7(pVCpu)) + { + pVmcb->guest.u64DR7 = CPUMGetHyperDR7(pVCpu); + pVmcb->guest.u64DR6 = X86_DR6_INIT_VAL; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_DRX; + } + + /** @todo If we cared, we could optimize to allow the guest to read registers + * with the same values. */ + fInterceptMovDRx = true; + pVCpu->hm.s.fUsingHyperDR7 = true; + Log5(("hmR0SvmExportSharedDebugState: Loaded hyper DRx\n")); + } + else + { + /* + * Update DR6, DR7 with the guest values if necessary. + */ + if ( pVmcb->guest.u64DR7 != pCtx->dr[7] + || pVmcb->guest.u64DR6 != pCtx->dr[6]) + { + pVmcb->guest.u64DR7 = pCtx->dr[7]; + pVmcb->guest.u64DR6 = pCtx->dr[6]; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_DRX; + } + pVCpu->hm.s.fUsingHyperDR7 = false; + + /* + * If the guest has enabled debug registers, we need to load them prior to + * executing guest code so they'll trigger at the right time. + */ + if (pCtx->dr[7] & (X86_DR7_ENABLED_MASK | X86_DR7_GD)) /** @todo Why GD? */ + { +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if ( CPUMIsGuestInLongModeEx(pCtx) + && !CPUMIsGuestDebugStateActivePending(pVCpu)) + { + CPUMR0LoadGuestDebugState(pVCpu, false /* include DR6 */); + STAM_COUNTER_INC(&pVCpu->hm.s.StatDRxArmed); + Assert(!CPUMIsHyperDebugStateActivePending(pVCpu)); + Assert(CPUMIsGuestDebugStateActivePending(pVCpu)); + } + else +#endif + if (!CPUMIsGuestDebugStateActive(pVCpu)) + { + CPUMR0LoadGuestDebugState(pVCpu, false /* include DR6 */); + STAM_COUNTER_INC(&pVCpu->hm.s.StatDRxArmed); + Assert(!CPUMIsHyperDebugStateActive(pVCpu)); + Assert(CPUMIsGuestDebugStateActive(pVCpu)); + } + Log5(("hmR0SvmExportSharedDebugState: Loaded guest DRx\n")); + } + /* + * If no debugging enabled, we'll lazy load DR0-3. We don't need to + * intercept #DB as DR6 is updated in the VMCB. + * + * Note! If we cared and dared, we could skip intercepting \#DB here. + * However, \#DB shouldn't be performance critical, so we'll play safe + * and keep the code similar to the VT-x code and always intercept it. + */ +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + else if ( !CPUMIsGuestDebugStateActivePending(pVCpu) + && !CPUMIsGuestDebugStateActive(pVCpu)) +#else + else if (!CPUMIsGuestDebugStateActive(pVCpu)) +#endif + { + fInterceptMovDRx = true; + } + } + + Assert(pVmcb->ctrl.u32InterceptXcpt & RT_BIT_32(X86_XCPT_DB)); + if (fInterceptMovDRx) + { + if ( pVmcb->ctrl.u16InterceptRdDRx != 0xffff + || pVmcb->ctrl.u16InterceptWrDRx != 0xffff) + { + pVmcb->ctrl.u16InterceptRdDRx = 0xffff; + pVmcb->ctrl.u16InterceptWrDRx = 0xffff; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + } + } + else + { + if ( pVmcb->ctrl.u16InterceptRdDRx + || pVmcb->ctrl.u16InterceptWrDRx) + { + pVmcb->ctrl.u16InterceptRdDRx = 0; + pVmcb->ctrl.u16InterceptWrDRx = 0; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + } + } + Log4Func(("DR6=%#RX64 DR7=%#RX64\n", pCtx->dr[6], pCtx->dr[7])); +} + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +/** + * Exports the nested-guest hardware virtualization state into the nested-guest + * VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcbNstGst Pointer to the nested-guest VM control block. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmExportGuestHwvirtStateNested(PVMCPU pVCpu, PSVMVMCB pVmcbNstGst) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_HWVIRT) + { + /* + * Ensure the nested-guest pause-filter counters don't exceed the outer guest values esp. + * since SVM doesn't have a preemption timer. + * + * We do this here rather than in hmR0SvmSetupVmcbNested() as we may have been executing the + * nested-guest in IEM incl. PAUSE instructions which would update the pause-filter counters + * and may continue execution in SVM R0 without a nested-guest #VMEXIT in between. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PSVMVMCBCTRL pVmcbNstGstCtrl = &pVmcbNstGst->ctrl; + uint16_t const uGuestPauseFilterCount = pVM->hm.s.svm.cPauseFilter; + uint16_t const uGuestPauseFilterThreshold = pVM->hm.s.svm.cPauseFilterThresholdTicks; + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_PAUSE)) + { + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + pVmcbNstGstCtrl->u16PauseFilterCount = RT_MIN(pCtx->hwvirt.svm.cPauseFilter, uGuestPauseFilterCount); + pVmcbNstGstCtrl->u16PauseFilterThreshold = RT_MIN(pCtx->hwvirt.svm.cPauseFilterThreshold, uGuestPauseFilterThreshold); + pVmcbNstGstCtrl->u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + } + else + { + pVmcbNstGstCtrl->u16PauseFilterCount = uGuestPauseFilterCount; + pVmcbNstGstCtrl->u16PauseFilterThreshold = uGuestPauseFilterThreshold; + } + + pVCpu->hm.s.fCtxChanged &= ~HM_CHANGED_GUEST_HWVIRT; + } +} +#endif + +/** + * Exports the guest APIC TPR state into the VMCB. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + */ +static int hmR0SvmExportGuestApicTpr(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_APIC_TPR) + { + PVM pVM = pVCpu->CTX_SUFF(pVM); + if ( PDMHasApic(pVM) + && APICIsEnabled(pVCpu)) + { + bool fPendingIntr; + uint8_t u8Tpr; + int rc = APICGetTpr(pVCpu, &u8Tpr, &fPendingIntr, NULL /* pu8PendingIrq */); + AssertRCReturn(rc, rc); + + /* Assume that we need to trap all TPR accesses and thus need not check on + every #VMEXIT if we should update the TPR. */ + Assert(pVmcb->ctrl.IntCtrl.n.u1VIntrMasking); + pVCpu->hm.s.svm.fSyncVTpr = false; + + if (!pVM->hm.s.fTPRPatchingActive) + { + /* Bits 3-0 of the VTPR field correspond to bits 7-4 of the TPR (which is the Task-Priority Class). */ + pVmcb->ctrl.IntCtrl.n.u8VTPR = (u8Tpr >> 4); + + /* If there are interrupts pending, intercept CR8 writes to evaluate ASAP if we + can deliver the interrupt to the guest. */ + if (fPendingIntr) + pVmcb->ctrl.u16InterceptWrCRx |= RT_BIT(8); + else + { + pVmcb->ctrl.u16InterceptWrCRx &= ~RT_BIT(8); + pVCpu->hm.s.svm.fSyncVTpr = true; + } + + pVmcb->ctrl.u32VmcbCleanBits &= ~(HMSVM_VMCB_CLEAN_INTERCEPTS | HMSVM_VMCB_CLEAN_INT_CTRL); + } + else + { + /* 32-bit guests uses LSTAR MSR for patching guest code which touches the TPR. */ + pVmcb->guest.u64LSTAR = u8Tpr; + uint8_t *pbMsrBitmap = (uint8_t *)pVCpu->hm.s.svm.pvMsrBitmap; + + /* If there are interrupts pending, intercept LSTAR writes, otherwise don't intercept reads or writes. */ + if (fPendingIntr) + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_LSTAR, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_INTERCEPT_WRITE); + else + { + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_LSTAR, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + pVCpu->hm.s.svm.fSyncVTpr = true; + } + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_IOPM_MSRPM; + } + } + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_APIC_TPR); + } + return VINF_SUCCESS; +} + + +/** + * Sets up the exception interrupts required for guest (or nested-guest) + * execution in the VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmExportGuestXcptIntercepts(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* If we modify intercepts from here, please check & adjust hmR0SvmMergeVmcbCtrlsNested() if required. */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_SVM_GUEST_XCPT_INTERCEPTS) + { + /* Trap #UD for GIM provider (e.g. for hypercalls). */ + if (pVCpu->hm.s.fGIMTrapXcptUD) + hmR0SvmSetXcptIntercept(pVmcb, X86_XCPT_UD); + else + hmR0SvmClearXcptIntercept(pVCpu, pVmcb, X86_XCPT_UD); + + /* Trap #BP for INT3 debug breakpoints set by the VM debugger. */ + if (pVCpu->CTX_SUFF(pVM)->dbgf.ro.cEnabledInt3Breakpoints) + hmR0SvmSetXcptIntercept(pVmcb, X86_XCPT_BP); + else + hmR0SvmClearXcptIntercept(pVCpu, pVmcb, X86_XCPT_BP); + + /* The remaining intercepts are handled elsewhere, e.g. in hmR0SvmExportGuestCR0(). */ + pVCpu->hm.s.fCtxChanged &= ~HM_CHANGED_SVM_GUEST_XCPT_INTERCEPTS; + } +} + + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +/** + * Merges guest and nested-guest intercepts for executing the nested-guest using + * hardware-assisted SVM. + * + * This merges the guest and nested-guest intercepts in a way that if the outer + * guest intercept is set we need to intercept it in the nested-guest as + * well. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcbNstGst Pointer to the nested-guest VM control block. + */ +static void hmR0SvmMergeVmcbCtrlsNested(PVMCPU pVCpu) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCSVMVMCB pVmcb = pVCpu->hm.s.svm.pVmcb; + PSVMVMCB pVmcbNstGst = pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pVmcb); + PSVMVMCBCTRL pVmcbNstGstCtrl = &pVmcbNstGst->ctrl; + + /* Merge the guest's CR intercepts into the nested-guest VMCB. */ + pVmcbNstGstCtrl->u16InterceptRdCRx |= pVmcb->ctrl.u16InterceptRdCRx; + pVmcbNstGstCtrl->u16InterceptWrCRx |= pVmcb->ctrl.u16InterceptWrCRx; + + /* Always intercept CR4 writes for tracking PGM mode changes. */ + pVmcbNstGstCtrl->u16InterceptWrCRx |= RT_BIT(4); + + /* Without nested paging, intercept CR3 reads and writes as we load shadow page tables. */ + if (!pVM->hm.s.fNestedPaging) + { + pVmcbNstGstCtrl->u16InterceptRdCRx |= RT_BIT(3); + pVmcbNstGstCtrl->u16InterceptWrCRx |= RT_BIT(3); + } + + /** @todo Figure out debugging with nested-guests, till then just intercept + * all DR[0-15] accesses. */ + pVmcbNstGstCtrl->u16InterceptRdDRx |= 0xffff; + pVmcbNstGstCtrl->u16InterceptWrDRx |= 0xffff; + + /* + * Merge the guest's exception intercepts into the nested-guest VMCB. + * + * - #UD: Exclude these as the outer guest's GIM hypercalls are not applicable + * while executing the nested-guest. + * + * - #BP: Exclude breakpoints set by the VM debugger for the outer guest. This can + * be tweaked later depending on how we wish to implement breakpoints. + * + * - #GP: Exclude these as it's the inner VMMs problem to get vmsvga 3d drivers + * loaded into their guests, not ours. + * + * Warning!! This ASSUMES we only intercept \#UD for hypercall purposes and \#BP + * for VM debugger breakpoints, see hmR0SvmExportGuestXcptIntercepts(). + */ +#ifndef HMSVM_ALWAYS_TRAP_ALL_XCPTS + pVmcbNstGstCtrl->u32InterceptXcpt |= pVmcb->ctrl.u32InterceptXcpt + & ~( RT_BIT(X86_XCPT_UD) + | RT_BIT(X86_XCPT_BP) + | (pVCpu->hm.s.fTrapXcptGpForLovelyMesaDrv ? RT_BIT(X86_XCPT_GP) : 0)); +#else + pVmcbNstGstCtrl->u32InterceptXcpt |= pVmcb->ctrl.u32InterceptXcpt; +#endif + + /* + * Adjust intercepts while executing the nested-guest that differ from the + * outer guest intercepts. + * + * - VINTR: Exclude the outer guest intercept as we don't need to cause VINTR #VMEXITs + * that belong to the nested-guest to the outer guest. + * + * - VMMCALL: Exclude the outer guest intercept as when it's also not intercepted by + * the nested-guest, the physical CPU raises a \#UD exception as expected. + */ + pVmcbNstGstCtrl->u64InterceptCtrl |= (pVmcb->ctrl.u64InterceptCtrl & ~( SVM_CTRL_INTERCEPT_VINTR + | SVM_CTRL_INTERCEPT_VMMCALL)) + | HMSVM_MANDATORY_GUEST_CTRL_INTERCEPTS; + + Assert( (pVmcbNstGstCtrl->u64InterceptCtrl & HMSVM_MANDATORY_GUEST_CTRL_INTERCEPTS) + == HMSVM_MANDATORY_GUEST_CTRL_INTERCEPTS); + + /* Finally, update the VMCB clean bits. */ + pVmcbNstGstCtrl->u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; +} +#endif + + +/** + * Selects the appropriate function to run guest code. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0SvmSelectVMRunHandler(PVMCPU pVCpu) +{ + if (CPUMIsGuestInLongMode(pVCpu)) + { +#ifndef VBOX_ENABLE_64_BITS_GUESTS + return VERR_PGM_UNSUPPORTED_SHADOW_PAGING_MODE; +#endif + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.fAllow64BitGuests); /* Guaranteed by hmR3InitFinalizeR0(). */ +#if HC_ARCH_BITS == 32 + /* 32-bit host. We need to switch to 64-bit before running the 64-bit guest. */ + pVCpu->hm.s.svm.pfnVMRun = SVMR0VMSwitcherRun64; +#else + /* 64-bit host or hybrid host. */ + pVCpu->hm.s.svm.pfnVMRun = SVMR0VMRun64; +#endif + } + else + { + /* Guest is not in long mode, use the 32-bit handler. */ + pVCpu->hm.s.svm.pfnVMRun = SVMR0VMRun; + } + return VINF_SUCCESS; +} + + +/** + * Enters the AMD-V session. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0DECL(int) SVMR0Enter(PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.svm.fSupported); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + LogFlowFunc(("pVCpu=%p\n", pVCpu)); + Assert((pVCpu->hm.s.fCtxChanged & (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE)) + == (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE)); + + pVCpu->hm.s.fLeaveDone = false; + return VINF_SUCCESS; +} + + +/** + * Thread-context callback for AMD-V. + * + * @param enmEvent The thread-context event. + * @param pVCpu The cross context virtual CPU structure. + * @param fGlobalInit Whether global VT-x/AMD-V init. is used. + * @thread EMT(pVCpu) + */ +VMMR0DECL(void) SVMR0ThreadCtxCallback(RTTHREADCTXEVENT enmEvent, PVMCPU pVCpu, bool fGlobalInit) +{ + NOREF(fGlobalInit); + + switch (enmEvent) + { + case RTTHREADCTXEVENT_OUT: + { + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(VMMR0ThreadCtxHookIsEnabled(pVCpu)); + VMCPU_ASSERT_EMT(pVCpu); + + /* No longjmps (log-flush, locks) in this fragile context. */ + VMMRZCallRing3Disable(pVCpu); + + if (!pVCpu->hm.s.fLeaveDone) + { + hmR0SvmLeave(pVCpu, false /* fImportState */); + pVCpu->hm.s.fLeaveDone = true; + } + + /* Leave HM context, takes care of local init (term). */ + int rc = HMR0LeaveCpu(pVCpu); + AssertRC(rc); NOREF(rc); + + /* Restore longjmp state. */ + VMMRZCallRing3Enable(pVCpu); + STAM_REL_COUNTER_INC(&pVCpu->hm.s.StatSwitchPreempt); + break; + } + + case RTTHREADCTXEVENT_IN: + { + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(VMMR0ThreadCtxHookIsEnabled(pVCpu)); + VMCPU_ASSERT_EMT(pVCpu); + + /* No longjmps (log-flush, locks) in this fragile context. */ + VMMRZCallRing3Disable(pVCpu); + + /* + * Initialize the bare minimum state required for HM. This takes care of + * initializing AMD-V if necessary (onlined CPUs, local init etc.) + */ + int rc = hmR0EnterCpu(pVCpu); + AssertRC(rc); NOREF(rc); + Assert((pVCpu->hm.s.fCtxChanged & (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE)) + == (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE)); + + pVCpu->hm.s.fLeaveDone = false; + + /* Restore longjmp state. */ + VMMRZCallRing3Enable(pVCpu); + break; + } + + default: + break; + } +} + + +/** + * Saves the host state. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +VMMR0DECL(int) SVMR0ExportHostState(PVMCPU pVCpu) +{ + NOREF(pVCpu); + + /* Nothing to do here. AMD-V does this for us automatically during the world-switch. */ + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_HOST_CONTEXT); + return VINF_SUCCESS; +} + + +/** + * Exports the guest state from the guest-CPU context into the VMCB. + * + * The CPU state will be loaded from these fields on every successful VM-entry. + * Also sets up the appropriate VMRUN function to execute guest code based on + * the guest CPU mode. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0SvmExportGuestState(PVMCPU pVCpu) +{ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatExportGuestState, x); + + PSVMVMCB pVmcb = pVCpu->hm.s.svm.pVmcb; + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + + Assert(pVmcb); + HMSVM_ASSERT_NOT_IN_NESTED_GUEST(pCtx); + + pVmcb->guest.u64RIP = pCtx->rip; + pVmcb->guest.u64RSP = pCtx->rsp; + pVmcb->guest.u64RFlags = pCtx->eflags.u32; + pVmcb->guest.u64RAX = pCtx->rax; +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + if (pVmcb->ctrl.IntCtrl.n.u1VGifEnable) + { + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.svm.u32Features & X86_CPUID_SVM_FEATURE_EDX_VGIF); /* Hardware supports it. */ + Assert(HMIsSvmVGifActive(pVCpu->CTX_SUFF(pVM))); /* VM has configured it. */ + pVmcb->ctrl.IntCtrl.n.u1VGif = CPUMGetGuestGif(pCtx); + } +#endif + + RTCCUINTREG const fEFlags = ASMIntDisableFlags(); + + int rc = hmR0SvmExportGuestControlRegs(pVCpu, pVmcb); + AssertRCReturnStmt(rc, ASMSetFlags(fEFlags), rc); + + hmR0SvmExportGuestSegmentRegs(pVCpu, pVmcb); + hmR0SvmExportGuestMsrs(pVCpu, pVmcb); + hmR0SvmExportGuestXcptIntercepts(pVCpu, pVmcb); + + ASMSetFlags(fEFlags); + + /* hmR0SvmExportGuestApicTpr() must be called -after- hmR0SvmExportGuestMsrs() as we + otherwise we would overwrite the LSTAR MSR that we use for TPR patching. */ + hmR0SvmExportGuestApicTpr(pVCpu, pVmcb); + + rc = hmR0SvmSelectVMRunHandler(pVCpu); + AssertRCReturn(rc, rc); + + /* Clear any bits that may be set but exported unconditionally or unused/reserved bits. */ + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~( HM_CHANGED_GUEST_RIP + | HM_CHANGED_GUEST_RFLAGS + | HM_CHANGED_GUEST_GPRS_MASK + | HM_CHANGED_GUEST_X87 + | HM_CHANGED_GUEST_SSE_AVX + | HM_CHANGED_GUEST_OTHER_XSAVE + | HM_CHANGED_GUEST_XCRx + | HM_CHANGED_GUEST_TSC_AUX + | HM_CHANGED_GUEST_OTHER_MSRS + | HM_CHANGED_GUEST_HWVIRT + | (HM_CHANGED_KEEPER_STATE_MASK & ~HM_CHANGED_SVM_GUEST_XCPT_INTERCEPTS))); + +#ifdef VBOX_STRICT + /* + * All of the guest-CPU state and SVM keeper bits should be exported here by now, + * except for the host-context and/or shared host-guest context bits. + */ + uint64_t const fCtxChanged = ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged); + RT_UNTRUSTED_NONVOLATILE_COPY_FENCE(); + AssertMsg(!(fCtxChanged & (HM_CHANGED_ALL_GUEST & ~HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE)), + ("fCtxChanged=%#RX64\n", fCtxChanged)); + + /* + * If we need to log state that isn't always imported, we'll need to import them here. + * See hmR0SvmPostRunGuest() for which part of the state is imported uncondtionally. + */ + hmR0SvmLogState(pVCpu, pVmcb, "hmR0SvmExportGuestState", 0 /* fFlags */, 0 /* uVerbose */); +#endif + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExportGuestState, x); + return VINF_SUCCESS; +} + + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +/** + * Merges the guest and nested-guest MSR permission bitmap. + * + * If the guest is intercepting an MSR we need to intercept it regardless of + * whether the nested-guest is intercepting it or not. + * + * @param pHostCpu The HM physical-CPU structure. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jmp zone!!! + */ +DECLINLINE(void) hmR0SvmMergeMsrpmNested(PHMPHYSCPU pHostCpu, PVMCPU pVCpu) +{ + uint64_t const *pu64GstMsrpm = (uint64_t const *)pVCpu->hm.s.svm.pvMsrBitmap; + uint64_t const *pu64NstGstMsrpm = (uint64_t const *)pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pvMsrBitmap); + uint64_t *pu64DstMsrpm = (uint64_t *)pHostCpu->n.svm.pvNstGstMsrpm; + + /* MSRPM bytes from offset 0x1800 are reserved, so we stop merging there. */ + uint32_t const offRsvdQwords = 0x1800 >> 3; + for (uint32_t i = 0; i < offRsvdQwords; i++) + pu64DstMsrpm[i] = pu64NstGstMsrpm[i] | pu64GstMsrpm[i]; +} + + +/** + * Caches the nested-guest VMCB fields before we modify them for execution using + * hardware-assisted SVM. + * + * @returns true if the VMCB was previously already cached, false otherwise. + * @param pVCpu The cross context virtual CPU structure. + * + * @sa HMNotifySvmNstGstVmexit. + */ +static bool hmR0SvmCacheVmcbNested(PVMCPU pVCpu) +{ + /* + * Cache the nested-guest programmed VMCB fields if we have not cached it yet. + * Otherwise we risk re-caching the values we may have modified, see @bugref{7243#c44}. + * + * Nested-paging CR3 is not saved back into the VMCB on #VMEXIT, hence no need to + * cache and restore it, see AMD spec. 15.25.4 "Nested Paging and VMRUN/#VMEXIT". + */ + PSVMNESTEDVMCBCACHE pVmcbNstGstCache = &pVCpu->hm.s.svm.NstGstVmcbCache; + bool const fWasCached = pVmcbNstGstCache->fCacheValid; + if (!fWasCached) + { + PCSVMVMCB pVmcbNstGst = pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pVmcb); + PCSVMVMCBCTRL pVmcbNstGstCtrl = &pVmcbNstGst->ctrl; + pVmcbNstGstCache->u16InterceptRdCRx = pVmcbNstGstCtrl->u16InterceptRdCRx; + pVmcbNstGstCache->u16InterceptWrCRx = pVmcbNstGstCtrl->u16InterceptWrCRx; + pVmcbNstGstCache->u16InterceptRdDRx = pVmcbNstGstCtrl->u16InterceptRdDRx; + pVmcbNstGstCache->u16InterceptWrDRx = pVmcbNstGstCtrl->u16InterceptWrDRx; + pVmcbNstGstCache->u16PauseFilterThreshold = pVmcbNstGstCtrl->u16PauseFilterThreshold; + pVmcbNstGstCache->u16PauseFilterCount = pVmcbNstGstCtrl->u16PauseFilterCount; + pVmcbNstGstCache->u32InterceptXcpt = pVmcbNstGstCtrl->u32InterceptXcpt; + pVmcbNstGstCache->u64InterceptCtrl = pVmcbNstGstCtrl->u64InterceptCtrl; + pVmcbNstGstCache->u64TSCOffset = pVmcbNstGstCtrl->u64TSCOffset; + pVmcbNstGstCache->fVIntrMasking = pVmcbNstGstCtrl->IntCtrl.n.u1VIntrMasking; + pVmcbNstGstCache->fNestedPaging = pVmcbNstGstCtrl->NestedPagingCtrl.n.u1NestedPaging; + pVmcbNstGstCache->fLbrVirt = pVmcbNstGstCtrl->LbrVirt.n.u1LbrVirt; + pVmcbNstGstCache->fCacheValid = true; + Log4Func(("Cached VMCB fields\n")); + } + + return fWasCached; +} + + +/** + * Sets up the nested-guest VMCB for execution using hardware-assisted SVM. + * + * This is done the first time we enter nested-guest execution using SVM R0 + * until the nested-guest \#VMEXIT (not to be confused with physical CPU + * \#VMEXITs which may or may not cause a corresponding nested-guest \#VMEXIT). + * + * @param pVCpu The cross context virtual CPU structure. + */ +static void hmR0SvmSetupVmcbNested(PVMCPU pVCpu) +{ + PSVMVMCB pVmcbNstGst = pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pVmcb); + PSVMVMCBCTRL pVmcbNstGstCtrl = &pVmcbNstGst->ctrl; + + /* + * First cache the nested-guest VMCB fields we may potentially modify. + */ + bool const fVmcbCached = hmR0SvmCacheVmcbNested(pVCpu); + if (!fVmcbCached) + { + /* + * The IOPM of the nested-guest can be ignored because the the guest always + * intercepts all IO port accesses. Thus, we'll swap to the guest IOPM rather + * than the nested-guest IOPM and swap the field back on the #VMEXIT. + */ + pVmcbNstGstCtrl->u64IOPMPhysAddr = g_HCPhysIOBitmap; + + /* + * Use the same nested-paging as the outer guest. We can't dynamically switch off + * nested-paging suddenly while executing a VM (see assertion at the end of + * Trap0eHandler() in PGMAllBth.h). + */ + pVmcbNstGstCtrl->NestedPagingCtrl.n.u1NestedPaging = pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging; + + /* Always enable V_INTR_MASKING as we do not want to allow access to the physical APIC TPR. */ + pVmcbNstGstCtrl->IntCtrl.n.u1VIntrMasking = 1; + + /* + * Turn off TPR syncing on #VMEXIT for nested-guests as CR8 intercepts are subject + * to the nested-guest intercepts and we always run with V_INTR_MASKING. + */ + pVCpu->hm.s.svm.fSyncVTpr = false; + +#ifdef DEBUG_ramshankar + /* For debugging purposes - copy the LBR info. from outer guest VMCB. */ + pVmcbNstGstCtrl->LbrVirt.n.u1LbrVirt = pVmcb->ctrl.LbrVirt.n.u1LbrVirt; +#endif + + /* + * If we don't expose Virtualized-VMSAVE/VMLOAD feature to the outer guest, we + * need to intercept VMSAVE/VMLOAD instructions executed by the nested-guest. + */ + if (!pVCpu->CTX_SUFF(pVM)->cpum.ro.GuestFeatures.fSvmVirtVmsaveVmload) + pVmcbNstGstCtrl->u64InterceptCtrl |= SVM_CTRL_INTERCEPT_VMSAVE + | SVM_CTRL_INTERCEPT_VMLOAD; + + /* + * If we don't expose Virtual GIF feature to the outer guest, we need to intercept + * CLGI/STGI instructions executed by the nested-guest. + */ + if (!pVCpu->CTX_SUFF(pVM)->cpum.ro.GuestFeatures.fSvmVGif) + pVmcbNstGstCtrl->u64InterceptCtrl |= SVM_CTRL_INTERCEPT_CLGI + | SVM_CTRL_INTERCEPT_STGI; + + /* Merge the guest and nested-guest intercepts. */ + hmR0SvmMergeVmcbCtrlsNested(pVCpu); + + /* Update the VMCB clean bits. */ + pVmcbNstGstCtrl->u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + } + else + { + Assert(!pVCpu->hm.s.svm.fSyncVTpr); + Assert(pVmcbNstGstCtrl->u64IOPMPhysAddr == g_HCPhysIOBitmap); + Assert(RT_BOOL(pVmcbNstGstCtrl->NestedPagingCtrl.n.u1NestedPaging) == pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging); + } +} + + +/** + * Exports the nested-guest state into the VMCB. + * + * We need to export the entire state as we could be continuing nested-guest + * execution at any point (not just immediately after VMRUN) and thus the VMCB + * can be out-of-sync with the nested-guest state if it was executed in IEM. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pCtx Pointer to the guest-CPU context. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0SvmExportGuestStateNested(PVMCPU pVCpu) +{ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatExportGuestState, x); + + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + PSVMVMCB pVmcbNstGst = pCtx->hwvirt.svm.CTX_SUFF(pVmcb); + Assert(pVmcbNstGst); + + hmR0SvmSetupVmcbNested(pVCpu); + + pVmcbNstGst->guest.u64RIP = pCtx->rip; + pVmcbNstGst->guest.u64RSP = pCtx->rsp; + pVmcbNstGst->guest.u64RFlags = pCtx->eflags.u32; + pVmcbNstGst->guest.u64RAX = pCtx->rax; + + RTCCUINTREG const fEFlags = ASMIntDisableFlags(); + + int rc = hmR0SvmExportGuestControlRegs(pVCpu, pVmcbNstGst); + AssertRCReturnStmt(rc, ASMSetFlags(fEFlags), rc); + + hmR0SvmExportGuestSegmentRegs(pVCpu, pVmcbNstGst); + hmR0SvmExportGuestMsrs(pVCpu, pVmcbNstGst); + hmR0SvmExportGuestHwvirtStateNested(pVCpu, pVmcbNstGst); + + ASMSetFlags(fEFlags); + + /* Nested VGIF not supported yet. */ + Assert(!pVmcbNstGst->ctrl.IntCtrl.n.u1VGifEnable); + + rc = hmR0SvmSelectVMRunHandler(pVCpu); + AssertRCReturn(rc, rc); + + /* Clear any bits that may be set but exported unconditionally or unused/reserved bits. */ + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~( HM_CHANGED_GUEST_RIP + | HM_CHANGED_GUEST_RFLAGS + | HM_CHANGED_GUEST_GPRS_MASK + | HM_CHANGED_GUEST_APIC_TPR + | HM_CHANGED_GUEST_X87 + | HM_CHANGED_GUEST_SSE_AVX + | HM_CHANGED_GUEST_OTHER_XSAVE + | HM_CHANGED_GUEST_XCRx + | HM_CHANGED_GUEST_TSC_AUX + | HM_CHANGED_GUEST_OTHER_MSRS + | HM_CHANGED_SVM_GUEST_XCPT_INTERCEPTS + | (HM_CHANGED_KEEPER_STATE_MASK & ~HM_CHANGED_SVM_MASK))); + +#ifdef VBOX_STRICT + /* + * All of the guest-CPU state and SVM keeper bits should be exported here by now, except + * for the host-context and/or shared host-guest context bits. + */ + uint64_t const fCtxChanged = ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged); + RT_UNTRUSTED_NONVOLATILE_COPY_FENCE(); + AssertMsg(!(fCtxChanged & (HM_CHANGED_ALL_GUEST & ~HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE)), + ("fCtxChanged=%#RX64\n", fCtxChanged)); + + /* + * If we need to log state that isn't always imported, we'll need to import them here. + * See hmR0SvmPostRunGuest() for which part of the state is imported uncondtionally. + */ + hmR0SvmLogState(pVCpu, pVmcbNstGst, "hmR0SvmExportGuestStateNested", 0 /* fFlags */, 0 /* uVerbose */); +#endif + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExportGuestState, x); + return rc; +} +#endif /* VBOX_WITH_NESTED_HWVIRT_SVM */ + + +/** + * Exports the state shared between the host and guest (or nested-guest) into + * the VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmExportSharedState(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_DR_MASK) + { + /** @todo Figure out stepping with nested-guest. */ + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (!CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + hmR0SvmExportSharedDebugState(pVCpu, pVmcb); + else + { + pVmcb->guest.u64DR6 = pCtx->dr[6]; + pVmcb->guest.u64DR7 = pCtx->dr[7]; + } + } + + pVCpu->hm.s.fCtxChanged &= ~HM_CHANGED_GUEST_DR_MASK; + AssertMsg(!(pVCpu->hm.s.fCtxChanged & HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE), + ("fCtxChanged=%#RX64\n", pVCpu->hm.s.fCtxChanged)); +} + + +/** + * Worker for SVMR0ImportStateOnDemand. + * + * @param pVCpu The cross context virtual CPU structure. + * @param fWhat What to import, CPUMCTX_EXTRN_XXX. + */ +static void hmR0SvmImportGuestState(PVMCPU pVCpu, uint64_t fWhat) +{ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatImportGuestState, x); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + PCSVMVMCBSTATESAVE pVmcbGuest = &pVmcb->guest; + PCSVMVMCBCTRL pVmcbCtrl = &pVmcb->ctrl; + + Log4Func(("fExtrn=%#RX64 fWhat=%#RX64\n", pCtx->fExtrn, fWhat)); + + /* + * We disable interrupts to make the updating of the state and in particular + * the fExtrn modification atomic wrt to preemption hooks. + */ + RTCCUINTREG const fEFlags = ASMIntDisableFlags(); + + fWhat &= pCtx->fExtrn; + if (fWhat) + { +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + if (fWhat & CPUMCTX_EXTRN_HWVIRT) + { + if (pVmcbCtrl->IntCtrl.n.u1VGifEnable) + { + Assert(!CPUMIsGuestInSvmNestedHwVirtMode(pCtx)); /* We don't yet support passing VGIF feature to the guest. */ + Assert(HMIsSvmVGifActive(pVCpu->CTX_SUFF(pVM))); /* VM has configured it. */ + CPUMSetGuestGif(pCtx, pVmcbCtrl->IntCtrl.n.u1VGif); + } + } + + if (fWhat & CPUMCTX_EXTRN_HM_SVM_HWVIRT_VIRQ) + { + if ( !pVmcbCtrl->IntCtrl.n.u1VIrqPending + && VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NESTED_GUEST)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INTERRUPT_NESTED_GUEST); + } +#endif + + if (fWhat & CPUMCTX_EXTRN_HM_SVM_INT_SHADOW) + { + if (pVmcbCtrl->IntShadow.n.u1IntShadow) + EMSetInhibitInterruptsPC(pVCpu, pVmcbGuest->u64RIP); + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); + } + + if (fWhat & CPUMCTX_EXTRN_RIP) + pCtx->rip = pVmcbGuest->u64RIP; + + if (fWhat & CPUMCTX_EXTRN_RFLAGS) + pCtx->eflags.u32 = pVmcbGuest->u64RFlags; + + if (fWhat & CPUMCTX_EXTRN_RSP) + pCtx->rsp = pVmcbGuest->u64RSP; + + if (fWhat & CPUMCTX_EXTRN_RAX) + pCtx->rax = pVmcbGuest->u64RAX; + + if (fWhat & CPUMCTX_EXTRN_SREG_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CS) + { + HMSVM_SEG_REG_COPY_FROM_VMCB(pCtx, pVmcbGuest, CS, cs); + /* Correct the CS granularity bit. Haven't seen it being wrong in any other register (yet). */ + /** @todo SELM might need to be fixed as it too should not care about the + * granularity bit. See @bugref{6785}. */ + if ( !pCtx->cs.Attr.n.u1Granularity + && pCtx->cs.Attr.n.u1Present + && pCtx->cs.u32Limit > UINT32_C(0xfffff)) + { + Assert((pCtx->cs.u32Limit & 0xfff) == 0xfff); + pCtx->cs.Attr.n.u1Granularity = 1; + } + HMSVM_ASSERT_SEG_GRANULARITY(pCtx, cs); + } + if (fWhat & CPUMCTX_EXTRN_SS) + { + HMSVM_SEG_REG_COPY_FROM_VMCB(pCtx, pVmcbGuest, SS, ss); + HMSVM_ASSERT_SEG_GRANULARITY(pCtx, ss); + /* + * Sync the hidden SS DPL field. AMD CPUs have a separate CPL field in the + * VMCB and uses that and thus it's possible that when the CPL changes during + * guest execution that the SS DPL isn't updated by AMD-V. Observed on some + * AMD Fusion CPUs with 64-bit guests. + * + * See AMD spec. 15.5.1 "Basic operation". + */ + Assert(!(pVmcbGuest->u8CPL & ~0x3)); + uint8_t const uCpl = pVmcbGuest->u8CPL; + if (pCtx->ss.Attr.n.u2Dpl != uCpl) + pCtx->ss.Attr.n.u2Dpl = uCpl & 0x3; + } + if (fWhat & CPUMCTX_EXTRN_DS) + { + HMSVM_SEG_REG_COPY_FROM_VMCB(pCtx, pVmcbGuest, DS, ds); + HMSVM_ASSERT_SEG_GRANULARITY(pCtx, ds); + } + if (fWhat & CPUMCTX_EXTRN_ES) + { + HMSVM_SEG_REG_COPY_FROM_VMCB(pCtx, pVmcbGuest, ES, es); + HMSVM_ASSERT_SEG_GRANULARITY(pCtx, es); + } + if (fWhat & CPUMCTX_EXTRN_FS) + { + HMSVM_SEG_REG_COPY_FROM_VMCB(pCtx, pVmcbGuest, FS, fs); + HMSVM_ASSERT_SEG_GRANULARITY(pCtx, fs); + } + if (fWhat & CPUMCTX_EXTRN_GS) + { + HMSVM_SEG_REG_COPY_FROM_VMCB(pCtx, pVmcbGuest, GS, gs); + HMSVM_ASSERT_SEG_GRANULARITY(pCtx, gs); + } + } + + if (fWhat & CPUMCTX_EXTRN_TABLE_MASK) + { + if (fWhat & CPUMCTX_EXTRN_TR) + { + /* + * Fixup TR attributes so it's compatible with Intel. Important when saved-states + * are used between Intel and AMD, see @bugref{6208#c39}. + * ASSUME that it's normally correct and that we're in 32-bit or 64-bit mode. + */ + HMSVM_SEG_REG_COPY_FROM_VMCB(pCtx, pVmcbGuest, TR, tr); + if (pCtx->tr.Attr.n.u4Type != X86_SEL_TYPE_SYS_386_TSS_BUSY) + { + if ( pCtx->tr.Attr.n.u4Type == X86_SEL_TYPE_SYS_386_TSS_AVAIL + || CPUMIsGuestInLongModeEx(pCtx)) + pCtx->tr.Attr.n.u4Type = X86_SEL_TYPE_SYS_386_TSS_BUSY; + else if (pCtx->tr.Attr.n.u4Type == X86_SEL_TYPE_SYS_286_TSS_AVAIL) + pCtx->tr.Attr.n.u4Type = X86_SEL_TYPE_SYS_286_TSS_BUSY; + } + } + + if (fWhat & CPUMCTX_EXTRN_LDTR) + HMSVM_SEG_REG_COPY_FROM_VMCB(pCtx, pVmcbGuest, LDTR, ldtr); + + if (fWhat & CPUMCTX_EXTRN_GDTR) + { + pCtx->gdtr.cbGdt = pVmcbGuest->GDTR.u32Limit; + pCtx->gdtr.pGdt = pVmcbGuest->GDTR.u64Base; + } + + if (fWhat & CPUMCTX_EXTRN_IDTR) + { + pCtx->idtr.cbIdt = pVmcbGuest->IDTR.u32Limit; + pCtx->idtr.pIdt = pVmcbGuest->IDTR.u64Base; + } + } + + if (fWhat & CPUMCTX_EXTRN_SYSCALL_MSRS) + { + pCtx->msrSTAR = pVmcbGuest->u64STAR; + pCtx->msrLSTAR = pVmcbGuest->u64LSTAR; + pCtx->msrCSTAR = pVmcbGuest->u64CSTAR; + pCtx->msrSFMASK = pVmcbGuest->u64SFMASK; + } + + if (fWhat & CPUMCTX_EXTRN_SYSENTER_MSRS) + { + pCtx->SysEnter.cs = pVmcbGuest->u64SysEnterCS; + pCtx->SysEnter.eip = pVmcbGuest->u64SysEnterEIP; + pCtx->SysEnter.esp = pVmcbGuest->u64SysEnterESP; + } + + if (fWhat & CPUMCTX_EXTRN_KERNEL_GS_BASE) + pCtx->msrKERNELGSBASE = pVmcbGuest->u64KernelGSBase; + + if (fWhat & CPUMCTX_EXTRN_DR_MASK) + { + if (fWhat & CPUMCTX_EXTRN_DR6) + { + if (!pVCpu->hm.s.fUsingHyperDR7) + pCtx->dr[6] = pVmcbGuest->u64DR6; + else + CPUMSetHyperDR6(pVCpu, pVmcbGuest->u64DR6); + } + + if (fWhat & CPUMCTX_EXTRN_DR7) + { + if (!pVCpu->hm.s.fUsingHyperDR7) + pCtx->dr[7] = pVmcbGuest->u64DR7; + else + Assert(pVmcbGuest->u64DR7 == CPUMGetHyperDR7(pVCpu)); + } + } + + if (fWhat & CPUMCTX_EXTRN_CR_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CR0) + { + /* We intercept changes to all CR0 bits except maybe TS & MP bits. */ + uint64_t const uCr0 = (pCtx->cr0 & ~(X86_CR0_TS | X86_CR0_MP)) + | (pVmcbGuest->u64CR0 & (X86_CR0_TS | X86_CR0_MP)); + VMMRZCallRing3Disable(pVCpu); /* Calls into PGM which has Log statements. */ + CPUMSetGuestCR0(pVCpu, uCr0); + VMMRZCallRing3Enable(pVCpu); + } + + if (fWhat & CPUMCTX_EXTRN_CR2) + pCtx->cr2 = pVmcbGuest->u64CR2; + + if (fWhat & CPUMCTX_EXTRN_CR3) + { + if ( pVmcbCtrl->NestedPagingCtrl.n.u1NestedPaging + && pCtx->cr3 != pVmcbGuest->u64CR3) + { + CPUMSetGuestCR3(pVCpu, pVmcbGuest->u64CR3); + VMCPU_FF_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3); + } + } + + /* Changes to CR4 are always intercepted. */ + } + + /* Update fExtrn. */ + pCtx->fExtrn &= ~fWhat; + + /* If everything has been imported, clear the HM keeper bit. */ + if (!(pCtx->fExtrn & HMSVM_CPUMCTX_EXTRN_ALL)) + { + pCtx->fExtrn &= ~CPUMCTX_EXTRN_KEEPER_HM; + Assert(!pCtx->fExtrn); + } + } + else + Assert(!pCtx->fExtrn || (pCtx->fExtrn & HMSVM_CPUMCTX_EXTRN_ALL)); + + ASMSetFlags(fEFlags); + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatImportGuestState, x); + + /* + * Honor any pending CR3 updates. + * + * Consider this scenario: #VMEXIT -> VMMRZCallRing3Enable() -> do stuff that causes a longjmp + * -> hmR0SvmCallRing3Callback() -> VMMRZCallRing3Disable() -> hmR0SvmImportGuestState() + * -> Sets VMCPU_FF_HM_UPDATE_CR3 pending -> return from the longjmp -> continue with #VMEXIT + * handling -> hmR0SvmImportGuestState() and here we are. + * + * The reason for such complicated handling is because VM-exits that call into PGM expect + * CR3 to be up-to-date and thus any CR3-saves -before- the VM-exit (longjmp) would've + * postponed the CR3 update via the force-flag and cleared CR3 from fExtrn. Any SVM R0 + * VM-exit handler that requests CR3 to be saved will end up here and we call PGMUpdateCR3(). + * + * The longjmp exit path can't check these CR3 force-flags and call code that takes a lock again, + * and does not process force-flag like regular exits to ring-3 either, we cover for it here. + */ + if ( VMMRZCallRing3IsEnabled(pVCpu) + && VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3)) + { + Assert(pCtx->cr3 == pVmcbGuest->u64CR3); + PGMUpdateCR3(pVCpu, pCtx->cr3); + } +} + + +/** + * Saves the guest (or nested-guest) state from the VMCB into the guest-CPU + * context. + * + * Currently there is no residual state left in the CPU that is not updated in the + * VMCB. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param fWhat What to import, CPUMCTX_EXTRN_XXX. + */ +VMMR0DECL(int) SVMR0ImportStateOnDemand(PVMCPU pVCpu, uint64_t fWhat) +{ + hmR0SvmImportGuestState(pVCpu, fWhat); + return VINF_SUCCESS; +} + + +/** + * Does the necessary state syncing before returning to ring-3 for any reason + * (longjmp, preemption, voluntary exits to ring-3) from AMD-V. + * + * @param pVCpu The cross context virtual CPU structure. + * @param fImportState Whether to import the guest state from the VMCB back + * to the guest-CPU context. + * + * @remarks No-long-jmp zone!!! + */ +static void hmR0SvmLeave(PVMCPU pVCpu, bool fImportState) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + Assert(VMMR0IsLogFlushDisabled(pVCpu)); + + /* + * !!! IMPORTANT !!! + * If you modify code here, make sure to check whether hmR0SvmCallRing3Callback() needs to be updated too. + */ + + /* Save the guest state if necessary. */ + if (fImportState) + hmR0SvmImportGuestState(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + + /* Restore host FPU state if necessary and resync on next R0 reentry. */ + CPUMR0FpuStateMaybeSaveGuestAndRestoreHost(pVCpu); + Assert(!CPUMIsGuestFPUStateActive(pVCpu)); + + /* + * Restore host debug registers if necessary and resync on next R0 reentry. + */ +#ifdef VBOX_STRICT + if (CPUMIsHyperDebugStateActive(pVCpu)) + { + PSVMVMCB pVmcb = pVCpu->hm.s.svm.pVmcb; /** @todo nested-guest. */ + Assert(pVmcb->ctrl.u16InterceptRdDRx == 0xffff); + Assert(pVmcb->ctrl.u16InterceptWrDRx == 0xffff); + } +#endif + CPUMR0DebugStateMaybeSaveGuestAndRestoreHost(pVCpu, false /* save DR6 */); + Assert(!CPUMIsHyperDebugStateActive(pVCpu)); + Assert(!CPUMIsGuestDebugStateActive(pVCpu)); + + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatEntry); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatImportGuestState); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatExportGuestState); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatPreExit); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatExitHandling); + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchLongJmpToR3); + + VMCPU_CMPXCHG_STATE(pVCpu, VMCPUSTATE_STARTED_HM, VMCPUSTATE_STARTED_EXEC); +} + + +/** + * Leaves the AMD-V session. + * + * Only used while returning to ring-3 either due to longjump or exits to + * ring-3. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +static int hmR0SvmLeaveSession(PVMCPU pVCpu) +{ + HM_DISABLE_PREEMPT(pVCpu); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* When thread-context hooks are used, we can avoid doing the leave again if we had been preempted before + and done this from the SVMR0ThreadCtxCallback(). */ + if (!pVCpu->hm.s.fLeaveDone) + { + hmR0SvmLeave(pVCpu, true /* fImportState */); + pVCpu->hm.s.fLeaveDone = true; + } + + /* + * !!! IMPORTANT !!! + * If you modify code here, make sure to check whether hmR0SvmCallRing3Callback() needs to be updated too. + */ + + /** @todo eliminate the need for calling VMMR0ThreadCtxHookDisable here! */ + /* Deregister hook now that we've left HM context before re-enabling preemption. */ + VMMR0ThreadCtxHookDisable(pVCpu); + + /* Leave HM context. This takes care of local init (term). */ + int rc = HMR0LeaveCpu(pVCpu); + + HM_RESTORE_PREEMPT(); + return rc; +} + + +/** + * Does the necessary state syncing before doing a longjmp to ring-3. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jmp zone!!! + */ +static int hmR0SvmLongJmpToRing3(PVMCPU pVCpu) +{ + return hmR0SvmLeaveSession(pVCpu); +} + + +/** + * VMMRZCallRing3() callback wrapper which saves the guest state (or restores + * any remaining host state) before we longjump to ring-3 and possibly get + * preempted. + * + * @param pVCpu The cross context virtual CPU structure. + * @param enmOperation The operation causing the ring-3 longjump. + * @param pvUser The user argument, NULL (currently unused). + */ +static DECLCALLBACK(int) hmR0SvmCallRing3Callback(PVMCPU pVCpu, VMMCALLRING3 enmOperation, void *pvUser) +{ + RT_NOREF_PV(pvUser); + + if (enmOperation == VMMCALLRING3_VM_R0_ASSERTION) + { + /* + * !!! IMPORTANT !!! + * If you modify code here, make sure to check whether hmR0SvmLeave() and hmR0SvmLeaveSession() needs + * to be updated too. This is a stripped down version which gets out ASAP trying to not trigger any assertion. + */ + VMMRZCallRing3RemoveNotification(pVCpu); + VMMRZCallRing3Disable(pVCpu); + HM_DISABLE_PREEMPT(pVCpu); + + /* Import the entire guest state. */ + hmR0SvmImportGuestState(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + + /* Restore host FPU state if necessary and resync on next R0 reentry. */ + CPUMR0FpuStateMaybeSaveGuestAndRestoreHost(pVCpu); + + /* Restore host debug registers if necessary and resync on next R0 reentry. */ + CPUMR0DebugStateMaybeSaveGuestAndRestoreHost(pVCpu, false /* save DR6 */); + + /* Deregister the hook now that we've left HM context before re-enabling preemption. */ + /** @todo eliminate the need for calling VMMR0ThreadCtxHookDisable here! */ + VMMR0ThreadCtxHookDisable(pVCpu); + + /* Leave HM context. This takes care of local init (term). */ + HMR0LeaveCpu(pVCpu); + + HM_RESTORE_PREEMPT(); + return VINF_SUCCESS; + } + + Assert(pVCpu); + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + HMSVM_ASSERT_PREEMPT_SAFE(pVCpu); + + VMMRZCallRing3Disable(pVCpu); + Assert(VMMR0IsLogFlushDisabled(pVCpu)); + + Log4Func(("Calling hmR0SvmLongJmpToRing3\n")); + int rc = hmR0SvmLongJmpToRing3(pVCpu); + AssertRCReturn(rc, rc); + + VMMRZCallRing3Enable(pVCpu); + return VINF_SUCCESS; +} + + +/** + * Take necessary actions before going back to ring-3. + * + * An action requires us to go back to ring-3. This function does the necessary + * steps before we can safely return to ring-3. This is not the same as longjmps + * to ring-3, this is voluntary. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param rcExit The reason for exiting to ring-3. Can be + * VINF_VMM_UNKNOWN_RING3_CALL. + */ +static int hmR0SvmExitToRing3(PVMCPU pVCpu, int rcExit) +{ + Assert(pVCpu); + HMSVM_ASSERT_PREEMPT_SAFE(pVCpu); + + /* Please, no longjumps here (any logging shouldn't flush jump back to ring-3). NO LOGGING BEFORE THIS POINT! */ + VMMRZCallRing3Disable(pVCpu); + Log4Func(("rcExit=%d LocalFF=%#RX64 GlobalFF=%#RX32\n", rcExit, (uint64_t)pVCpu->fLocalForcedActions, + pVCpu->CTX_SUFF(pVM)->fGlobalForcedActions)); + + /* We need to do this only while truly exiting the "inner loop" back to ring-3 and -not- for any longjmp to ring3. */ + if (pVCpu->hm.s.Event.fPending) + { + hmR0SvmPendingEventToTrpmTrap(pVCpu); + Assert(!pVCpu->hm.s.Event.fPending); + } + + /* Sync. the necessary state for going back to ring-3. */ + hmR0SvmLeaveSession(pVCpu); + STAM_COUNTER_DEC(&pVCpu->hm.s.StatSwitchLongJmpToR3); + + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_TO_R3); + CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_SYSENTER_MSR + | CPUM_CHANGED_LDTR + | CPUM_CHANGED_GDTR + | CPUM_CHANGED_IDTR + | CPUM_CHANGED_TR + | CPUM_CHANGED_HIDDEN_SEL_REGS); + if ( pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging + && CPUMIsGuestPagingEnabledEx(&pVCpu->cpum.GstCtx)) + { + CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_GLOBAL_TLB_FLUSH); + } + + /* Update the exit-to-ring 3 reason. */ + pVCpu->hm.s.rcLastExitToR3 = rcExit; + + /* On our way back from ring-3, reload the guest-CPU state if it may change while in ring-3. */ + if ( rcExit != VINF_EM_RAW_INTERRUPT + || CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) + { + Assert(!(pVCpu->cpum.GstCtx.fExtrn & HMSVM_CPUMCTX_EXTRN_ALL)); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + } + + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchExitToR3); + + /* We do -not- want any longjmp notifications after this! We must return to ring-3 ASAP. */ + VMMRZCallRing3RemoveNotification(pVCpu); + VMMRZCallRing3Enable(pVCpu); + + /* + * If we're emulating an instruction, we shouldn't have any TRPM traps pending + * and if we're injecting an event we should have a TRPM trap pending. + */ + AssertReturnStmt(rcExit != VINF_EM_RAW_INJECT_TRPM_EVENT || TRPMHasTrap(pVCpu), + pVCpu->hm.s.u32HMError = rcExit, + VERR_SVM_IPE_5); + AssertReturnStmt(rcExit != VINF_EM_RAW_EMULATE_INSTR || !TRPMHasTrap(pVCpu), + pVCpu->hm.s.u32HMError = rcExit, + VERR_SVM_IPE_4); + + return rcExit; +} + + +/** + * Updates the use of TSC offsetting mode for the CPU and adjusts the necessary + * intercepts. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmUpdateTscOffsetting(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + /* + * Avoid intercepting RDTSC/RDTSCP if we determined the host TSC (++) is stable + * and in case of a nested-guest, if the nested-VMCB specifies it is not intercepting + * RDTSC/RDTSCP as well. + */ + bool fParavirtTsc; + uint64_t uTscOffset; + bool const fCanUseRealTsc = TMCpuTickCanUseRealTSC(pVCpu->CTX_SUFF(pVM), pVCpu, &uTscOffset, &fParavirtTsc); + + bool fIntercept; + if (fCanUseRealTsc) + fIntercept = hmR0SvmClearCtrlIntercept(pVCpu, pVmcb, SVM_CTRL_INTERCEPT_RDTSC | SVM_CTRL_INTERCEPT_RDTSCP); + else + { + hmR0SvmSetCtrlIntercept(pVmcb, SVM_CTRL_INTERCEPT_RDTSC | SVM_CTRL_INTERCEPT_RDTSCP); + fIntercept = true; + } + + if (!fIntercept) + { +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + /* Apply the nested-guest VMCB's TSC offset over the guest TSC offset. */ + if (CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) + uTscOffset = HMApplySvmNstGstTscOffset(pVCpu, uTscOffset); +#endif + + /* Update the TSC offset in the VMCB and the relevant clean bits. */ + pVmcb->ctrl.u64TSCOffset = uTscOffset; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + + STAM_COUNTER_INC(&pVCpu->hm.s.StatTscOffset); + } + else + STAM_COUNTER_INC(&pVCpu->hm.s.StatTscIntercept); + + /* Currently neither Hyper-V nor KVM need to update their paravirt. TSC + information before every VM-entry, hence we have nothing to do here at the moment. */ + if (fParavirtTsc) + STAM_COUNTER_INC(&pVCpu->hm.s.StatTscParavirt); +} + + +/** + * Sets an event as a pending event to be injected into the guest. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pEvent Pointer to the SVM event. + * @param GCPtrFaultAddress The fault-address (CR2) in case it's a + * page-fault. + * + * @remarks Statistics counter assumes this is a guest event being reflected to + * the guest i.e. 'StatInjectPendingReflect' is incremented always. + */ +DECLINLINE(void) hmR0SvmSetPendingEvent(PVMCPU pVCpu, PSVMEVENT pEvent, RTGCUINTPTR GCPtrFaultAddress) +{ + Assert(!pVCpu->hm.s.Event.fPending); + Assert(pEvent->n.u1Valid); + + pVCpu->hm.s.Event.u64IntInfo = pEvent->u; + pVCpu->hm.s.Event.fPending = true; + pVCpu->hm.s.Event.GCPtrFaultAddress = GCPtrFaultAddress; + + Log4Func(("u=%#RX64 u8Vector=%#x Type=%#x ErrorCodeValid=%RTbool ErrorCode=%#RX32\n", pEvent->u, pEvent->n.u8Vector, + (uint8_t)pEvent->n.u3Type, !!pEvent->n.u1ErrorCodeValid, pEvent->n.u32ErrorCode)); +} + + +/** + * Sets an invalid-opcode (\#UD) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0SvmSetPendingXcptUD(PVMCPU pVCpu) +{ + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_UD; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); +} + + +/** + * Sets a debug (\#DB) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0SvmSetPendingXcptDB(PVMCPU pVCpu) +{ + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_DB; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); +} + + +/** + * Sets a page fault (\#PF) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + * @param u32ErrCode The error-code for the page-fault. + * @param uFaultAddress The page fault address (CR2). + * + * @remarks This updates the guest CR2 with @a uFaultAddress! + */ +DECLINLINE(void) hmR0SvmSetPendingXcptPF(PVMCPU pVCpu, uint32_t u32ErrCode, RTGCUINTPTR uFaultAddress) +{ + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_PF; + Event.n.u1ErrorCodeValid = 1; + Event.n.u32ErrorCode = u32ErrCode; + + /* Update CR2 of the guest. */ + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR2); + if (pVCpu->cpum.GstCtx.cr2 != uFaultAddress) + { + pVCpu->cpum.GstCtx.cr2 = uFaultAddress; + /* The VMCB clean bit for CR2 will be updated while re-loading the guest state. */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CR2); + } + + hmR0SvmSetPendingEvent(pVCpu, &Event, uFaultAddress); +} + + +/** + * Sets a math-fault (\#MF) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0SvmSetPendingXcptMF(PVMCPU pVCpu) +{ + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_MF; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); +} + + +/** + * Sets a double fault (\#DF) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0SvmSetPendingXcptDF(PVMCPU pVCpu) +{ + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_DF; + Event.n.u1ErrorCodeValid = 1; + Event.n.u32ErrorCode = 0; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); +} + + +/** + * Injects an event into the guest upon VMRUN by updating the relevant field + * in the VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the guest VM control block. + * @param pEvent Pointer to the event. + * + * @remarks No-long-jump zone!!! + * @remarks Requires CR0! + */ +DECLINLINE(void) hmR0SvmInjectEventVmcb(PVMCPU pVCpu, PSVMVMCB pVmcb, PSVMEVENT pEvent) +{ + Assert(!pVmcb->ctrl.EventInject.n.u1Valid); + pVmcb->ctrl.EventInject.u = pEvent->u; + STAM_COUNTER_INC(&pVCpu->hm.s.paStatInjectedIrqsR0[pEvent->n.u8Vector & MASK_INJECT_IRQ_STAT]); + RT_NOREF(pVCpu); + + Log4Func(("u=%#RX64 u8Vector=%#x Type=%#x ErrorCodeValid=%RTbool ErrorCode=%#RX32\n", pEvent->u, pEvent->n.u8Vector, + (uint8_t)pEvent->n.u3Type, !!pEvent->n.u1ErrorCodeValid, pEvent->n.u32ErrorCode)); +} + + + +/** + * Converts any TRPM trap into a pending HM event. This is typically used when + * entering from ring-3 (not longjmp returns). + * + * @param pVCpu The cross context virtual CPU structure. + */ +static void hmR0SvmTrpmTrapToPendingEvent(PVMCPU pVCpu) +{ + Assert(TRPMHasTrap(pVCpu)); + Assert(!pVCpu->hm.s.Event.fPending); + + uint8_t uVector; + TRPMEVENT enmTrpmEvent; + RTGCUINT uErrCode; + RTGCUINTPTR GCPtrFaultAddress; + uint8_t cbInstr; + + int rc = TRPMQueryTrapAll(pVCpu, &uVector, &enmTrpmEvent, &uErrCode, &GCPtrFaultAddress, &cbInstr); + AssertRC(rc); + + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u8Vector = uVector; + + /* Refer AMD spec. 15.20 "Event Injection" for the format. */ + if (enmTrpmEvent == TRPM_TRAP) + { + Event.n.u3Type = SVM_EVENT_EXCEPTION; + switch (uVector) + { + case X86_XCPT_NMI: + { + Event.n.u3Type = SVM_EVENT_NMI; + break; + } + + case X86_XCPT_PF: + case X86_XCPT_DF: + case X86_XCPT_TS: + case X86_XCPT_NP: + case X86_XCPT_SS: + case X86_XCPT_GP: + case X86_XCPT_AC: + { + Event.n.u1ErrorCodeValid = 1; + Event.n.u32ErrorCode = uErrCode; + break; + } + } + } + else if (enmTrpmEvent == TRPM_HARDWARE_INT) + Event.n.u3Type = SVM_EVENT_EXTERNAL_IRQ; + else if (enmTrpmEvent == TRPM_SOFTWARE_INT) + Event.n.u3Type = SVM_EVENT_SOFTWARE_INT; + else + AssertMsgFailed(("Invalid TRPM event type %d\n", enmTrpmEvent)); + + rc = TRPMResetTrap(pVCpu); + AssertRC(rc); + + Log4(("TRPM->HM event: u=%#RX64 u8Vector=%#x uErrorCodeValid=%RTbool uErrorCode=%#RX32\n", Event.u, Event.n.u8Vector, + !!Event.n.u1ErrorCodeValid, Event.n.u32ErrorCode)); + + hmR0SvmSetPendingEvent(pVCpu, &Event, GCPtrFaultAddress); +} + + +/** + * Converts any pending SVM event into a TRPM trap. Typically used when leaving + * AMD-V to execute any instruction. + * + * @param pVCpu The cross context virtual CPU structure. + */ +static void hmR0SvmPendingEventToTrpmTrap(PVMCPU pVCpu) +{ + Assert(pVCpu->hm.s.Event.fPending); + Assert(TRPMQueryTrap(pVCpu, NULL /* pu8TrapNo */, NULL /* pEnmType */) == VERR_TRPM_NO_ACTIVE_TRAP); + + SVMEVENT Event; + Event.u = pVCpu->hm.s.Event.u64IntInfo; + + uint8_t uVector = Event.n.u8Vector; + uint8_t uVectorType = Event.n.u3Type; + TRPMEVENT enmTrapType = HMSvmEventToTrpmEventType(&Event); + + Log4(("HM event->TRPM: uVector=%#x enmTrapType=%d\n", uVector, uVectorType)); + + int rc = TRPMAssertTrap(pVCpu, uVector, enmTrapType); + AssertRC(rc); + + if (Event.n.u1ErrorCodeValid) + TRPMSetErrorCode(pVCpu, Event.n.u32ErrorCode); + + if ( uVectorType == SVM_EVENT_EXCEPTION + && uVector == X86_XCPT_PF) + { + TRPMSetFaultAddress(pVCpu, pVCpu->hm.s.Event.GCPtrFaultAddress); + Assert(pVCpu->hm.s.Event.GCPtrFaultAddress == CPUMGetGuestCR2(pVCpu)); + } + else if (uVectorType == SVM_EVENT_SOFTWARE_INT) + { + AssertMsg( uVectorType == SVM_EVENT_SOFTWARE_INT + || (uVector == X86_XCPT_BP || uVector == X86_XCPT_OF), + ("Invalid vector: uVector=%#x uVectorType=%#x\n", uVector, uVectorType)); + TRPMSetInstrLength(pVCpu, pVCpu->hm.s.Event.cbInstr); + } + pVCpu->hm.s.Event.fPending = false; +} + + +/** + * Checks if the guest (or nested-guest) has an interrupt shadow active right + * now. + * + * @returns @c true if the interrupt shadow is active, @c false otherwise. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + * @remarks Has side-effects with VMCPU_FF_INHIBIT_INTERRUPTS force-flag. + */ +static bool hmR0SvmIsIntrShadowActive(PVMCPU pVCpu) +{ + /* + * Instructions like STI and MOV SS inhibit interrupts till the next instruction + * completes. Check if we should inhibit interrupts or clear any existing + * interrupt inhibition. + */ + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS)) + { + if (pVCpu->cpum.GstCtx.rip != EMGetInhibitInterruptsPC(pVCpu)) + { + /* + * We can clear the inhibit force flag as even if we go back to the recompiler + * without executing guest code in AMD-V, the flag's condition to be cleared is + * met and thus the cleared state is correct. + */ + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); + return false; + } + return true; + } + return false; +} + + +/** + * Sets the virtual interrupt intercept control in the VMCB. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + */ +static void hmR0SvmSetIntWindowExiting(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + /* + * When AVIC isn't supported, set up an interrupt window to cause a #VMEXIT when the guest + * is ready to accept interrupts. At #VMEXIT, we then get the interrupt from the APIC + * (updating ISR at the right time) and inject the interrupt. + * + * With AVIC is supported, we could make use of the asynchronously delivery without + * #VMEXIT and we would be passing the AVIC page to SVM. + * + * In AMD-V, an interrupt window is achieved using a combination of V_IRQ (an interrupt + * is pending), V_IGN_TPR (ignore TPR priorities) and the VINTR intercept all being set. + */ +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + /* + * Currently we don't overlay interupt windows and if there's any V_IRQ pending in the + * nested-guest VMCB, we avoid setting up any interrupt window on behalf of the outer + * guest. + */ + /** @todo Does this mean we end up prioritizing virtual interrupt + * delivery/window over a physical interrupt (from the outer guest) + * might be pending? */ + bool const fEnableIntWindow = !VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NESTED_GUEST); + if (!fEnableIntWindow) + { + Assert(CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)); + Log4(("Nested-guest V_IRQ already pending\n")); + } +#else + bool const fEnableIntWindow = true; + RT_NOREF(pVCpu); +#endif + if (fEnableIntWindow) + { + Assert(pVmcb->ctrl.IntCtrl.n.u1IgnoreTPR); + pVmcb->ctrl.IntCtrl.n.u1VIrqPending = 1; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INT_CTRL; + hmR0SvmSetCtrlIntercept(pVmcb, SVM_CTRL_INTERCEPT_VINTR); + Log4(("Set VINTR intercept\n")); + } +} + + +/** + * Clears the virtual interrupt intercept control in the VMCB as + * we are figured the guest is unable process any interrupts + * at this point of time. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + */ +static void hmR0SvmClearIntWindowExiting(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + PSVMVMCBCTRL pVmcbCtrl = &pVmcb->ctrl; + if ( pVmcbCtrl->IntCtrl.n.u1VIrqPending + || (pVmcbCtrl->u64InterceptCtrl & SVM_CTRL_INTERCEPT_VINTR)) + { + pVmcbCtrl->IntCtrl.n.u1VIrqPending = 0; + pVmcbCtrl->u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INT_CTRL; + hmR0SvmClearCtrlIntercept(pVCpu, pVmcb, SVM_CTRL_INTERCEPT_VINTR); + Log4(("Cleared VINTR intercept\n")); + } +} + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +/** + * Evaluates the event to be delivered to the nested-guest and sets it as the + * pending event. + * + * @returns VBox strict status code. + * @param pVCpu The cross context virtual CPU structure. + */ +static VBOXSTRICTRC hmR0SvmEvaluatePendingEventNested(PVMCPU pVCpu) +{ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + HMSVM_ASSERT_IN_NESTED_GUEST(pCtx); + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_HWVIRT + | CPUMCTX_EXTRN_RFLAGS + | CPUMCTX_EXTRN_HM_SVM_INT_SHADOW + | CPUMCTX_EXTRN_HM_SVM_HWVIRT_VIRQ); + + Assert(!pVCpu->hm.s.Event.fPending); + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + Assert(pVmcb); + + bool const fGif = CPUMGetGuestGif(pCtx); + bool const fIntShadow = hmR0SvmIsIntrShadowActive(pVCpu); + bool const fBlockNmi = VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS); + + Log4Func(("fGif=%RTbool fBlockNmi=%RTbool fIntShadow=%RTbool fIntPending=%RTbool fNmiPending=%RTbool\n", + fGif, fBlockNmi, fIntShadow, VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC), + VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NMI))); + + /** @todo SMI. SMIs take priority over NMIs. */ + + /* + * Check if the guest can receive NMIs. + * Nested NMIs are not allowed, see AMD spec. 8.1.4 "Masking External Interrupts". + * NMIs take priority over maskable interrupts, see AMD spec. 8.5 "Priorities". + */ + if ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NMI) + && !fBlockNmi) + { + if ( fGif + && !fIntShadow) + { + if (CPUMIsGuestSvmCtrlInterceptSet(pVCpu, pCtx, SVM_CTRL_INTERCEPT_NMI)) + { + Log4(("Intercepting NMI -> #VMEXIT\n")); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + return IEMExecSvmVmexit(pVCpu, SVM_EXIT_NMI, 0, 0); + } + + Log4(("Setting NMI pending for injection\n")); + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u8Vector = X86_XCPT_NMI; + Event.n.u3Type = SVM_EVENT_NMI; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INTERRUPT_NMI); + } + else if (!fGif) + hmR0SvmSetCtrlIntercept(pVmcb, SVM_CTRL_INTERCEPT_STGI); + else + hmR0SvmSetIntWindowExiting(pVCpu, pVmcb); + } + /* + * Check if the nested-guest can receive external interrupts (generated by the guest's + * PIC/APIC). + * + * External intercepts, NMI, SMI etc. from the physical CPU are -always- intercepted + * when executing using hardware-assisted SVM, see HMSVM_MANDATORY_GUEST_CTRL_INTERCEPTS. + * + * External interrupts that are generated for the outer guest may be intercepted + * depending on how the nested-guest VMCB was programmed by guest software. + * + * Physical interrupts always take priority over virtual interrupts, + * see AMD spec. 15.21.4 "Injecting Virtual (INTR) Interrupts". + * + * We don't need to inject nested-guest virtual interrupts here, we can let the hardware + * do that work when we execute nested guest code esp. since all the required information + * is in the VMCB, unlike physical interrupts where we need to fetch the interrupt from + * the virtual interrupt controller. + */ + else if ( VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC) + && !pVCpu->hm.s.fSingleInstruction) + { + if ( fGif + && !fIntShadow + && CPUMIsGuestSvmPhysIntrEnabled(pVCpu, pCtx)) + { + if (CPUMIsGuestSvmCtrlInterceptSet(pVCpu, pCtx, SVM_CTRL_INTERCEPT_INTR)) + { + Log4(("Intercepting INTR -> #VMEXIT\n")); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + return IEMExecSvmVmexit(pVCpu, SVM_EXIT_INTR, 0, 0); + } + + uint8_t u8Interrupt; + int rc = PDMGetInterrupt(pVCpu, &u8Interrupt); + if (RT_SUCCESS(rc)) + { + Log4(("Setting external interrupt %#x pending for injection\n", u8Interrupt)); + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u8Vector = u8Interrupt; + Event.n.u3Type = SVM_EVENT_EXTERNAL_IRQ; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + } + else if (rc == VERR_APIC_INTR_MASKED_BY_TPR) + { + /* + * AMD-V has no TPR thresholding feature. TPR and the force-flag will be + * updated eventually when the TPR is written by the guest. + */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchTprMaskedIrq); + } + else + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchGuestIrq); + } + else if (!fGif) + hmR0SvmSetCtrlIntercept(pVmcb, SVM_CTRL_INTERCEPT_STGI); + else + hmR0SvmSetIntWindowExiting(pVCpu, pVmcb); + } + + return VINF_SUCCESS; +} +#endif + +/** + * Evaluates the event to be delivered to the guest and sets it as the pending + * event. + * + * @param pVCpu The cross context virtual CPU structure. + */ +static void hmR0SvmEvaluatePendingEvent(PVMCPU pVCpu) +{ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + HMSVM_ASSERT_NOT_IN_NESTED_GUEST(pCtx); + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_HWVIRT + | CPUMCTX_EXTRN_RFLAGS + | CPUMCTX_EXTRN_HM_SVM_INT_SHADOW); + + Assert(!pVCpu->hm.s.Event.fPending); + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + Assert(pVmcb); + + bool const fGif = CPUMGetGuestGif(pCtx); + bool const fIntShadow = hmR0SvmIsIntrShadowActive(pVCpu); + bool const fBlockInt = !(pCtx->eflags.u32 & X86_EFL_IF); + bool const fBlockNmi = VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS); + + Log4Func(("fGif=%RTbool fBlockNmi=%RTbool fBlockInt=%RTbool fIntShadow=%RTbool fIntPending=%RTbool NMI pending=%RTbool\n", + fGif, fBlockNmi, fBlockInt, fIntShadow, + VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC), + VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NMI))); + + /** @todo SMI. SMIs take priority over NMIs. */ + + /* + * Check if the guest can receive NMIs. + * Nested NMIs are not allowed, see AMD spec. 8.1.4 "Masking External Interrupts". + * NMIs take priority over maskable interrupts, see AMD spec. 8.5 "Priorities". + */ + if ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NMI) + && !fBlockNmi) + { + if ( fGif + && !fIntShadow) + { + Log4(("Setting NMI pending for injection\n")); + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u8Vector = X86_XCPT_NMI; + Event.n.u3Type = SVM_EVENT_NMI; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INTERRUPT_NMI); + } + else if (!fGif) + hmR0SvmSetCtrlIntercept(pVmcb, SVM_CTRL_INTERCEPT_STGI); + else + hmR0SvmSetIntWindowExiting(pVCpu, pVmcb); + } + /* + * Check if the guest can receive external interrupts (PIC/APIC). Once PDMGetInterrupt() + * returns a valid interrupt we -must- deliver the interrupt. We can no longer re-request + * it from the APIC device. + */ + else if ( VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC) + && !pVCpu->hm.s.fSingleInstruction) + { + if ( fGif + && !fBlockInt + && !fIntShadow) + { + uint8_t u8Interrupt; + int rc = PDMGetInterrupt(pVCpu, &u8Interrupt); + if (RT_SUCCESS(rc)) + { + Log4(("Setting external interrupt %#x pending for injection\n", u8Interrupt)); + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u8Vector = u8Interrupt; + Event.n.u3Type = SVM_EVENT_EXTERNAL_IRQ; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + } + else if (rc == VERR_APIC_INTR_MASKED_BY_TPR) + { + /* + * AMD-V has no TPR thresholding feature. TPR and the force-flag will be + * updated eventually when the TPR is written by the guest. + */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchTprMaskedIrq); + } + else + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchGuestIrq); + } + else if (!fGif) + hmR0SvmSetCtrlIntercept(pVmcb, SVM_CTRL_INTERCEPT_STGI); + else + hmR0SvmSetIntWindowExiting(pVCpu, pVmcb); + } +} + + +/** + * Injects any pending events into the guest (or nested-guest). + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * + * @remarks Must only be called when we are guaranteed to enter + * hardware-assisted SVM execution and not return to ring-3 + * prematurely. + */ +static void hmR0SvmInjectPendingEvent(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + Assert(!TRPMHasTrap(pVCpu)); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + + bool const fIntShadow = hmR0SvmIsIntrShadowActive(pVCpu); +#ifdef VBOX_STRICT + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + bool const fGif = pCtx->hwvirt.fGif; + bool fAllowInt = fGif; + if (fGif) + { + /* + * For nested-guests we have no way to determine if we're injecting a physical or + * virtual interrupt at this point. Hence the partial verification below. + */ + if (CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + fAllowInt = CPUMIsGuestSvmPhysIntrEnabled(pVCpu, pCtx) || CPUMIsGuestSvmVirtIntrEnabled(pVCpu, pCtx); + else + fAllowInt = RT_BOOL(pCtx->eflags.u32 & X86_EFL_IF); + } +#endif + + if (pVCpu->hm.s.Event.fPending) + { + SVMEVENT Event; + Event.u = pVCpu->hm.s.Event.u64IntInfo; + Assert(Event.n.u1Valid); + + /* + * Validate event injection pre-conditions. + */ + if (Event.n.u3Type == SVM_EVENT_EXTERNAL_IRQ) + { + Assert(fAllowInt); + Assert(!fIntShadow); + } + else if (Event.n.u3Type == SVM_EVENT_NMI) + { + Assert(fGif); + Assert(!fIntShadow); + } + + /* + * Before injecting an NMI we must set VMCPU_FF_BLOCK_NMIS to prevent nested NMIs. We + * do this only when we are surely going to inject the NMI as otherwise if we return + * to ring-3 prematurely we could leave NMIs blocked indefinitely upon re-entry into + * SVM R0. + * + * With VT-x, this is handled by the Guest interruptibility information VMCS field + * which will set the VMCS field after actually delivering the NMI which we read on + * VM-exit to determine the state. + */ + if ( Event.n.u3Type == SVM_EVENT_NMI + && Event.n.u8Vector == X86_XCPT_NMI + && !VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + { + VMCPU_FF_SET(pVCpu, VMCPU_FF_BLOCK_NMIS); + } + + /* + * Inject it (update VMCB for injection by the hardware). + */ + Log4(("Injecting pending HM event\n")); + hmR0SvmInjectEventVmcb(pVCpu, pVmcb, &Event); + pVCpu->hm.s.Event.fPending = false; + + if (Event.n.u3Type == SVM_EVENT_EXTERNAL_IRQ) + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectInterrupt); + else + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectXcpt); + } + else + Assert(pVmcb->ctrl.EventInject.n.u1Valid == 0); + + /* + * We could have injected an NMI through IEM and continue guest execution using + * hardware-assisted SVM. In which case, we would not have any events pending (above) + * but we still need to intercept IRET in order to eventually clear NMI inhibition. + */ + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + hmR0SvmSetCtrlIntercept(pVmcb, SVM_CTRL_INTERCEPT_IRET); + + /* + * Update the guest interrupt shadow in the guest (or nested-guest) VMCB. + * + * For nested-guests: We need to update it too for the scenario where IEM executes + * the nested-guest but execution later continues here with an interrupt shadow active. + */ + pVmcb->ctrl.IntShadow.n.u1IntShadow = fIntShadow; +} + + +/** + * Reports world-switch error and dumps some useful debug info. + * + * @param pVCpu The cross context virtual CPU structure. + * @param rcVMRun The return code from VMRUN (or + * VERR_SVM_INVALID_GUEST_STATE for invalid + * guest-state). + */ +static void hmR0SvmReportWorldSwitchError(PVMCPU pVCpu, int rcVMRun) +{ + HMSVM_ASSERT_PREEMPT_SAFE(pVCpu); + HMSVM_ASSERT_NOT_IN_NESTED_GUEST(&pVCpu->cpum.GstCtx); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + + if (rcVMRun == VERR_SVM_INVALID_GUEST_STATE) + { +#ifdef VBOX_STRICT + hmR0DumpRegs(pVCpu); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + Log4(("ctrl.u32VmcbCleanBits %#RX32\n", pVmcb->ctrl.u32VmcbCleanBits)); + Log4(("ctrl.u16InterceptRdCRx %#x\n", pVmcb->ctrl.u16InterceptRdCRx)); + Log4(("ctrl.u16InterceptWrCRx %#x\n", pVmcb->ctrl.u16InterceptWrCRx)); + Log4(("ctrl.u16InterceptRdDRx %#x\n", pVmcb->ctrl.u16InterceptRdDRx)); + Log4(("ctrl.u16InterceptWrDRx %#x\n", pVmcb->ctrl.u16InterceptWrDRx)); + Log4(("ctrl.u32InterceptXcpt %#x\n", pVmcb->ctrl.u32InterceptXcpt)); + Log4(("ctrl.u64InterceptCtrl %#RX64\n", pVmcb->ctrl.u64InterceptCtrl)); + Log4(("ctrl.u64IOPMPhysAddr %#RX64\n", pVmcb->ctrl.u64IOPMPhysAddr)); + Log4(("ctrl.u64MSRPMPhysAddr %#RX64\n", pVmcb->ctrl.u64MSRPMPhysAddr)); + Log4(("ctrl.u64TSCOffset %#RX64\n", pVmcb->ctrl.u64TSCOffset)); + + Log4(("ctrl.TLBCtrl.u32ASID %#x\n", pVmcb->ctrl.TLBCtrl.n.u32ASID)); + Log4(("ctrl.TLBCtrl.u8TLBFlush %#x\n", pVmcb->ctrl.TLBCtrl.n.u8TLBFlush)); + Log4(("ctrl.TLBCtrl.u24Reserved %#x\n", pVmcb->ctrl.TLBCtrl.n.u24Reserved)); + + Log4(("ctrl.IntCtrl.u8VTPR %#x\n", pVmcb->ctrl.IntCtrl.n.u8VTPR)); + Log4(("ctrl.IntCtrl.u1VIrqPending %#x\n", pVmcb->ctrl.IntCtrl.n.u1VIrqPending)); + Log4(("ctrl.IntCtrl.u1VGif %#x\n", pVmcb->ctrl.IntCtrl.n.u1VGif)); + Log4(("ctrl.IntCtrl.u6Reserved0 %#x\n", pVmcb->ctrl.IntCtrl.n.u6Reserved)); + Log4(("ctrl.IntCtrl.u4VIntrPrio %#x\n", pVmcb->ctrl.IntCtrl.n.u4VIntrPrio)); + Log4(("ctrl.IntCtrl.u1IgnoreTPR %#x\n", pVmcb->ctrl.IntCtrl.n.u1IgnoreTPR)); + Log4(("ctrl.IntCtrl.u3Reserved %#x\n", pVmcb->ctrl.IntCtrl.n.u3Reserved)); + Log4(("ctrl.IntCtrl.u1VIntrMasking %#x\n", pVmcb->ctrl.IntCtrl.n.u1VIntrMasking)); + Log4(("ctrl.IntCtrl.u1VGifEnable %#x\n", pVmcb->ctrl.IntCtrl.n.u1VGifEnable)); + Log4(("ctrl.IntCtrl.u5Reserved1 %#x\n", pVmcb->ctrl.IntCtrl.n.u5Reserved)); + Log4(("ctrl.IntCtrl.u8VIntrVector %#x\n", pVmcb->ctrl.IntCtrl.n.u8VIntrVector)); + Log4(("ctrl.IntCtrl.u24Reserved %#x\n", pVmcb->ctrl.IntCtrl.n.u24Reserved)); + + Log4(("ctrl.IntShadow.u1IntShadow %#x\n", pVmcb->ctrl.IntShadow.n.u1IntShadow)); + Log4(("ctrl.IntShadow.u1GuestIntMask %#x\n", pVmcb->ctrl.IntShadow.n.u1GuestIntMask)); + Log4(("ctrl.u64ExitCode %#RX64\n", pVmcb->ctrl.u64ExitCode)); + Log4(("ctrl.u64ExitInfo1 %#RX64\n", pVmcb->ctrl.u64ExitInfo1)); + Log4(("ctrl.u64ExitInfo2 %#RX64\n", pVmcb->ctrl.u64ExitInfo2)); + Log4(("ctrl.ExitIntInfo.u8Vector %#x\n", pVmcb->ctrl.ExitIntInfo.n.u8Vector)); + Log4(("ctrl.ExitIntInfo.u3Type %#x\n", pVmcb->ctrl.ExitIntInfo.n.u3Type)); + Log4(("ctrl.ExitIntInfo.u1ErrorCodeValid %#x\n", pVmcb->ctrl.ExitIntInfo.n.u1ErrorCodeValid)); + Log4(("ctrl.ExitIntInfo.u19Reserved %#x\n", pVmcb->ctrl.ExitIntInfo.n.u19Reserved)); + Log4(("ctrl.ExitIntInfo.u1Valid %#x\n", pVmcb->ctrl.ExitIntInfo.n.u1Valid)); + Log4(("ctrl.ExitIntInfo.u32ErrorCode %#x\n", pVmcb->ctrl.ExitIntInfo.n.u32ErrorCode)); + Log4(("ctrl.NestedPagingCtrl.u1NestedPaging %#x\n", pVmcb->ctrl.NestedPagingCtrl.n.u1NestedPaging)); + Log4(("ctrl.NestedPagingCtrl.u1Sev %#x\n", pVmcb->ctrl.NestedPagingCtrl.n.u1Sev)); + Log4(("ctrl.NestedPagingCtrl.u1SevEs %#x\n", pVmcb->ctrl.NestedPagingCtrl.n.u1SevEs)); + Log4(("ctrl.EventInject.u8Vector %#x\n", pVmcb->ctrl.EventInject.n.u8Vector)); + Log4(("ctrl.EventInject.u3Type %#x\n", pVmcb->ctrl.EventInject.n.u3Type)); + Log4(("ctrl.EventInject.u1ErrorCodeValid %#x\n", pVmcb->ctrl.EventInject.n.u1ErrorCodeValid)); + Log4(("ctrl.EventInject.u19Reserved %#x\n", pVmcb->ctrl.EventInject.n.u19Reserved)); + Log4(("ctrl.EventInject.u1Valid %#x\n", pVmcb->ctrl.EventInject.n.u1Valid)); + Log4(("ctrl.EventInject.u32ErrorCode %#x\n", pVmcb->ctrl.EventInject.n.u32ErrorCode)); + + Log4(("ctrl.u64NestedPagingCR3 %#RX64\n", pVmcb->ctrl.u64NestedPagingCR3)); + + Log4(("ctrl.LbrVirt.u1LbrVirt %#x\n", pVmcb->ctrl.LbrVirt.n.u1LbrVirt)); + Log4(("ctrl.LbrVirt.u1VirtVmsaveVmload %#x\n", pVmcb->ctrl.LbrVirt.n.u1VirtVmsaveVmload)); + + Log4(("guest.CS.u16Sel %RTsel\n", pVmcb->guest.CS.u16Sel)); + Log4(("guest.CS.u16Attr %#x\n", pVmcb->guest.CS.u16Attr)); + Log4(("guest.CS.u32Limit %#RX32\n", pVmcb->guest.CS.u32Limit)); + Log4(("guest.CS.u64Base %#RX64\n", pVmcb->guest.CS.u64Base)); + Log4(("guest.DS.u16Sel %#RTsel\n", pVmcb->guest.DS.u16Sel)); + Log4(("guest.DS.u16Attr %#x\n", pVmcb->guest.DS.u16Attr)); + Log4(("guest.DS.u32Limit %#RX32\n", pVmcb->guest.DS.u32Limit)); + Log4(("guest.DS.u64Base %#RX64\n", pVmcb->guest.DS.u64Base)); + Log4(("guest.ES.u16Sel %RTsel\n", pVmcb->guest.ES.u16Sel)); + Log4(("guest.ES.u16Attr %#x\n", pVmcb->guest.ES.u16Attr)); + Log4(("guest.ES.u32Limit %#RX32\n", pVmcb->guest.ES.u32Limit)); + Log4(("guest.ES.u64Base %#RX64\n", pVmcb->guest.ES.u64Base)); + Log4(("guest.FS.u16Sel %RTsel\n", pVmcb->guest.FS.u16Sel)); + Log4(("guest.FS.u16Attr %#x\n", pVmcb->guest.FS.u16Attr)); + Log4(("guest.FS.u32Limit %#RX32\n", pVmcb->guest.FS.u32Limit)); + Log4(("guest.FS.u64Base %#RX64\n", pVmcb->guest.FS.u64Base)); + Log4(("guest.GS.u16Sel %RTsel\n", pVmcb->guest.GS.u16Sel)); + Log4(("guest.GS.u16Attr %#x\n", pVmcb->guest.GS.u16Attr)); + Log4(("guest.GS.u32Limit %#RX32\n", pVmcb->guest.GS.u32Limit)); + Log4(("guest.GS.u64Base %#RX64\n", pVmcb->guest.GS.u64Base)); + + Log4(("guest.GDTR.u32Limit %#RX32\n", pVmcb->guest.GDTR.u32Limit)); + Log4(("guest.GDTR.u64Base %#RX64\n", pVmcb->guest.GDTR.u64Base)); + + Log4(("guest.LDTR.u16Sel %RTsel\n", pVmcb->guest.LDTR.u16Sel)); + Log4(("guest.LDTR.u16Attr %#x\n", pVmcb->guest.LDTR.u16Attr)); + Log4(("guest.LDTR.u32Limit %#RX32\n", pVmcb->guest.LDTR.u32Limit)); + Log4(("guest.LDTR.u64Base %#RX64\n", pVmcb->guest.LDTR.u64Base)); + + Log4(("guest.IDTR.u32Limit %#RX32\n", pVmcb->guest.IDTR.u32Limit)); + Log4(("guest.IDTR.u64Base %#RX64\n", pVmcb->guest.IDTR.u64Base)); + + Log4(("guest.TR.u16Sel %RTsel\n", pVmcb->guest.TR.u16Sel)); + Log4(("guest.TR.u16Attr %#x\n", pVmcb->guest.TR.u16Attr)); + Log4(("guest.TR.u32Limit %#RX32\n", pVmcb->guest.TR.u32Limit)); + Log4(("guest.TR.u64Base %#RX64\n", pVmcb->guest.TR.u64Base)); + + Log4(("guest.u8CPL %#x\n", pVmcb->guest.u8CPL)); + Log4(("guest.u64CR0 %#RX64\n", pVmcb->guest.u64CR0)); + Log4(("guest.u64CR2 %#RX64\n", pVmcb->guest.u64CR2)); + Log4(("guest.u64CR3 %#RX64\n", pVmcb->guest.u64CR3)); + Log4(("guest.u64CR4 %#RX64\n", pVmcb->guest.u64CR4)); + Log4(("guest.u64DR6 %#RX64\n", pVmcb->guest.u64DR6)); + Log4(("guest.u64DR7 %#RX64\n", pVmcb->guest.u64DR7)); + + Log4(("guest.u64RIP %#RX64\n", pVmcb->guest.u64RIP)); + Log4(("guest.u64RSP %#RX64\n", pVmcb->guest.u64RSP)); + Log4(("guest.u64RAX %#RX64\n", pVmcb->guest.u64RAX)); + Log4(("guest.u64RFlags %#RX64\n", pVmcb->guest.u64RFlags)); + + Log4(("guest.u64SysEnterCS %#RX64\n", pVmcb->guest.u64SysEnterCS)); + Log4(("guest.u64SysEnterEIP %#RX64\n", pVmcb->guest.u64SysEnterEIP)); + Log4(("guest.u64SysEnterESP %#RX64\n", pVmcb->guest.u64SysEnterESP)); + + Log4(("guest.u64EFER %#RX64\n", pVmcb->guest.u64EFER)); + Log4(("guest.u64STAR %#RX64\n", pVmcb->guest.u64STAR)); + Log4(("guest.u64LSTAR %#RX64\n", pVmcb->guest.u64LSTAR)); + Log4(("guest.u64CSTAR %#RX64\n", pVmcb->guest.u64CSTAR)); + Log4(("guest.u64SFMASK %#RX64\n", pVmcb->guest.u64SFMASK)); + Log4(("guest.u64KernelGSBase %#RX64\n", pVmcb->guest.u64KernelGSBase)); + Log4(("guest.u64PAT %#RX64\n", pVmcb->guest.u64PAT)); + Log4(("guest.u64DBGCTL %#RX64\n", pVmcb->guest.u64DBGCTL)); + Log4(("guest.u64BR_FROM %#RX64\n", pVmcb->guest.u64BR_FROM)); + Log4(("guest.u64BR_TO %#RX64\n", pVmcb->guest.u64BR_TO)); + Log4(("guest.u64LASTEXCPFROM %#RX64\n", pVmcb->guest.u64LASTEXCPFROM)); + Log4(("guest.u64LASTEXCPTO %#RX64\n", pVmcb->guest.u64LASTEXCPTO)); + + NOREF(pVmcb); +#endif /* VBOX_STRICT */ + } + else + Log4Func(("rcVMRun=%d\n", rcVMRun)); +} + + +/** + * Check per-VM and per-VCPU force flag actions that require us to go back to + * ring-3 for one reason or another. + * + * @returns VBox status code (information status code included). + * @retval VINF_SUCCESS if we don't have any actions that require going back to + * ring-3. + * @retval VINF_PGM_SYNC_CR3 if we have pending PGM CR3 sync. + * @retval VINF_EM_PENDING_REQUEST if we have pending requests (like hardware + * interrupts) + * @retval VINF_PGM_POOL_FLUSH_PENDING if PGM is doing a pool flush and requires + * all EMTs to be in ring-3. + * @retval VINF_EM_RAW_TO_R3 if there is pending DMA requests. + * @retval VINF_EM_NO_MEMORY PGM is out of memory, we need to return + * to the EM loop. + * + * @param pVCpu The cross context virtual CPU structure. + */ +static int hmR0SvmCheckForceFlags(PVMCPU pVCpu) +{ + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + Assert(!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_PAE_PDPES)); + + /* Could happen as a result of longjump. */ + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3)) + PGMUpdateCR3(pVCpu, CPUMGetGuestCR3(pVCpu)); + + /* Update pending interrupts into the APIC's IRR. */ + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC)) + APICUpdatePendingInterrupts(pVCpu); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + if ( VM_FF_IS_ANY_SET(pVM, !pVCpu->hm.s.fSingleInstruction + ? VM_FF_HP_R0_PRE_HM_MASK : VM_FF_HP_R0_PRE_HM_STEP_MASK) + || VMCPU_FF_IS_ANY_SET(pVCpu, !pVCpu->hm.s.fSingleInstruction + ? VMCPU_FF_HP_R0_PRE_HM_MASK : VMCPU_FF_HP_R0_PRE_HM_STEP_MASK) ) + { + /* Pending PGM C3 sync. */ + if (VMCPU_FF_IS_ANY_SET(pVCpu,VMCPU_FF_PGM_SYNC_CR3 | VMCPU_FF_PGM_SYNC_CR3_NON_GLOBAL)) + { + int rc = PGMSyncCR3(pVCpu, pVCpu->cpum.GstCtx.cr0, pVCpu->cpum.GstCtx.cr3, pVCpu->cpum.GstCtx.cr4, + VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3)); + if (rc != VINF_SUCCESS) + { + Log4Func(("PGMSyncCR3 forcing us back to ring-3. rc=%d\n", rc)); + return rc; + } + } + + /* Pending HM-to-R3 operations (critsects, timers, EMT rendezvous etc.) */ + /* -XXX- what was that about single stepping? */ + if ( VM_FF_IS_ANY_SET(pVM, VM_FF_HM_TO_R3_MASK) + || VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HM_TO_R3_MASK)) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchHmToR3FF); + int rc = RT_LIKELY(!VM_FF_IS_SET(pVM, VM_FF_PGM_NO_MEMORY)) ? VINF_EM_RAW_TO_R3 : VINF_EM_NO_MEMORY; + Log4Func(("HM_TO_R3 forcing us back to ring-3. rc=%d\n", rc)); + return rc; + } + + /* Pending VM request packets, such as hardware interrupts. */ + if ( VM_FF_IS_SET(pVM, VM_FF_REQUEST) + || VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_REQUEST)) + { + Log4Func(("Pending VM request forcing us back to ring-3\n")); + return VINF_EM_PENDING_REQUEST; + } + + /* Pending PGM pool flushes. */ + if (VM_FF_IS_SET(pVM, VM_FF_PGM_POOL_FLUSH_PENDING)) + { + Log4Func(("PGM pool flush pending forcing us back to ring-3\n")); + return VINF_PGM_POOL_FLUSH_PENDING; + } + + /* Pending DMA requests. */ + if (VM_FF_IS_SET(pVM, VM_FF_PDM_DMA)) + { + Log4Func(("Pending DMA request forcing us back to ring-3\n")); + return VINF_EM_RAW_TO_R3; + } + } + + return VINF_SUCCESS; +} + + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +/** + * Does the preparations before executing nested-guest code in AMD-V. + * + * @returns VBox status code (informational status codes included). + * @retval VINF_SUCCESS if we can proceed with running the guest. + * @retval VINF_* scheduling changes, we have to go back to ring-3. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pSvmTransient Pointer to the SVM transient structure. + * + * @remarks Same caveats regarding longjumps as hmR0SvmPreRunGuest applies. + * @sa hmR0SvmPreRunGuest. + */ +static int hmR0SvmPreRunGuestNested(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + HMSVM_ASSERT_PREEMPT_SAFE(pVCpu); + HMSVM_ASSERT_IN_NESTED_GUEST(pCtx); + +#ifdef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + if (CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) /* Redundant check to avoid unreachable code warning. */ + { + Log2(("hmR0SvmPreRunGuest: Rescheduling to IEM due to nested-hwvirt or forced IEM exec -> VINF_EM_RESCHEDULE_REM\n")); + return VINF_EM_RESCHEDULE_REM; + } +#endif + + /* Check force flag actions that might require us to go back to ring-3. */ + int rc = hmR0SvmCheckForceFlags(pVCpu); + if (rc != VINF_SUCCESS) + return rc; + + if (TRPMHasTrap(pVCpu)) + hmR0SvmTrpmTrapToPendingEvent(pVCpu); + else if (!pVCpu->hm.s.Event.fPending) + { + VBOXSTRICTRC rcStrict = hmR0SvmEvaluatePendingEventNested(pVCpu); + if ( rcStrict != VINF_SUCCESS + || !CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + return VBOXSTRICTRC_VAL(rcStrict); + } + + HMSVM_ASSERT_IN_NESTED_GUEST(pCtx); + + /* + * On the oldest AMD-V systems, we may not get enough information to reinject an NMI. + * Just do it in software, see @bugref{8411}. + * NB: If we could continue a task switch exit we wouldn't need to do this. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (RT_UNLIKELY( !pVM->hm.s.svm.u32Features + && pVCpu->hm.s.Event.fPending + && SVM_EVENT_GET_TYPE(pVCpu->hm.s.Event.u64IntInfo) == SVM_EVENT_NMI)) + { + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + +#ifdef HMSVM_SYNC_FULL_GUEST_STATE + Assert(!(pCtx->fExtrn & HMSVM_CPUMCTX_EXTRN_ALL)); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); +#endif + + /* + * Export the nested-guest state bits that are not shared with the host in any way as we + * can longjmp or get preempted in the midst of exporting some of the state. + */ + rc = hmR0SvmExportGuestStateNested(pVCpu); + AssertRCReturn(rc, rc); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExportFull); + + /* Ensure we've cached (and hopefully modified) the VMCB for execution using hardware-assisted SVM. */ + Assert(pVCpu->hm.s.svm.NstGstVmcbCache.fCacheValid); + + /* + * No longjmps to ring-3 from this point on!!! + * + * Asserts() will still longjmp to ring-3 (but won't return), which is intentional, + * better than a kernel panic. This also disables flushing of the R0-logger instance. + */ + VMMRZCallRing3Disable(pVCpu); + + /* + * We disable interrupts so that we don't miss any interrupts that would flag preemption + * (IPI/timers etc.) when thread-context hooks aren't used and we've been running with + * preemption disabled for a while. Since this is purly to aid the + * RTThreadPreemptIsPending() code, it doesn't matter that it may temporarily reenable and + * disable interrupt on NT. + * + * We need to check for force-flags that could've possible been altered since we last + * checked them (e.g. by PDMGetInterrupt() leaving the PDM critical section, + * see @bugref{6398}). + * + * We also check a couple of other force-flags as a last opportunity to get the EMT back + * to ring-3 before executing guest code. + */ + pSvmTransient->fEFlags = ASMIntDisableFlags(); + if ( VM_FF_IS_ANY_SET(pVM, VM_FF_EMT_RENDEZVOUS | VM_FF_TM_VIRTUAL_SYNC) + || VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HM_TO_R3_MASK)) + { + ASMSetFlags(pSvmTransient->fEFlags); + VMMRZCallRing3Enable(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchHmToR3FF); + return VINF_EM_RAW_TO_R3; + } + if (RTThreadPreemptIsPending(NIL_RTTHREAD)) + { + ASMSetFlags(pSvmTransient->fEFlags); + VMMRZCallRing3Enable(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchPendingHostIrq); + return VINF_EM_RAW_INTERRUPT; + } + return VINF_SUCCESS; +} +#endif + + +/** + * Does the preparations before executing guest code in AMD-V. + * + * This may cause longjmps to ring-3 and may even result in rescheduling to the + * recompiler. We must be cautious what we do here regarding committing + * guest-state information into the VMCB assuming we assuredly execute the guest + * in AMD-V. If we fall back to the recompiler after updating the VMCB and + * clearing the common-state (TRPM/forceflags), we must undo those changes so + * that the recompiler can (and should) use them when it resumes guest + * execution. Otherwise such operations must be done when we can no longer + * exit to ring-3. + * + * @returns VBox status code (informational status codes included). + * @retval VINF_SUCCESS if we can proceed with running the guest. + * @retval VINF_* scheduling changes, we have to go back to ring-3. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pSvmTransient Pointer to the SVM transient structure. + */ +static int hmR0SvmPreRunGuest(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_ASSERT_PREEMPT_SAFE(pVCpu); + HMSVM_ASSERT_NOT_IN_NESTED_GUEST(&pVCpu->cpum.GstCtx); + + /* Check force flag actions that might require us to go back to ring-3. */ + int rc = hmR0SvmCheckForceFlags(pVCpu); + if (rc != VINF_SUCCESS) + return rc; + + if (TRPMHasTrap(pVCpu)) + hmR0SvmTrpmTrapToPendingEvent(pVCpu); + else if (!pVCpu->hm.s.Event.fPending) + hmR0SvmEvaluatePendingEvent(pVCpu); + + /* + * On the oldest AMD-V systems, we may not get enough information to reinject an NMI. + * Just do it in software, see @bugref{8411}. + * NB: If we could continue a task switch exit we wouldn't need to do this. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (RT_UNLIKELY(pVCpu->hm.s.Event.fPending && (((pVCpu->hm.s.Event.u64IntInfo >> 8) & 7) == SVM_EVENT_NMI))) + if (RT_UNLIKELY(!pVM->hm.s.svm.u32Features)) + return VINF_EM_RAW_INJECT_TRPM_EVENT; + +#ifdef HMSVM_SYNC_FULL_GUEST_STATE + Assert(!(pVCpu->cpum.GstCtx->fExtrn & HMSVM_CPUMCTX_EXTRN_ALL)); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); +#endif + + /* + * Export the guest state bits that are not shared with the host in any way as we can + * longjmp or get preempted in the midst of exporting some of the state. + */ + rc = hmR0SvmExportGuestState(pVCpu); + AssertRCReturn(rc, rc); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExportFull); + + /* + * If we're not intercepting TPR changes in the guest, save the guest TPR before the + * world-switch so we can update it on the way back if the guest changed the TPR. + */ + if (pVCpu->hm.s.svm.fSyncVTpr) + { + PCSVMVMCB pVmcb = pVCpu->hm.s.svm.pVmcb; + if (pVM->hm.s.fTPRPatchingActive) + pSvmTransient->u8GuestTpr = pVmcb->guest.u64LSTAR; + else + pSvmTransient->u8GuestTpr = pVmcb->ctrl.IntCtrl.n.u8VTPR; + } + + /* + * No longjmps to ring-3 from this point on!!! + * + * Asserts() will still longjmp to ring-3 (but won't return), which is intentional, + * better than a kernel panic. This also disables flushing of the R0-logger instance. + */ + VMMRZCallRing3Disable(pVCpu); + + /* + * We disable interrupts so that we don't miss any interrupts that would flag preemption + * (IPI/timers etc.) when thread-context hooks aren't used and we've been running with + * preemption disabled for a while. Since this is purly to aid the + * RTThreadPreemptIsPending() code, it doesn't matter that it may temporarily reenable and + * disable interrupt on NT. + * + * We need to check for force-flags that could've possible been altered since we last + * checked them (e.g. by PDMGetInterrupt() leaving the PDM critical section, + * see @bugref{6398}). + * + * We also check a couple of other force-flags as a last opportunity to get the EMT back + * to ring-3 before executing guest code. + */ + pSvmTransient->fEFlags = ASMIntDisableFlags(); + if ( VM_FF_IS_ANY_SET(pVM, VM_FF_EMT_RENDEZVOUS | VM_FF_TM_VIRTUAL_SYNC) + || VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HM_TO_R3_MASK)) + { + ASMSetFlags(pSvmTransient->fEFlags); + VMMRZCallRing3Enable(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchHmToR3FF); + return VINF_EM_RAW_TO_R3; + } + if (RTThreadPreemptIsPending(NIL_RTTHREAD)) + { + ASMSetFlags(pSvmTransient->fEFlags); + VMMRZCallRing3Enable(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchPendingHostIrq); + return VINF_EM_RAW_INTERRUPT; + } + + return VINF_SUCCESS; +} + + +/** + * Prepares to run guest (or nested-guest) code in AMD-V and we've committed to + * doing so. + * + * This means there is no backing out to ring-3 or anywhere else at this point. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pSvmTransient Pointer to the SVM transient structure. + * + * @remarks Called with preemption disabled. + * @remarks No-long-jump zone!!! + */ +static void hmR0SvmPreRunGuestCommitted(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + Assert(VMMR0IsLogFlushDisabled(pVCpu)); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + VMCPU_ASSERT_STATE(pVCpu, VMCPUSTATE_STARTED_HM); + VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED_EXEC); /* Indicate the start of guest execution. */ + + PVM pVM = pVCpu->CTX_SUFF(pVM); + PSVMVMCB pVmcb = pSvmTransient->pVmcb; + + hmR0SvmInjectPendingEvent(pVCpu, pVmcb); + + if (!CPUMIsGuestFPUStateActive(pVCpu)) + { + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatLoadGuestFpuState, x); + CPUMR0LoadGuestFPU(pVM, pVCpu); /* (Ignore rc, no need to set HM_CHANGED_HOST_CONTEXT for SVM.) */ + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatLoadGuestFpuState, x); + STAM_COUNTER_INC(&pVCpu->hm.s.StatLoadGuestFpu); + } + + /* Load the state shared between host and guest (FPU, debug). */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_SVM_HOST_GUEST_SHARED_STATE) + hmR0SvmExportSharedState(pVCpu, pVmcb); + + pVCpu->hm.s.fCtxChanged &= ~HM_CHANGED_HOST_CONTEXT; /* Preemption might set this, nothing to do on AMD-V. */ + AssertMsg(!pVCpu->hm.s.fCtxChanged, ("fCtxChanged=%#RX64\n", pVCpu->hm.s.fCtxChanged)); + + PHMPHYSCPU pHostCpu = hmR0GetCurrentCpu(); + RTCPUID const idHostCpu = pHostCpu->idCpu; + bool const fMigratedHostCpu = idHostCpu != pVCpu->hm.s.idLastCpu; + + /* Setup TSC offsetting. */ + if ( pSvmTransient->fUpdateTscOffsetting + || fMigratedHostCpu) + { + hmR0SvmUpdateTscOffsetting(pVCpu, pVmcb); + pSvmTransient->fUpdateTscOffsetting = false; + } + + /* If we've migrating CPUs, mark the VMCB Clean bits as dirty. */ + if (fMigratedHostCpu) + pVmcb->ctrl.u32VmcbCleanBits = 0; + + /* Store status of the shared guest-host state at the time of VMRUN. */ +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.GstCtx)) + { + pSvmTransient->fWasGuestDebugStateActive = CPUMIsGuestDebugStateActivePending(pVCpu); + pSvmTransient->fWasHyperDebugStateActive = CPUMIsHyperDebugStateActivePending(pVCpu); + } + else +#endif + { + pSvmTransient->fWasGuestDebugStateActive = CPUMIsGuestDebugStateActive(pVCpu); + pSvmTransient->fWasHyperDebugStateActive = CPUMIsHyperDebugStateActive(pVCpu); + } + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + uint8_t *pbMsrBitmap; + if (!pSvmTransient->fIsNestedGuest) + pbMsrBitmap = (uint8_t *)pVCpu->hm.s.svm.pvMsrBitmap; + else + { + hmR0SvmMergeMsrpmNested(pHostCpu, pVCpu); + + /* Update the nested-guest VMCB with the newly merged MSRPM (clean bits updated below). */ + pVmcb->ctrl.u64MSRPMPhysAddr = pHostCpu->n.svm.HCPhysNstGstMsrpm; + pbMsrBitmap = (uint8_t *)pHostCpu->n.svm.pvNstGstMsrpm; + } +#else + uint8_t *pbMsrBitmap = (uint8_t *)pVCpu->hm.s.svm.pvMsrBitmap; +#endif + + ASMAtomicWriteBool(&pVCpu->hm.s.fCheckedTLBFlush, true); /* Used for TLB flushing, set this across the world switch. */ + /* Flush the appropriate tagged-TLB entries. */ + hmR0SvmFlushTaggedTlb(pHostCpu, pVCpu, pVmcb); + Assert(pVCpu->hm.s.idLastCpu == idHostCpu); + + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatEntry, &pVCpu->hm.s.StatInGC, x); + + TMNotifyStartOfExecution(pVCpu); /* Finally, notify TM to resume its clocks as we're about + to start executing. */ + + /* + * Save the current Host TSC_AUX and write the guest TSC_AUX to the host, so that RDTSCPs + * (that don't cause exits) reads the guest MSR, see @bugref{3324}. + * + * This should be done -after- any RDTSCPs for obtaining the host timestamp (TM, STAM etc). + */ + if ( pVM->cpum.ro.HostFeatures.fRdTscP + && !(pVmcb->ctrl.u64InterceptCtrl & SVM_CTRL_INTERCEPT_RDTSCP)) + { + uint64_t const uGuestTscAux = CPUMGetGuestTscAux(pVCpu); + pVCpu->hm.s.u64HostTscAux = ASMRdMsr(MSR_K8_TSC_AUX); + if (uGuestTscAux != pVCpu->hm.s.u64HostTscAux) + ASMWrMsr(MSR_K8_TSC_AUX, uGuestTscAux); + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_TSC_AUX, SVMMSREXIT_PASSTHRU_READ, SVMMSREXIT_PASSTHRU_WRITE); + pSvmTransient->fRestoreTscAuxMsr = true; + } + else + { + hmR0SvmSetMsrPermission(pVCpu, pbMsrBitmap, MSR_K8_TSC_AUX, SVMMSREXIT_INTERCEPT_READ, SVMMSREXIT_INTERCEPT_WRITE); + pSvmTransient->fRestoreTscAuxMsr = false; + } + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_IOPM_MSRPM; + + /* + * If VMCB Clean bits isn't supported by the CPU or exposed to the guest in the nested + * virtualization case, mark all state-bits as dirty indicating to the CPU to re-load + * from the VMCB. + */ + bool const fSupportsVmcbCleanBits = hmR0SvmSupportsVmcbCleanBits(pVCpu); + if (!fSupportsVmcbCleanBits) + pVmcb->ctrl.u32VmcbCleanBits = 0; +} + + +/** + * Wrapper for running the guest (or nested-guest) code in AMD-V. + * + * @returns VBox strict status code. + * @param pVCpu The cross context virtual CPU structure. + * @param HCPhysVmcb The host physical address of the VMCB. + * + * @remarks No-long-jump zone!!! + */ +DECLINLINE(int) hmR0SvmRunGuest(PVMCPU pVCpu, RTHCPHYS HCPhysVmcb) +{ + /* Mark that HM is the keeper of all guest-CPU registers now that we're going to execute guest code. */ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + pCtx->fExtrn |= HMSVM_CPUMCTX_EXTRN_ALL | CPUMCTX_EXTRN_KEEPER_HM; + + /* + * 64-bit Windows uses XMM registers in the kernel as the Microsoft compiler expresses + * floating-point operations using SSE instructions. Some XMM registers (XMM6-XMM15) are + * callee-saved and thus the need for this XMM wrapper. + * + * Refer MSDN "Configuring Programs for 64-bit/x64 Software Conventions / Register Usage". + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); +#ifdef VBOX_WITH_KERNEL_USING_XMM + return hmR0SVMRunWrapXMM(pVCpu->hm.s.svm.HCPhysVmcbHost, HCPhysVmcb, pCtx, pVM, pVCpu, pVCpu->hm.s.svm.pfnVMRun); +#else + return pVCpu->hm.s.svm.pfnVMRun(pVCpu->hm.s.svm.HCPhysVmcbHost, HCPhysVmcb, pCtx, pVM, pVCpu); +#endif +} + + +/** + * Undoes the TSC offset applied for an SVM nested-guest and returns the TSC + * value for the guest. + * + * @returns The TSC offset after undoing any nested-guest TSC offset. + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param uTicks The nested-guest TSC. + * + * @note If you make any changes to this function, please check if + * hmR0SvmNstGstUndoTscOffset() needs adjusting. + * + * @sa HMApplySvmNstGstTscOffset(). + */ +DECLINLINE(uint64_t) hmR0SvmNstGstUndoTscOffset(PVMCPU pVCpu, uint64_t uTicks) +{ + PCSVMNESTEDVMCBCACHE pVmcbNstGstCache = &pVCpu->hm.s.svm.NstGstVmcbCache; + Assert(pVmcbNstGstCache->fCacheValid); + return uTicks - pVmcbNstGstCache->u64TSCOffset; +} + + +/** + * Performs some essential restoration of state after running guest (or + * nested-guest) code in AMD-V. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pSvmTransient Pointer to the SVM transient structure. + * @param rcVMRun Return code of VMRUN. + * + * @remarks Called with interrupts disabled. + * @remarks No-long-jump zone!!! This function will however re-enable longjmps + * unconditionally when it is safe to do so. + */ +static void hmR0SvmPostRunGuest(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient, int rcVMRun) +{ + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + + uint64_t const uHostTsc = ASMReadTSC(); /* Read the TSC as soon as possible. */ + ASMAtomicWriteBool(&pVCpu->hm.s.fCheckedTLBFlush, false); /* See HMInvalidatePageOnAllVCpus(): used for TLB flushing. */ + ASMAtomicIncU32(&pVCpu->hm.s.cWorldSwitchExits); /* Initialized in vmR3CreateUVM(): used for EMT poking. */ + + PSVMVMCB pVmcb = pSvmTransient->pVmcb; + PSVMVMCBCTRL pVmcbCtrl = &pVmcb->ctrl; + + /* TSC read must be done early for maximum accuracy. */ + if (!(pVmcbCtrl->u64InterceptCtrl & SVM_CTRL_INTERCEPT_RDTSC)) + { + if (!pSvmTransient->fIsNestedGuest) + TMCpuTickSetLastSeen(pVCpu, uHostTsc + pVmcbCtrl->u64TSCOffset); +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + else + { + /* The nested-guest VMCB TSC offset shall eventually be restored on #VMEXIT via HMNotifySvmNstGstVmexit(). */ + uint64_t const uGstTsc = hmR0SvmNstGstUndoTscOffset(pVCpu, uHostTsc + pVmcbCtrl->u64TSCOffset); + TMCpuTickSetLastSeen(pVCpu, uGstTsc); + } +#endif + } + + if (pSvmTransient->fRestoreTscAuxMsr) + { + uint64_t u64GuestTscAuxMsr = ASMRdMsr(MSR_K8_TSC_AUX); + CPUMSetGuestTscAux(pVCpu, u64GuestTscAuxMsr); + if (u64GuestTscAuxMsr != pVCpu->hm.s.u64HostTscAux) + ASMWrMsr(MSR_K8_TSC_AUX, pVCpu->hm.s.u64HostTscAux); + } + + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatInGC, &pVCpu->hm.s.StatPreExit, x); + TMNotifyEndOfExecution(pVCpu); /* Notify TM that the guest is no longer running. */ + VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED_HM); + + Assert(!(ASMGetFlags() & X86_EFL_IF)); + ASMSetFlags(pSvmTransient->fEFlags); /* Enable interrupts. */ + VMMRZCallRing3Enable(pVCpu); /* It is now safe to do longjmps to ring-3!!! */ + + /* If VMRUN failed, we can bail out early. This does -not- cover SVM_EXIT_INVALID. */ + if (RT_UNLIKELY(rcVMRun != VINF_SUCCESS)) + { + Log4Func(("VMRUN failure: rcVMRun=%Rrc\n", rcVMRun)); + return; + } + + pSvmTransient->u64ExitCode = pVmcbCtrl->u64ExitCode; /* Save the #VMEXIT reason. */ + pVmcbCtrl->u32VmcbCleanBits = HMSVM_VMCB_CLEAN_ALL; /* Mark the VMCB-state cache as unmodified by VMM. */ + pSvmTransient->fVectoringDoublePF = false; /* Vectoring double page-fault needs to be determined later. */ + pSvmTransient->fVectoringPF = false; /* Vectoring page-fault needs to be determined later. */ + +#ifdef HMSVM_SYNC_FULL_GUEST_STATE + hmR0SvmImportGuestState(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + Assert(!(pVCpu->cpum.GstCtx.fExtrn & HMSVM_CPUMCTX_EXTRN_ALL)); +#else + /* + * Always import the following: + * + * - RIP for exit optimizations and evaluating event injection on re-entry. + * - RFLAGS for evaluating event injection on VM re-entry and for exporting shared debug + * state on preemption. + * - Interrupt shadow, GIF for evaluating event injection on VM re-entry. + * - CS for exit optimizations. + * - RAX, RSP for simplifying assumptions on GPRs. All other GPRs are swapped by the + * assembly switcher code. + * - Shared state (only DR7 currently) for exporting shared debug state on preemption. + */ + hmR0SvmImportGuestState(pVCpu, CPUMCTX_EXTRN_RIP + | CPUMCTX_EXTRN_RFLAGS + | CPUMCTX_EXTRN_RAX + | CPUMCTX_EXTRN_RSP + | CPUMCTX_EXTRN_CS + | CPUMCTX_EXTRN_HWVIRT + | CPUMCTX_EXTRN_HM_SVM_INT_SHADOW + | CPUMCTX_EXTRN_HM_SVM_HWVIRT_VIRQ + | HMSVM_CPUMCTX_SHARED_STATE); +#endif + + if ( pSvmTransient->u64ExitCode != SVM_EXIT_INVALID + && pVCpu->hm.s.svm.fSyncVTpr) + { + Assert(!pSvmTransient->fIsNestedGuest); + /* TPR patching (for 32-bit guests) uses LSTAR MSR for holding the TPR value, otherwise uses the VTPR. */ + if ( pVCpu->CTX_SUFF(pVM)->hm.s.fTPRPatchingActive + && (pVmcb->guest.u64LSTAR & 0xff) != pSvmTransient->u8GuestTpr) + { + int rc = APICSetTpr(pVCpu, pVmcb->guest.u64LSTAR & 0xff); + AssertRC(rc); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_APIC_TPR); + } + /* Sync TPR when we aren't intercepting CR8 writes. */ + else if (pSvmTransient->u8GuestTpr != pVmcbCtrl->IntCtrl.n.u8VTPR) + { + int rc = APICSetTpr(pVCpu, pVmcbCtrl->IntCtrl.n.u8VTPR << 4); + AssertRC(rc); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_APIC_TPR); + } + } + +#ifdef DEBUG_ramshankar + if (CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) + { + hmR0SvmImportGuestState(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + hmR0SvmLogState(pVCpu, pVmcb, pVCpu->cpum.GstCtx, "hmR0SvmPostRunGuestNested", HMSVM_LOG_ALL & ~HMSVM_LOG_LBR, + 0 /* uVerbose */); + } +#endif + + HMSVM_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP); + EMHistoryAddExit(pVCpu, EMEXIT_MAKE_FT(EMEXIT_F_KIND_SVM, pSvmTransient->u64ExitCode & EMEXIT_F_TYPE_MASK), + pVCpu->cpum.GstCtx.cs.u64Base + pVCpu->cpum.GstCtx.rip, uHostTsc); +} + + +/** + * Runs the guest code using AMD-V. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pcLoops Pointer to the number of executed loops. + */ +static int hmR0SvmRunGuestCodeNormal(PVMCPU pVCpu, uint32_t *pcLoops) +{ + uint32_t const cMaxResumeLoops = pVCpu->CTX_SUFF(pVM)->hm.s.cMaxResumeLoops; + Assert(pcLoops); + Assert(*pcLoops <= cMaxResumeLoops); + + SVMTRANSIENT SvmTransient; + RT_ZERO(SvmTransient); + SvmTransient.fUpdateTscOffsetting = true; + SvmTransient.pVmcb = pVCpu->hm.s.svm.pVmcb; + + int rc = VERR_INTERNAL_ERROR_5; + for (;;) + { + Assert(!HMR0SuspendPending()); + HMSVM_ASSERT_CPU_SAFE(pVCpu); + + /* Preparatory work for running nested-guest code, this may force us to return to + ring-3. This bugger disables interrupts on VINF_SUCCESS! */ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatEntry, x); + rc = hmR0SvmPreRunGuest(pVCpu, &SvmTransient); + if (rc != VINF_SUCCESS) + break; + + /* + * No longjmps to ring-3 from this point on!!! + * + * Asserts() will still longjmp to ring-3 (but won't return), which is intentional, + * better than a kernel panic. This also disables flushing of the R0-logger instance. + */ + hmR0SvmPreRunGuestCommitted(pVCpu, &SvmTransient); + rc = hmR0SvmRunGuest(pVCpu, pVCpu->hm.s.svm.HCPhysVmcb); + + /* Restore any residual host-state and save any bits shared between host and guest + into the guest-CPU state. Re-enables interrupts! */ + hmR0SvmPostRunGuest(pVCpu, &SvmTransient, rc); + + if (RT_UNLIKELY( rc != VINF_SUCCESS /* Check for VMRUN errors. */ + || SvmTransient.u64ExitCode == SVM_EXIT_INVALID)) /* Check for invalid guest-state errors. */ + { + if (rc == VINF_SUCCESS) + rc = VERR_SVM_INVALID_GUEST_STATE; + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatPreExit, x); + hmR0SvmReportWorldSwitchError(pVCpu, rc); + break; + } + + /* Handle the #VMEXIT. */ + HMSVM_EXITCODE_STAM_COUNTER_INC(SvmTransient.u64ExitCode); + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatPreExit, &pVCpu->hm.s.StatExitHandling, x); + VBOXVMM_R0_HMSVM_VMEXIT(pVCpu, &pVCpu->cpum.GstCtx, SvmTransient.u64ExitCode, pVCpu->hm.s.svm.pVmcb); + rc = hmR0SvmHandleExit(pVCpu, &SvmTransient); + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitHandling, x); + if (rc != VINF_SUCCESS) + break; + if (++(*pcLoops) >= cMaxResumeLoops) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchMaxResumeLoops); + rc = VINF_EM_RAW_INTERRUPT; + break; + } + } + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatEntry, x); + return rc; +} + + +/** + * Runs the guest code using AMD-V in single step mode. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pcLoops Pointer to the number of executed loops. + */ +static int hmR0SvmRunGuestCodeStep(PVMCPU pVCpu, uint32_t *pcLoops) +{ + uint32_t const cMaxResumeLoops = pVCpu->CTX_SUFF(pVM)->hm.s.cMaxResumeLoops; + Assert(pcLoops); + Assert(*pcLoops <= cMaxResumeLoops); + + SVMTRANSIENT SvmTransient; + RT_ZERO(SvmTransient); + SvmTransient.fUpdateTscOffsetting = true; + SvmTransient.pVmcb = pVCpu->hm.s.svm.pVmcb; + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint16_t uCsStart = pCtx->cs.Sel; + uint64_t uRipStart = pCtx->rip; + + int rc = VERR_INTERNAL_ERROR_5; + for (;;) + { + Assert(!HMR0SuspendPending()); + AssertMsg(pVCpu->hm.s.idEnteredCpu == RTMpCpuId(), + ("Illegal migration! Entered on CPU %u Current %u cLoops=%u\n", (unsigned)pVCpu->hm.s.idEnteredCpu, + (unsigned)RTMpCpuId(), *pcLoops)); + + /* Preparatory work for running nested-guest code, this may force us to return to + ring-3. This bugger disables interrupts on VINF_SUCCESS! */ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatEntry, x); + rc = hmR0SvmPreRunGuest(pVCpu, &SvmTransient); + if (rc != VINF_SUCCESS) + break; + + /* + * No longjmps to ring-3 from this point on!!! + * + * Asserts() will still longjmp to ring-3 (but won't return), which is intentional, + * better than a kernel panic. This also disables flushing of the R0-logger instance. + */ + VMMRZCallRing3Disable(pVCpu); + VMMRZCallRing3RemoveNotification(pVCpu); + hmR0SvmPreRunGuestCommitted(pVCpu, &SvmTransient); + + rc = hmR0SvmRunGuest(pVCpu, pVCpu->hm.s.svm.HCPhysVmcb); + + /* Restore any residual host-state and save any bits shared between host and guest + into the guest-CPU state. Re-enables interrupts! */ + hmR0SvmPostRunGuest(pVCpu, &SvmTransient, rc); + + if (RT_UNLIKELY( rc != VINF_SUCCESS /* Check for VMRUN errors. */ + || SvmTransient.u64ExitCode == SVM_EXIT_INVALID)) /* Check for invalid guest-state errors. */ + { + if (rc == VINF_SUCCESS) + rc = VERR_SVM_INVALID_GUEST_STATE; + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatPreExit, x); + hmR0SvmReportWorldSwitchError(pVCpu, rc); + return rc; + } + + /* Handle the #VMEXIT. */ + HMSVM_EXITCODE_STAM_COUNTER_INC(SvmTransient.u64ExitCode); + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatPreExit, &pVCpu->hm.s.StatExitHandling, x); + VBOXVMM_R0_HMSVM_VMEXIT(pVCpu, pCtx, SvmTransient.u64ExitCode, pVCpu->hm.s.svm.pVmcb); + rc = hmR0SvmHandleExit(pVCpu, &SvmTransient); + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitHandling, x); + if (rc != VINF_SUCCESS) + break; + if (++(*pcLoops) >= cMaxResumeLoops) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchMaxResumeLoops); + rc = VINF_EM_RAW_INTERRUPT; + break; + } + + /* + * Did the RIP change, if so, consider it a single step. + * Otherwise, make sure one of the TFs gets set. + */ + if ( pCtx->rip != uRipStart + || pCtx->cs.Sel != uCsStart) + { + rc = VINF_EM_DBG_STEPPED; + break; + } + pVCpu->hm.s.fCtxChanged |= HM_CHANGED_GUEST_DR_MASK; + } + + /* + * Clear the X86_EFL_TF if necessary. + */ + if (pVCpu->hm.s.fClearTrapFlag) + { + pVCpu->hm.s.fClearTrapFlag = false; + pCtx->eflags.Bits.u1TF = 0; + } + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatEntry, x); + return rc; +} + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +/** + * Runs the nested-guest code using AMD-V. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pcLoops Pointer to the number of executed loops. If we're switching + * from the guest-code execution loop to this nested-guest + * execution loop pass the remainder value, else pass 0. + */ +static int hmR0SvmRunGuestCodeNested(PVMCPU pVCpu, uint32_t *pcLoops) +{ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + HMSVM_ASSERT_IN_NESTED_GUEST(pCtx); + Assert(pcLoops); + Assert(*pcLoops <= pVCpu->CTX_SUFF(pVM)->hm.s.cMaxResumeLoops); + + SVMTRANSIENT SvmTransient; + RT_ZERO(SvmTransient); + SvmTransient.fUpdateTscOffsetting = true; + SvmTransient.pVmcb = pCtx->hwvirt.svm.CTX_SUFF(pVmcb); + SvmTransient.fIsNestedGuest = true; + + int rc = VERR_INTERNAL_ERROR_4; + for (;;) + { + Assert(!HMR0SuspendPending()); + HMSVM_ASSERT_CPU_SAFE(pVCpu); + + /* Preparatory work for running nested-guest code, this may force us to return to + ring-3. This bugger disables interrupts on VINF_SUCCESS! */ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatEntry, x); + rc = hmR0SvmPreRunGuestNested(pVCpu, &SvmTransient); + if ( rc != VINF_SUCCESS + || !CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + { + break; + } + + /* + * No longjmps to ring-3 from this point on!!! + * + * Asserts() will still longjmp to ring-3 (but won't return), which is intentional, + * better than a kernel panic. This also disables flushing of the R0-logger instance. + */ + hmR0SvmPreRunGuestCommitted(pVCpu, &SvmTransient); + + rc = hmR0SvmRunGuest(pVCpu, pCtx->hwvirt.svm.HCPhysVmcb); + + /* Restore any residual host-state and save any bits shared between host and guest + into the guest-CPU state. Re-enables interrupts! */ + hmR0SvmPostRunGuest(pVCpu, &SvmTransient, rc); + + if (RT_LIKELY( rc == VINF_SUCCESS + && SvmTransient.u64ExitCode != SVM_EXIT_INVALID)) + { /* extremely likely */ } + else + { + /* VMRUN failed, shouldn't really happen, Guru. */ + if (rc != VINF_SUCCESS) + break; + + /* Invalid nested-guest state. Cause a #VMEXIT but assert on strict builds. */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + AssertMsgFailed(("Invalid nested-guest state. rc=%Rrc u64ExitCode=%#RX64\n", rc, SvmTransient.u64ExitCode)); + rc = VBOXSTRICTRC_TODO(IEMExecSvmVmexit(pVCpu, SVM_EXIT_INVALID, 0, 0)); + break; + } + + /* Handle the #VMEXIT. */ + HMSVM_NESTED_EXITCODE_STAM_COUNTER_INC(SvmTransient.u64ExitCode); + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatPreExit, &pVCpu->hm.s.StatExitHandling, x); + VBOXVMM_R0_HMSVM_VMEXIT(pVCpu, pCtx, SvmTransient.u64ExitCode, pCtx->hwvirt.svm.CTX_SUFF(pVmcb)); + rc = hmR0SvmHandleExitNested(pVCpu, &SvmTransient); + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitHandling, x); + if ( rc != VINF_SUCCESS + || !CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + break; + if (++(*pcLoops) >= pVCpu->CTX_SUFF(pVM)->hm.s.cMaxResumeLoops) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchMaxResumeLoops); + rc = VINF_EM_RAW_INTERRUPT; + break; + } + + /** @todo handle single-stepping */ + } + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatEntry, x); + return rc; +} +#endif + + +/** + * Runs the guest code using AMD-V. + * + * @returns Strict VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0DECL(VBOXSTRICTRC) SVMR0RunGuestCode(PVMCPU pVCpu) +{ + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + HMSVM_ASSERT_PREEMPT_SAFE(pVCpu); + VMMRZCallRing3SetNotification(pVCpu, hmR0SvmCallRing3Callback, NULL /* pvUser */); + + uint32_t cLoops = 0; + int rc; +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + if (!CPUMIsGuestInSvmNestedHwVirtMode(&pVCpu->cpum.GstCtx)) +#endif + { + if (!pVCpu->hm.s.fSingleInstruction) + rc = hmR0SvmRunGuestCodeNormal(pVCpu, &cLoops); + else + rc = hmR0SvmRunGuestCodeStep(pVCpu, &cLoops); + } +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + else + { + rc = VINF_SVM_VMRUN; + } + + /* Re-check the nested-guest condition here as we may be transitioning from the normal + execution loop into the nested-guest, hence this is not placed in the 'else' part above. */ + if (rc == VINF_SVM_VMRUN) + { + rc = hmR0SvmRunGuestCodeNested(pVCpu, &cLoops); + if (rc == VINF_SVM_VMEXIT) + rc = VINF_SUCCESS; + } +#endif + + /* Fixup error codes. */ + if (rc == VERR_EM_INTERPRETER) + rc = VINF_EM_RAW_EMULATE_INSTR; + else if (rc == VINF_EM_RESET) + rc = VINF_EM_TRIPLE_FAULT; + + /* Prepare to return to ring-3. This will remove longjmp notifications. */ + rc = hmR0SvmExitToRing3(pVCpu, rc); + Assert(!VMMRZCallRing3IsNotificationSet(pVCpu)); + return rc; +} + + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +/** + * Determines whether an IOIO intercept is active for the nested-guest or not. + * + * @param pvIoBitmap Pointer to the nested-guest IO bitmap. + * @param pIoExitInfo Pointer to the SVMIOIOEXITINFO. + */ +static bool hmR0SvmIsIoInterceptActive(void *pvIoBitmap, PSVMIOIOEXITINFO pIoExitInfo) +{ + const uint16_t u16Port = pIoExitInfo->n.u16Port; + const SVMIOIOTYPE enmIoType = (SVMIOIOTYPE)pIoExitInfo->n.u1Type; + const uint8_t cbReg = (pIoExitInfo->u >> SVM_IOIO_OP_SIZE_SHIFT) & 7; + const uint8_t cAddrSizeBits = ((pIoExitInfo->u >> SVM_IOIO_ADDR_SIZE_SHIFT) & 7) << 4; + const uint8_t iEffSeg = pIoExitInfo->n.u3Seg; + const bool fRep = pIoExitInfo->n.u1Rep; + const bool fStrIo = pIoExitInfo->n.u1Str; + + return HMIsSvmIoInterceptActive(pvIoBitmap, u16Port, enmIoType, cbReg, cAddrSizeBits, iEffSeg, fRep, fStrIo, + NULL /* pIoExitInfo */); +} + + +/** + * Handles a nested-guest \#VMEXIT (for all EXITCODE values except + * SVM_EXIT_INVALID). + * + * @returns VBox status code (informational status codes included). + * @param pVCpu The cross context virtual CPU structure. + * @param pSvmTransient Pointer to the SVM transient structure. + */ +static int hmR0SvmHandleExitNested(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_ASSERT_IN_NESTED_GUEST(&pVCpu->cpum.GstCtx); + Assert(pSvmTransient->u64ExitCode != SVM_EXIT_INVALID); + Assert(pSvmTransient->u64ExitCode <= SVM_EXIT_MAX); + + /* + * We import the complete state here because we use separate VMCBs for the guest and the + * nested-guest, and the guest's VMCB is used after the #VMEXIT. We can only save/restore + * the #VMEXIT specific state if we used the same VMCB for both guest and nested-guest. + */ +#define NST_GST_VMEXIT_CALL_RET(a_pVCpu, a_uExitCode, a_uExitInfo1, a_uExitInfo2) \ + do { \ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); \ + return VBOXSTRICTRC_TODO(IEMExecSvmVmexit((a_pVCpu), (a_uExitCode), (a_uExitInfo1), (a_uExitInfo2))); \ + } while (0) + + /* + * For all the #VMEXITs here we primarily figure out if the #VMEXIT is expected by the + * nested-guest. If it isn't, it should be handled by the (outer) guest. + */ + PSVMVMCB pVmcbNstGst = pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pVmcb); + PSVMVMCBCTRL pVmcbNstGstCtrl = &pVmcbNstGst->ctrl; + uint64_t const uExitCode = pVmcbNstGstCtrl->u64ExitCode; + uint64_t const uExitInfo1 = pVmcbNstGstCtrl->u64ExitInfo1; + uint64_t const uExitInfo2 = pVmcbNstGstCtrl->u64ExitInfo2; + + Assert(uExitCode == pVmcbNstGstCtrl->u64ExitCode); + switch (uExitCode) + { + case SVM_EXIT_CPUID: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_CPUID)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitCpuid(pVCpu, pSvmTransient); + } + + case SVM_EXIT_RDTSC: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_RDTSC)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitRdtsc(pVCpu, pSvmTransient); + } + + case SVM_EXIT_RDTSCP: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_RDTSCP)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitRdtscp(pVCpu, pSvmTransient); + } + + case SVM_EXIT_MONITOR: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_MONITOR)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitMonitor(pVCpu, pSvmTransient); + } + + case SVM_EXIT_MWAIT: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_MWAIT)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitMwait(pVCpu, pSvmTransient); + } + + case SVM_EXIT_HLT: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_HLT)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitHlt(pVCpu, pSvmTransient); + } + + case SVM_EXIT_MSR: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_MSR_PROT)) + { + uint32_t const idMsr = pVCpu->cpum.GstCtx.ecx; + uint16_t offMsrpm; + uint8_t uMsrpmBit; + int rc = HMGetSvmMsrpmOffsetAndBit(idMsr, &offMsrpm, &uMsrpmBit); + if (RT_SUCCESS(rc)) + { + Assert(uMsrpmBit == 0 || uMsrpmBit == 2 || uMsrpmBit == 4 || uMsrpmBit == 6); + Assert(offMsrpm < SVM_MSRPM_PAGES << X86_PAGE_4K_SHIFT); + + uint8_t const *pbMsrBitmap = (uint8_t const *)pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pvMsrBitmap); + pbMsrBitmap += offMsrpm; + bool const fInterceptRead = RT_BOOL(*pbMsrBitmap & RT_BIT(uMsrpmBit)); + bool const fInterceptWrite = RT_BOOL(*pbMsrBitmap & RT_BIT(uMsrpmBit + 1)); + + if ( (fInterceptWrite && pVmcbNstGstCtrl->u64ExitInfo1 == SVM_EXIT1_MSR_WRITE) + || (fInterceptRead && pVmcbNstGstCtrl->u64ExitInfo1 == SVM_EXIT1_MSR_READ)) + { + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + } + } + else + { + /* + * MSRs not covered by the MSRPM automatically cause an #VMEXIT. + * See AMD-V spec. "15.11 MSR Intercepts". + */ + Assert(rc == VERR_OUT_OF_RANGE); + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + } + } + return hmR0SvmExitMsr(pVCpu, pSvmTransient); + } + + case SVM_EXIT_IOIO: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_IOIO_PROT)) + { + void *pvIoBitmap = pVCpu->cpum.GstCtx.hwvirt.svm.CTX_SUFF(pvIoBitmap); + SVMIOIOEXITINFO IoExitInfo; + IoExitInfo.u = pVmcbNstGst->ctrl.u64ExitInfo1; + bool const fIntercept = hmR0SvmIsIoInterceptActive(pvIoBitmap, &IoExitInfo); + if (fIntercept) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + } + return hmR0SvmExitIOInstr(pVCpu, pSvmTransient); + } + + case SVM_EXIT_XCPT_PF: + { + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (pVM->hm.s.fNestedPaging) + { + uint32_t const u32ErrCode = pVmcbNstGstCtrl->u64ExitInfo1; + uint64_t const uFaultAddress = pVmcbNstGstCtrl->u64ExitInfo2; + + /* If the nested-guest is intercepting #PFs, cause a #PF #VMEXIT. */ + if (HMIsGuestSvmXcptInterceptSet(pVCpu, X86_XCPT_PF)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, u32ErrCode, uFaultAddress); + + /* If the nested-guest is not intercepting #PFs, forward the #PF to the guest. */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR2); + hmR0SvmSetPendingXcptPF(pVCpu, u32ErrCode, uFaultAddress); + return VINF_SUCCESS; + } + return hmR0SvmExitXcptPF(pVCpu, pSvmTransient); + } + + case SVM_EXIT_XCPT_UD: + { + if (HMIsGuestSvmXcptInterceptSet(pVCpu, X86_XCPT_UD)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + hmR0SvmSetPendingXcptUD(pVCpu); + return VINF_SUCCESS; + } + + case SVM_EXIT_XCPT_MF: + { + if (HMIsGuestSvmXcptInterceptSet(pVCpu, X86_XCPT_MF)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitXcptMF(pVCpu, pSvmTransient); + } + + case SVM_EXIT_XCPT_DB: + { + if (HMIsGuestSvmXcptInterceptSet(pVCpu, X86_XCPT_DB)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmNestedExitXcptDB(pVCpu, pSvmTransient); + } + + case SVM_EXIT_XCPT_AC: + { + if (HMIsGuestSvmXcptInterceptSet(pVCpu, X86_XCPT_AC)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitXcptAC(pVCpu, pSvmTransient); + } + + case SVM_EXIT_XCPT_BP: + { + if (HMIsGuestSvmXcptInterceptSet(pVCpu, X86_XCPT_BP)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmNestedExitXcptBP(pVCpu, pSvmTransient); + } + + case SVM_EXIT_READ_CR0: + case SVM_EXIT_READ_CR3: + case SVM_EXIT_READ_CR4: + { + uint8_t const uCr = uExitCode - SVM_EXIT_READ_CR0; + if (HMIsGuestSvmReadCRxInterceptSet(pVCpu, uCr)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitReadCRx(pVCpu, pSvmTransient); + } + + case SVM_EXIT_CR0_SEL_WRITE: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_CR0_SEL_WRITE)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitWriteCRx(pVCpu, pSvmTransient); + } + + case SVM_EXIT_WRITE_CR0: + case SVM_EXIT_WRITE_CR3: + case SVM_EXIT_WRITE_CR4: + case SVM_EXIT_WRITE_CR8: /* CR8 writes would go to the V_TPR rather than here, since we run with V_INTR_MASKING. */ + { + uint8_t const uCr = uExitCode - SVM_EXIT_WRITE_CR0; + Log4Func(("Write CR%u: uExitInfo1=%#RX64 uExitInfo2=%#RX64\n", uCr, uExitInfo1, uExitInfo2)); + + if (HMIsGuestSvmWriteCRxInterceptSet(pVCpu, uCr)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitWriteCRx(pVCpu, pSvmTransient); + } + + case SVM_EXIT_PAUSE: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_PAUSE)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitPause(pVCpu, pSvmTransient); + } + + case SVM_EXIT_VINTR: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_VINTR)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitUnexpected(pVCpu, pSvmTransient); + } + + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + case SVM_EXIT_SMI: + case SVM_EXIT_XCPT_NMI: /* Should not occur, SVM_EXIT_NMI is used instead. */ + { + /* + * We shouldn't direct physical interrupts, NMIs, SMIs to the nested-guest. + * + * Although we don't intercept SMIs, the nested-guest might. Therefore, we might + * get an SMI #VMEXIT here so simply ignore rather than causing a corresponding + * nested-guest #VMEXIT. + * + * We shall import the complete state here as we may cause #VMEXITs from ring-3 + * while trying to inject interrupts, see comment at the top of this function. + */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_ALL); + return hmR0SvmExitIntr(pVCpu, pSvmTransient); + } + + case SVM_EXIT_FERR_FREEZE: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_FERR_FREEZE)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitFerrFreeze(pVCpu, pSvmTransient); + } + + case SVM_EXIT_INVLPG: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_INVLPG)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitInvlpg(pVCpu, pSvmTransient); + } + + case SVM_EXIT_WBINVD: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_WBINVD)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitWbinvd(pVCpu, pSvmTransient); + } + + case SVM_EXIT_INVD: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_INVD)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitInvd(pVCpu, pSvmTransient); + } + + case SVM_EXIT_RDPMC: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_RDPMC)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitRdpmc(pVCpu, pSvmTransient); + } + + default: + { + switch (uExitCode) + { + case SVM_EXIT_READ_DR0: case SVM_EXIT_READ_DR1: case SVM_EXIT_READ_DR2: case SVM_EXIT_READ_DR3: + case SVM_EXIT_READ_DR6: case SVM_EXIT_READ_DR7: case SVM_EXIT_READ_DR8: case SVM_EXIT_READ_DR9: + case SVM_EXIT_READ_DR10: case SVM_EXIT_READ_DR11: case SVM_EXIT_READ_DR12: case SVM_EXIT_READ_DR13: + case SVM_EXIT_READ_DR14: case SVM_EXIT_READ_DR15: + { + uint8_t const uDr = uExitCode - SVM_EXIT_READ_DR0; + if (HMIsGuestSvmReadDRxInterceptSet(pVCpu, uDr)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitReadDRx(pVCpu, pSvmTransient); + } + + case SVM_EXIT_WRITE_DR0: case SVM_EXIT_WRITE_DR1: case SVM_EXIT_WRITE_DR2: case SVM_EXIT_WRITE_DR3: + case SVM_EXIT_WRITE_DR6: case SVM_EXIT_WRITE_DR7: case SVM_EXIT_WRITE_DR8: case SVM_EXIT_WRITE_DR9: + case SVM_EXIT_WRITE_DR10: case SVM_EXIT_WRITE_DR11: case SVM_EXIT_WRITE_DR12: case SVM_EXIT_WRITE_DR13: + case SVM_EXIT_WRITE_DR14: case SVM_EXIT_WRITE_DR15: + { + uint8_t const uDr = uExitCode - SVM_EXIT_WRITE_DR0; + if (HMIsGuestSvmWriteDRxInterceptSet(pVCpu, uDr)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitWriteDRx(pVCpu, pSvmTransient); + } + + case SVM_EXIT_XCPT_DE: + /* SVM_EXIT_XCPT_DB: */ /* Handled above. */ + /* SVM_EXIT_XCPT_NMI: */ /* Handled above. */ + /* SVM_EXIT_XCPT_BP: */ /* Handled above. */ + case SVM_EXIT_XCPT_OF: + case SVM_EXIT_XCPT_BR: + /* SVM_EXIT_XCPT_UD: */ /* Handled above. */ + case SVM_EXIT_XCPT_NM: + case SVM_EXIT_XCPT_DF: + case SVM_EXIT_XCPT_CO_SEG_OVERRUN: + case SVM_EXIT_XCPT_TS: + case SVM_EXIT_XCPT_NP: + case SVM_EXIT_XCPT_SS: + case SVM_EXIT_XCPT_GP: + /* SVM_EXIT_XCPT_PF: */ /* Handled above. */ + case SVM_EXIT_XCPT_15: /* Reserved. */ + /* SVM_EXIT_XCPT_MF: */ /* Handled above. */ + /* SVM_EXIT_XCPT_AC: */ /* Handled above. */ + case SVM_EXIT_XCPT_MC: + case SVM_EXIT_XCPT_XF: + case SVM_EXIT_XCPT_20: case SVM_EXIT_XCPT_21: case SVM_EXIT_XCPT_22: case SVM_EXIT_XCPT_23: + case SVM_EXIT_XCPT_24: case SVM_EXIT_XCPT_25: case SVM_EXIT_XCPT_26: case SVM_EXIT_XCPT_27: + case SVM_EXIT_XCPT_28: case SVM_EXIT_XCPT_29: case SVM_EXIT_XCPT_30: case SVM_EXIT_XCPT_31: + { + uint8_t const uVector = uExitCode - SVM_EXIT_XCPT_0; + if (HMIsGuestSvmXcptInterceptSet(pVCpu, uVector)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitXcptGeneric(pVCpu, pSvmTransient); + } + + case SVM_EXIT_XSETBV: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_XSETBV)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitXsetbv(pVCpu, pSvmTransient); + } + + case SVM_EXIT_TASK_SWITCH: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_TASK_SWITCH)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitTaskSwitch(pVCpu, pSvmTransient); + } + + case SVM_EXIT_IRET: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_IRET)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitIret(pVCpu, pSvmTransient); + } + + case SVM_EXIT_SHUTDOWN: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_SHUTDOWN)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitShutdown(pVCpu, pSvmTransient); + } + + case SVM_EXIT_VMMCALL: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_VMMCALL)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitVmmCall(pVCpu, pSvmTransient); + } + + case SVM_EXIT_CLGI: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_CLGI)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitClgi(pVCpu, pSvmTransient); + } + + case SVM_EXIT_STGI: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_STGI)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitStgi(pVCpu, pSvmTransient); + } + + case SVM_EXIT_VMLOAD: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_VMLOAD)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitVmload(pVCpu, pSvmTransient); + } + + case SVM_EXIT_VMSAVE: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_VMSAVE)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitVmsave(pVCpu, pSvmTransient); + } + + case SVM_EXIT_INVLPGA: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_INVLPGA)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitInvlpga(pVCpu, pSvmTransient); + } + + case SVM_EXIT_VMRUN: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_VMRUN)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + return hmR0SvmExitVmrun(pVCpu, pSvmTransient); + } + + case SVM_EXIT_RSM: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_RSM)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + hmR0SvmSetPendingXcptUD(pVCpu); + return VINF_SUCCESS; + } + + case SVM_EXIT_SKINIT: + { + if (HMIsGuestSvmCtrlInterceptSet(pVCpu, SVM_CTRL_INTERCEPT_SKINIT)) + NST_GST_VMEXIT_CALL_RET(pVCpu, uExitCode, uExitInfo1, uExitInfo2); + hmR0SvmSetPendingXcptUD(pVCpu); + return VINF_SUCCESS; + } + + case SVM_EXIT_NPF: + { + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging); + return hmR0SvmExitNestedPF(pVCpu, pSvmTransient); + } + + case SVM_EXIT_INIT: /* We shouldn't get INIT signals while executing a nested-guest. */ + return hmR0SvmExitUnexpected(pVCpu, pSvmTransient); + + default: + { + AssertMsgFailed(("hmR0SvmHandleExitNested: Unknown exit code %#x\n", pSvmTransient->u64ExitCode)); + pVCpu->hm.s.u32HMError = pSvmTransient->u64ExitCode; + return VERR_SVM_UNKNOWN_EXIT; + } + } + } + } + /* not reached */ + +#undef NST_GST_VMEXIT_CALL_RET +} +#endif + + +/** + * Handles a guest \#VMEXIT (for all EXITCODE values except SVM_EXIT_INVALID). + * + * @returns VBox status code (informational status codes included). + * @param pVCpu The cross context virtual CPU structure. + * @param pSvmTransient Pointer to the SVM transient structure. + */ +static int hmR0SvmHandleExit(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + Assert(pSvmTransient->u64ExitCode != SVM_EXIT_INVALID); + Assert(pSvmTransient->u64ExitCode <= SVM_EXIT_MAX); + +#ifdef DEBUG_ramshankar +# define VMEXIT_CALL_RET(a_fDbg, a_CallExpr) \ + do { \ + if ((a_fDbg) == 1) \ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); \ + int rc = a_CallExpr; \ + if ((a_fDbg) == 1) \ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); \ + return rc; \ + } while (0) +#else +# define VMEXIT_CALL_RET(a_fDbg, a_CallExpr) return a_CallExpr +#endif + + /* + * The ordering of the case labels is based on most-frequently-occurring #VMEXITs + * for most guests under normal workloads (for some definition of "normal"). + */ + uint64_t const uExitCode = pSvmTransient->u64ExitCode; + switch (uExitCode) + { + case SVM_EXIT_NPF: VMEXIT_CALL_RET(0, hmR0SvmExitNestedPF(pVCpu, pSvmTransient)); + case SVM_EXIT_IOIO: VMEXIT_CALL_RET(0, hmR0SvmExitIOInstr(pVCpu, pSvmTransient)); + case SVM_EXIT_RDTSC: VMEXIT_CALL_RET(0, hmR0SvmExitRdtsc(pVCpu, pSvmTransient)); + case SVM_EXIT_RDTSCP: VMEXIT_CALL_RET(0, hmR0SvmExitRdtscp(pVCpu, pSvmTransient)); + case SVM_EXIT_CPUID: VMEXIT_CALL_RET(0, hmR0SvmExitCpuid(pVCpu, pSvmTransient)); + case SVM_EXIT_XCPT_PF: VMEXIT_CALL_RET(0, hmR0SvmExitXcptPF(pVCpu, pSvmTransient)); + case SVM_EXIT_MSR: VMEXIT_CALL_RET(0, hmR0SvmExitMsr(pVCpu, pSvmTransient)); + case SVM_EXIT_MONITOR: VMEXIT_CALL_RET(0, hmR0SvmExitMonitor(pVCpu, pSvmTransient)); + case SVM_EXIT_MWAIT: VMEXIT_CALL_RET(0, hmR0SvmExitMwait(pVCpu, pSvmTransient)); + case SVM_EXIT_HLT: VMEXIT_CALL_RET(0, hmR0SvmExitHlt(pVCpu, pSvmTransient)); + + case SVM_EXIT_XCPT_NMI: /* Should not occur, SVM_EXIT_NMI is used instead. */ + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: VMEXIT_CALL_RET(0, hmR0SvmExitIntr(pVCpu, pSvmTransient)); + + case SVM_EXIT_READ_CR0: + case SVM_EXIT_READ_CR3: + case SVM_EXIT_READ_CR4: VMEXIT_CALL_RET(0, hmR0SvmExitReadCRx(pVCpu, pSvmTransient)); + + case SVM_EXIT_CR0_SEL_WRITE: + case SVM_EXIT_WRITE_CR0: + case SVM_EXIT_WRITE_CR3: + case SVM_EXIT_WRITE_CR4: + case SVM_EXIT_WRITE_CR8: VMEXIT_CALL_RET(0, hmR0SvmExitWriteCRx(pVCpu, pSvmTransient)); + + case SVM_EXIT_VINTR: VMEXIT_CALL_RET(0, hmR0SvmExitVIntr(pVCpu, pSvmTransient)); + case SVM_EXIT_PAUSE: VMEXIT_CALL_RET(0, hmR0SvmExitPause(pVCpu, pSvmTransient)); + case SVM_EXIT_VMMCALL: VMEXIT_CALL_RET(0, hmR0SvmExitVmmCall(pVCpu, pSvmTransient)); + case SVM_EXIT_INVLPG: VMEXIT_CALL_RET(0, hmR0SvmExitInvlpg(pVCpu, pSvmTransient)); + case SVM_EXIT_WBINVD: VMEXIT_CALL_RET(0, hmR0SvmExitWbinvd(pVCpu, pSvmTransient)); + case SVM_EXIT_INVD: VMEXIT_CALL_RET(0, hmR0SvmExitInvd(pVCpu, pSvmTransient)); + case SVM_EXIT_RDPMC: VMEXIT_CALL_RET(0, hmR0SvmExitRdpmc(pVCpu, pSvmTransient)); + case SVM_EXIT_IRET: VMEXIT_CALL_RET(0, hmR0SvmExitIret(pVCpu, pSvmTransient)); + case SVM_EXIT_XCPT_UD: VMEXIT_CALL_RET(0, hmR0SvmExitXcptUD(pVCpu, pSvmTransient)); + case SVM_EXIT_XCPT_MF: VMEXIT_CALL_RET(0, hmR0SvmExitXcptMF(pVCpu, pSvmTransient)); + case SVM_EXIT_XCPT_DB: VMEXIT_CALL_RET(0, hmR0SvmExitXcptDB(pVCpu, pSvmTransient)); + case SVM_EXIT_XCPT_AC: VMEXIT_CALL_RET(0, hmR0SvmExitXcptAC(pVCpu, pSvmTransient)); + case SVM_EXIT_XCPT_BP: VMEXIT_CALL_RET(0, hmR0SvmExitXcptBP(pVCpu, pSvmTransient)); + case SVM_EXIT_XCPT_GP: VMEXIT_CALL_RET(0, hmR0SvmExitXcptGP(pVCpu, pSvmTransient)); + case SVM_EXIT_XSETBV: VMEXIT_CALL_RET(0, hmR0SvmExitXsetbv(pVCpu, pSvmTransient)); + case SVM_EXIT_FERR_FREEZE: VMEXIT_CALL_RET(0, hmR0SvmExitFerrFreeze(pVCpu, pSvmTransient)); + + default: + { + switch (pSvmTransient->u64ExitCode) + { + case SVM_EXIT_READ_DR0: case SVM_EXIT_READ_DR1: case SVM_EXIT_READ_DR2: case SVM_EXIT_READ_DR3: + case SVM_EXIT_READ_DR6: case SVM_EXIT_READ_DR7: case SVM_EXIT_READ_DR8: case SVM_EXIT_READ_DR9: + case SVM_EXIT_READ_DR10: case SVM_EXIT_READ_DR11: case SVM_EXIT_READ_DR12: case SVM_EXIT_READ_DR13: + case SVM_EXIT_READ_DR14: case SVM_EXIT_READ_DR15: + VMEXIT_CALL_RET(0, hmR0SvmExitReadDRx(pVCpu, pSvmTransient)); + + case SVM_EXIT_WRITE_DR0: case SVM_EXIT_WRITE_DR1: case SVM_EXIT_WRITE_DR2: case SVM_EXIT_WRITE_DR3: + case SVM_EXIT_WRITE_DR6: case SVM_EXIT_WRITE_DR7: case SVM_EXIT_WRITE_DR8: case SVM_EXIT_WRITE_DR9: + case SVM_EXIT_WRITE_DR10: case SVM_EXIT_WRITE_DR11: case SVM_EXIT_WRITE_DR12: case SVM_EXIT_WRITE_DR13: + case SVM_EXIT_WRITE_DR14: case SVM_EXIT_WRITE_DR15: + VMEXIT_CALL_RET(0, hmR0SvmExitWriteDRx(pVCpu, pSvmTransient)); + + case SVM_EXIT_TASK_SWITCH: VMEXIT_CALL_RET(0, hmR0SvmExitTaskSwitch(pVCpu, pSvmTransient)); + case SVM_EXIT_SHUTDOWN: VMEXIT_CALL_RET(0, hmR0SvmExitShutdown(pVCpu, pSvmTransient)); + + case SVM_EXIT_SMI: + case SVM_EXIT_INIT: + { + /* + * We don't intercept SMIs. As for INIT signals, it really shouldn't ever happen here. + * If it ever does, we want to know about it so log the exit code and bail. + */ + VMEXIT_CALL_RET(0, hmR0SvmExitUnexpected(pVCpu, pSvmTransient)); + } + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + case SVM_EXIT_CLGI: VMEXIT_CALL_RET(0, hmR0SvmExitClgi(pVCpu, pSvmTransient)); + case SVM_EXIT_STGI: VMEXIT_CALL_RET(0, hmR0SvmExitStgi(pVCpu, pSvmTransient)); + case SVM_EXIT_VMLOAD: VMEXIT_CALL_RET(0, hmR0SvmExitVmload(pVCpu, pSvmTransient)); + case SVM_EXIT_VMSAVE: VMEXIT_CALL_RET(0, hmR0SvmExitVmsave(pVCpu, pSvmTransient)); + case SVM_EXIT_INVLPGA: VMEXIT_CALL_RET(0, hmR0SvmExitInvlpga(pVCpu, pSvmTransient)); + case SVM_EXIT_VMRUN: VMEXIT_CALL_RET(0, hmR0SvmExitVmrun(pVCpu, pSvmTransient)); +#else + case SVM_EXIT_CLGI: + case SVM_EXIT_STGI: + case SVM_EXIT_VMLOAD: + case SVM_EXIT_VMSAVE: + case SVM_EXIT_INVLPGA: + case SVM_EXIT_VMRUN: +#endif + case SVM_EXIT_RSM: + case SVM_EXIT_SKINIT: + { + hmR0SvmSetPendingXcptUD(pVCpu); + return VINF_SUCCESS; + } + +#ifdef HMSVM_ALWAYS_TRAP_ALL_XCPTS + case SVM_EXIT_XCPT_DE: + /* SVM_EXIT_XCPT_DB: */ /* Handled above. */ + /* SVM_EXIT_XCPT_NMI: */ /* Handled above. */ + /* SVM_EXIT_XCPT_BP: */ /* Handled above. */ + case SVM_EXIT_XCPT_OF: + case SVM_EXIT_XCPT_BR: + /* SVM_EXIT_XCPT_UD: */ /* Handled above. */ + case SVM_EXIT_XCPT_NM: + case SVM_EXIT_XCPT_DF: + case SVM_EXIT_XCPT_CO_SEG_OVERRUN: + case SVM_EXIT_XCPT_TS: + case SVM_EXIT_XCPT_NP: + case SVM_EXIT_XCPT_SS: + /* SVM_EXIT_XCPT_GP: */ /* Handled above. */ + /* SVM_EXIT_XCPT_PF: */ + case SVM_EXIT_XCPT_15: /* Reserved. */ + /* SVM_EXIT_XCPT_MF: */ /* Handled above. */ + /* SVM_EXIT_XCPT_AC: */ /* Handled above. */ + case SVM_EXIT_XCPT_MC: + case SVM_EXIT_XCPT_XF: + case SVM_EXIT_XCPT_20: case SVM_EXIT_XCPT_21: case SVM_EXIT_XCPT_22: case SVM_EXIT_XCPT_23: + case SVM_EXIT_XCPT_24: case SVM_EXIT_XCPT_25: case SVM_EXIT_XCPT_26: case SVM_EXIT_XCPT_27: + case SVM_EXIT_XCPT_28: case SVM_EXIT_XCPT_29: case SVM_EXIT_XCPT_30: case SVM_EXIT_XCPT_31: + VMEXIT_CALL_RET(0, hmR0SvmExitXcptGeneric(pVCpu, pSvmTransient)); +#endif /* HMSVM_ALWAYS_TRAP_ALL_XCPTS */ + + default: + { + AssertMsgFailed(("hmR0SvmHandleExit: Unknown exit code %#RX64\n", uExitCode)); + pVCpu->hm.s.u32HMError = uExitCode; + return VERR_SVM_UNKNOWN_EXIT; + } + } + } + } + /* not reached */ +#undef VMEXIT_CALL_RET +} + + +#ifdef VBOX_STRICT +/* Is there some generic IPRT define for this that are not in Runtime/internal/\* ?? */ +# define HMSVM_ASSERT_PREEMPT_CPUID_VAR() \ + RTCPUID const idAssertCpu = RTThreadPreemptIsEnabled(NIL_RTTHREAD) ? NIL_RTCPUID : RTMpCpuId() + +# define HMSVM_ASSERT_PREEMPT_CPUID() \ + do \ + { \ + RTCPUID const idAssertCpuNow = RTThreadPreemptIsEnabled(NIL_RTTHREAD) ? NIL_RTCPUID : RTMpCpuId(); \ + AssertMsg(idAssertCpu == idAssertCpuNow, ("SVM %#x, %#x\n", idAssertCpu, idAssertCpuNow)); \ + } while (0) + +# define HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(a_pVCpu, a_pSvmTransient) \ + do { \ + AssertPtr((a_pVCpu)); \ + AssertPtr((a_pSvmTransient)); \ + Assert(ASMIntAreEnabled()); \ + HMSVM_ASSERT_PREEMPT_SAFE((a_pVCpu)); \ + HMSVM_ASSERT_PREEMPT_CPUID_VAR(); \ + Log4Func(("vcpu[%u] -v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-\n", (a_pVCpu)->idCpu)); \ + HMSVM_ASSERT_PREEMPT_SAFE((a_pVCpu)); \ + if (VMMR0IsLogFlushDisabled((a_pVCpu))) \ + HMSVM_ASSERT_PREEMPT_CPUID(); \ + } while (0) +#else +# define HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(a_pVCpu, a_pSvmTransient) \ + do { \ + RT_NOREF2(a_pVCpu, a_pSvmTransient); \ + } while (0) +#endif + + +/** + * Gets the IEM exception flags for the specified SVM event. + * + * @returns The IEM exception flags. + * @param pEvent Pointer to the SVM event. + * + * @remarks This function currently only constructs flags required for + * IEMEvaluateRecursiveXcpt and not the complete flags (e.g. error-code + * and CR2 aspects of an exception are not included). + */ +static uint32_t hmR0SvmGetIemXcptFlags(PCSVMEVENT pEvent) +{ + uint8_t const uEventType = pEvent->n.u3Type; + uint32_t fIemXcptFlags; + switch (uEventType) + { + case SVM_EVENT_EXCEPTION: + /* + * Only INT3 and INTO instructions can raise #BP and #OF exceptions. + * See AMD spec. Table 8-1. "Interrupt Vector Source and Cause". + */ + if (pEvent->n.u8Vector == X86_XCPT_BP) + { + fIemXcptFlags = IEM_XCPT_FLAGS_T_SOFT_INT | IEM_XCPT_FLAGS_BP_INSTR; + break; + } + if (pEvent->n.u8Vector == X86_XCPT_OF) + { + fIemXcptFlags = IEM_XCPT_FLAGS_T_SOFT_INT | IEM_XCPT_FLAGS_OF_INSTR; + break; + } + /** @todo How do we distinguish ICEBP \#DB from the regular one? */ + RT_FALL_THRU(); + case SVM_EVENT_NMI: + fIemXcptFlags = IEM_XCPT_FLAGS_T_CPU_XCPT; + break; + + case SVM_EVENT_EXTERNAL_IRQ: + fIemXcptFlags = IEM_XCPT_FLAGS_T_EXT_INT; + break; + + case SVM_EVENT_SOFTWARE_INT: + fIemXcptFlags = IEM_XCPT_FLAGS_T_SOFT_INT; + break; + + default: + fIemXcptFlags = 0; + AssertMsgFailed(("Unexpected event type! uEventType=%#x uVector=%#x", uEventType, pEvent->n.u8Vector)); + break; + } + return fIemXcptFlags; +} + + +/** + * Handle a condition that occurred while delivering an event through the guest + * IDT. + * + * @returns VBox status code (informational error codes included). + * @retval VINF_SUCCESS if we should continue handling the \#VMEXIT. + * @retval VINF_HM_DOUBLE_FAULT if a \#DF condition was detected and we ought to + * continue execution of the guest which will delivery the \#DF. + * @retval VINF_EM_RESET if we detected a triple-fault condition. + * @retval VERR_EM_GUEST_CPU_HANG if we detected a guest CPU hang. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pSvmTransient Pointer to the SVM transient structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0SvmCheckExitDueToEventDelivery(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + int rc = VINF_SUCCESS; + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR2); + + Log4(("EXITINTINFO: Pending vectoring event %#RX64 Valid=%RTbool ErrValid=%RTbool Err=%#RX32 Type=%u Vector=%u\n", + pVmcb->ctrl.ExitIntInfo.u, !!pVmcb->ctrl.ExitIntInfo.n.u1Valid, !!pVmcb->ctrl.ExitIntInfo.n.u1ErrorCodeValid, + pVmcb->ctrl.ExitIntInfo.n.u32ErrorCode, pVmcb->ctrl.ExitIntInfo.n.u3Type, pVmcb->ctrl.ExitIntInfo.n.u8Vector)); + + /* + * The EXITINTINFO (if valid) contains the prior exception (IDT vector) that was trying to + * be delivered to the guest which caused a #VMEXIT which was intercepted (Exit vector). + * + * See AMD spec. 15.7.3 "EXITINFO Pseudo-Code". + */ + if (pVmcb->ctrl.ExitIntInfo.n.u1Valid) + { + IEMXCPTRAISE enmRaise; + IEMXCPTRAISEINFO fRaiseInfo; + bool const fExitIsHwXcpt = pSvmTransient->u64ExitCode - SVM_EXIT_XCPT_0 <= SVM_EXIT_XCPT_31; + uint8_t const uIdtVector = pVmcb->ctrl.ExitIntInfo.n.u8Vector; + if (fExitIsHwXcpt) + { + uint8_t const uExitVector = pSvmTransient->u64ExitCode - SVM_EXIT_XCPT_0; + uint32_t const fIdtVectorFlags = hmR0SvmGetIemXcptFlags(&pVmcb->ctrl.ExitIntInfo); + uint32_t const fExitVectorFlags = IEM_XCPT_FLAGS_T_CPU_XCPT; + enmRaise = IEMEvaluateRecursiveXcpt(pVCpu, fIdtVectorFlags, uIdtVector, fExitVectorFlags, uExitVector, &fRaiseInfo); + } + else + { + /* + * If delivery of an event caused a #VMEXIT that is not an exception (e.g. #NPF) + * then we end up here. + * + * If the event was: + * - a software interrupt, we can re-execute the instruction which will + * regenerate the event. + * - an NMI, we need to clear NMI blocking and re-inject the NMI. + * - a hardware exception or external interrupt, we re-inject it. + */ + fRaiseInfo = IEMXCPTRAISEINFO_NONE; + if (pVmcb->ctrl.ExitIntInfo.n.u3Type == SVM_EVENT_SOFTWARE_INT) + enmRaise = IEMXCPTRAISE_REEXEC_INSTR; + else + enmRaise = IEMXCPTRAISE_PREV_EVENT; + } + + switch (enmRaise) + { + case IEMXCPTRAISE_CURRENT_XCPT: + case IEMXCPTRAISE_PREV_EVENT: + { + /* For software interrupts, we shall re-execute the instruction. */ + if (!(fRaiseInfo & IEMXCPTRAISEINFO_SOFT_INT_XCPT)) + { + RTGCUINTPTR GCPtrFaultAddress = 0; + + /* If we are re-injecting an NMI, clear NMI blocking. */ + if (pVmcb->ctrl.ExitIntInfo.n.u3Type == SVM_EVENT_NMI) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_BLOCK_NMIS); + + /* Determine a vectoring #PF condition, see comment in hmR0SvmExitXcptPF(). */ + if (fRaiseInfo & (IEMXCPTRAISEINFO_EXT_INT_PF | IEMXCPTRAISEINFO_NMI_PF)) + { + pSvmTransient->fVectoringPF = true; + Log4Func(("IDT: Pending vectoring #PF due to delivery of Ext-Int/NMI. uCR2=%#RX64\n", + pVCpu->cpum.GstCtx.cr2)); + } + else if ( pVmcb->ctrl.ExitIntInfo.n.u3Type == SVM_EVENT_EXCEPTION + && uIdtVector == X86_XCPT_PF) + { + /* + * If the previous exception was a #PF, we need to recover the CR2 value. + * This can't happen with shadow paging. + */ + GCPtrFaultAddress = pVCpu->cpum.GstCtx.cr2; + } + + /* + * Without nested paging, when uExitVector is #PF, CR2 value will be updated from the VMCB's + * exit info. fields, if it's a guest #PF, see hmR0SvmExitXcptPF(). + */ + Assert(pVmcb->ctrl.ExitIntInfo.n.u3Type != SVM_EVENT_SOFTWARE_INT); + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectPendingReflect); + hmR0SvmSetPendingEvent(pVCpu, &pVmcb->ctrl.ExitIntInfo, GCPtrFaultAddress); + + Log4Func(("IDT: Pending vectoring event %#RX64 ErrValid=%RTbool Err=%#RX32 GCPtrFaultAddress=%#RX64\n", + pVmcb->ctrl.ExitIntInfo.u, RT_BOOL(pVmcb->ctrl.ExitIntInfo.n.u1ErrorCodeValid), + pVmcb->ctrl.ExitIntInfo.n.u32ErrorCode, GCPtrFaultAddress)); + } + break; + } + + case IEMXCPTRAISE_REEXEC_INSTR: + { + Assert(rc == VINF_SUCCESS); + break; + } + + case IEMXCPTRAISE_DOUBLE_FAULT: + { + /* + * Determing a vectoring double #PF condition. Used later, when PGM evaluates + * the second #PF as a guest #PF (and not a shadow #PF) and needs to be + * converted into a #DF. + */ + if (fRaiseInfo & IEMXCPTRAISEINFO_PF_PF) + { + Log4Func(("IDT: Pending vectoring double #PF uCR2=%#RX64\n", pVCpu->cpum.GstCtx.cr2)); + pSvmTransient->fVectoringDoublePF = true; + Assert(rc == VINF_SUCCESS); + } + else + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectPendingReflect); + hmR0SvmSetPendingXcptDF(pVCpu); + rc = VINF_HM_DOUBLE_FAULT; + } + break; + } + + case IEMXCPTRAISE_TRIPLE_FAULT: + { + rc = VINF_EM_RESET; + break; + } + + case IEMXCPTRAISE_CPU_HANG: + { + rc = VERR_EM_GUEST_CPU_HANG; + break; + } + + default: + AssertMsgFailedBreakStmt(("Bogus enmRaise value: %d (%#x)\n", enmRaise, enmRaise), rc = VERR_SVM_IPE_2); + } + } + Assert(rc == VINF_SUCCESS || rc == VINF_HM_DOUBLE_FAULT || rc == VINF_EM_RESET || rc == VERR_EM_GUEST_CPU_HANG); + return rc; +} + + +/** + * Advances the guest RIP by the number of bytes specified in @a cb. + * + * @param pVCpu The cross context virtual CPU structure. + * @param cb RIP increment value in bytes. + */ +DECLINLINE(void) hmR0SvmAdvanceRip(PVMCPU pVCpu, uint32_t cb) +{ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + pCtx->rip += cb; + + /* Update interrupt shadow. */ + if ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS) + && pCtx->rip != EMGetInhibitInterruptsPC(pVCpu)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); +} + + +/* -=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ +/* -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- #VMEXIT handlers -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- */ +/* -=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ + +/** @name \#VMEXIT handlers. + * @{ + */ + +/** + * \#VMEXIT handler for external interrupts, NMIs, FPU assertion freeze and INIT + * signals (SVM_EXIT_INTR, SVM_EXIT_NMI, SVM_EXIT_FERR_FREEZE, SVM_EXIT_INIT). + */ +HMSVM_EXIT_DECL hmR0SvmExitIntr(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + if (pSvmTransient->u64ExitCode == SVM_EXIT_NMI) + STAM_REL_COUNTER_INC(&pVCpu->hm.s.StatExitHostNmiInGC); + else if (pSvmTransient->u64ExitCode == SVM_EXIT_INTR) + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitExtInt); + + /* + * AMD-V has no preemption timer and the generic periodic preemption timer has no way to + * signal -before- the timer fires if the current interrupt is our own timer or a some + * other host interrupt. We also cannot examine what interrupt it is until the host + * actually take the interrupt. + * + * Going back to executing guest code here unconditionally causes random scheduling + * problems (observed on an AMD Phenom 9850 Quad-Core on Windows 64-bit host). + */ + return VINF_EM_RAW_INTERRUPT; +} + + +/** + * \#VMEXIT handler for WBINVD (SVM_EXIT_WBINVD). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitWbinvd(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedWbinvd(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for INVD (SVM_EXIT_INVD). Unconditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitInvd(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedInvd(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for INVD (SVM_EXIT_CPUID). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitCpuid(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | CPUMCTX_EXTRN_RAX | CPUMCTX_EXTRN_RCX); + VBOXSTRICTRC rcStrict; + PCEMEXITREC pExitRec = EMHistoryUpdateFlagsAndTypeAndPC(pVCpu, + EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_CPUID), + pVCpu->cpum.GstCtx.rip + pVCpu->cpum.GstCtx.cs.u64Base); + if (!pExitRec) + { + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedCpuid(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + } + else + { + /* + * Frequent exit or something needing probing. Get state and call EMHistoryExec. + */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + + Log4(("CpuIdExit/%u: %04x:%08RX64: %#x/%#x -> EMHistoryExec\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pVCpu->cpum.GstCtx.eax, pVCpu->cpum.GstCtx.ecx)); + + rcStrict = EMHistoryExec(pVCpu, pExitRec, 0); + + Log4(("CpuIdExit/%u: %04x:%08RX64: EMHistoryExec -> %Rrc + %04x:%08RX64\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, + VBOXSTRICTRC_VAL(rcStrict), pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip)); + } + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for RDTSC (SVM_EXIT_RDTSC). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitRdtsc(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | CPUMCTX_EXTRN_CR4); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedRdtsc(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_SUCCESS) + pSvmTransient->fUpdateTscOffsetting = true; + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for RDTSCP (SVM_EXIT_RDTSCP). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitRdtscp(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | CPUMCTX_EXTRN_CR4 | CPUMCTX_EXTRN_TSC_AUX); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedRdtscp(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_SUCCESS) + pSvmTransient->fUpdateTscOffsetting = true; + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for RDPMC (SVM_EXIT_RDPMC). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitRdpmc(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | CPUMCTX_EXTRN_CR4); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedRdpmc(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for INVLPG (SVM_EXIT_INVLPG). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitInvlpg(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + Assert(!pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging); + + VBOXSTRICTRC rcStrict; + bool const fSupportsDecodeAssists = hmR0SvmSupportsDecodeAssists(pVCpu); + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if ( fSupportsDecodeAssists + && fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + RTGCPTR const GCPtrPage = pVmcb->ctrl.u64ExitInfo1; + rcStrict = IEMExecDecodedInvlpg(pVCpu, cbInstr, GCPtrPage); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_VAL(rcStrict); +} + + +/** + * \#VMEXIT handler for HLT (SVM_EXIT_HLT). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitHlt(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedHlt(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if ( rcStrict == VINF_EM_HALT + || rcStrict == VINF_SUCCESS) + rcStrict = EMShouldContinueAfterHalt(pVCpu, &pVCpu->cpum.GstCtx) ? VINF_SUCCESS : VINF_EM_HALT; + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitHlt); + if (rcStrict != VINF_SUCCESS) + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchHltToR3); + return VBOXSTRICTRC_VAL(rcStrict);; +} + + +/** + * \#VMEXIT handler for MONITOR (SVM_EXIT_MONITOR). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitMonitor(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + /* + * If the instruction length is supplied by the CPU is 3 bytes, we can be certain that no + * segment override prefix is present (and thus use the default segment DS). Otherwise, a + * segment override prefix or other prefixes might be used, in which case we fallback to + * IEMExecOne() to figure out. + */ + VBOXSTRICTRC rcStrict; + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = hmR0SvmSupportsNextRipSave(pVCpu) ? pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip : 0; + if (cbInstr) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK | CPUMCTX_EXTRN_DS); + rcStrict = IEMExecDecodedMonitor(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitMonitor); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for MWAIT (SVM_EXIT_MWAIT). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitMwait(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedMwait(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if ( rcStrict == VINF_EM_HALT + && EMMonitorWaitShouldContinue(pVCpu, &pVCpu->cpum.GstCtx)) + rcStrict = VINF_SUCCESS; + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitMwait); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for shutdown (triple-fault) (SVM_EXIT_SHUTDOWN). Conditional + * \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitShutdown(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + return VINF_EM_RESET; +} + + +/** + * \#VMEXIT handler for unexpected exits. Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitUnexpected(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + AssertMsgFailed(("hmR0SvmExitUnexpected: ExitCode=%#RX64 uExitInfo1=%#RX64 uExitInfo2=%#RX64\n", pSvmTransient->u64ExitCode, + pVmcb->ctrl.u64ExitInfo1, pVmcb->ctrl.u64ExitInfo2)); + RT_NOREF(pVmcb); + pVCpu->hm.s.u32HMError = (uint32_t)pSvmTransient->u64ExitCode; + return VERR_SVM_UNEXPECTED_EXIT; +} + + +/** + * \#VMEXIT handler for CRx reads (SVM_EXIT_READ_CR*). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitReadCRx(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + Log4Func(("CS:RIP=%04x:%#RX64\n", pCtx->cs.Sel, pCtx->rip)); +#ifdef VBOX_WITH_STATISTICS + switch (pSvmTransient->u64ExitCode) + { + case SVM_EXIT_READ_CR0: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR0Read); break; + case SVM_EXIT_READ_CR2: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR2Read); break; + case SVM_EXIT_READ_CR3: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR3Read); break; + case SVM_EXIT_READ_CR4: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR4Read); break; + case SVM_EXIT_READ_CR8: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR8Read); break; + } +#endif + + bool const fSupportsDecodeAssists = hmR0SvmSupportsDecodeAssists(pVCpu); + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if ( fSupportsDecodeAssists + && fSupportsNextRipSave) + { + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + bool const fMovCRx = RT_BOOL(pVmcb->ctrl.u64ExitInfo1 & SVM_EXIT1_MOV_CRX_MASK); + if (fMovCRx) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | CPUMCTX_EXTRN_CR_MASK + | CPUMCTX_EXTRN_APIC_TPR); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pCtx->rip; + uint8_t const iCrReg = pSvmTransient->u64ExitCode - SVM_EXIT_READ_CR0; + uint8_t const iGReg = pVmcb->ctrl.u64ExitInfo1 & SVM_EXIT1_MOV_CRX_GPR_NUMBER; + VBOXSTRICTRC rcStrict = IEMExecDecodedMovCRxRead(pVCpu, cbInstr, iGReg, iCrReg); + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_VAL(rcStrict); + } + /* else: SMSW instruction, fall back below to IEM for this. */ + } + + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + VBOXSTRICTRC rcStrict = IEMExecOne(pVCpu); + AssertMsg( rcStrict == VINF_SUCCESS + || rcStrict == VINF_PGM_SYNC_CR3 + || rcStrict == VINF_IEM_RAISED_XCPT, + ("hmR0SvmExitReadCRx: IEMExecOne failed rc=%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + Assert((pSvmTransient->u64ExitCode - SVM_EXIT_READ_CR0) <= 15); + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for CRx writes (SVM_EXIT_WRITE_CR*). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitWriteCRx(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + uint64_t const uExitCode = pSvmTransient->u64ExitCode; + uint8_t const iCrReg = uExitCode == SVM_EXIT_CR0_SEL_WRITE ? 0 : (pSvmTransient->u64ExitCode - SVM_EXIT_WRITE_CR0); + Assert(iCrReg <= 15); + + VBOXSTRICTRC rcStrict = VERR_SVM_IPE_5; + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + bool fDecodedInstr = false; + bool const fSupportsDecodeAssists = hmR0SvmSupportsDecodeAssists(pVCpu); + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if ( fSupportsDecodeAssists + && fSupportsNextRipSave) + { + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + bool const fMovCRx = RT_BOOL(pVmcb->ctrl.u64ExitInfo1 & SVM_EXIT1_MOV_CRX_MASK); + if (fMovCRx) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK | CPUMCTX_EXTRN_CR3 | CPUMCTX_EXTRN_CR4 + | CPUMCTX_EXTRN_APIC_TPR); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pCtx->rip; + uint8_t const iGReg = pVmcb->ctrl.u64ExitInfo1 & SVM_EXIT1_MOV_CRX_GPR_NUMBER; + Log4Func(("Mov CR%u w/ iGReg=%#x\n", iCrReg, iGReg)); + rcStrict = IEMExecDecodedMovCRxWrite(pVCpu, cbInstr, iCrReg, iGReg); + fDecodedInstr = true; + } + /* else: LMSW or CLTS instruction, fall back below to IEM for this. */ + } + + if (!fDecodedInstr) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + Log4Func(("iCrReg=%#x\n", iCrReg)); + rcStrict = IEMExecOne(pVCpu); + if (RT_UNLIKELY( rcStrict == VERR_IEM_ASPECT_NOT_IMPLEMENTED + || rcStrict == VERR_IEM_INSTR_NOT_IMPLEMENTED)) + rcStrict = VERR_EM_INTERPRETER; + } + + if (rcStrict == VINF_SUCCESS) + { + switch (iCrReg) + { + case 0: + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CR0); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR0Write); + break; + + case 2: + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CR2); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR2Write); + break; + + case 3: + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CR3); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR3Write); + break; + + case 4: + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CR4); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR4Write); + break; + + case 8: + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_APIC_TPR); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR8Write); + break; + + default: + { + AssertMsgFailed(("hmR0SvmExitWriteCRx: Invalid/Unexpected Write-CRx exit. u64ExitCode=%#RX64 %#x\n", + pSvmTransient->u64ExitCode, iCrReg)); + break; + } + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + } + else + Assert(rcStrict == VERR_EM_INTERPRETER || rcStrict == VINF_PGM_SYNC_CR3); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT helper for read MSRs, see hmR0SvmExitMsr. + * + * @returns Strict VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + */ +static VBOXSTRICTRC hmR0SvmExitReadMsr(PVMCPU pVCpu, PSVMVMCB pVmcb) +{ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitRdmsr); + Log4Func(("idMsr=%#RX32\n", pVCpu->cpum.GstCtx.ecx)); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + /** @todo Optimize this: Only retrieve the MSR bits we need here. CPUMAllMsrs.cpp + * can ask for what it needs instead of using CPUMCTX_EXTRN_ALL_MSRS. */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | CPUMCTX_EXTRN_ALL_MSRS); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedRdmsr(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | CPUMCTX_EXTRN_ALL_MSRS); + rcStrict = IEMExecOne(pVCpu); + } + + AssertMsg( rcStrict == VINF_SUCCESS + || rcStrict == VINF_IEM_RAISED_XCPT + || rcStrict == VINF_CPUM_R3_MSR_READ, + ("hmR0SvmExitReadMsr: Unexpected status %Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return rcStrict; +} + + +/** + * \#VMEXIT helper for write MSRs, see hmR0SvmExitMsr. + * + * @returns Strict VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmcb Pointer to the VM control block. + * @param pSvmTransient Pointer to the SVM-transient structure. + */ +static VBOXSTRICTRC hmR0SvmExitWriteMsr(PVMCPU pVCpu, PSVMVMCB pVmcb, PSVMTRANSIENT pSvmTransient) +{ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint32_t const idMsr = pCtx->ecx; + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitWrmsr); + Log4Func(("idMsr=%#RX32\n", idMsr)); + + /* + * Handle TPR patching MSR writes. + * We utilitize the LSTAR MSR for patching. + */ + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if ( pVCpu->CTX_SUFF(pVM)->hm.s.fTPRPatchingActive + && idMsr == MSR_K8_LSTAR) + { + unsigned cbInstr; + if (fSupportsNextRipSave) + cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + else + { + PDISCPUSTATE pDis = &pVCpu->hm.s.DisState; + int rc = EMInterpretDisasCurrent(pVCpu->CTX_SUFF(pVM), pVCpu, pDis, &cbInstr); + if ( rc == VINF_SUCCESS + && pDis->pCurInstr->uOpcode == OP_WRMSR) + Assert(cbInstr > 0); + else + cbInstr = 0; + } + + /* Our patch code uses LSTAR for TPR caching for 32-bit guests. */ + if ((pCtx->eax & 0xff) != pSvmTransient->u8GuestTpr) + { + int rc = APICSetTpr(pVCpu, pCtx->eax & 0xff); + AssertRCReturn(rc, rc); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_APIC_TPR); + } + + int rc = VINF_SUCCESS; + hmR0SvmAdvanceRip(pVCpu, cbInstr); + HMSVM_CHECK_SINGLE_STEP(pVCpu, rc); + return rc; + } + + /* + * Handle regular MSR writes. + */ + VBOXSTRICTRC rcStrict; + if (fSupportsNextRipSave) + { + /** @todo Optimize this: We don't need to get much of the MSR state here + * since we're only updating. CPUMAllMsrs.cpp can ask for what it needs and + * clear the applicable extern flags. */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | CPUMCTX_EXTRN_ALL_MSRS); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedWrmsr(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | CPUMCTX_EXTRN_ALL_MSRS); + rcStrict = IEMExecOne(pVCpu); + } + + AssertMsg( rcStrict == VINF_SUCCESS + || rcStrict == VINF_IEM_RAISED_XCPT + || rcStrict == VINF_CPUM_R3_MSR_WRITE, + ("hmR0SvmExitWriteMsr: Unexpected status %Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + + if (rcStrict == VINF_SUCCESS) + { + /* If this is an X2APIC WRMSR access, update the APIC TPR state. */ + if ( idMsr >= MSR_IA32_X2APIC_START + && idMsr <= MSR_IA32_X2APIC_END) + { + /* + * We've already saved the APIC related guest-state (TPR) in hmR0SvmPostRunGuest(). + * When full APIC register virtualization is implemented we'll have to make sure + * APIC state is saved from the VMCB before IEM changes it. + */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_APIC_TPR); + } + else + { + switch (idMsr) + { + case MSR_IA32_TSC: pSvmTransient->fUpdateTscOffsetting = true; break; + case MSR_K6_EFER: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_EFER_MSR); break; + case MSR_K8_FS_BASE: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_FS); break; + case MSR_K8_GS_BASE: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_GS); break; + case MSR_IA32_SYSENTER_CS: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_SYSENTER_CS_MSR); break; + case MSR_IA32_SYSENTER_EIP: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_SYSENTER_EIP_MSR); break; + case MSR_IA32_SYSENTER_ESP: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_SYSENTER_ESP_MSR); break; + } + } + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return rcStrict; +} + + +/** + * \#VMEXIT handler for MSR read and writes (SVM_EXIT_MSR). Conditional + * \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitMsr(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + if (pVmcb->ctrl.u64ExitInfo1 == SVM_EXIT1_MSR_READ) + return VBOXSTRICTRC_TODO(hmR0SvmExitReadMsr(pVCpu, pVmcb)); + + Assert(pVmcb->ctrl.u64ExitInfo1 == SVM_EXIT1_MSR_WRITE); + return VBOXSTRICTRC_TODO(hmR0SvmExitWriteMsr(pVCpu, pVmcb, pSvmTransient)); +} + + +/** + * \#VMEXIT handler for DRx read (SVM_EXIT_READ_DRx). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitReadDRx(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitDRxRead); + + /** @todo Stepping with nested-guest. */ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (!CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + { + /* We should -not- get this #VMEXIT if the guest's debug registers were active. */ + if (pSvmTransient->fWasGuestDebugStateActive) + { + AssertMsgFailed(("hmR0SvmExitReadDRx: Unexpected exit %#RX32\n", (uint32_t)pSvmTransient->u64ExitCode)); + pVCpu->hm.s.u32HMError = (uint32_t)pSvmTransient->u64ExitCode; + return VERR_SVM_UNEXPECTED_EXIT; + } + + /* + * Lazy DR0-3 loading. + */ + if (!pSvmTransient->fWasHyperDebugStateActive) + { + Assert(!DBGFIsStepping(pVCpu)); Assert(!pVCpu->hm.s.fSingleInstruction); + Log5(("hmR0SvmExitReadDRx: Lazy loading guest debug registers\n")); + + /* Don't intercept DRx read and writes. */ + PSVMVMCB pVmcb = pVCpu->hm.s.svm.pVmcb; + pVmcb->ctrl.u16InterceptRdDRx = 0; + pVmcb->ctrl.u16InterceptWrDRx = 0; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_INTERCEPTS; + + /* We're playing with the host CPU state here, make sure we don't preempt or longjmp. */ + VMMRZCallRing3Disable(pVCpu); + HM_DISABLE_PREEMPT(pVCpu); + + /* Save the host & load the guest debug state, restart execution of the MOV DRx instruction. */ + CPUMR0LoadGuestDebugState(pVCpu, false /* include DR6 */); + Assert(CPUMIsGuestDebugStateActive(pVCpu) || HC_ARCH_BITS == 32); + + HM_RESTORE_PREEMPT(); + VMMRZCallRing3Enable(pVCpu); + + STAM_COUNTER_INC(&pVCpu->hm.s.StatDRxContextSwitch); + return VINF_SUCCESS; + } + } + + /* + * Interpret the read/writing of DRx. + */ + /** @todo Decode assist. */ + VBOXSTRICTRC rc = EMInterpretInstruction(pVCpu, CPUMCTX2CORE(pCtx), 0 /* pvFault */); + Log5(("hmR0SvmExitReadDRx: Emulated DRx access: rc=%Rrc\n", VBOXSTRICTRC_VAL(rc))); + if (RT_LIKELY(rc == VINF_SUCCESS)) + { + /* Not necessary for read accesses but whatever doesn't hurt for now, will be fixed with decode assist. */ + /** @todo CPUM should set this flag! */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_DR_MASK); + HMSVM_CHECK_SINGLE_STEP(pVCpu, rc); + } + else + Assert(rc == VERR_EM_INTERPRETER); + return VBOXSTRICTRC_TODO(rc); +} + + +/** + * \#VMEXIT handler for DRx write (SVM_EXIT_WRITE_DRx). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitWriteDRx(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + /* For now it's the same since we interpret the instruction anyway. Will change when using of Decode Assist is implemented. */ + int rc = hmR0SvmExitReadDRx(pVCpu, pSvmTransient); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitDRxWrite); + STAM_COUNTER_DEC(&pVCpu->hm.s.StatExitDRxRead); + return rc; +} + + +/** + * \#VMEXIT handler for XCRx write (SVM_EXIT_XSETBV). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXsetbv(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + + /** @todo decode assists... */ + VBOXSTRICTRC rcStrict = IEMExecOne(pVCpu); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + { + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + pVCpu->hm.s.fLoadSaveGuestXcr0 = (pCtx->cr4 & X86_CR4_OSXSAVE) && pCtx->aXcr[0] != ASMGetXcr0(); + Log4Func(("New XCR0=%#RX64 fLoadSaveGuestXcr0=%RTbool (cr4=%#RX64)\n", pCtx->aXcr[0], pVCpu->hm.s.fLoadSaveGuestXcr0, + pCtx->cr4)); + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for I/O instructions (SVM_EXIT_IOIO). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitIOInstr(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | CPUMCTX_EXTRN_SREG_MASK); + + /* I/O operation lookup arrays. */ + static uint32_t const s_aIOSize[8] = { 0, 1, 2, 0, 4, 0, 0, 0 }; /* Size of the I/O accesses in bytes. */ + static uint32_t const s_aIOOpAnd[8] = { 0, 0xff, 0xffff, 0, 0xffffffff, 0, 0, 0 }; /* AND masks for saving + the result (in AL/AX/EAX). */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + + Log4Func(("CS:RIP=%04x:%#RX64\n", pCtx->cs.Sel, pCtx->rip)); + + /* Refer AMD spec. 15.10.2 "IN and OUT Behaviour" and Figure 15-2. "EXITINFO1 for IOIO Intercept" for the format. */ + SVMIOIOEXITINFO IoExitInfo; + IoExitInfo.u = (uint32_t)pVmcb->ctrl.u64ExitInfo1; + uint32_t uIOWidth = (IoExitInfo.u >> 4) & 0x7; + uint32_t cbValue = s_aIOSize[uIOWidth]; + uint32_t uAndVal = s_aIOOpAnd[uIOWidth]; + + if (RT_UNLIKELY(!cbValue)) + { + AssertMsgFailed(("hmR0SvmExitIOInstr: Invalid IO operation. uIOWidth=%u\n", uIOWidth)); + return VERR_EM_INTERPRETER; + } + + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RFLAGS); + VBOXSTRICTRC rcStrict; + PCEMEXITREC pExitRec = NULL; + if ( !pVCpu->hm.s.fSingleInstruction + && !pVCpu->cpum.GstCtx.eflags.Bits.u1TF) + pExitRec = EMHistoryUpdateFlagsAndTypeAndPC(pVCpu, + !IoExitInfo.n.u1Str + ? IoExitInfo.n.u1Type == SVM_IOIO_READ + ? EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_IO_PORT_READ) + : EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_IO_PORT_WRITE) + : IoExitInfo.n.u1Type == SVM_IOIO_READ + ? EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_IO_PORT_STR_READ) + : EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_IO_PORT_STR_WRITE), + pVCpu->cpum.GstCtx.rip + pVCpu->cpum.GstCtx.cs.u64Base); + if (!pExitRec) + { + bool fUpdateRipAlready = false; + if (IoExitInfo.n.u1Str) + { + /* INS/OUTS - I/O String instruction. */ + /** @todo Huh? why can't we use the segment prefix information given by AMD-V + * in EXITINFO1? Investigate once this thing is up and running. */ + Log4Func(("CS:RIP=%04x:%08RX64 %#06x/%u %c str\n", pCtx->cs.Sel, pCtx->rip, IoExitInfo.n.u16Port, cbValue, + IoExitInfo.n.u1Type == SVM_IOIO_WRITE ? 'w' : 'r')); + AssertReturn(pCtx->dx == IoExitInfo.n.u16Port, VERR_SVM_IPE_2); + static IEMMODE const s_aenmAddrMode[8] = + { + (IEMMODE)-1, IEMMODE_16BIT, IEMMODE_32BIT, (IEMMODE)-1, IEMMODE_64BIT, (IEMMODE)-1, (IEMMODE)-1, (IEMMODE)-1 + }; + IEMMODE enmAddrMode = s_aenmAddrMode[(IoExitInfo.u >> 7) & 0x7]; + if (enmAddrMode != (IEMMODE)-1) + { + uint64_t cbInstr = pVmcb->ctrl.u64ExitInfo2 - pCtx->rip; + if (cbInstr <= 15 && cbInstr >= 1) + { + Assert(cbInstr >= 1U + IoExitInfo.n.u1Rep); + if (IoExitInfo.n.u1Type == SVM_IOIO_WRITE) + { + /* Don't know exactly how to detect whether u3Seg is valid, currently + only enabling it for Bulldozer and later with NRIP. OS/2 broke on + 2384 Opterons when only checking NRIP. */ + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if ( fSupportsNextRipSave + && pVM->cpum.ro.GuestFeatures.enmMicroarch >= kCpumMicroarch_AMD_15h_First) + { + AssertMsg(IoExitInfo.n.u3Seg == X86_SREG_DS || cbInstr > 1U + IoExitInfo.n.u1Rep, + ("u32Seg=%d cbInstr=%d u1REP=%d", IoExitInfo.n.u3Seg, cbInstr, IoExitInfo.n.u1Rep)); + rcStrict = IEMExecStringIoWrite(pVCpu, cbValue, enmAddrMode, IoExitInfo.n.u1Rep, (uint8_t)cbInstr, + IoExitInfo.n.u3Seg, true /*fIoChecked*/); + } + else if (cbInstr == 1U + IoExitInfo.n.u1Rep) + rcStrict = IEMExecStringIoWrite(pVCpu, cbValue, enmAddrMode, IoExitInfo.n.u1Rep, (uint8_t)cbInstr, + X86_SREG_DS, true /*fIoChecked*/); + else + rcStrict = IEMExecOne(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitIOStringWrite); + } + else + { + AssertMsg(IoExitInfo.n.u3Seg == X86_SREG_ES /*=0*/, ("%#x\n", IoExitInfo.n.u3Seg)); + rcStrict = IEMExecStringIoRead(pVCpu, cbValue, enmAddrMode, IoExitInfo.n.u1Rep, (uint8_t)cbInstr, + true /*fIoChecked*/); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitIOStringRead); + } + } + else + { + AssertMsgFailed(("rip=%RX64 nrip=%#RX64 cbInstr=%#RX64\n", pCtx->rip, pVmcb->ctrl.u64ExitInfo2, cbInstr)); + rcStrict = IEMExecOne(pVCpu); + } + } + else + { + AssertMsgFailed(("IoExitInfo=%RX64\n", IoExitInfo.u)); + rcStrict = IEMExecOne(pVCpu); + } + fUpdateRipAlready = true; + } + else + { + /* IN/OUT - I/O instruction. */ + Assert(!IoExitInfo.n.u1Rep); + + uint8_t const cbInstr = pVmcb->ctrl.u64ExitInfo2 - pCtx->rip; + if (IoExitInfo.n.u1Type == SVM_IOIO_WRITE) + { + rcStrict = IOMIOPortWrite(pVM, pVCpu, IoExitInfo.n.u16Port, pCtx->eax & uAndVal, cbValue); + if ( rcStrict == VINF_IOM_R3_IOPORT_WRITE + && !pCtx->eflags.Bits.u1TF) + rcStrict = EMRZSetPendingIoPortWrite(pVCpu, IoExitInfo.n.u16Port, cbInstr, cbValue, pCtx->eax & uAndVal); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitIOWrite); + } + else + { + uint32_t u32Val = 0; + rcStrict = IOMIOPortRead(pVM, pVCpu, IoExitInfo.n.u16Port, &u32Val, cbValue); + if (IOM_SUCCESS(rcStrict)) + { + /* Save result of I/O IN instr. in AL/AX/EAX. */ + /** @todo r=bird: 32-bit op size should clear high bits of rax! */ + pCtx->eax = (pCtx->eax & ~uAndVal) | (u32Val & uAndVal); + } + else if ( rcStrict == VINF_IOM_R3_IOPORT_READ + && !pCtx->eflags.Bits.u1TF) + rcStrict = EMRZSetPendingIoPortRead(pVCpu, IoExitInfo.n.u16Port, cbInstr, cbValue); + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitIORead); + } + } + + if (IOM_SUCCESS(rcStrict)) + { + /* AMD-V saves the RIP of the instruction following the IO instruction in EXITINFO2. */ + if (!fUpdateRipAlready) + pCtx->rip = pVmcb->ctrl.u64ExitInfo2; + + /* + * If any I/O breakpoints are armed, we need to check if one triggered + * and take appropriate action. + * Note that the I/O breakpoint type is undefined if CR4.DE is 0. + */ + /** @todo Optimize away the DBGFBpIsHwIoArmed call by having DBGF tell the + * execution engines about whether hyper BPs and such are pending. */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_DR7); + uint32_t const uDr7 = pCtx->dr[7]; + if (RT_UNLIKELY( ( (uDr7 & X86_DR7_ENABLED_MASK) + && X86_DR7_ANY_RW_IO(uDr7) + && (pCtx->cr4 & X86_CR4_DE)) + || DBGFBpIsHwIoArmed(pVM))) + { + /* We're playing with the host CPU state here, make sure we don't preempt or longjmp. */ + VMMRZCallRing3Disable(pVCpu); + HM_DISABLE_PREEMPT(pVCpu); + + STAM_COUNTER_INC(&pVCpu->hm.s.StatDRxIoCheck); + CPUMR0DebugStateMaybeSaveGuest(pVCpu, false /*fDr6*/); + + VBOXSTRICTRC rcStrict2 = DBGFBpCheckIo(pVM, pVCpu, &pVCpu->cpum.GstCtx, IoExitInfo.n.u16Port, cbValue); + if (rcStrict2 == VINF_EM_RAW_GUEST_TRAP) + { + /* Raise #DB. */ + pVmcb->guest.u64DR6 = pCtx->dr[6]; + pVmcb->guest.u64DR7 = pCtx->dr[7]; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_DRX; + hmR0SvmSetPendingXcptDB(pVCpu); + } + /* rcStrict is VINF_SUCCESS, VINF_IOM_R3_IOPORT_COMMIT_WRITE, or in [VINF_EM_FIRST..VINF_EM_LAST], + however we can ditch VINF_IOM_R3_IOPORT_COMMIT_WRITE as it has VMCPU_FF_IOM as backup. */ + else if ( rcStrict2 != VINF_SUCCESS + && (rcStrict == VINF_SUCCESS || rcStrict2 < rcStrict)) + rcStrict = rcStrict2; + AssertCompile(VINF_EM_LAST < VINF_IOM_R3_IOPORT_COMMIT_WRITE); + + HM_RESTORE_PREEMPT(); + VMMRZCallRing3Enable(pVCpu); + } + + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + } + +#ifdef VBOX_STRICT + if ( rcStrict == VINF_IOM_R3_IOPORT_READ + || rcStrict == VINF_EM_PENDING_R3_IOPORT_READ) + Assert(IoExitInfo.n.u1Type == SVM_IOIO_READ); + else if ( rcStrict == VINF_IOM_R3_IOPORT_WRITE + || rcStrict == VINF_IOM_R3_IOPORT_COMMIT_WRITE + || rcStrict == VINF_EM_PENDING_R3_IOPORT_WRITE) + Assert(IoExitInfo.n.u1Type == SVM_IOIO_WRITE); + else + { + /** @todo r=bird: This is missing a bunch of VINF_EM_FIRST..VINF_EM_LAST + * statuses, that the VMM device and some others may return. See + * IOM_SUCCESS() for guidance. */ + AssertMsg( RT_FAILURE(rcStrict) + || rcStrict == VINF_SUCCESS + || rcStrict == VINF_EM_RAW_EMULATE_INSTR + || rcStrict == VINF_EM_DBG_BREAKPOINT + || rcStrict == VINF_EM_RAW_GUEST_TRAP + || rcStrict == VINF_EM_RAW_TO_R3 + || rcStrict == VINF_TRPM_XCPT_DISPATCHED + || rcStrict == VINF_EM_TRIPLE_FAULT, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + } +#endif + } + else + { + /* + * Frequent exit or something needing probing. Get state and call EMHistoryExec. + */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + STAM_COUNTER_INC(!IoExitInfo.n.u1Str + ? IoExitInfo.n.u1Type == SVM_IOIO_WRITE ? &pVCpu->hm.s.StatExitIOWrite : &pVCpu->hm.s.StatExitIORead + : IoExitInfo.n.u1Type == SVM_IOIO_WRITE ? &pVCpu->hm.s.StatExitIOStringWrite : &pVCpu->hm.s.StatExitIOStringRead); + Log4(("IOExit/%u: %04x:%08RX64: %s%s%s %#x LB %u -> EMHistoryExec\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, IoExitInfo.n.u1Rep ? "REP " : "", + IoExitInfo.n.u1Type == SVM_IOIO_WRITE ? "OUT" : "IN", IoExitInfo.n.u1Str ? "S" : "", IoExitInfo.n.u16Port, uIOWidth)); + + rcStrict = EMHistoryExec(pVCpu, pExitRec, 0); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + + Log4(("IOExit/%u: %04x:%08RX64: EMHistoryExec -> %Rrc + %04x:%08RX64\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, + VBOXSTRICTRC_VAL(rcStrict), pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip)); + } + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for Nested Page-faults (SVM_EXIT_NPF). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitNestedPF(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + Assert(pVM->hm.s.fNestedPaging); + + /* See AMD spec. 15.25.6 "Nested versus Guest Page Faults, Fault Ordering" for VMCB details for #NPF. */ + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + RTGCPHYS GCPhysFaultAddr = pVmcb->ctrl.u64ExitInfo2; + uint32_t u32ErrCode = pVmcb->ctrl.u64ExitInfo1; /* Note! High bits in EXITINFO1 may contain additional info and are + thus intentionally not copied into u32ErrCode. */ + + Log4Func(("#NPF at CS:RIP=%04x:%#RX64 GCPhysFaultAddr=%RGp ErrCode=%#x \n", pCtx->cs.Sel, pCtx->rip, GCPhysFaultAddr, + u32ErrCode)); + + /* + * TPR patching for 32-bit guests, using the reserved bit in the page tables for MMIO regions. + */ + if ( pVM->hm.s.fTprPatchingAllowed + && (GCPhysFaultAddr & PAGE_OFFSET_MASK) == XAPIC_OFF_TPR + && ( !(u32ErrCode & X86_TRAP_PF_P) /* Not present */ + || (u32ErrCode & (X86_TRAP_PF_P | X86_TRAP_PF_RSVD)) == (X86_TRAP_PF_P | X86_TRAP_PF_RSVD)) /* MMIO page. */ + && !CPUMIsGuestInSvmNestedHwVirtMode(pCtx) + && !CPUMIsGuestInLongModeEx(pCtx) + && !CPUMGetGuestCPL(pVCpu) + && pVM->hm.s.cPatches < RT_ELEMENTS(pVM->hm.s.aPatches)) + { + RTGCPHYS GCPhysApicBase = APICGetBaseMsrNoCheck(pVCpu); + GCPhysApicBase &= PAGE_BASE_GC_MASK; + + if (GCPhysFaultAddr == GCPhysApicBase + XAPIC_OFF_TPR) + { + /* Only attempt to patch the instruction once. */ + PHMTPRPATCH pPatch = (PHMTPRPATCH)RTAvloU32Get(&pVM->hm.s.PatchTree, (AVLOU32KEY)pCtx->eip); + if (!pPatch) + return VINF_EM_HM_PATCH_TPR_INSTR; + } + } + + /* + * Determine the nested paging mode. + */ +/** @todo r=bird: Gotta love this nested paging hacking we're still carrying with us... (Split PGM_TYPE_NESTED.) */ + PGMMODE enmNestedPagingMode; +#if HC_ARCH_BITS == 32 + if (CPUMIsGuestInLongModeEx(pCtx)) + enmNestedPagingMode = PGMMODE_AMD64_NX; + else +#endif + enmNestedPagingMode = PGMGetHostMode(pVM); + + /* + * MMIO optimization using the reserved (RSVD) bit in the guest page tables for MMIO pages. + */ + Assert((u32ErrCode & (X86_TRAP_PF_RSVD | X86_TRAP_PF_P)) != X86_TRAP_PF_RSVD); + if ((u32ErrCode & (X86_TRAP_PF_RSVD | X86_TRAP_PF_P)) == (X86_TRAP_PF_RSVD | X86_TRAP_PF_P)) + { + /* + * If event delivery causes an MMIO #NPF, go back to instruction emulation as otherwise + * injecting the original pending event would most likely cause the same MMIO #NPF. + */ + if (pVCpu->hm.s.Event.fPending) + return VINF_EM_RAW_INJECT_TRPM_EVENT; + + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP); + VBOXSTRICTRC rcStrict; + PCEMEXITREC pExitRec = EMHistoryUpdateFlagsAndTypeAndPC(pVCpu, + EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_MMIO), + pVCpu->cpum.GstCtx.rip + pVCpu->cpum.GstCtx.cs.u64Base); + if (!pExitRec) + { + + rcStrict = PGMR0Trap0eHandlerNPMisconfig(pVM, pVCpu, enmNestedPagingMode, CPUMCTX2CORE(pCtx), GCPhysFaultAddr, + u32ErrCode); + + /* + * If we succeed, resume guest execution. + * + * If we fail in interpreting the instruction because we couldn't get the guest + * physical address of the page containing the instruction via the guest's page + * tables (we would invalidate the guest page in the host TLB), resume execution + * which would cause a guest page fault to let the guest handle this weird case. + * + * See @bugref{6043}. + */ + if ( rcStrict == VINF_SUCCESS + || rcStrict == VERR_PAGE_TABLE_NOT_PRESENT + || rcStrict == VERR_PAGE_NOT_PRESENT) + { + /* Successfully handled MMIO operation. */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_APIC_TPR); + rcStrict = VINF_SUCCESS; + } + } + else + { + /* + * Frequent exit or something needing probing. Get state and call EMHistoryExec. + */ + Assert(pCtx == &pVCpu->cpum.GstCtx); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + Log4(("EptMisscfgExit/%u: %04x:%08RX64: %RGp -> EMHistoryExec\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, GCPhysFaultAddr)); + + rcStrict = EMHistoryExec(pVCpu, pExitRec, 0); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + + Log4(("EptMisscfgExit/%u: %04x:%08RX64: EMHistoryExec -> %Rrc + %04x:%08RX64\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, + VBOXSTRICTRC_VAL(rcStrict), pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip)); + } + return VBOXSTRICTRC_TODO(rcStrict); + } + + TRPMAssertXcptPF(pVCpu, GCPhysFaultAddr, u32ErrCode); + int rc = PGMR0Trap0eHandlerNestedPaging(pVM, pVCpu, enmNestedPagingMode, u32ErrCode, CPUMCTX2CORE(pCtx), GCPhysFaultAddr); + TRPMResetTrap(pVCpu); + + Log4Func(("#NPF: PGMR0Trap0eHandlerNestedPaging returns %Rrc CS:RIP=%04x:%#RX64\n", rc, pCtx->cs.Sel, pCtx->rip)); + + /* + * Same case as PGMR0Trap0eHandlerNPMisconfig(). See comment above, @bugref{6043}. + */ + if ( rc == VINF_SUCCESS + || rc == VERR_PAGE_TABLE_NOT_PRESENT + || rc == VERR_PAGE_NOT_PRESENT) + { + /* We've successfully synced our shadow page tables. */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitShadowPF); + rc = VINF_SUCCESS; + } + + return rc; +} + + +/** + * \#VMEXIT handler for virtual interrupt (SVM_EXIT_VINTR). Conditional + * \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitVIntr(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_ASSERT_NOT_IN_NESTED_GUEST(&pVCpu->cpum.GstCtx); + + /* Indicate that we no longer need to #VMEXIT when the guest is ready to receive NMIs, it is now ready. */ + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + hmR0SvmClearIntWindowExiting(pVCpu, pVmcb); + + /* Deliver the pending interrupt via hmR0SvmEvaluatePendingEvent() and resume guest execution. */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitIntWindow); + return VINF_SUCCESS; +} + + +/** + * \#VMEXIT handler for task switches (SVM_EXIT_TASK_SWITCH). Conditional + * \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitTaskSwitch(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + +#ifndef HMSVM_ALWAYS_TRAP_TASK_SWITCH + Assert(!pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging); +#endif + + /* Check if this task-switch occurred while delivering an event through the guest IDT. */ + if (pVCpu->hm.s.Event.fPending) /* Can happen with exceptions/NMI. See @bugref{8411}. */ + { + /* + * AMD-V provides us with the exception which caused the TS; we collect + * the information in the call to hmR0SvmCheckExitDueToEventDelivery(). + */ + Log4Func(("TS occurred during event delivery\n")); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitTaskSwitch); + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + + /** @todo Emulate task switch someday, currently just going back to ring-3 for + * emulation. */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitTaskSwitch); + return VERR_EM_INTERPRETER; +} + + +/** + * \#VMEXIT handler for VMMCALL (SVM_EXIT_VMMCALL). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitVmmCall(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + + if (pVCpu->CTX_SUFF(pVM)->hm.s.fTprPatchingAllowed) + { + int rc = hmSvmEmulateMovTpr(pVCpu); + if (rc != VERR_NOT_FOUND) + { + Log4Func(("hmSvmEmulateMovTpr returns %Rrc\n", rc)); + return rc; + } + } + + if (EMAreHypercallInstructionsEnabled(pVCpu)) + { + unsigned cbInstr; + if (hmR0SvmSupportsNextRipSave(pVCpu)) + { + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + } + else + { + PDISCPUSTATE pDis = &pVCpu->hm.s.DisState; + int rc = EMInterpretDisasCurrent(pVCpu->CTX_SUFF(pVM), pVCpu, pDis, &cbInstr); + if ( rc == VINF_SUCCESS + && pDis->pCurInstr->uOpcode == OP_VMMCALL) + Assert(cbInstr > 0); + else + cbInstr = 0; + } + + VBOXSTRICTRC rcStrict = GIMHypercall(pVCpu, &pVCpu->cpum.GstCtx); + if (RT_SUCCESS(rcStrict)) + { + /* Only update the RIP if we're continuing guest execution and not in the case + of say VINF_GIM_R3_HYPERCALL. */ + if (rcStrict == VINF_SUCCESS) + hmR0SvmAdvanceRip(pVCpu, cbInstr); + + return VBOXSTRICTRC_VAL(rcStrict); + } + else + Log4Func(("GIMHypercall returns %Rrc -> #UD\n", VBOXSTRICTRC_VAL(rcStrict))); + } + + hmR0SvmSetPendingXcptUD(pVCpu); + return VINF_SUCCESS; +} + + +/** + * \#VMEXIT handler for VMMCALL (SVM_EXIT_VMMCALL). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitPause(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + unsigned cbInstr; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + } + else + { + PDISCPUSTATE pDis = &pVCpu->hm.s.DisState; + int rc = EMInterpretDisasCurrent(pVCpu->CTX_SUFF(pVM), pVCpu, pDis, &cbInstr); + if ( rc == VINF_SUCCESS + && pDis->pCurInstr->uOpcode == OP_PAUSE) + Assert(cbInstr > 0); + else + cbInstr = 0; + } + + /** @todo The guest has likely hit a contended spinlock. We might want to + * poke a schedule different guest VCPU. */ + hmR0SvmAdvanceRip(pVCpu, cbInstr); + return VINF_EM_RAW_INTERRUPT; +} + + +/** + * \#VMEXIT handler for FERR intercept (SVM_EXIT_FERR_FREEZE). Conditional + * \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitFerrFreeze(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR0); + Assert(!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_NE)); + + Log4Func(("Raising IRQ 13 in response to #FERR\n")); + return PDMIsaSetIrq(pVCpu->CTX_SUFF(pVM), 13 /* u8Irq */, 1 /* u8Level */, 0 /* uTagSrc */); +} + + +/** + * \#VMEXIT handler for IRET (SVM_EXIT_IRET). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitIret(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + /* Clear NMI blocking. */ + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_BLOCK_NMIS); + + /* Indicate that we no longer need to #VMEXIT when the guest is ready to receive NMIs, it is now ready. */ + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + hmR0SvmClearCtrlIntercept(pVCpu, pVmcb, SVM_CTRL_INTERCEPT_IRET); + + /* Deliver the pending NMI via hmR0SvmEvaluatePendingEvent() and resume guest execution. */ + return VINF_SUCCESS; +} + + +/** + * \#VMEXIT handler for page-fault exceptions (SVM_EXIT_XCPT_14). + * Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXcptPF(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + /* See AMD spec. 15.12.15 "#PF (Page Fault)". */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint32_t uErrCode = pVmcb->ctrl.u64ExitInfo1; + uint64_t const uFaultAddress = pVmcb->ctrl.u64ExitInfo2; + +#if defined(HMSVM_ALWAYS_TRAP_ALL_XCPTS) || defined(HMSVM_ALWAYS_TRAP_PF) + if (pVM->hm.s.fNestedPaging) + { + pVCpu->hm.s.Event.fPending = false; /* In case it's a contributory or vectoring #PF. */ + if ( !pSvmTransient->fVectoringDoublePF + || CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + { + /* A genuine guest #PF, reflect it to the guest. */ + hmR0SvmSetPendingXcptPF(pVCpu, uErrCode, uFaultAddress); + Log4Func(("#PF: Guest page fault at %04X:%RGv FaultAddr=%RX64 ErrCode=%#x\n", pCtx->cs.Sel, (RTGCPTR)pCtx->rip, + uFaultAddress, uErrCode)); + } + else + { + /* A guest page-fault occurred during delivery of a page-fault. Inject #DF. */ + hmR0SvmSetPendingXcptDF(pVCpu); + Log4Func(("Pending #DF due to vectoring #PF. NP\n")); + } + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestPF); + return VINF_SUCCESS; + } +#endif + + Assert(!pVM->hm.s.fNestedPaging); + + /* + * TPR patching shortcut for APIC TPR reads and writes; only applicable to 32-bit guests. + */ + if ( pVM->hm.s.fTprPatchingAllowed + && (uFaultAddress & 0xfff) == XAPIC_OFF_TPR + && !(uErrCode & X86_TRAP_PF_P) /* Not present. */ + && !CPUMIsGuestInSvmNestedHwVirtMode(pCtx) + && !CPUMIsGuestInLongModeEx(pCtx) + && !CPUMGetGuestCPL(pVCpu) + && pVM->hm.s.cPatches < RT_ELEMENTS(pVM->hm.s.aPatches)) + { + RTGCPHYS GCPhysApicBase; + GCPhysApicBase = APICGetBaseMsrNoCheck(pVCpu); + GCPhysApicBase &= PAGE_BASE_GC_MASK; + + /* Check if the page at the fault-address is the APIC base. */ + RTGCPHYS GCPhysPage; + int rc2 = PGMGstGetPage(pVCpu, (RTGCPTR)uFaultAddress, NULL /* pfFlags */, &GCPhysPage); + if ( rc2 == VINF_SUCCESS + && GCPhysPage == GCPhysApicBase) + { + /* Only attempt to patch the instruction once. */ + PHMTPRPATCH pPatch = (PHMTPRPATCH)RTAvloU32Get(&pVM->hm.s.PatchTree, (AVLOU32KEY)pCtx->eip); + if (!pPatch) + return VINF_EM_HM_PATCH_TPR_INSTR; + } + } + + Log4Func(("#PF: uFaultAddress=%#RX64 CS:RIP=%#04x:%#RX64 uErrCode %#RX32 cr3=%#RX64\n", uFaultAddress, pCtx->cs.Sel, + pCtx->rip, uErrCode, pCtx->cr3)); + + /* + * If it's a vectoring #PF, emulate injecting the original event injection as + * PGMTrap0eHandler() is incapable of differentiating between instruction emulation and + * event injection that caused a #PF. See @bugref{6607}. + */ + if (pSvmTransient->fVectoringPF) + { + Assert(pVCpu->hm.s.Event.fPending); + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + + TRPMAssertXcptPF(pVCpu, uFaultAddress, uErrCode); + int rc = PGMTrap0eHandler(pVCpu, uErrCode, CPUMCTX2CORE(pCtx), (RTGCPTR)uFaultAddress); + + Log4Func(("#PF: rc=%Rrc\n", rc)); + + if (rc == VINF_SUCCESS) + { + /* Successfully synced shadow pages tables or emulated an MMIO instruction. */ + TRPMResetTrap(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitShadowPF); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + return rc; + } + + if (rc == VINF_EM_RAW_GUEST_TRAP) + { + pVCpu->hm.s.Event.fPending = false; /* In case it's a contributory or vectoring #PF. */ + + /* + * If a nested-guest delivers a #PF and that causes a #PF which is -not- a shadow #PF, + * we should simply forward the #PF to the guest and is up to the nested-hypervisor to + * determine whether it is a nested-shadow #PF or a #DF, see @bugref{7243#c121}. + */ + if ( !pSvmTransient->fVectoringDoublePF + || CPUMIsGuestInSvmNestedHwVirtMode(pCtx)) + { + /* It's a guest (or nested-guest) page fault and needs to be reflected. */ + uErrCode = TRPMGetErrorCode(pVCpu); /* The error code might have been changed. */ + TRPMResetTrap(pVCpu); + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM + /* If the nested-guest is intercepting #PFs, cause a #PF #VMEXIT. */ + if ( CPUMIsGuestInSvmNestedHwVirtMode(pCtx) + && HMIsGuestSvmXcptInterceptSet(pVCpu, X86_XCPT_PF)) + return VBOXSTRICTRC_TODO(IEMExecSvmVmexit(pVCpu, SVM_EXIT_XCPT_PF, uErrCode, uFaultAddress)); +#endif + + hmR0SvmSetPendingXcptPF(pVCpu, uErrCode, uFaultAddress); + } + else + { + /* A guest page-fault occurred during delivery of a page-fault. Inject #DF. */ + TRPMResetTrap(pVCpu); + hmR0SvmSetPendingXcptDF(pVCpu); + Log4Func(("#PF: Pending #DF due to vectoring #PF\n")); + } + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestPF); + return VINF_SUCCESS; + } + + TRPMResetTrap(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitShadowPFEM); + return rc; +} + + +/** + * \#VMEXIT handler for undefined opcode (SVM_EXIT_XCPT_6). + * Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXcptUD(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_ASSERT_NOT_IN_NESTED_GUEST(&pVCpu->cpum.GstCtx); + + /* Paranoia; Ensure we cannot be called as a result of event delivery. */ + PSVMVMCB pVmcb = pVCpu->hm.s.svm.pVmcb; + Assert(!pVmcb->ctrl.ExitIntInfo.n.u1Valid); NOREF(pVmcb); + + int rc = VERR_SVM_UNEXPECTED_XCPT_EXIT; + if (pVCpu->hm.s.fGIMTrapXcptUD) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + uint8_t cbInstr = 0; + VBOXSTRICTRC rcStrict = GIMXcptUD(pVCpu, &pVCpu->cpum.GstCtx, NULL /* pDis */, &cbInstr); + if (rcStrict == VINF_SUCCESS) + { + /* #UD #VMEXIT does not have valid NRIP information, manually advance RIP. See @bugref{7270#c170}. */ + hmR0SvmAdvanceRip(pVCpu, cbInstr); + rc = VINF_SUCCESS; + HMSVM_CHECK_SINGLE_STEP(pVCpu, rc); + } + else if (rcStrict == VINF_GIM_HYPERCALL_CONTINUING) + rc = VINF_SUCCESS; + else if (rcStrict == VINF_GIM_R3_HYPERCALL) + rc = VINF_GIM_R3_HYPERCALL; + else + Assert(RT_FAILURE(VBOXSTRICTRC_VAL(rcStrict))); + } + + /* If the GIM #UD exception handler didn't succeed for some reason or wasn't needed, raise #UD. */ + if (RT_FAILURE(rc)) + { + hmR0SvmSetPendingXcptUD(pVCpu); + rc = VINF_SUCCESS; + } + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestUD); + return rc; +} + + +/** + * \#VMEXIT handler for math-fault exceptions (SVM_EXIT_XCPT_16). + * Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXcptMF(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + + /* Paranoia; Ensure we cannot be called as a result of event delivery. */ + Assert(!pVmcb->ctrl.ExitIntInfo.n.u1Valid); NOREF(pVmcb); + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestMF); + + if (!(pCtx->cr0 & X86_CR0_NE)) + { + PVM pVM = pVCpu->CTX_SUFF(pVM); + PDISSTATE pDis = &pVCpu->hm.s.DisState; + unsigned cbInstr; + int rc = EMInterpretDisasCurrent(pVM, pVCpu, pDis, &cbInstr); + if (RT_SUCCESS(rc)) + { + /* Convert a #MF into a FERR -> IRQ 13. See @bugref{6117}. */ + rc = PDMIsaSetIrq(pVCpu->CTX_SUFF(pVM), 13 /* u8Irq */, 1 /* u8Level */, 0 /* uTagSrc */); + if (RT_SUCCESS(rc)) + hmR0SvmAdvanceRip(pVCpu, cbInstr); + } + else + Log4Func(("EMInterpretDisasCurrent returned %Rrc uOpCode=%#x\n", rc, pDis->pCurInstr->uOpcode)); + return rc; + } + + hmR0SvmSetPendingXcptMF(pVCpu); + return VINF_SUCCESS; +} + + +/** + * \#VMEXIT handler for debug exceptions (SVM_EXIT_XCPT_1). Conditional + * \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXcptDB(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + if (RT_UNLIKELY(pVCpu->hm.s.Event.fPending)) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectPendingInterpret); + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestDB); + + /* + * This can be a fault-type #DB (instruction breakpoint) or a trap-type #DB (data + * breakpoint). However, for both cases DR6 and DR7 are updated to what the exception + * handler expects. See AMD spec. 15.12.2 "#DB (Debug)". + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PSVMVMCB pVmcb = pVCpu->hm.s.svm.pVmcb; + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + int rc = DBGFRZTrap01Handler(pVM, pVCpu, CPUMCTX2CORE(pCtx), pVmcb->guest.u64DR6, pVCpu->hm.s.fSingleInstruction); + if (rc == VINF_EM_RAW_GUEST_TRAP) + { + Log5(("hmR0SvmExitXcptDB: DR6=%#RX64 -> guest trap\n", pVmcb->guest.u64DR6)); + if (CPUMIsHyperDebugStateActive(pVCpu)) + CPUMSetGuestDR6(pVCpu, CPUMGetGuestDR6(pVCpu) | pVmcb->guest.u64DR6); + + /* Reflect the exception back to the guest. */ + hmR0SvmSetPendingXcptDB(pVCpu); + rc = VINF_SUCCESS; + } + + /* + * Update DR6. + */ + if (CPUMIsHyperDebugStateActive(pVCpu)) + { + Log5(("hmR0SvmExitXcptDB: DR6=%#RX64 -> %Rrc\n", pVmcb->guest.u64DR6, rc)); + pVmcb->guest.u64DR6 = X86_DR6_INIT_VAL; + pVmcb->ctrl.u32VmcbCleanBits &= ~HMSVM_VMCB_CLEAN_DRX; + } + else + { + AssertMsg(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc)); + Assert(!pVCpu->hm.s.fSingleInstruction && !DBGFIsStepping(pVCpu)); + } + + return rc; +} + + +/** + * \#VMEXIT handler for alignment check exceptions (SVM_EXIT_XCPT_17). + * Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXcptAC(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_AC; + Event.n.u1ErrorCodeValid = 1; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + return VINF_SUCCESS; +} + + +/** + * \#VMEXIT handler for breakpoint exceptions (SVM_EXIT_XCPT_3). + * Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXcptBP(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + int rc = DBGFRZTrap03Handler(pVCpu->CTX_SUFF(pVM), pVCpu, CPUMCTX2CORE(pCtx)); + if (rc == VINF_EM_RAW_GUEST_TRAP) + { + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_BP; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + } + + Assert(rc == VINF_SUCCESS || rc == VINF_EM_RAW_GUEST_TRAP || rc == VINF_EM_DBG_BREAKPOINT); + return rc; +} + + +/** + * Hacks its way around the lovely mesa driver's backdoor accesses. + * + * @sa hmR0VmxHandleMesaDrvGp + */ +static int hmR0SvmHandleMesaDrvGp(PVMCPU pVCpu, PCPUMCTX pCtx, PCSVMVMCB pVmcb) +{ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RFLAGS | CPUMCTX_EXTRN_GPRS_MASK); + Log(("hmR0SvmHandleMesaDrvGp: at %04x:%08RX64 rcx=%RX64 rbx=%RX64\n", + pVmcb->guest.CS.u16Sel, pVmcb->guest.u64RIP, pCtx->rcx, pCtx->rbx)); + RT_NOREF(pCtx, pVmcb); + + /* For now we'll just skip the instruction. */ + hmR0SvmAdvanceRip(pVCpu, 1); + return VINF_SUCCESS; +} + + +/** + * Checks if the \#GP'ing instruction is the mesa driver doing it's lovely + * backdoor logging w/o checking what it is running inside. + * + * This recognizes an "IN EAX,DX" instruction executed in flat ring-3, with the + * backdoor port and magic numbers loaded in registers. + * + * @returns true if it is, false if it isn't. + * @sa hmR0VmxIsMesaDrvGp + */ +DECLINLINE(bool) hmR0SvmIsMesaDrvGp(PVMCPU pVCpu, PCPUMCTX pCtx, PCSVMVMCB pVmcb) +{ + /* Check magic and port. */ + Assert(!(pCtx->fExtrn & (CPUMCTX_EXTRN_RDX | CPUMCTX_EXTRN_RCX))); + /*Log8(("hmR0SvmIsMesaDrvGp: rax=%RX64 rdx=%RX64\n", pCtx->fExtrn & CPUMCTX_EXTRN_RAX ? pVmcb->guest.u64RAX : pCtx->rax, pCtx->rdx));*/ + if (pCtx->dx != UINT32_C(0x5658)) + return false; + if ((pCtx->fExtrn & CPUMCTX_EXTRN_RAX ? pVmcb->guest.u64RAX : pCtx->rax) != UINT32_C(0x564d5868)) + return false; + + /* Check that it is #GP(0). */ + if (pVmcb->ctrl.u64ExitInfo1 != 0) + return false; + + /* Flat ring-3 CS. */ + /*Log8(("hmR0SvmIsMesaDrvGp: u8CPL=%d base=%RX64\n", pVmcb->guest.u8CPL, pCtx->fExtrn & CPUMCTX_EXTRN_CS ? pVmcb->guest.CS.u64Base : pCtx->cs.u64Base));*/ + if (pVmcb->guest.u8CPL != 3) + return false; + if ((pCtx->fExtrn & CPUMCTX_EXTRN_CS ? pVmcb->guest.CS.u64Base : pCtx->cs.u64Base) != 0) + return false; + + /* 0xed: IN eAX,dx */ + if (pVmcb->ctrl.cbInstrFetched < 1) /* unlikely, it turns out. */ + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_GPRS_MASK + | CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_CR3 | CPUMCTX_EXTRN_CR4 | CPUMCTX_EXTRN_EFER); + uint8_t abInstr[1]; + int rc = PGMPhysSimpleReadGCPtr(pVCpu, abInstr, pCtx->rip, sizeof(abInstr)); + /*Log8(("hmR0SvmIsMesaDrvGp: PGMPhysSimpleReadGCPtr -> %Rrc %#x\n", rc, abInstr[0])); */ + if (RT_FAILURE(rc)) + return false; + if (abInstr[0] != 0xed) + return false; + } + else + { + /*Log8(("hmR0SvmIsMesaDrvGp: %#x\n", pVmcb->ctrl.abInstr));*/ + if (pVmcb->ctrl.abInstr[0] != 0xed) + return false; + } + return true; +} + + +/** + * \#VMEXIT handler for general protection faults (SVM_EXIT_XCPT_BP). + * Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXcptGP(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + Assert(pSvmTransient->u64ExitCode == pVmcb->ctrl.u64ExitCode); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if ( !pVCpu->hm.s.fTrapXcptGpForLovelyMesaDrv + || !hmR0SvmIsMesaDrvGp(pVCpu, pCtx, pVmcb)) + { + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_GP; + Event.n.u1ErrorCodeValid = 1; + Event.n.u32ErrorCode = (uint32_t)pVmcb->ctrl.u64ExitInfo1; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + return VINF_SUCCESS; + } + return hmR0SvmHandleMesaDrvGp(pVCpu, pCtx, pVmcb); +} + + +#if defined(HMSVM_ALWAYS_TRAP_ALL_XCPTS) || defined(VBOX_WITH_NESTED_HWVIRT_SVM) +/** + * \#VMEXIT handler for generic exceptions. Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitXcptGeneric(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const uVector = pVmcb->ctrl.u64ExitCode - SVM_EXIT_XCPT_0; + uint32_t const uErrCode = pVmcb->ctrl.u64ExitInfo1; + Assert(pSvmTransient->u64ExitCode == pVmcb->ctrl.u64ExitCode); + Assert(uVector <= X86_XCPT_LAST); + Log4Func(("uVector=%#x uErrCode=%u\n", uVector, uErrCode)); + + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = uVector; + switch (uVector) + { + /* Shouldn't be here for reflecting #PFs (among other things, the fault address isn't passed along). */ + case X86_XCPT_PF: AssertMsgFailed(("hmR0SvmExitXcptGeneric: Unexpected exception")); return VERR_SVM_IPE_5; + case X86_XCPT_DF: + case X86_XCPT_TS: + case X86_XCPT_NP: + case X86_XCPT_SS: + case X86_XCPT_GP: + case X86_XCPT_AC: + { + Event.n.u1ErrorCodeValid = 1; + Event.n.u32ErrorCode = uErrCode; + break; + } + } + + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + return VINF_SUCCESS; +} +#endif + +#ifdef VBOX_WITH_NESTED_HWVIRT_SVM +/** + * \#VMEXIT handler for CLGI (SVM_EXIT_CLGI). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitClgi(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + Assert(pVmcb); + Assert(!pVmcb->ctrl.IntCtrl.n.u1VGifEnable); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + uint64_t const fImport = CPUMCTX_EXTRN_HWVIRT; + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | fImport); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedClgi(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | fImport); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_SUCCESS) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_HWVIRT); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for STGI (SVM_EXIT_STGI). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitStgi(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + /* + * When VGIF is not used we always intercept STGI instructions. When VGIF is used, + * we only intercept STGI when events are pending for GIF to become 1. + */ + PSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + if (pVmcb->ctrl.IntCtrl.n.u1VGifEnable) + hmR0SvmClearCtrlIntercept(pVCpu, pVmcb, SVM_CTRL_INTERCEPT_STGI); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + uint64_t const fImport = CPUMCTX_EXTRN_HWVIRT; + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | fImport); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedStgi(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | fImport); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_SUCCESS) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_HWVIRT); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for VMLOAD (SVM_EXIT_VMLOAD). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitVmload(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + Assert(pVmcb); + Assert(!pVmcb->ctrl.LbrVirt.n.u1VirtVmsaveVmload); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + uint64_t const fImport = CPUMCTX_EXTRN_FS | CPUMCTX_EXTRN_GS | CPUMCTX_EXTRN_KERNEL_GS_BASE + | CPUMCTX_EXTRN_TR | CPUMCTX_EXTRN_LDTR | CPUMCTX_EXTRN_SYSCALL_MSRS + | CPUMCTX_EXTRN_SYSENTER_MSRS; + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | fImport); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedVmload(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | fImport); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_SUCCESS) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_FS | HM_CHANGED_GUEST_GS + | HM_CHANGED_GUEST_TR | HM_CHANGED_GUEST_LDTR + | HM_CHANGED_GUEST_KERNEL_GS_BASE | HM_CHANGED_GUEST_SYSCALL_MSRS + | HM_CHANGED_GUEST_SYSENTER_MSR_MASK); + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for VMSAVE (SVM_EXIT_VMSAVE). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitVmsave(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + Assert(!pVmcb->ctrl.LbrVirt.n.u1VirtVmsaveVmload); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedVmsave(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for INVLPGA (SVM_EXIT_INVLPGA). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitInvlpga(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK); + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedInvlpga(pVCpu, cbInstr); + } + else + { + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rcStrict = IEMExecOne(pVCpu); + } + + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * \#VMEXIT handler for STGI (SVM_EXIT_VMRUN). Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmExitVmrun(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + /* We shall import the entire state here, just in case we enter and continue execution of + the nested-guest with hardware-assisted SVM in ring-0, we would be switching VMCBs and + could lose lose part of CPU state. */ + HMSVM_CPUMCTX_IMPORT_STATE(pVCpu, HMSVM_CPUMCTX_EXTRN_ALL); + + VBOXSTRICTRC rcStrict; + bool const fSupportsNextRipSave = hmR0SvmSupportsNextRipSave(pVCpu); + if (fSupportsNextRipSave) + { + PCSVMVMCB pVmcb = hmR0SvmGetCurrentVmcb(pVCpu); + uint8_t const cbInstr = pVmcb->ctrl.u64NextRIP - pVCpu->cpum.GstCtx.rip; + rcStrict = IEMExecDecodedVmrun(pVCpu, cbInstr); + } + else + { + /* We use IEMExecOneBypassEx() here as it supresses attempt to continue emulating any + instruction(s) when interrupt inhibition is set as part of emulating the VMRUN + instruction itself, see @bugref{7243#c126} */ + rcStrict = IEMExecOneBypassEx(pVCpu, CPUMCTX2CORE(&pVCpu->cpum.GstCtx), NULL /* pcbWritten */); + } + + if (rcStrict == VINF_SUCCESS) + { + rcStrict = VINF_SVM_VMRUN; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_SVM_VMRUN_MASK); + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + HMSVM_CHECK_SINGLE_STEP(pVCpu, rcStrict); + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * Nested-guest \#VMEXIT handler for debug exceptions (SVM_EXIT_XCPT_1). + * Unconditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmNestedExitXcptDB(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + if (pVCpu->hm.s.Event.fPending) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectPendingInterpret); + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + + hmR0SvmSetPendingXcptDB(pVCpu); + return VINF_SUCCESS; +} + + +/** + * Nested-guest \#VMEXIT handler for breakpoint exceptions (SVM_EXIT_XCPT_3). + * Conditional \#VMEXIT. + */ +HMSVM_EXIT_DECL hmR0SvmNestedExitXcptBP(PVMCPU pVCpu, PSVMTRANSIENT pSvmTransient) +{ + HMSVM_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pSvmTransient); + HMSVM_CHECK_EXIT_DUE_TO_EVENT_DELIVERY(pVCpu, pSvmTransient); + + SVMEVENT Event; + Event.u = 0; + Event.n.u1Valid = 1; + Event.n.u3Type = SVM_EVENT_EXCEPTION; + Event.n.u8Vector = X86_XCPT_BP; + hmR0SvmSetPendingEvent(pVCpu, &Event, 0 /* GCPtrFaultAddress */); + return VINF_SUCCESS; +} +#endif /* VBOX_WITH_NESTED_HWVIRT_SVM */ + +/** @} */ + diff --git a/src/VBox/VMM/VMMR0/HMSVMR0.h b/src/VBox/VMM/VMMR0/HMSVMR0.h new file mode 100644 index 00000000..19b15ede --- /dev/null +++ b/src/VBox/VMM/VMMR0/HMSVMR0.h @@ -0,0 +1,99 @@ +/* $Id: HMSVMR0.h $ */ +/** @file + * HM SVM (AMD-V) - Internal header file. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + +#ifndef VMM_INCLUDED_SRC_VMMR0_HMSVMR0_h +#define VMM_INCLUDED_SRC_VMMR0_HMSVMR0_h +#ifndef RT_WITHOUT_PRAGMA_ONCE +# pragma once +#endif + +#include <VBox/cdefs.h> +#include <VBox/types.h> +#include <VBox/vmm/hm.h> +#include <VBox/vmm/hm_svm.h> + +RT_C_DECLS_BEGIN + +/** @defgroup grp_svm_int Internal + * @ingroup grp_svm + * @internal + * @{ + */ + +#ifdef IN_RING0 + +VMMR0DECL(int) SVMR0GlobalInit(void); +VMMR0DECL(void) SVMR0GlobalTerm(void); +VMMR0DECL(int) SVMR0Enter(PVMCPU pVCpu); +VMMR0DECL(void) SVMR0ThreadCtxCallback(RTTHREADCTXEVENT enmEvent, PVMCPU pVCpu, bool fGlobalInit); +VMMR0DECL(int) SVMR0EnableCpu(PHMPHYSCPU pHostCpu, PVM pVM, void *pvPageCpu, RTHCPHYS HCPhysCpuPage, + bool fEnabledBySystem, PCSUPHWVIRTMSRS pHwvirtMsrs); +VMMR0DECL(int) SVMR0DisableCpu(void *pvPageCpu, RTHCPHYS pPageCpuPhys); +VMMR0DECL(int) SVMR0InitVM(PVM pVM); +VMMR0DECL(int) SVMR0TermVM(PVM pVM); +VMMR0DECL(int) SVMR0SetupVM(PVM pVM); +VMMR0DECL(VBOXSTRICTRC) SVMR0RunGuestCode(PVMCPU pVCpu); +VMMR0DECL(int) SVMR0ExportHostState(PVMCPU pVCpu); +VMMR0DECL(int) SVMR0ImportStateOnDemand(PVMCPU pVCpu, uint64_t fWhat); +VMMR0DECL(int) SVMR0InvalidatePage(PVMCPU pVCpu, RTGCPTR GCVirt); + +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) +DECLASM(int) SVMR0VMSwitcherRun64(RTHCPHYS pVMCBHostPhys, RTHCPHYS pVMCBPhys, PCPUMCTX pCtx, PVM pVM, PVMCPU pVCpu); +VMMR0DECL(int) SVMR0Execute64BitsHandler(PVMCPU pVCpu, HM64ON32OP enmOp, uint32_t cbParam, uint32_t *paParam); +#endif /* HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) */ + +/** + * Prepares for and executes VMRUN (32-bit guests). + * + * @returns VBox status code. + * @param pVMCBHostPhys Physical address of host VMCB. + * @param pVMCBPhys Physical address of the VMCB. + * @param pCtx Pointer to the guest CPU context. + * @param pVM The cross context VM structure. (Not used.) + * @param pVCpu The cross context virtual CPU structure. (Not used.) + */ +DECLASM(int) SVMR0VMRun(RTHCPHYS pVMCBHostPhys, RTHCPHYS pVMCBPhys, PCPUMCTX pCtx, PVM pVM, PVMCPU pVCpu); + + +/** + * Prepares for and executes VMRUN (64-bit guests). + * + * @returns VBox status code. + * @param pVMCBHostPhys Physical address of host VMCB. + * @param pVMCBPhys Physical address of the VMCB. + * @param pCtx Pointer to the guest CPU context. + * @param pVM The cross context VM structure. (Not used.) + * @param pVCpu The cross context virtual CPU structure. (Not used.) + */ +DECLASM(int) SVMR0VMRun64(RTHCPHYS pVMCBHostPhys, RTHCPHYS pVMCBPhys, PCPUMCTX pCtx, PVM pVM, PVMCPU pVCpu); + +/** + * Executes INVLPGA. + * + * @param pPageGC Virtual page to invalidate. + * @param u32ASID Tagged TLB id. + */ +DECLASM(void) SVMR0InvlpgA(RTGCPTR pPageGC, uint32_t u32ASID); + +#endif /* IN_RING0 */ + +/** @} */ + +RT_C_DECLS_END + +#endif /* !VMM_INCLUDED_SRC_VMMR0_HMSVMR0_h */ + diff --git a/src/VBox/VMM/VMMR0/HMVMXR0.cpp b/src/VBox/VMM/VMMR0/HMVMXR0.cpp new file mode 100644 index 00000000..62d2b7e1 --- /dev/null +++ b/src/VBox/VMM/VMMR0/HMVMXR0.cpp @@ -0,0 +1,13777 @@ +/* $Id: HMVMXR0.cpp $ */ +/** @file + * HM VMX (Intel VT-x) - Host Context Ring-0. + */ + +/* + * Copyright (C) 2012-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_HM +#define VMCPU_INCL_CPUM_GST_CTX +#include <iprt/x86.h> +#include <iprt/asm-amd64-x86.h> +#include <iprt/thread.h> + +#include <VBox/vmm/pdmapi.h> +#include <VBox/vmm/dbgf.h> +#include <VBox/vmm/iem.h> +#include <VBox/vmm/iom.h> +#include <VBox/vmm/selm.h> +#include <VBox/vmm/tm.h> +#include <VBox/vmm/em.h> +#include <VBox/vmm/gim.h> +#include <VBox/vmm/apic.h> +#ifdef VBOX_WITH_REM +# include <VBox/vmm/rem.h> +#endif +#include "HMInternal.h" +#include <VBox/vmm/vm.h> +#include <VBox/vmm/hmvmxinline.h> +#include "HMVMXR0.h" +#include "dtrace/VBoxVMM.h" + +#ifdef DEBUG_ramshankar +# define HMVMX_ALWAYS_SAVE_GUEST_RFLAGS +# define HMVMX_ALWAYS_SAVE_FULL_GUEST_STATE +# define HMVMX_ALWAYS_SYNC_FULL_GUEST_STATE +# define HMVMX_ALWAYS_CHECK_GUEST_STATE +# define HMVMX_ALWAYS_TRAP_ALL_XCPTS +# define HMVMX_ALWAYS_TRAP_PF +# define HMVMX_ALWAYS_FLUSH_TLB +# define HMVMX_ALWAYS_SWAP_EFER +#endif + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +/** Use the function table. */ +#define HMVMX_USE_FUNCTION_TABLE + +/** Determine which tagged-TLB flush handler to use. */ +#define HMVMX_FLUSH_TAGGED_TLB_EPT_VPID 0 +#define HMVMX_FLUSH_TAGGED_TLB_EPT 1 +#define HMVMX_FLUSH_TAGGED_TLB_VPID 2 +#define HMVMX_FLUSH_TAGGED_TLB_NONE 3 + +/** @name HMVMX_READ_XXX + * Flags to skip redundant reads of some common VMCS fields that are not part of + * the guest-CPU or VCPU state but are needed while handling VM-exits. + */ +#define HMVMX_READ_IDT_VECTORING_INFO RT_BIT_32(0) +#define HMVMX_READ_IDT_VECTORING_ERROR_CODE RT_BIT_32(1) +#define HMVMX_READ_EXIT_QUALIFICATION RT_BIT_32(2) +#define HMVMX_READ_EXIT_INSTR_LEN RT_BIT_32(3) +#define HMVMX_READ_EXIT_INTERRUPTION_INFO RT_BIT_32(4) +#define HMVMX_READ_EXIT_INTERRUPTION_ERROR_CODE RT_BIT_32(5) +#define HMVMX_READ_EXIT_INSTR_INFO RT_BIT_32(6) +#define HMVMX_READ_GUEST_LINEAR_ADDR RT_BIT_32(7) +/** @} */ + +/** + * States of the VMCS. + * + * This does not reflect all possible VMCS states but currently only those + * needed for maintaining the VMCS consistently even when thread-context hooks + * are used. Maybe later this can be extended (i.e. Nested Virtualization). + */ +#define HMVMX_VMCS_STATE_CLEAR RT_BIT(0) +#define HMVMX_VMCS_STATE_ACTIVE RT_BIT(1) +#define HMVMX_VMCS_STATE_LAUNCHED RT_BIT(2) + +/** + * Subset of the guest-CPU state that is kept by VMX R0 code while executing the + * guest using hardware-assisted VMX. + * + * This excludes state like GPRs (other than RSP) which are always are + * swapped and restored across the world-switch and also registers like EFER, + * MSR which cannot be modified by the guest without causing a VM-exit. + */ +#define HMVMX_CPUMCTX_EXTRN_ALL ( CPUMCTX_EXTRN_RIP \ + | CPUMCTX_EXTRN_RFLAGS \ + | CPUMCTX_EXTRN_RSP \ + | CPUMCTX_EXTRN_SREG_MASK \ + | CPUMCTX_EXTRN_TABLE_MASK \ + | CPUMCTX_EXTRN_KERNEL_GS_BASE \ + | CPUMCTX_EXTRN_SYSCALL_MSRS \ + | CPUMCTX_EXTRN_SYSENTER_MSRS \ + | CPUMCTX_EXTRN_TSC_AUX \ + | CPUMCTX_EXTRN_OTHER_MSRS \ + | CPUMCTX_EXTRN_CR0 \ + | CPUMCTX_EXTRN_CR3 \ + | CPUMCTX_EXTRN_CR4 \ + | CPUMCTX_EXTRN_DR7 \ + | CPUMCTX_EXTRN_HM_VMX_MASK) + +/** + * Exception bitmap mask for real-mode guests (real-on-v86). + * + * We need to intercept all exceptions manually except: + * - \#AC and \#DB are always intercepted to prevent the CPU from deadlocking + * due to bugs in Intel CPUs. + * - \#PF need not be intercepted even in real-mode if we have Nested Paging + * support. + */ +#define HMVMX_REAL_MODE_XCPT_MASK ( RT_BIT(X86_XCPT_DE) /* always: | RT_BIT(X86_XCPT_DB) */ | RT_BIT(X86_XCPT_NMI) \ + | RT_BIT(X86_XCPT_BP) | RT_BIT(X86_XCPT_OF) | RT_BIT(X86_XCPT_BR) \ + | RT_BIT(X86_XCPT_UD) | RT_BIT(X86_XCPT_NM) | RT_BIT(X86_XCPT_DF) \ + | RT_BIT(X86_XCPT_CO_SEG_OVERRUN) | RT_BIT(X86_XCPT_TS) | RT_BIT(X86_XCPT_NP) \ + | RT_BIT(X86_XCPT_SS) | RT_BIT(X86_XCPT_GP) /* RT_BIT(X86_XCPT_PF) */ \ + | RT_BIT(X86_XCPT_MF) /* always: | RT_BIT(X86_XCPT_AC) */ | RT_BIT(X86_XCPT_MC) \ + | RT_BIT(X86_XCPT_XF)) + +/** Maximum VM-instruction error number. */ +#define HMVMX_INSTR_ERROR_MAX 28 + +/** Profiling macro. */ +#ifdef HM_PROFILE_EXIT_DISPATCH +# define HMVMX_START_EXIT_DISPATCH_PROF() STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatExitDispatch, ed) +# define HMVMX_STOP_EXIT_DISPATCH_PROF() STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitDispatch, ed) +#else +# define HMVMX_START_EXIT_DISPATCH_PROF() do { } while (0) +# define HMVMX_STOP_EXIT_DISPATCH_PROF() do { } while (0) +#endif + +/** Assert that preemption is disabled or covered by thread-context hooks. */ +#define HMVMX_ASSERT_PREEMPT_SAFE(a_pVCpu) Assert( VMMR0ThreadCtxHookIsEnabled((a_pVCpu)) \ + || !RTThreadPreemptIsEnabled(NIL_RTTHREAD)) + +/** Assert that we haven't migrated CPUs when thread-context hooks are not + * used. */ +#define HMVMX_ASSERT_CPU_SAFE(a_pVCpu) AssertMsg( VMMR0ThreadCtxHookIsEnabled((a_pVCpu)) \ + || (a_pVCpu)->hm.s.idEnteredCpu == RTMpCpuId(), \ + ("Illegal migration! Entered on CPU %u Current %u\n", \ + (a_pVCpu)->hm.s.idEnteredCpu, RTMpCpuId())) + +/** Asserts that the given CPUMCTX_EXTRN_XXX bits are present in the guest-CPU + * context. */ +#define HMVMX_CPUMCTX_ASSERT(a_pVCpu, a_fExtrnMbz) AssertMsg(!((a_pVCpu)->cpum.GstCtx.fExtrn & (a_fExtrnMbz)), \ + ("fExtrn=%#RX64 fExtrnMbz=%#RX64\n", \ + (a_pVCpu)->cpum.GstCtx.fExtrn, (a_fExtrnMbz))) + +/** Macro for importing guest state from the VMCS back into CPUMCTX (intended to be + * used only from VM-exit handlers). */ +#define HMVMX_CPUMCTX_IMPORT_STATE(a_pVCpu, a_fWhat) (hmR0VmxImportGuestState((a_pVCpu), (a_fWhat))) + +/** Helper macro for VM-exit handlers called unexpectedly. */ +#define HMVMX_UNEXPECTED_EXIT_RET(a_pVCpu, a_pVmxTransient) \ + do { \ + (a_pVCpu)->hm.s.u32HMError = (a_pVmxTransient)->uExitReason; \ + return VERR_VMX_UNEXPECTED_EXIT; \ + } while (0) + +/** Macro for importing segment registers to the VMCS from the guest-CPU context. */ +#ifdef VMX_USE_CACHED_VMCS_ACCESSES +# define HMVMX_IMPORT_SREG(Sel, a_pCtxSelReg) \ + hmR0VmxImportGuestSegmentReg(pVCpu, VMX_VMCS16_GUEST_##Sel##_SEL, VMX_VMCS32_GUEST_##Sel##_LIMIT, \ + VMX_VMCS_GUEST_##Sel##_BASE_CACHE_IDX, VMX_VMCS32_GUEST_##Sel##_ACCESS_RIGHTS, (a_pCtxSelReg)) +#else +# define HMVMX_IMPORT_SREG(Sel, a_pCtxSelReg) \ + hmR0VmxImportGuestSegmentReg(pVCpu, VMX_VMCS16_GUEST_##Sel##_SEL, VMX_VMCS32_GUEST_##Sel##_LIMIT, \ + VMX_VMCS_GUEST_##Sel##_BASE, VMX_VMCS32_GUEST_##Sel##_ACCESS_RIGHTS, (a_pCtxSelReg)) +#endif + +/** Macro for exporting segment registers to the VMCS from the guest-CPU context. */ +#define HMVMX_EXPORT_SREG(Sel, a_pCtxSelReg) \ + hmR0VmxExportGuestSegmentReg(pVCpu, VMX_VMCS16_GUEST_##Sel##_SEL, VMX_VMCS32_GUEST_##Sel##_LIMIT, \ + VMX_VMCS_GUEST_##Sel##_BASE, VMX_VMCS32_GUEST_##Sel##_ACCESS_RIGHTS, (a_pCtxSelReg)) + +#ifdef VBOX_WITH_NESTED_HWVIRT_VMX +/** Macro that does the necessary privilege checks and intercepted VM-exits for + * guests that attempted to execute a VMX instruction. */ +# define HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(a_pVCpu, a_uExitReason) \ + do \ + { \ + VBOXSTRICTRC rcStrictTmp = hmR0VmxCheckExitDueToVmxInstr((a_pVCpu), (a_uExitReason)); \ + if (rcStrictTmp == VINF_SUCCESS) \ + { /* likely */ } \ + else if (rcStrictTmp == VINF_HM_PENDING_XCPT) \ + { \ + Assert((a_pVCpu)->hm.s.Event.fPending); \ + Log4Func(("Privilege checks failed -> %#x\n", VMX_ENTRY_INT_INFO_VECTOR((a_pVCpu)->hm.s.Event.u64IntInfo))); \ + return VINF_SUCCESS; \ + } \ + else \ + { \ + int rcTmp = VBOXSTRICTRC_VAL(rcStrictTmp); \ + AssertMsgFailedReturn(("Unexpected failure. rc=%Rrc", rcTmp), rcTmp); \ + } \ + } while (0) + +/** Macro that decodes a memory operand for an instruction VM-exit. */ +# define HMVMX_DECODE_MEM_OPERAND(a_pVCpu, a_uExitInstrInfo, a_uExitQual, a_enmMemAccess, a_pGCPtrEffAddr) \ + do \ + { \ + VBOXSTRICTRC rcStrictTmp = hmR0VmxDecodeMemOperand((a_pVCpu), (a_uExitInstrInfo), (a_uExitQual), (a_enmMemAccess), \ + (a_pGCPtrEffAddr)); \ + if (rcStrictTmp == VINF_SUCCESS) \ + { /* likely */ } \ + else if (rcStrictTmp == VINF_HM_PENDING_XCPT) \ + { \ + uint8_t const uXcptTmp = VMX_ENTRY_INT_INFO_VECTOR((a_pVCpu)->hm.s.Event.u64IntInfo); \ + Log4Func(("Memory operand decoding failed, raising xcpt %#x\n", uXcptTmp)); \ + NOREF(uXcptTmp); \ + return VINF_SUCCESS; \ + } \ + else \ + { \ + Log4Func(("hmR0VmxDecodeMemOperand failed. rc=%Rrc\n", VBOXSTRICTRC_VAL(rcStrictTmp))); \ + return rcStrictTmp; \ + } \ + } while (0) + +# ifdef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM +/** Macro that executes a VMX instruction in IEM. */ +# define HMVMX_IEM_EXEC_VMX_INSTR_RET(a_pVCpu) \ + do { \ + int rc = HMVMX_CPUMCTX_IMPORT_STATE((a_pVCpu), HMVMX_CPUMCTX_EXTRN_ALL); \ + AssertRCReturn(rc, rc); \ + VBOXSTRICTRC rcStrict = IEMExecOne((a_pVCpu)); \ + if (rcStrict == VINF_SUCCESS) \ + ASMAtomicUoOrU64(&(a_pVCpu)->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); \ + else if (rcStrict == VINF_IEM_RAISED_XCPT) \ + { \ + rcStrict = VINF_SUCCESS; \ + ASMAtomicUoOrU64(&(a_pVCpu)->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); \ + } \ + return VBOXSTRICTRC_VAL(rcStrict); \ + } while (0) + +# endif /* VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM */ +#endif /* VBOX_WITH_NESTED_HWVIRT_VMX */ + + +/********************************************************************************************************************************* +* Structures and Typedefs * +*********************************************************************************************************************************/ +/** + * VMX transient state. + * + * A state structure for holding miscellaneous information across + * VMX non-root operation and restored after the transition. + */ +typedef struct VMXTRANSIENT +{ + /** The host's rflags/eflags. */ + RTCCUINTREG fEFlags; +#if HC_ARCH_BITS == 32 + uint32_t u32Alignment0; +#endif + /** The guest's TPR value used for TPR shadowing. */ + uint8_t u8GuestTpr; + /** Alignment. */ + uint8_t abAlignment0[7]; + + /** The basic VM-exit reason. */ + uint16_t uExitReason; + /** Alignment. */ + uint16_t u16Alignment0; + /** The VM-exit interruption error code. */ + uint32_t uExitIntErrorCode; + /** The VM-exit exit code qualification. */ + uint64_t uExitQual; + /** The Guest-linear address. */ + uint64_t uGuestLinearAddr; + + /** The VM-exit interruption-information field. */ + uint32_t uExitIntInfo; + /** The VM-exit instruction-length field. */ + uint32_t cbInstr; + /** The VM-exit instruction-information field. */ + VMXEXITINSTRINFO ExitInstrInfo; + /** Whether the VM-entry failed or not. */ + bool fVMEntryFailed; + /** Alignment. */ + uint8_t abAlignment1[3]; + + /** The VM-entry interruption-information field. */ + uint32_t uEntryIntInfo; + /** The VM-entry exception error code field. */ + uint32_t uEntryXcptErrorCode; + /** The VM-entry instruction length field. */ + uint32_t cbEntryInstr; + + /** IDT-vectoring information field. */ + uint32_t uIdtVectoringInfo; + /** IDT-vectoring error code. */ + uint32_t uIdtVectoringErrorCode; + + /** Mask of currently read VMCS fields; HMVMX_READ_XXX. */ + uint32_t fVmcsFieldsRead; + + /** Whether the guest debug state was active at the time of VM-exit. */ + bool fWasGuestDebugStateActive; + /** Whether the hyper debug state was active at the time of VM-exit. */ + bool fWasHyperDebugStateActive; + /** Whether TSC-offsetting should be setup before VM-entry. */ + bool fUpdateTscOffsettingAndPreemptTimer; + /** Whether the VM-exit was caused by a page-fault during delivery of a + * contributory exception or a page-fault. */ + bool fVectoringDoublePF; + /** Whether the VM-exit was caused by a page-fault during delivery of an + * external interrupt or NMI. */ + bool fVectoringPF; +} VMXTRANSIENT; +AssertCompileMemberAlignment(VMXTRANSIENT, uExitReason, sizeof(uint64_t)); +AssertCompileMemberAlignment(VMXTRANSIENT, uExitIntInfo, sizeof(uint64_t)); +AssertCompileMemberAlignment(VMXTRANSIENT, uEntryIntInfo, sizeof(uint64_t)); +AssertCompileMemberAlignment(VMXTRANSIENT, fWasGuestDebugStateActive, sizeof(uint64_t)); +AssertCompileMemberSize(VMXTRANSIENT, ExitInstrInfo, sizeof(uint32_t)); +/** Pointer to VMX transient state. */ +typedef VMXTRANSIENT *PVMXTRANSIENT; + +/** + * Memory operand read or write access. + */ +typedef enum VMXMEMACCESS +{ + VMXMEMACCESS_READ = 0, + VMXMEMACCESS_WRITE = 1 +} VMXMEMACCESS; + +/** + * VMX VM-exit handler. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure. + * @param pVmxTransient Pointer to the VMX-transient structure. + */ +#ifndef HMVMX_USE_FUNCTION_TABLE +typedef VBOXSTRICTRC FNVMXEXITHANDLER(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +#else +typedef DECLCALLBACK(VBOXSTRICTRC) FNVMXEXITHANDLER(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +/** Pointer to VM-exit handler. */ +typedef FNVMXEXITHANDLER *PFNVMXEXITHANDLER; +#endif + +/** + * VMX VM-exit handler, non-strict status code. + * + * This is generally the same as FNVMXEXITHANDLER, the NSRC bit is just FYI. + * + * @returns VBox status code, no informational status code returned. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmxTransient Pointer to the VMX-transient structure. + * + * @remarks This is not used on anything returning VERR_EM_INTERPRETER as the + * use of that status code will be replaced with VINF_EM_SOMETHING + * later when switching over to IEM. + */ +#ifndef HMVMX_USE_FUNCTION_TABLE +typedef int FNVMXEXITHANDLERNSRC(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +#else +typedef FNVMXEXITHANDLER FNVMXEXITHANDLERNSRC; +#endif + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +static void hmR0VmxFlushEpt(PVMCPU pVCpu, VMXTLBFLUSHEPT enmTlbFlush); +static void hmR0VmxFlushVpid(PVMCPU pVCpu, VMXTLBFLUSHVPID enmTlbFlush, RTGCPTR GCPtr); +static void hmR0VmxClearIntNmiWindowsVmcs(PVMCPU pVCpu); +static int hmR0VmxImportGuestState(PVMCPU pVCpu, uint64_t fWhat); +static VBOXSTRICTRC hmR0VmxInjectEventVmcs(PVMCPU pVCpu, uint64_t u64IntInfo, uint32_t cbInstr, uint32_t u32ErrCode, + RTGCUINTREG GCPtrFaultAddress, bool fStepping, uint32_t *pfIntrState); +#if HC_ARCH_BITS == 32 +static int hmR0VmxInitVmcsReadCache(PVMCPU pVCpu); +#endif +#ifndef HMVMX_USE_FUNCTION_TABLE +DECLINLINE(VBOXSTRICTRC) hmR0VmxHandleExit(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient, uint32_t rcReason); +# define HMVMX_EXIT_DECL DECLINLINE(VBOXSTRICTRC) +# define HMVMX_EXIT_NSRC_DECL DECLINLINE(int) +#else +# define HMVMX_EXIT_DECL static DECLCALLBACK(VBOXSTRICTRC) +# define HMVMX_EXIT_NSRC_DECL HMVMX_EXIT_DECL +#endif + +/** @name VM-exit handlers. + * @{ + */ +static FNVMXEXITHANDLER hmR0VmxExitXcptOrNmi; +static FNVMXEXITHANDLER hmR0VmxExitExtInt; +static FNVMXEXITHANDLER hmR0VmxExitTripleFault; +static FNVMXEXITHANDLERNSRC hmR0VmxExitInitSignal; +static FNVMXEXITHANDLERNSRC hmR0VmxExitSipi; +static FNVMXEXITHANDLERNSRC hmR0VmxExitIoSmi; +static FNVMXEXITHANDLERNSRC hmR0VmxExitSmi; +static FNVMXEXITHANDLERNSRC hmR0VmxExitIntWindow; +static FNVMXEXITHANDLERNSRC hmR0VmxExitNmiWindow; +static FNVMXEXITHANDLER hmR0VmxExitTaskSwitch; +static FNVMXEXITHANDLER hmR0VmxExitCpuid; +static FNVMXEXITHANDLER hmR0VmxExitGetsec; +static FNVMXEXITHANDLER hmR0VmxExitHlt; +static FNVMXEXITHANDLERNSRC hmR0VmxExitInvd; +static FNVMXEXITHANDLER hmR0VmxExitInvlpg; +static FNVMXEXITHANDLER hmR0VmxExitRdpmc; +static FNVMXEXITHANDLER hmR0VmxExitVmcall; +#ifdef VBOX_WITH_NESTED_HWVIRT_VMX +static FNVMXEXITHANDLER hmR0VmxExitVmclear; +static FNVMXEXITHANDLER hmR0VmxExitVmlaunch; +static FNVMXEXITHANDLER hmR0VmxExitVmptrld; +static FNVMXEXITHANDLER hmR0VmxExitVmptrst; +static FNVMXEXITHANDLER hmR0VmxExitVmread; +static FNVMXEXITHANDLER hmR0VmxExitVmresume; +static FNVMXEXITHANDLER hmR0VmxExitVmwrite; +static FNVMXEXITHANDLER hmR0VmxExitVmxoff; +static FNVMXEXITHANDLER hmR0VmxExitVmxon; +#endif +static FNVMXEXITHANDLER hmR0VmxExitRdtsc; +static FNVMXEXITHANDLERNSRC hmR0VmxExitRsm; +static FNVMXEXITHANDLERNSRC hmR0VmxExitSetPendingXcptUD; +static FNVMXEXITHANDLER hmR0VmxExitMovCRx; +static FNVMXEXITHANDLER hmR0VmxExitMovDRx; +static FNVMXEXITHANDLER hmR0VmxExitIoInstr; +static FNVMXEXITHANDLER hmR0VmxExitRdmsr; +static FNVMXEXITHANDLER hmR0VmxExitWrmsr; +static FNVMXEXITHANDLERNSRC hmR0VmxExitErrInvalidGuestState; +static FNVMXEXITHANDLERNSRC hmR0VmxExitErrMsrLoad; +static FNVMXEXITHANDLERNSRC hmR0VmxExitErrUndefined; +static FNVMXEXITHANDLER hmR0VmxExitMwait; +static FNVMXEXITHANDLER hmR0VmxExitMtf; +static FNVMXEXITHANDLER hmR0VmxExitMonitor; +static FNVMXEXITHANDLER hmR0VmxExitPause; +static FNVMXEXITHANDLERNSRC hmR0VmxExitErrMachineCheck; +static FNVMXEXITHANDLERNSRC hmR0VmxExitTprBelowThreshold; +static FNVMXEXITHANDLER hmR0VmxExitApicAccess; +static FNVMXEXITHANDLER hmR0VmxExitXdtrAccess; +static FNVMXEXITHANDLER hmR0VmxExitEptViolation; +static FNVMXEXITHANDLER hmR0VmxExitEptMisconfig; +static FNVMXEXITHANDLER hmR0VmxExitRdtscp; +static FNVMXEXITHANDLER hmR0VmxExitPreemptTimer; +static FNVMXEXITHANDLERNSRC hmR0VmxExitWbinvd; +static FNVMXEXITHANDLER hmR0VmxExitXsetbv; +static FNVMXEXITHANDLER hmR0VmxExitRdrand; +static FNVMXEXITHANDLER hmR0VmxExitInvpcid; +/** @} */ + +static int hmR0VmxExitXcptPF(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +static int hmR0VmxExitXcptMF(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +static int hmR0VmxExitXcptDB(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +static int hmR0VmxExitXcptBP(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +static int hmR0VmxExitXcptGP(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +static int hmR0VmxExitXcptAC(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +static int hmR0VmxExitXcptGeneric(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient); +static uint32_t hmR0VmxCheckGuestState(PVMCPU pVCpu); + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +#ifdef HMVMX_USE_FUNCTION_TABLE + +/** + * VMX_EXIT dispatch table. + */ +static const PFNVMXEXITHANDLER g_apfnVMExitHandlers[VMX_EXIT_MAX + 1] = +{ + /* 00 VMX_EXIT_XCPT_OR_NMI */ hmR0VmxExitXcptOrNmi, + /* 01 VMX_EXIT_EXT_INT */ hmR0VmxExitExtInt, + /* 02 VMX_EXIT_TRIPLE_FAULT */ hmR0VmxExitTripleFault, + /* 03 VMX_EXIT_INIT_SIGNAL */ hmR0VmxExitInitSignal, + /* 04 VMX_EXIT_SIPI */ hmR0VmxExitSipi, + /* 05 VMX_EXIT_IO_SMI */ hmR0VmxExitIoSmi, + /* 06 VMX_EXIT_SMI */ hmR0VmxExitSmi, + /* 07 VMX_EXIT_INT_WINDOW */ hmR0VmxExitIntWindow, + /* 08 VMX_EXIT_NMI_WINDOW */ hmR0VmxExitNmiWindow, + /* 09 VMX_EXIT_TASK_SWITCH */ hmR0VmxExitTaskSwitch, + /* 10 VMX_EXIT_CPUID */ hmR0VmxExitCpuid, + /* 11 VMX_EXIT_GETSEC */ hmR0VmxExitGetsec, + /* 12 VMX_EXIT_HLT */ hmR0VmxExitHlt, + /* 13 VMX_EXIT_INVD */ hmR0VmxExitInvd, + /* 14 VMX_EXIT_INVLPG */ hmR0VmxExitInvlpg, + /* 15 VMX_EXIT_RDPMC */ hmR0VmxExitRdpmc, + /* 16 VMX_EXIT_RDTSC */ hmR0VmxExitRdtsc, + /* 17 VMX_EXIT_RSM */ hmR0VmxExitRsm, + /* 18 VMX_EXIT_VMCALL */ hmR0VmxExitVmcall, +#ifdef VBOX_WITH_NESTED_HWVIRT_VMX + /* 19 VMX_EXIT_VMCLEAR */ hmR0VmxExitVmclear, + /* 20 VMX_EXIT_VMLAUNCH */ hmR0VmxExitVmlaunch, + /* 21 VMX_EXIT_VMPTRLD */ hmR0VmxExitVmptrld, + /* 22 VMX_EXIT_VMPTRST */ hmR0VmxExitVmptrst, + /* 23 VMX_EXIT_VMREAD */ hmR0VmxExitVmread, + /* 24 VMX_EXIT_VMRESUME */ hmR0VmxExitVmresume, + /* 25 VMX_EXIT_VMWRITE */ hmR0VmxExitVmwrite, + /* 26 VMX_EXIT_VMXOFF */ hmR0VmxExitVmxoff, + /* 27 VMX_EXIT_VMXON */ hmR0VmxExitVmxon, +#else + /* 19 VMX_EXIT_VMCLEAR */ hmR0VmxExitSetPendingXcptUD, + /* 20 VMX_EXIT_VMLAUNCH */ hmR0VmxExitSetPendingXcptUD, + /* 21 VMX_EXIT_VMPTRLD */ hmR0VmxExitSetPendingXcptUD, + /* 22 VMX_EXIT_VMPTRST */ hmR0VmxExitSetPendingXcptUD, + /* 23 VMX_EXIT_VMREAD */ hmR0VmxExitSetPendingXcptUD, + /* 24 VMX_EXIT_VMRESUME */ hmR0VmxExitSetPendingXcptUD, + /* 25 VMX_EXIT_VMWRITE */ hmR0VmxExitSetPendingXcptUD, + /* 26 VMX_EXIT_VMXOFF */ hmR0VmxExitSetPendingXcptUD, + /* 27 VMX_EXIT_VMXON */ hmR0VmxExitSetPendingXcptUD, +#endif + /* 28 VMX_EXIT_MOV_CRX */ hmR0VmxExitMovCRx, + /* 29 VMX_EXIT_MOV_DRX */ hmR0VmxExitMovDRx, + /* 30 VMX_EXIT_IO_INSTR */ hmR0VmxExitIoInstr, + /* 31 VMX_EXIT_RDMSR */ hmR0VmxExitRdmsr, + /* 32 VMX_EXIT_WRMSR */ hmR0VmxExitWrmsr, + /* 33 VMX_EXIT_ERR_INVALID_GUEST_STATE */ hmR0VmxExitErrInvalidGuestState, + /* 34 VMX_EXIT_ERR_MSR_LOAD */ hmR0VmxExitErrMsrLoad, + /* 35 UNDEFINED */ hmR0VmxExitErrUndefined, + /* 36 VMX_EXIT_MWAIT */ hmR0VmxExitMwait, + /* 37 VMX_EXIT_MTF */ hmR0VmxExitMtf, + /* 38 UNDEFINED */ hmR0VmxExitErrUndefined, + /* 39 VMX_EXIT_MONITOR */ hmR0VmxExitMonitor, + /* 40 UNDEFINED */ hmR0VmxExitPause, + /* 41 VMX_EXIT_PAUSE */ hmR0VmxExitErrMachineCheck, + /* 42 VMX_EXIT_ERR_MACHINE_CHECK */ hmR0VmxExitErrUndefined, + /* 43 VMX_EXIT_TPR_BELOW_THRESHOLD */ hmR0VmxExitTprBelowThreshold, + /* 44 VMX_EXIT_APIC_ACCESS */ hmR0VmxExitApicAccess, + /* 45 UNDEFINED */ hmR0VmxExitErrUndefined, + /* 46 VMX_EXIT_GDTR_IDTR_ACCESS */ hmR0VmxExitXdtrAccess, + /* 47 VMX_EXIT_LDTR_TR_ACCESS */ hmR0VmxExitXdtrAccess, + /* 48 VMX_EXIT_EPT_VIOLATION */ hmR0VmxExitEptViolation, + /* 49 VMX_EXIT_EPT_MISCONFIG */ hmR0VmxExitEptMisconfig, + /* 50 VMX_EXIT_INVEPT */ hmR0VmxExitSetPendingXcptUD, + /* 51 VMX_EXIT_RDTSCP */ hmR0VmxExitRdtscp, + /* 52 VMX_EXIT_PREEMPT_TIMER */ hmR0VmxExitPreemptTimer, + /* 53 VMX_EXIT_INVVPID */ hmR0VmxExitSetPendingXcptUD, + /* 54 VMX_EXIT_WBINVD */ hmR0VmxExitWbinvd, + /* 55 VMX_EXIT_XSETBV */ hmR0VmxExitXsetbv, + /* 56 VMX_EXIT_APIC_WRITE */ hmR0VmxExitErrUndefined, + /* 57 VMX_EXIT_RDRAND */ hmR0VmxExitRdrand, + /* 58 VMX_EXIT_INVPCID */ hmR0VmxExitInvpcid, + /* 59 VMX_EXIT_VMFUNC */ hmR0VmxExitSetPendingXcptUD, + /* 60 VMX_EXIT_ENCLS */ hmR0VmxExitErrUndefined, + /* 61 VMX_EXIT_RDSEED */ hmR0VmxExitErrUndefined, /* only spurious exits, so undefined */ + /* 62 VMX_EXIT_PML_FULL */ hmR0VmxExitErrUndefined, + /* 63 VMX_EXIT_XSAVES */ hmR0VmxExitSetPendingXcptUD, + /* 64 VMX_EXIT_XRSTORS */ hmR0VmxExitSetPendingXcptUD, +}; +#endif /* HMVMX_USE_FUNCTION_TABLE */ + +#if defined(VBOX_STRICT) && defined(LOG_ENABLED) +static const char * const g_apszVmxInstrErrors[HMVMX_INSTR_ERROR_MAX + 1] = +{ + /* 0 */ "(Not Used)", + /* 1 */ "VMCALL executed in VMX root operation.", + /* 2 */ "VMCLEAR with invalid physical address.", + /* 3 */ "VMCLEAR with VMXON pointer.", + /* 4 */ "VMLAUNCH with non-clear VMCS.", + /* 5 */ "VMRESUME with non-launched VMCS.", + /* 6 */ "VMRESUME after VMXOFF", + /* 7 */ "VM-entry with invalid control fields.", + /* 8 */ "VM-entry with invalid host state fields.", + /* 9 */ "VMPTRLD with invalid physical address.", + /* 10 */ "VMPTRLD with VMXON pointer.", + /* 11 */ "VMPTRLD with incorrect revision identifier.", + /* 12 */ "VMREAD/VMWRITE from/to unsupported VMCS component.", + /* 13 */ "VMWRITE to read-only VMCS component.", + /* 14 */ "(Not Used)", + /* 15 */ "VMXON executed in VMX root operation.", + /* 16 */ "VM-entry with invalid executive-VMCS pointer.", + /* 17 */ "VM-entry with non-launched executing VMCS.", + /* 18 */ "VM-entry with executive-VMCS pointer not VMXON pointer.", + /* 19 */ "VMCALL with non-clear VMCS.", + /* 20 */ "VMCALL with invalid VM-exit control fields.", + /* 21 */ "(Not Used)", + /* 22 */ "VMCALL with incorrect MSEG revision identifier.", + /* 23 */ "VMXOFF under dual monitor treatment of SMIs and SMM.", + /* 24 */ "VMCALL with invalid SMM-monitor features.", + /* 25 */ "VM-entry with invalid VM-execution control fields in executive VMCS.", + /* 26 */ "VM-entry with events blocked by MOV SS.", + /* 27 */ "(Not Used)", + /* 28 */ "Invalid operand to INVEPT/INVVPID." +}; +#endif /* VBOX_STRICT */ + + +/** + * Updates the VM's last error record. + * + * If there was a VMX instruction error, reads the error data from the VMCS and + * updates VCPU's last error record as well. + * + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * Can be NULL if @a rc is not VERR_VMX_UNABLE_TO_START_VM or + * VERR_VMX_INVALID_VMCS_FIELD. + * @param rc The error code. + */ +static void hmR0VmxUpdateErrorRecord(PVMCPU pVCpu, int rc) +{ + if ( rc == VERR_VMX_INVALID_VMCS_FIELD + || rc == VERR_VMX_UNABLE_TO_START_VM) + { + AssertPtrReturnVoid(pVCpu); + VMXReadVmcs32(VMX_VMCS32_RO_VM_INSTR_ERROR, &pVCpu->hm.s.vmx.LastError.u32InstrError); + } + pVCpu->CTX_SUFF(pVM)->hm.s.rcInit = rc; +} + + +/** + * Reads the VM-entry interruption-information field from the VMCS into the VMX + * transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + * + * @remarks No-long-jump zone!!! + */ +DECLINLINE(int) hmR0VmxReadEntryIntInfoVmcs(PVMXTRANSIENT pVmxTransient) +{ + int rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_INTERRUPTION_INFO, &pVmxTransient->uEntryIntInfo); + AssertRCReturn(rc, rc); + return VINF_SUCCESS; +} + +#ifdef VBOX_STRICT +/** + * Reads the VM-entry exception error code field from the VMCS into + * the VMX transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + * + * @remarks No-long-jump zone!!! + */ +DECLINLINE(int) hmR0VmxReadEntryXcptErrorCodeVmcs(PVMXTRANSIENT pVmxTransient) +{ + int rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_EXCEPTION_ERRCODE, &pVmxTransient->uEntryXcptErrorCode); + AssertRCReturn(rc, rc); + return VINF_SUCCESS; +} + + +/** + * Reads the VM-entry exception error code field from the VMCS into + * the VMX transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + * + * @remarks No-long-jump zone!!! + */ +DECLINLINE(int) hmR0VmxReadEntryInstrLenVmcs(PVMXTRANSIENT pVmxTransient) +{ + int rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_INSTR_LENGTH, &pVmxTransient->cbEntryInstr); + AssertRCReturn(rc, rc); + return VINF_SUCCESS; +} +#endif /* VBOX_STRICT */ + + +/** + * Reads the VM-exit interruption-information field from the VMCS into the VMX + * transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + */ +DECLINLINE(int) hmR0VmxReadExitIntInfoVmcs(PVMXTRANSIENT pVmxTransient) +{ + if (!(pVmxTransient->fVmcsFieldsRead & HMVMX_READ_EXIT_INTERRUPTION_INFO)) + { + int rc = VMXReadVmcs32(VMX_VMCS32_RO_EXIT_INTERRUPTION_INFO, &pVmxTransient->uExitIntInfo); + AssertRCReturn(rc,rc); + pVmxTransient->fVmcsFieldsRead |= HMVMX_READ_EXIT_INTERRUPTION_INFO; + } + return VINF_SUCCESS; +} + + +/** + * Reads the VM-exit interruption error code from the VMCS into the VMX + * transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + */ +DECLINLINE(int) hmR0VmxReadExitIntErrorCodeVmcs(PVMXTRANSIENT pVmxTransient) +{ + if (!(pVmxTransient->fVmcsFieldsRead & HMVMX_READ_EXIT_INTERRUPTION_ERROR_CODE)) + { + int rc = VMXReadVmcs32(VMX_VMCS32_RO_EXIT_INTERRUPTION_ERROR_CODE, &pVmxTransient->uExitIntErrorCode); + AssertRCReturn(rc, rc); + pVmxTransient->fVmcsFieldsRead |= HMVMX_READ_EXIT_INTERRUPTION_ERROR_CODE; + } + return VINF_SUCCESS; +} + + +/** + * Reads the VM-exit instruction length field from the VMCS into the VMX + * transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + */ +DECLINLINE(int) hmR0VmxReadExitInstrLenVmcs(PVMXTRANSIENT pVmxTransient) +{ + if (!(pVmxTransient->fVmcsFieldsRead & HMVMX_READ_EXIT_INSTR_LEN)) + { + int rc = VMXReadVmcs32(VMX_VMCS32_RO_EXIT_INSTR_LENGTH, &pVmxTransient->cbInstr); + AssertRCReturn(rc, rc); + pVmxTransient->fVmcsFieldsRead |= HMVMX_READ_EXIT_INSTR_LEN; + } + return VINF_SUCCESS; +} + + +/** + * Reads the VM-exit instruction-information field from the VMCS into + * the VMX transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + */ +DECLINLINE(int) hmR0VmxReadExitInstrInfoVmcs(PVMXTRANSIENT pVmxTransient) +{ + if (!(pVmxTransient->fVmcsFieldsRead & HMVMX_READ_EXIT_INSTR_INFO)) + { + int rc = VMXReadVmcs32(VMX_VMCS32_RO_EXIT_INSTR_INFO, &pVmxTransient->ExitInstrInfo.u); + AssertRCReturn(rc, rc); + pVmxTransient->fVmcsFieldsRead |= HMVMX_READ_EXIT_INSTR_INFO; + } + return VINF_SUCCESS; +} + + +/** + * Reads the VM-exit Qualification from the VMCS into the VMX transient structure. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure of the + * calling EMT. (Required for the VMCS cache case.) + * @param pVmxTransient Pointer to the VMX transient structure. + */ +DECLINLINE(int) hmR0VmxReadExitQualVmcs(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + if (!(pVmxTransient->fVmcsFieldsRead & HMVMX_READ_EXIT_QUALIFICATION)) + { + int rc = VMXReadVmcsGstN(VMX_VMCS_RO_EXIT_QUALIFICATION, &pVmxTransient->uExitQual); NOREF(pVCpu); + AssertRCReturn(rc, rc); + pVmxTransient->fVmcsFieldsRead |= HMVMX_READ_EXIT_QUALIFICATION; + } + return VINF_SUCCESS; +} + + +/** + * Reads the Guest-linear address from the VMCS into the VMX transient structure. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure of the + * calling EMT. (Required for the VMCS cache case.) + * @param pVmxTransient Pointer to the VMX transient structure. + */ +DECLINLINE(int) hmR0VmxReadGuestLinearAddrVmcs(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + if (!(pVmxTransient->fVmcsFieldsRead & HMVMX_READ_GUEST_LINEAR_ADDR)) + { + int rc = VMXReadVmcsGstN(VMX_VMCS_RO_GUEST_LINEAR_ADDR, &pVmxTransient->uGuestLinearAddr); NOREF(pVCpu); + AssertRCReturn(rc, rc); + pVmxTransient->fVmcsFieldsRead |= HMVMX_READ_GUEST_LINEAR_ADDR; + } + return VINF_SUCCESS; +} + + +/** + * Reads the IDT-vectoring information field from the VMCS into the VMX + * transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + * + * @remarks No-long-jump zone!!! + */ +DECLINLINE(int) hmR0VmxReadIdtVectoringInfoVmcs(PVMXTRANSIENT pVmxTransient) +{ + if (!(pVmxTransient->fVmcsFieldsRead & HMVMX_READ_IDT_VECTORING_INFO)) + { + int rc = VMXReadVmcs32(VMX_VMCS32_RO_IDT_VECTORING_INFO, &pVmxTransient->uIdtVectoringInfo); + AssertRCReturn(rc, rc); + pVmxTransient->fVmcsFieldsRead |= HMVMX_READ_IDT_VECTORING_INFO; + } + return VINF_SUCCESS; +} + + +/** + * Reads the IDT-vectoring error code from the VMCS into the VMX + * transient structure. + * + * @returns VBox status code. + * @param pVmxTransient Pointer to the VMX transient structure. + */ +DECLINLINE(int) hmR0VmxReadIdtVectoringErrorCodeVmcs(PVMXTRANSIENT pVmxTransient) +{ + if (!(pVmxTransient->fVmcsFieldsRead & HMVMX_READ_IDT_VECTORING_ERROR_CODE)) + { + int rc = VMXReadVmcs32(VMX_VMCS32_RO_IDT_VECTORING_ERROR_CODE, &pVmxTransient->uIdtVectoringErrorCode); + AssertRCReturn(rc, rc); + pVmxTransient->fVmcsFieldsRead |= HMVMX_READ_IDT_VECTORING_ERROR_CODE; + } + return VINF_SUCCESS; +} + + +/** + * Enters VMX root mode operation on the current CPU. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. Can be + * NULL, after a resume. + * @param HCPhysCpuPage Physical address of the VMXON region. + * @param pvCpuPage Pointer to the VMXON region. + */ +static int hmR0VmxEnterRootMode(PVM pVM, RTHCPHYS HCPhysCpuPage, void *pvCpuPage) +{ + Assert(HCPhysCpuPage && HCPhysCpuPage != NIL_RTHCPHYS); + Assert(RT_ALIGN_T(HCPhysCpuPage, _4K, RTHCPHYS) == HCPhysCpuPage); + Assert(pvCpuPage); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + if (pVM) + { + /* Write the VMCS revision dword to the VMXON region. */ + *(uint32_t *)pvCpuPage = RT_BF_GET(pVM->hm.s.vmx.Msrs.u64Basic, VMX_BF_BASIC_VMCS_ID); + } + + /* Paranoid: Disable interrupts as, in theory, interrupt handlers might mess with CR4. */ + RTCCUINTREG fEFlags = ASMIntDisableFlags(); + + /* Enable the VMX bit in CR4 if necessary. */ + RTCCUINTREG uOldCr4 = SUPR0ChangeCR4(X86_CR4_VMXE, RTCCUINTREG_MAX); + + /* Enter VMX root mode. */ + int rc = VMXEnable(HCPhysCpuPage); + if (RT_FAILURE(rc)) + { + if (!(uOldCr4 & X86_CR4_VMXE)) + SUPR0ChangeCR4(0, ~X86_CR4_VMXE); + + if (pVM) + pVM->hm.s.vmx.HCPhysVmxEnableError = HCPhysCpuPage; + } + + /* Restore interrupts. */ + ASMSetFlags(fEFlags); + return rc; +} + + +/** + * Exits VMX root mode operation on the current CPU. + * + * @returns VBox status code. + */ +static int hmR0VmxLeaveRootMode(void) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* Paranoid: Disable interrupts as, in theory, interrupts handlers might mess with CR4. */ + RTCCUINTREG fEFlags = ASMIntDisableFlags(); + + /* If we're for some reason not in VMX root mode, then don't leave it. */ + RTCCUINTREG uHostCR4 = ASMGetCR4(); + + int rc; + if (uHostCR4 & X86_CR4_VMXE) + { + /* Exit VMX root mode and clear the VMX bit in CR4. */ + VMXDisable(); + SUPR0ChangeCR4(0, ~X86_CR4_VMXE); + rc = VINF_SUCCESS; + } + else + rc = VERR_VMX_NOT_IN_VMX_ROOT_MODE; + + /* Restore interrupts. */ + ASMSetFlags(fEFlags); + return rc; +} + + +/** + * Allocates and maps one physically contiguous page. The allocated page is + * zero'd out. (Used by various VT-x structures). + * + * @returns IPRT status code. + * @param pMemObj Pointer to the ring-0 memory object. + * @param ppVirt Where to store the virtual address of the + * allocation. + * @param pHCPhys Where to store the physical address of the + * allocation. + */ +static int hmR0VmxPageAllocZ(PRTR0MEMOBJ pMemObj, PRTR0PTR ppVirt, PRTHCPHYS pHCPhys) +{ + AssertPtrReturn(pMemObj, VERR_INVALID_PARAMETER); + AssertPtrReturn(ppVirt, VERR_INVALID_PARAMETER); + AssertPtrReturn(pHCPhys, VERR_INVALID_PARAMETER); + + int rc = RTR0MemObjAllocCont(pMemObj, PAGE_SIZE, false /* fExecutable */); + if (RT_FAILURE(rc)) + return rc; + *ppVirt = RTR0MemObjAddress(*pMemObj); + *pHCPhys = RTR0MemObjGetPagePhysAddr(*pMemObj, 0 /* iPage */); + ASMMemZero32(*ppVirt, PAGE_SIZE); + return VINF_SUCCESS; +} + + +/** + * Frees and unmaps an allocated physical page. + * + * @param pMemObj Pointer to the ring-0 memory object. + * @param ppVirt Where to re-initialize the virtual address of + * allocation as 0. + * @param pHCPhys Where to re-initialize the physical address of the + * allocation as 0. + */ +static void hmR0VmxPageFree(PRTR0MEMOBJ pMemObj, PRTR0PTR ppVirt, PRTHCPHYS pHCPhys) +{ + AssertPtr(pMemObj); + AssertPtr(ppVirt); + AssertPtr(pHCPhys); + if (*pMemObj != NIL_RTR0MEMOBJ) + { + int rc = RTR0MemObjFree(*pMemObj, true /* fFreeMappings */); + AssertRC(rc); + *pMemObj = NIL_RTR0MEMOBJ; + *ppVirt = 0; + *pHCPhys = 0; + } +} + + +/** + * Worker function to free VT-x related structures. + * + * @returns IPRT status code. + * @param pVM The cross context VM structure. + */ +static void hmR0VmxStructsFree(PVM pVM) +{ + for (VMCPUID i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + AssertPtr(pVCpu); + + hmR0VmxPageFree(&pVCpu->hm.s.vmx.hMemObjHostMsr, &pVCpu->hm.s.vmx.pvHostMsr, &pVCpu->hm.s.vmx.HCPhysHostMsr); + hmR0VmxPageFree(&pVCpu->hm.s.vmx.hMemObjGuestMsr, &pVCpu->hm.s.vmx.pvGuestMsr, &pVCpu->hm.s.vmx.HCPhysGuestMsr); + + if (pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_USE_MSR_BITMAPS) + hmR0VmxPageFree(&pVCpu->hm.s.vmx.hMemObjMsrBitmap, &pVCpu->hm.s.vmx.pvMsrBitmap, &pVCpu->hm.s.vmx.HCPhysMsrBitmap); + + hmR0VmxPageFree(&pVCpu->hm.s.vmx.hMemObjVmcs, &pVCpu->hm.s.vmx.pvVmcs, &pVCpu->hm.s.vmx.HCPhysVmcs); + } + + hmR0VmxPageFree(&pVM->hm.s.vmx.hMemObjApicAccess, (PRTR0PTR)&pVM->hm.s.vmx.pbApicAccess, &pVM->hm.s.vmx.HCPhysApicAccess); +#ifdef VBOX_WITH_CRASHDUMP_MAGIC + hmR0VmxPageFree(&pVM->hm.s.vmx.hMemObjScratch, &pVM->hm.s.vmx.pbScratch, &pVM->hm.s.vmx.HCPhysScratch); +#endif +} + + +/** + * Worker function to allocate VT-x related VM structures. + * + * @returns IPRT status code. + * @param pVM The cross context VM structure. + */ +static int hmR0VmxStructsAlloc(PVM pVM) +{ + /* + * Initialize members up-front so we can cleanup properly on allocation failure. + */ +#define VMXLOCAL_INIT_VM_MEMOBJ(a_Name, a_VirtPrefix) \ + pVM->hm.s.vmx.hMemObj##a_Name = NIL_RTR0MEMOBJ; \ + pVM->hm.s.vmx.a_VirtPrefix##a_Name = 0; \ + pVM->hm.s.vmx.HCPhys##a_Name = 0; + +#define VMXLOCAL_INIT_VMCPU_MEMOBJ(a_Name, a_VirtPrefix) \ + pVCpu->hm.s.vmx.hMemObj##a_Name = NIL_RTR0MEMOBJ; \ + pVCpu->hm.s.vmx.a_VirtPrefix##a_Name = 0; \ + pVCpu->hm.s.vmx.HCPhys##a_Name = 0; + +#ifdef VBOX_WITH_CRASHDUMP_MAGIC + VMXLOCAL_INIT_VM_MEMOBJ(Scratch, pv); +#endif + VMXLOCAL_INIT_VM_MEMOBJ(ApicAccess, pb); + + AssertCompile(sizeof(VMCPUID) == sizeof(pVM->cCpus)); + for (VMCPUID i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + VMXLOCAL_INIT_VMCPU_MEMOBJ(Vmcs, pv); + VMXLOCAL_INIT_VMCPU_MEMOBJ(MsrBitmap, pv); + VMXLOCAL_INIT_VMCPU_MEMOBJ(GuestMsr, pv); + VMXLOCAL_INIT_VMCPU_MEMOBJ(HostMsr, pv); + } +#undef VMXLOCAL_INIT_VMCPU_MEMOBJ +#undef VMXLOCAL_INIT_VM_MEMOBJ + + /* The VMCS size cannot be more than 4096 bytes. See Intel spec. Appendix A.1 "Basic VMX Information". */ + AssertReturnStmt(RT_BF_GET(pVM->hm.s.vmx.Msrs.u64Basic, VMX_BF_BASIC_VMCS_SIZE) <= PAGE_SIZE, + (&pVM->aCpus[0])->hm.s.u32HMError = VMX_UFC_INVALID_VMCS_SIZE, + VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO); + + /* + * Allocate all the VT-x structures. + */ + int rc = VINF_SUCCESS; +#ifdef VBOX_WITH_CRASHDUMP_MAGIC + rc = hmR0VmxPageAllocZ(&pVM->hm.s.vmx.hMemObjScratch, &pVM->hm.s.vmx.pbScratch, &pVM->hm.s.vmx.HCPhysScratch); + if (RT_FAILURE(rc)) + goto cleanup; + strcpy((char *)pVM->hm.s.vmx.pbScratch, "SCRATCH Magic"); + *(uint64_t *)(pVM->hm.s.vmx.pbScratch + 16) = UINT64_C(0xdeadbeefdeadbeef); +#endif + + /* Allocate the APIC-access page for trapping APIC accesses from the guest. */ + if (pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1 & VMX_PROC_CTLS2_VIRT_APIC_ACCESS) + { + rc = hmR0VmxPageAllocZ(&pVM->hm.s.vmx.hMemObjApicAccess, (PRTR0PTR)&pVM->hm.s.vmx.pbApicAccess, + &pVM->hm.s.vmx.HCPhysApicAccess); + if (RT_FAILURE(rc)) + goto cleanup; + } + + /* + * Initialize per-VCPU VT-x structures. + */ + for (VMCPUID i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + AssertPtr(pVCpu); + + /* Allocate the VM control structure (VMCS). */ + rc = hmR0VmxPageAllocZ(&pVCpu->hm.s.vmx.hMemObjVmcs, &pVCpu->hm.s.vmx.pvVmcs, &pVCpu->hm.s.vmx.HCPhysVmcs); + if (RT_FAILURE(rc)) + goto cleanup; + + /* Get the allocated virtual-APIC page from the APIC device for transparent TPR accesses. */ + if ( PDMHasApic(pVM) + && (pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_USE_TPR_SHADOW)) + { + rc = APICGetApicPageForCpu(pVCpu, &pVCpu->hm.s.vmx.HCPhysVirtApic, (PRTR0PTR)&pVCpu->hm.s.vmx.pbVirtApic, + NULL /* pR3Ptr */, NULL /* pRCPtr */); + if (RT_FAILURE(rc)) + goto cleanup; + } + + /* + * Allocate the MSR-bitmap if supported by the CPU. The MSR-bitmap is for + * transparent accesses of specific MSRs. + * + * If the condition for enabling MSR bitmaps changes here, don't forget to + * update HMAreMsrBitmapsAvailable(). + */ + if (pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_USE_MSR_BITMAPS) + { + rc = hmR0VmxPageAllocZ(&pVCpu->hm.s.vmx.hMemObjMsrBitmap, &pVCpu->hm.s.vmx.pvMsrBitmap, + &pVCpu->hm.s.vmx.HCPhysMsrBitmap); + if (RT_FAILURE(rc)) + goto cleanup; + ASMMemFill32(pVCpu->hm.s.vmx.pvMsrBitmap, PAGE_SIZE, UINT32_C(0xffffffff)); + } + + /* Allocate the VM-entry MSR-load and VM-exit MSR-store page for the guest MSRs. */ + rc = hmR0VmxPageAllocZ(&pVCpu->hm.s.vmx.hMemObjGuestMsr, &pVCpu->hm.s.vmx.pvGuestMsr, &pVCpu->hm.s.vmx.HCPhysGuestMsr); + if (RT_FAILURE(rc)) + goto cleanup; + + /* Allocate the VM-exit MSR-load page for the host MSRs. */ + rc = hmR0VmxPageAllocZ(&pVCpu->hm.s.vmx.hMemObjHostMsr, &pVCpu->hm.s.vmx.pvHostMsr, &pVCpu->hm.s.vmx.HCPhysHostMsr); + if (RT_FAILURE(rc)) + goto cleanup; + } + + return VINF_SUCCESS; + +cleanup: + hmR0VmxStructsFree(pVM); + return rc; +} + + +/** + * Does global VT-x initialization (called during module initialization). + * + * @returns VBox status code. + */ +VMMR0DECL(int) VMXR0GlobalInit(void) +{ +#ifdef HMVMX_USE_FUNCTION_TABLE + AssertCompile(VMX_EXIT_MAX + 1 == RT_ELEMENTS(g_apfnVMExitHandlers)); +# ifdef VBOX_STRICT + for (unsigned i = 0; i < RT_ELEMENTS(g_apfnVMExitHandlers); i++) + Assert(g_apfnVMExitHandlers[i]); +# endif +#endif + return VINF_SUCCESS; +} + + +/** + * Does global VT-x termination (called during module termination). + */ +VMMR0DECL(void) VMXR0GlobalTerm() +{ + /* Nothing to do currently. */ +} + + +/** + * Sets up and activates VT-x on the current CPU. + * + * @returns VBox status code. + * @param pHostCpu The HM physical-CPU structure. + * @param pVM The cross context VM structure. Can be + * NULL after a host resume operation. + * @param pvCpuPage Pointer to the VMXON region (can be NULL if @a + * fEnabledByHost is @c true). + * @param HCPhysCpuPage Physical address of the VMXON region (can be 0 if + * @a fEnabledByHost is @c true). + * @param fEnabledByHost Set if SUPR0EnableVTx() or similar was used to + * enable VT-x on the host. + * @param pHwvirtMsrs Pointer to the hardware-virtualization MSRs. + */ +VMMR0DECL(int) VMXR0EnableCpu(PHMPHYSCPU pHostCpu, PVM pVM, void *pvCpuPage, RTHCPHYS HCPhysCpuPage, bool fEnabledByHost, + PCSUPHWVIRTMSRS pHwvirtMsrs) +{ + Assert(pHostCpu); + Assert(pHwvirtMsrs); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* Enable VT-x if it's not already enabled by the host. */ + if (!fEnabledByHost) + { + int rc = hmR0VmxEnterRootMode(pVM, HCPhysCpuPage, pvCpuPage); + if (RT_FAILURE(rc)) + return rc; + } + + /* + * Flush all EPT tagged-TLB entries (in case VirtualBox or any other hypervisor have been + * using EPTPs) so we don't retain any stale guest-physical mappings which won't get + * invalidated when flushing by VPID. + */ + if (pHwvirtMsrs->u.vmx.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVEPT_ALL_CONTEXTS) + { + hmR0VmxFlushEpt(NULL /* pVCpu */, VMXTLBFLUSHEPT_ALL_CONTEXTS); + pHostCpu->fFlushAsidBeforeUse = false; + } + else + pHostCpu->fFlushAsidBeforeUse = true; + + /* Ensure each VCPU scheduled on this CPU gets a new VPID on resume. See @bugref{6255}. */ + ++pHostCpu->cTlbFlushes; + + return VINF_SUCCESS; +} + + +/** + * Deactivates VT-x on the current CPU. + * + * @returns VBox status code. + * @param pvCpuPage Pointer to the VMXON region. + * @param HCPhysCpuPage Physical address of the VMXON region. + * + * @remarks This function should never be called when SUPR0EnableVTx() or + * similar was used to enable VT-x on the host. + */ +VMMR0DECL(int) VMXR0DisableCpu(void *pvCpuPage, RTHCPHYS HCPhysCpuPage) +{ + RT_NOREF2(pvCpuPage, HCPhysCpuPage); + + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + return hmR0VmxLeaveRootMode(); +} + + +/** + * Sets the permission bits for the specified MSR in the MSR bitmap. + * + * @param pVCpu The cross context virtual CPU structure. + * @param uMsr The MSR value. + * @param enmRead Whether reading this MSR causes a VM-exit. + * @param enmWrite Whether writing this MSR causes a VM-exit. + */ +static void hmR0VmxSetMsrPermission(PVMCPU pVCpu, uint32_t uMsr, VMXMSREXITREAD enmRead, VMXMSREXITWRITE enmWrite) +{ + int32_t iBit; + uint8_t *pbMsrBitmap = (uint8_t *)pVCpu->hm.s.vmx.pvMsrBitmap; + + /* + * MSR Layout: + * Byte index MSR range Interpreted as + * 0x000 - 0x3ff 0x00000000 - 0x00001fff Low MSR read bits. + * 0x400 - 0x7ff 0xc0000000 - 0xc0001fff High MSR read bits. + * 0x800 - 0xbff 0x00000000 - 0x00001fff Low MSR write bits. + * 0xc00 - 0xfff 0xc0000000 - 0xc0001fff High MSR write bits. + * + * A bit corresponding to an MSR within the above range causes a VM-exit + * if the bit is 1 on executions of RDMSR/WRMSR. + * + * If an MSR falls out of the MSR range, it always cause a VM-exit. + * + * See Intel spec. 24.6.9 "MSR-Bitmap Address". + */ + if (uMsr <= 0x00001fff) + iBit = uMsr; + else if (uMsr - UINT32_C(0xc0000000) <= UINT32_C(0x00001fff)) + { + iBit = uMsr - UINT32_C(0xc0000000); + pbMsrBitmap += 0x400; + } + else + AssertMsgFailedReturnVoid(("hmR0VmxSetMsrPermission: Invalid MSR %#RX32\n", uMsr)); + + Assert(iBit <= 0x1fff); + if (enmRead == VMXMSREXIT_INTERCEPT_READ) + ASMBitSet(pbMsrBitmap, iBit); + else + ASMBitClear(pbMsrBitmap, iBit); + + if (enmWrite == VMXMSREXIT_INTERCEPT_WRITE) + ASMBitSet(pbMsrBitmap + 0x800, iBit); + else + ASMBitClear(pbMsrBitmap + 0x800, iBit); +} + + +/** + * Updates the VMCS with the number of effective MSRs in the auto-load/store MSR + * area. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param cMsrs The number of MSRs. + */ +static int hmR0VmxSetAutoLoadStoreMsrCount(PVMCPU pVCpu, uint32_t cMsrs) +{ + /* Shouldn't ever happen but there -is- a number. We're well within the recommended 512. */ + uint64_t const uVmxMiscMsr = pVCpu->CTX_SUFF(pVM)->hm.s.vmx.Msrs.u64Misc; + uint32_t const cMaxSupportedMsrs = VMX_MISC_MAX_MSRS(uVmxMiscMsr); + if (RT_UNLIKELY(cMsrs > cMaxSupportedMsrs)) + { + LogRel(("CPU auto-load/store MSR count in VMCS exceeded cMsrs=%u Supported=%u.\n", cMsrs, cMaxSupportedMsrs)); + pVCpu->hm.s.u32HMError = VMX_UFC_INSUFFICIENT_GUEST_MSR_STORAGE; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Update number of guest MSRs to load/store across the world-switch. */ + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_ENTRY_MSR_LOAD_COUNT, cMsrs); + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_EXIT_MSR_STORE_COUNT, cMsrs); + + /* Update number of host MSRs to load after the world-switch. Identical to guest-MSR count as it's always paired. */ + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_EXIT_MSR_LOAD_COUNT, cMsrs); + AssertRCReturn(rc, rc); + + /* Update the VCPU's copy of the MSR count. */ + pVCpu->hm.s.vmx.cMsrs = cMsrs; + + return VINF_SUCCESS; +} + + +/** + * Adds a new (or updates the value of an existing) guest/host MSR + * pair to be swapped during the world-switch as part of the + * auto-load/store MSR area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param uMsr The MSR. + * @param uGuestMsrValue Value of the guest MSR. + * @param fUpdateHostMsr Whether to update the value of the host MSR if + * necessary. + * @param pfAddedAndUpdated Where to store whether the MSR was added -and- + * its value was updated. Optional, can be NULL. + */ +static int hmR0VmxAddAutoLoadStoreMsr(PVMCPU pVCpu, uint32_t uMsr, uint64_t uGuestMsrValue, bool fUpdateHostMsr, + bool *pfAddedAndUpdated) +{ + PVMXAUTOMSR pGuestMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvGuestMsr; + uint32_t cMsrs = pVCpu->hm.s.vmx.cMsrs; + uint32_t i; + for (i = 0; i < cMsrs; i++) + { + if (pGuestMsr->u32Msr == uMsr) + break; + pGuestMsr++; + } + + bool fAdded = false; + if (i == cMsrs) + { + ++cMsrs; + int rc = hmR0VmxSetAutoLoadStoreMsrCount(pVCpu, cMsrs); + AssertMsgRCReturn(rc, ("hmR0VmxAddAutoLoadStoreMsr: Insufficient space to add MSR %u\n", uMsr), rc); + + /* Now that we're swapping MSRs during the world-switch, allow the guest to read/write them without causing VM-exits. */ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_MSR_BITMAPS) + hmR0VmxSetMsrPermission(pVCpu, uMsr, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + + fAdded = true; + } + + /* Update the MSR values in the auto-load/store MSR area. */ + pGuestMsr->u32Msr = uMsr; + pGuestMsr->u64Value = uGuestMsrValue; + + /* Create/update the MSR slot in the host MSR area. */ + PVMXAUTOMSR pHostMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvHostMsr; + pHostMsr += i; + pHostMsr->u32Msr = uMsr; + + /* + * Update the host MSR only when requested by the caller AND when we're + * adding it to the auto-load/store area. Otherwise, it would have been + * updated by hmR0VmxExportHostMsrs(). We do this for performance reasons. + */ + bool fUpdatedMsrValue = false; + if ( fAdded + && fUpdateHostMsr) + { + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + pHostMsr->u64Value = ASMRdMsr(pHostMsr->u32Msr); + fUpdatedMsrValue = true; + } + + if (pfAddedAndUpdated) + *pfAddedAndUpdated = fUpdatedMsrValue; + return VINF_SUCCESS; +} + + +/** + * Removes a guest/host MSR pair to be swapped during the world-switch from the + * auto-load/store MSR area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param uMsr The MSR. + */ +static int hmR0VmxRemoveAutoLoadStoreMsr(PVMCPU pVCpu, uint32_t uMsr) +{ + PVMXAUTOMSR pGuestMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvGuestMsr; + uint32_t cMsrs = pVCpu->hm.s.vmx.cMsrs; + for (uint32_t i = 0; i < cMsrs; i++) + { + /* Find the MSR. */ + if (pGuestMsr->u32Msr == uMsr) + { + /* If it's the last MSR, simply reduce the count. */ + if (i == cMsrs - 1) + { + --cMsrs; + break; + } + + /* Remove it by swapping the last MSR in place of it, and reducing the count. */ + PVMXAUTOMSR pLastGuestMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvGuestMsr; + pLastGuestMsr += cMsrs - 1; + pGuestMsr->u32Msr = pLastGuestMsr->u32Msr; + pGuestMsr->u64Value = pLastGuestMsr->u64Value; + + PVMXAUTOMSR pHostMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvHostMsr; + PVMXAUTOMSR pLastHostMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvHostMsr; + pLastHostMsr += cMsrs - 1; + pHostMsr->u32Msr = pLastHostMsr->u32Msr; + pHostMsr->u64Value = pLastHostMsr->u64Value; + --cMsrs; + break; + } + pGuestMsr++; + } + + /* Update the VMCS if the count changed (meaning the MSR was found). */ + if (cMsrs != pVCpu->hm.s.vmx.cMsrs) + { + int rc = hmR0VmxSetAutoLoadStoreMsrCount(pVCpu, cMsrs); + AssertRCReturn(rc, rc); + + /* We're no longer swapping MSRs during the world-switch, intercept guest read/writes to them. */ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_MSR_BITMAPS) + hmR0VmxSetMsrPermission(pVCpu, uMsr, VMXMSREXIT_INTERCEPT_READ, VMXMSREXIT_INTERCEPT_WRITE); + + Log4Func(("Removed MSR %#RX32 new cMsrs=%u\n", uMsr, pVCpu->hm.s.vmx.cMsrs)); + return VINF_SUCCESS; + } + + return VERR_NOT_FOUND; +} + + +/** + * Checks if the specified guest MSR is part of the auto-load/store area in + * the VMCS. + * + * @returns true if found, false otherwise. + * @param pVCpu The cross context virtual CPU structure. + * @param uMsr The MSR to find. + */ +static bool hmR0VmxIsAutoLoadStoreGuestMsr(PVMCPU pVCpu, uint32_t uMsr) +{ + PVMXAUTOMSR pGuestMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvGuestMsr; + uint32_t const cMsrs = pVCpu->hm.s.vmx.cMsrs; + + for (uint32_t i = 0; i < cMsrs; i++, pGuestMsr++) + { + if (pGuestMsr->u32Msr == uMsr) + return true; + } + return false; +} + + +/** + * Updates the value of all host MSRs in the auto-load/store area in the VMCS. + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0VmxUpdateAutoLoadStoreHostMsrs(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + PVMXAUTOMSR pHostMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvHostMsr; + PVMXAUTOMSR pGuestMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvGuestMsr; + uint32_t const cMsrs = pVCpu->hm.s.vmx.cMsrs; + + for (uint32_t i = 0; i < cMsrs; i++, pHostMsr++, pGuestMsr++) + { + AssertReturnVoid(pHostMsr->u32Msr == pGuestMsr->u32Msr); + + /* + * Performance hack for the host EFER MSR. We use the cached value rather than re-read it. + * Strict builds will catch mismatches in hmR0VmxCheckAutoLoadStoreMsrs(). See @bugref{7368}. + */ + if (pHostMsr->u32Msr == MSR_K6_EFER) + pHostMsr->u64Value = pVCpu->CTX_SUFF(pVM)->hm.s.vmx.u64HostEfer; + else + pHostMsr->u64Value = ASMRdMsr(pHostMsr->u32Msr); + } + + pVCpu->hm.s.vmx.fUpdatedHostMsrs = true; +} + + +/** + * Saves a set of host MSRs to allow read/write passthru access to the guest and + * perform lazy restoration of the host MSRs while leaving VT-x. + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0VmxLazySaveHostMsrs(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* + * Note: If you're adding MSRs here, make sure to update the MSR-bitmap permissions in hmR0VmxSetupProcCtls(). + */ + if (!(pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_SAVED_HOST)) + { + Assert(!(pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_LOADED_GUEST)); /* Guest MSRs better not be loaded now. */ +#if HC_ARCH_BITS == 64 + if (pVCpu->CTX_SUFF(pVM)->hm.s.fAllow64BitGuests) + { + pVCpu->hm.s.vmx.u64HostLStarMsr = ASMRdMsr(MSR_K8_LSTAR); + pVCpu->hm.s.vmx.u64HostStarMsr = ASMRdMsr(MSR_K6_STAR); + pVCpu->hm.s.vmx.u64HostSFMaskMsr = ASMRdMsr(MSR_K8_SF_MASK); + pVCpu->hm.s.vmx.u64HostKernelGSBaseMsr = ASMRdMsr(MSR_K8_KERNEL_GS_BASE); + } +#endif + pVCpu->hm.s.vmx.fLazyMsrs |= VMX_LAZY_MSRS_SAVED_HOST; + } +} + + +/** + * Checks whether the MSR belongs to the set of guest MSRs that we restore + * lazily while leaving VT-x. + * + * @returns true if it does, false otherwise. + * @param pVCpu The cross context virtual CPU structure. + * @param uMsr The MSR to check. + */ +static bool hmR0VmxIsLazyGuestMsr(PVMCPU pVCpu, uint32_t uMsr) +{ + NOREF(pVCpu); +#if HC_ARCH_BITS == 64 + if (pVCpu->CTX_SUFF(pVM)->hm.s.fAllow64BitGuests) + { + switch (uMsr) + { + case MSR_K8_LSTAR: + case MSR_K6_STAR: + case MSR_K8_SF_MASK: + case MSR_K8_KERNEL_GS_BASE: + return true; + } + } +#else + RT_NOREF(pVCpu, uMsr); +#endif + return false; +} + + +/** + * Loads a set of guests MSRs to allow read/passthru to the guest. + * + * The name of this function is slightly confusing. This function does NOT + * postpone loading, but loads the MSR right now. "hmR0VmxLazy" is simply a + * common prefix for functions dealing with "lazy restoration" of the shared + * MSRs. + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0VmxLazyLoadGuestMsrs(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + + Assert(pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_SAVED_HOST); +#if HC_ARCH_BITS == 64 + if (pVCpu->CTX_SUFF(pVM)->hm.s.fAllow64BitGuests) + { + /* + * If the guest MSRs are not loaded -and- if all the guest MSRs are identical + * to the MSRs on the CPU (which are the saved host MSRs, see assertion above) then + * we can skip a few MSR writes. + * + * Otherwise, it implies either 1. they're not loaded, or 2. they're loaded but the + * guest MSR values in the guest-CPU context might be different to what's currently + * loaded in the CPU. In either case, we need to write the new guest MSR values to the + * CPU, see @bugref{8728}. + */ + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if ( !(pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_LOADED_GUEST) + && pCtx->msrKERNELGSBASE == pVCpu->hm.s.vmx.u64HostKernelGSBaseMsr + && pCtx->msrLSTAR == pVCpu->hm.s.vmx.u64HostLStarMsr + && pCtx->msrSTAR == pVCpu->hm.s.vmx.u64HostStarMsr + && pCtx->msrSFMASK == pVCpu->hm.s.vmx.u64HostSFMaskMsr) + { +#ifdef VBOX_STRICT + Assert(ASMRdMsr(MSR_K8_KERNEL_GS_BASE) == pCtx->msrKERNELGSBASE); + Assert(ASMRdMsr(MSR_K8_LSTAR) == pCtx->msrLSTAR); + Assert(ASMRdMsr(MSR_K6_STAR) == pCtx->msrSTAR); + Assert(ASMRdMsr(MSR_K8_SF_MASK) == pCtx->msrSFMASK); +#endif + } + else + { + ASMWrMsr(MSR_K8_KERNEL_GS_BASE, pCtx->msrKERNELGSBASE); + ASMWrMsr(MSR_K8_LSTAR, pCtx->msrLSTAR); + ASMWrMsr(MSR_K6_STAR, pCtx->msrSTAR); + ASMWrMsr(MSR_K8_SF_MASK, pCtx->msrSFMASK); + } + } +#endif + pVCpu->hm.s.vmx.fLazyMsrs |= VMX_LAZY_MSRS_LOADED_GUEST; +} + + +/** + * Performs lazy restoration of the set of host MSRs if they were previously + * loaded with guest MSR values. + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + * @remarks The guest MSRs should have been saved back into the guest-CPU + * context by hmR0VmxImportGuestState()!!! + */ +static void hmR0VmxLazyRestoreHostMsrs(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + + if (pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_LOADED_GUEST) + { + Assert(pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_SAVED_HOST); +#if HC_ARCH_BITS == 64 + if (pVCpu->CTX_SUFF(pVM)->hm.s.fAllow64BitGuests) + { + ASMWrMsr(MSR_K8_LSTAR, pVCpu->hm.s.vmx.u64HostLStarMsr); + ASMWrMsr(MSR_K6_STAR, pVCpu->hm.s.vmx.u64HostStarMsr); + ASMWrMsr(MSR_K8_SF_MASK, pVCpu->hm.s.vmx.u64HostSFMaskMsr); + ASMWrMsr(MSR_K8_KERNEL_GS_BASE, pVCpu->hm.s.vmx.u64HostKernelGSBaseMsr); + } +#endif + } + pVCpu->hm.s.vmx.fLazyMsrs &= ~(VMX_LAZY_MSRS_LOADED_GUEST | VMX_LAZY_MSRS_SAVED_HOST); +} + + +/** + * Verifies that our cached values of the VMCS fields are all consistent with + * what's actually present in the VMCS. + * + * @returns VBox status code. + * @retval VINF_SUCCESS if all our caches match their respective VMCS fields. + * @retval VERR_VMX_VMCS_FIELD_CACHE_INVALID if a cache field doesn't match the + * VMCS content. HMCPU error-field is + * updated, see VMX_VCI_XXX. + * @param pVCpu The cross context virtual CPU structure. + */ +static int hmR0VmxCheckVmcsCtls(PVMCPU pVCpu) +{ + uint32_t u32Val; + int rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY, &u32Val); + AssertRCReturn(rc, rc); + AssertMsgReturnStmt(pVCpu->hm.s.vmx.u32EntryCtls == u32Val, + ("Cache=%#RX32 VMCS=%#RX32\n", pVCpu->hm.s.vmx.u32EntryCtls, u32Val), + pVCpu->hm.s.u32HMError = VMX_VCI_CTRL_ENTRY, + VERR_VMX_VMCS_FIELD_CACHE_INVALID); + + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_EXIT, &u32Val); + AssertRCReturn(rc, rc); + AssertMsgReturnStmt(pVCpu->hm.s.vmx.u32ExitCtls == u32Val, + ("Cache=%#RX32 VMCS=%#RX32\n", pVCpu->hm.s.vmx.u32ExitCtls, u32Val), + pVCpu->hm.s.u32HMError = VMX_VCI_CTRL_EXIT, + VERR_VMX_VMCS_FIELD_CACHE_INVALID); + + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_PIN_EXEC, &u32Val); + AssertRCReturn(rc, rc); + AssertMsgReturnStmt(pVCpu->hm.s.vmx.u32PinCtls == u32Val, + ("Cache=%#RX32 VMCS=%#RX32\n", pVCpu->hm.s.vmx.u32PinCtls, u32Val), + pVCpu->hm.s.u32HMError = VMX_VCI_CTRL_PIN_EXEC, + VERR_VMX_VMCS_FIELD_CACHE_INVALID); + + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, &u32Val); + AssertRCReturn(rc, rc); + AssertMsgReturnStmt(pVCpu->hm.s.vmx.u32ProcCtls == u32Val, + ("Cache=%#RX32 VMCS=%#RX32\n", pVCpu->hm.s.vmx.u32ProcCtls, u32Val), + pVCpu->hm.s.u32HMError = VMX_VCI_CTRL_PROC_EXEC, + VERR_VMX_VMCS_FIELD_CACHE_INVALID); + + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_SECONDARY_CTLS) + { + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_PROC_EXEC2, &u32Val); + AssertRCReturn(rc, rc); + AssertMsgReturnStmt(pVCpu->hm.s.vmx.u32ProcCtls2 == u32Val, + ("Cache=%#RX32 VMCS=%#RX32\n", pVCpu->hm.s.vmx.u32ProcCtls2, u32Val), + pVCpu->hm.s.u32HMError = VMX_VCI_CTRL_PROC_EXEC2, + VERR_VMX_VMCS_FIELD_CACHE_INVALID); + } + + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_EXCEPTION_BITMAP, &u32Val); + AssertRCReturn(rc, rc); + AssertMsgReturnStmt(pVCpu->hm.s.vmx.u32XcptBitmap == u32Val, + ("Cache=%#RX32 VMCS=%#RX32\n", pVCpu->hm.s.vmx.u32XcptBitmap, u32Val), + pVCpu->hm.s.u32HMError = VMX_VCI_CTRL_XCPT_BITMAP, + VERR_VMX_VMCS_FIELD_CACHE_INVALID); + + uint64_t u64Val; + rc = VMXReadVmcs64(VMX_VMCS64_CTRL_TSC_OFFSET_FULL, &u64Val); + AssertRCReturn(rc, rc); + AssertMsgReturnStmt(pVCpu->hm.s.vmx.u64TscOffset == u64Val, + ("Cache=%#RX64 VMCS=%#RX64\n", pVCpu->hm.s.vmx.u64TscOffset, u64Val), + pVCpu->hm.s.u32HMError = VMX_VCI_CTRL_TSC_OFFSET, + VERR_VMX_VMCS_FIELD_CACHE_INVALID); + + return VINF_SUCCESS; +} + + +#ifdef VBOX_STRICT +/** + * Verifies that our cached host EFER value has not changed + * since we cached it. + * + * @param pVCpu The cross context virtual CPU structure. + */ +static void hmR0VmxCheckHostEferMsr(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + if (pVCpu->hm.s.vmx.u32ExitCtls & VMX_EXIT_CTLS_LOAD_EFER_MSR) + { + uint64_t u64Val; + int rc = VMXReadVmcs64(VMX_VMCS64_HOST_EFER_FULL, &u64Val); + AssertRC(rc); + + uint64_t u64HostEferMsr = ASMRdMsr(MSR_K6_EFER); + AssertMsgReturnVoid(u64HostEferMsr == u64Val, ("u64HostEferMsr=%#RX64 u64Val=%#RX64\n", u64HostEferMsr, u64Val)); + } +} + + +/** + * Verifies whether the guest/host MSR pairs in the auto-load/store area in the + * VMCS are correct. + * + * @param pVCpu The cross context virtual CPU structure. + */ +static void hmR0VmxCheckAutoLoadStoreMsrs(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* Verify MSR counts in the VMCS are what we think it should be. */ + uint32_t cMsrs; + int rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_MSR_LOAD_COUNT, &cMsrs); AssertRC(rc); + Assert(cMsrs == pVCpu->hm.s.vmx.cMsrs); + + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_EXIT_MSR_STORE_COUNT, &cMsrs); AssertRC(rc); + Assert(cMsrs == pVCpu->hm.s.vmx.cMsrs); + + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_EXIT_MSR_LOAD_COUNT, &cMsrs); AssertRC(rc); + Assert(cMsrs == pVCpu->hm.s.vmx.cMsrs); + + PCVMXAUTOMSR pHostMsr = (PCVMXAUTOMSR)pVCpu->hm.s.vmx.pvHostMsr; + PCVMXAUTOMSR pGuestMsr = (PCVMXAUTOMSR)pVCpu->hm.s.vmx.pvGuestMsr; + for (uint32_t i = 0; i < cMsrs; i++, pHostMsr++, pGuestMsr++) + { + /* Verify that the MSRs are paired properly and that the host MSR has the correct value. */ + AssertMsgReturnVoid(pHostMsr->u32Msr == pGuestMsr->u32Msr, ("HostMsr=%#RX32 GuestMsr=%#RX32 cMsrs=%u\n", pHostMsr->u32Msr, + pGuestMsr->u32Msr, cMsrs)); + + uint64_t u64Msr = ASMRdMsr(pHostMsr->u32Msr); + AssertMsgReturnVoid(pHostMsr->u64Value == u64Msr, ("u32Msr=%#RX32 VMCS Value=%#RX64 ASMRdMsr=%#RX64 cMsrs=%u\n", + pHostMsr->u32Msr, pHostMsr->u64Value, u64Msr, cMsrs)); + + /* Verify that the permissions are as expected in the MSR bitmap. */ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_MSR_BITMAPS) + { + VMXMSREXITREAD enmRead; + VMXMSREXITWRITE enmWrite; + rc = HMGetVmxMsrPermission(pVCpu->hm.s.vmx.pvMsrBitmap, pGuestMsr->u32Msr, &enmRead, &enmWrite); + AssertMsgReturnVoid(rc == VINF_SUCCESS, ("HMGetVmxMsrPermission! failed. rc=%Rrc\n", rc)); + if (pGuestMsr->u32Msr == MSR_K6_EFER) + { + AssertMsgReturnVoid(enmRead == VMXMSREXIT_INTERCEPT_READ, ("Passthru read for EFER!?\n")); + AssertMsgReturnVoid(enmWrite == VMXMSREXIT_INTERCEPT_WRITE, ("Passthru write for EFER!?\n")); + } + else + { + AssertMsgReturnVoid(enmRead == VMXMSREXIT_PASSTHRU_READ, ("u32Msr=%#RX32 cMsrs=%u No passthru read!\n", + pGuestMsr->u32Msr, cMsrs)); + AssertMsgReturnVoid(enmWrite == VMXMSREXIT_PASSTHRU_WRITE, ("u32Msr=%#RX32 cMsrs=%u No passthru write!\n", + pGuestMsr->u32Msr, cMsrs)); + } + } + } +} +#endif /* VBOX_STRICT */ + + +/** + * Flushes the TLB using EPT. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure of the calling + * EMT. Can be NULL depending on @a enmTlbFlush. + * @param enmTlbFlush Type of flush. + * + * @remarks Caller is responsible for making sure this function is called only + * when NestedPaging is supported and providing @a enmTlbFlush that is + * supported by the CPU. + * @remarks Can be called with interrupts disabled. + */ +static void hmR0VmxFlushEpt(PVMCPU pVCpu, VMXTLBFLUSHEPT enmTlbFlush) +{ + uint64_t au64Descriptor[2]; + if (enmTlbFlush == VMXTLBFLUSHEPT_ALL_CONTEXTS) + au64Descriptor[0] = 0; + else + { + Assert(pVCpu); + au64Descriptor[0] = pVCpu->hm.s.vmx.HCPhysEPTP; + } + au64Descriptor[1] = 0; /* MBZ. Intel spec. 33.3 "VMX Instructions" */ + + int rc = VMXR0InvEPT(enmTlbFlush, &au64Descriptor[0]); + AssertMsg(rc == VINF_SUCCESS, + ("VMXR0InvEPT %#x %RGv failed with %Rrc\n", enmTlbFlush, pVCpu ? pVCpu->hm.s.vmx.HCPhysEPTP : 0, rc)); + + if ( RT_SUCCESS(rc) + && pVCpu) + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushNestedPaging); +} + + +/** + * Flushes the TLB using VPID. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure of the calling + * EMT. Can be NULL depending on @a enmTlbFlush. + * @param enmTlbFlush Type of flush. + * @param GCPtr Virtual address of the page to flush (can be 0 depending + * on @a enmTlbFlush). + * + * @remarks Can be called with interrupts disabled. + */ +static void hmR0VmxFlushVpid(PVMCPU pVCpu, VMXTLBFLUSHVPID enmTlbFlush, RTGCPTR GCPtr) +{ + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fVpid); + + uint64_t au64Descriptor[2]; + if (enmTlbFlush == VMXTLBFLUSHVPID_ALL_CONTEXTS) + { + au64Descriptor[0] = 0; + au64Descriptor[1] = 0; + } + else + { + AssertPtr(pVCpu); + AssertMsg(pVCpu->hm.s.uCurrentAsid != 0, ("VMXR0InvVPID: invalid ASID %lu\n", pVCpu->hm.s.uCurrentAsid)); + AssertMsg(pVCpu->hm.s.uCurrentAsid <= UINT16_MAX, ("VMXR0InvVPID: invalid ASID %lu\n", pVCpu->hm.s.uCurrentAsid)); + au64Descriptor[0] = pVCpu->hm.s.uCurrentAsid; + au64Descriptor[1] = GCPtr; + } + + int rc = VMXR0InvVPID(enmTlbFlush, &au64Descriptor[0]); + AssertMsg(rc == VINF_SUCCESS, + ("VMXR0InvVPID %#x %u %RGv failed with %Rrc\n", enmTlbFlush, pVCpu ? pVCpu->hm.s.uCurrentAsid : 0, GCPtr, rc)); + + if ( RT_SUCCESS(rc) + && pVCpu) + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushAsid); + NOREF(rc); +} + + +/** + * Invalidates a guest page by guest virtual address. Only relevant for + * EPT/VPID, otherwise there is nothing really to invalidate. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param GCVirt Guest virtual address of the page to invalidate. + */ +VMMR0DECL(int) VMXR0InvalidatePage(PVMCPU pVCpu, RTGCPTR GCVirt) +{ + AssertPtr(pVCpu); + LogFlowFunc(("pVCpu=%p GCVirt=%RGv\n", pVCpu, GCVirt)); + + bool fFlushPending = VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_TLB_FLUSH); + if (!fFlushPending) + { + /* + * We must invalidate the guest TLB entry in either case, we cannot ignore it even for + * the EPT case. See @bugref{6043} and @bugref{6177}. + * + * Set the VMCPU_FF_TLB_FLUSH force flag and flush before VM-entry in hmR0VmxFlushTLB*() + * as this function maybe called in a loop with individual addresses. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (pVM->hm.s.vmx.fVpid) + { + bool fVpidFlush = RT_BOOL(pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVVPID_INDIV_ADDR); + +#if HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) + /* + * Workaround Erratum BV75, AAJ159 and others that affect several Intel CPUs + * where executing INVVPID outside 64-bit mode does not flush translations of + * 64-bit linear addresses, see @bugref{6208#c72}. + */ + if (RT_HI_U32(GCVirt)) + fVpidFlush = false; +#endif + + if (fVpidFlush) + { + hmR0VmxFlushVpid(pVCpu, VMXTLBFLUSHVPID_INDIV_ADDR, GCVirt); + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlbInvlpgVirt); + } + else + VMCPU_FF_SET(pVCpu, VMCPU_FF_TLB_FLUSH); + } + else if (pVM->hm.s.fNestedPaging) + VMCPU_FF_SET(pVCpu, VMCPU_FF_TLB_FLUSH); + } + + return VINF_SUCCESS; +} + + +/** + * Dummy placeholder for tagged-TLB flush handling before VM-entry. Used in the + * case where neither EPT nor VPID is supported by the CPU. + * + * @param pHostCpu The HM physical-CPU structure. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Called with interrupts disabled. + */ +static void hmR0VmxFlushTaggedTlbNone(PHMPHYSCPU pHostCpu, PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + AssertPtr(pHostCpu); + + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_TLB_FLUSH); + + Assert(pHostCpu->idCpu != NIL_RTCPUID); + pVCpu->hm.s.idLastCpu = pHostCpu->idCpu; + pVCpu->hm.s.cTlbFlushes = pHostCpu->cTlbFlushes; + pVCpu->hm.s.fForceTLBFlush = false; + return; +} + + +/** + * Flushes the tagged-TLB entries for EPT+VPID CPUs as necessary. + * + * @param pHostCpu The HM physical-CPU structure. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks All references to "ASID" in this function pertains to "VPID" in Intel's + * nomenclature. The reason is, to avoid confusion in compare statements + * since the host-CPU copies are named "ASID". + * + * @remarks Called with interrupts disabled. + */ +static void hmR0VmxFlushTaggedTlbBoth(PHMPHYSCPU pHostCpu, PVMCPU pVCpu) +{ +#ifdef VBOX_WITH_STATISTICS + bool fTlbFlushed = false; +# define HMVMX_SET_TAGGED_TLB_FLUSHED() do { fTlbFlushed = true; } while (0) +# define HMVMX_UPDATE_FLUSH_SKIPPED_STAT() do { \ + if (!fTlbFlushed) \ + STAM_COUNTER_INC(&pVCpu->hm.s.StatNoFlushTlbWorldSwitch); \ + } while (0) +#else +# define HMVMX_SET_TAGGED_TLB_FLUSHED() do { } while (0) +# define HMVMX_UPDATE_FLUSH_SKIPPED_STAT() do { } while (0) +#endif + + AssertPtr(pVCpu); + AssertPtr(pHostCpu); + Assert(pHostCpu->idCpu != NIL_RTCPUID); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + AssertMsg(pVM->hm.s.fNestedPaging && pVM->hm.s.vmx.fVpid, + ("hmR0VmxFlushTaggedTlbBoth cannot be invoked unless NestedPaging & VPID are enabled." + "fNestedPaging=%RTbool fVpid=%RTbool", pVM->hm.s.fNestedPaging, pVM->hm.s.vmx.fVpid)); + + /* + * Force a TLB flush for the first world-switch if the current CPU differs from the one we + * ran on last. If the TLB flush count changed, another VM (VCPU rather) has hit the ASID + * limit while flushing the TLB or the host CPU is online after a suspend/resume, so we + * cannot reuse the current ASID anymore. + */ + if ( pVCpu->hm.s.idLastCpu != pHostCpu->idCpu + || pVCpu->hm.s.cTlbFlushes != pHostCpu->cTlbFlushes) + { + ++pHostCpu->uCurrentAsid; + if (pHostCpu->uCurrentAsid >= pVM->hm.s.uMaxAsid) + { + pHostCpu->uCurrentAsid = 1; /* Wraparound to 1; host uses 0. */ + pHostCpu->cTlbFlushes++; /* All VCPUs that run on this host CPU must use a new VPID. */ + pHostCpu->fFlushAsidBeforeUse = true; /* All VCPUs that run on this host CPU must flush their new VPID before use. */ + } + + pVCpu->hm.s.uCurrentAsid = pHostCpu->uCurrentAsid; + pVCpu->hm.s.idLastCpu = pHostCpu->idCpu; + pVCpu->hm.s.cTlbFlushes = pHostCpu->cTlbFlushes; + + /* + * Flush by EPT when we get rescheduled to a new host CPU to ensure EPT-only tagged mappings are also + * invalidated. We don't need to flush-by-VPID here as flushing by EPT covers it. See @bugref{6568}. + */ + hmR0VmxFlushEpt(pVCpu, pVM->hm.s.vmx.enmTlbFlushEpt); + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlbWorldSwitch); + HMVMX_SET_TAGGED_TLB_FLUSHED(); + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_TLB_FLUSH); + } + else if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_TLB_FLUSH)) /* Check for explicit TLB flushes. */ + { + /* + * Changes to the EPT paging structure by VMM requires flushing-by-EPT as the CPU + * creates guest-physical (ie. only EPT-tagged) mappings while traversing the EPT + * tables when EPT is in use. Flushing-by-VPID will only flush linear (only + * VPID-tagged) and combined (EPT+VPID tagged) mappings but not guest-physical + * mappings, see @bugref{6568}. + * + * See Intel spec. 28.3.2 "Creating and Using Cached Translation Information". + */ + hmR0VmxFlushEpt(pVCpu, pVM->hm.s.vmx.enmTlbFlushEpt); + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlb); + HMVMX_SET_TAGGED_TLB_FLUSHED(); + } + + pVCpu->hm.s.fForceTLBFlush = false; + HMVMX_UPDATE_FLUSH_SKIPPED_STAT(); + + Assert(pVCpu->hm.s.idLastCpu == pHostCpu->idCpu); + Assert(pVCpu->hm.s.cTlbFlushes == pHostCpu->cTlbFlushes); + AssertMsg(pVCpu->hm.s.cTlbFlushes == pHostCpu->cTlbFlushes, + ("Flush count mismatch for cpu %d (%u vs %u)\n", pHostCpu->idCpu, pVCpu->hm.s.cTlbFlushes, pHostCpu->cTlbFlushes)); + AssertMsg(pHostCpu->uCurrentAsid >= 1 && pHostCpu->uCurrentAsid < pVM->hm.s.uMaxAsid, + ("Cpu[%u] uCurrentAsid=%u cTlbFlushes=%u pVCpu->idLastCpu=%u pVCpu->cTlbFlushes=%u\n", pHostCpu->idCpu, + pHostCpu->uCurrentAsid, pHostCpu->cTlbFlushes, pVCpu->hm.s.idLastCpu, pVCpu->hm.s.cTlbFlushes)); + AssertMsg(pVCpu->hm.s.uCurrentAsid >= 1 && pVCpu->hm.s.uCurrentAsid < pVM->hm.s.uMaxAsid, + ("Cpu[%u] pVCpu->uCurrentAsid=%u\n", pHostCpu->idCpu, pVCpu->hm.s.uCurrentAsid)); + + /* Update VMCS with the VPID. */ + int rc = VMXWriteVmcs32(VMX_VMCS16_VPID, pVCpu->hm.s.uCurrentAsid); + AssertRC(rc); + +#undef HMVMX_SET_TAGGED_TLB_FLUSHED +} + + +/** + * Flushes the tagged-TLB entries for EPT CPUs as necessary. + * + * @param pHostCpu The HM physical-CPU structure. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Called with interrupts disabled. + */ +static void hmR0VmxFlushTaggedTlbEpt(PHMPHYSCPU pHostCpu, PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + AssertPtr(pHostCpu); + Assert(pHostCpu->idCpu != NIL_RTCPUID); + AssertMsg(pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging, ("hmR0VmxFlushTaggedTlbEpt cannot be invoked without NestedPaging.")); + AssertMsg(!pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fVpid, ("hmR0VmxFlushTaggedTlbEpt cannot be invoked with VPID.")); + + /* + * Force a TLB flush for the first world-switch if the current CPU differs from the one we ran on last. + * A change in the TLB flush count implies the host CPU is online after a suspend/resume. + */ + if ( pVCpu->hm.s.idLastCpu != pHostCpu->idCpu + || pVCpu->hm.s.cTlbFlushes != pHostCpu->cTlbFlushes) + { + pVCpu->hm.s.fForceTLBFlush = true; + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlbWorldSwitch); + } + + /* Check for explicit TLB flushes. */ + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_TLB_FLUSH)) + { + pVCpu->hm.s.fForceTLBFlush = true; + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlb); + } + + pVCpu->hm.s.idLastCpu = pHostCpu->idCpu; + pVCpu->hm.s.cTlbFlushes = pHostCpu->cTlbFlushes; + + if (pVCpu->hm.s.fForceTLBFlush) + { + hmR0VmxFlushEpt(pVCpu, pVCpu->CTX_SUFF(pVM)->hm.s.vmx.enmTlbFlushEpt); + pVCpu->hm.s.fForceTLBFlush = false; + } +} + + +/** + * Flushes the tagged-TLB entries for VPID CPUs as necessary. + * + * @param pHostCpu The HM physical-CPU structure. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Called with interrupts disabled. + */ +static void hmR0VmxFlushTaggedTlbVpid(PHMPHYSCPU pHostCpu, PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + AssertPtr(pHostCpu); + Assert(pHostCpu->idCpu != NIL_RTCPUID); + AssertMsg(pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fVpid, ("hmR0VmxFlushTlbVpid cannot be invoked without VPID.")); + AssertMsg(!pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging, ("hmR0VmxFlushTlbVpid cannot be invoked with NestedPaging")); + + /* + * Force a TLB flush for the first world switch if the current CPU differs from the one we + * ran on last. If the TLB flush count changed, another VM (VCPU rather) has hit the ASID + * limit while flushing the TLB or the host CPU is online after a suspend/resume, so we + * cannot reuse the current ASID anymore. + */ + if ( pVCpu->hm.s.idLastCpu != pHostCpu->idCpu + || pVCpu->hm.s.cTlbFlushes != pHostCpu->cTlbFlushes) + { + pVCpu->hm.s.fForceTLBFlush = true; + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlbWorldSwitch); + } + + /* Check for explicit TLB flushes. */ + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_TLB_FLUSH)) + { + /* + * If we ever support VPID flush combinations other than ALL or SINGLE-context (see + * hmR0VmxSetupTaggedTlb()) we would need to explicitly flush in this case (add an + * fExplicitFlush = true here and change the pHostCpu->fFlushAsidBeforeUse check below to + * include fExplicitFlush's too) - an obscure corner case. + */ + pVCpu->hm.s.fForceTLBFlush = true; + STAM_COUNTER_INC(&pVCpu->hm.s.StatFlushTlb); + } + + PVM pVM = pVCpu->CTX_SUFF(pVM); + pVCpu->hm.s.idLastCpu = pHostCpu->idCpu; + if (pVCpu->hm.s.fForceTLBFlush) + { + ++pHostCpu->uCurrentAsid; + if (pHostCpu->uCurrentAsid >= pVM->hm.s.uMaxAsid) + { + pHostCpu->uCurrentAsid = 1; /* Wraparound to 1; host uses 0 */ + pHostCpu->cTlbFlushes++; /* All VCPUs that run on this host CPU must use a new VPID. */ + pHostCpu->fFlushAsidBeforeUse = true; /* All VCPUs that run on this host CPU must flush their new VPID before use. */ + } + + pVCpu->hm.s.fForceTLBFlush = false; + pVCpu->hm.s.cTlbFlushes = pHostCpu->cTlbFlushes; + pVCpu->hm.s.uCurrentAsid = pHostCpu->uCurrentAsid; + if (pHostCpu->fFlushAsidBeforeUse) + { + if (pVM->hm.s.vmx.enmTlbFlushVpid == VMXTLBFLUSHVPID_SINGLE_CONTEXT) + hmR0VmxFlushVpid(pVCpu, VMXTLBFLUSHVPID_SINGLE_CONTEXT, 0 /* GCPtr */); + else if (pVM->hm.s.vmx.enmTlbFlushVpid == VMXTLBFLUSHVPID_ALL_CONTEXTS) + { + hmR0VmxFlushVpid(pVCpu, VMXTLBFLUSHVPID_ALL_CONTEXTS, 0 /* GCPtr */); + pHostCpu->fFlushAsidBeforeUse = false; + } + else + { + /* hmR0VmxSetupTaggedTlb() ensures we never get here. Paranoia. */ + AssertMsgFailed(("Unsupported VPID-flush context type.\n")); + } + } + } + + AssertMsg(pVCpu->hm.s.cTlbFlushes == pHostCpu->cTlbFlushes, + ("Flush count mismatch for cpu %d (%u vs %u)\n", pHostCpu->idCpu, pVCpu->hm.s.cTlbFlushes, pHostCpu->cTlbFlushes)); + AssertMsg(pHostCpu->uCurrentAsid >= 1 && pHostCpu->uCurrentAsid < pVM->hm.s.uMaxAsid, + ("Cpu[%u] uCurrentAsid=%u cTlbFlushes=%u pVCpu->idLastCpu=%u pVCpu->cTlbFlushes=%u\n", pHostCpu->idCpu, + pHostCpu->uCurrentAsid, pHostCpu->cTlbFlushes, pVCpu->hm.s.idLastCpu, pVCpu->hm.s.cTlbFlushes)); + AssertMsg(pVCpu->hm.s.uCurrentAsid >= 1 && pVCpu->hm.s.uCurrentAsid < pVM->hm.s.uMaxAsid, + ("Cpu[%u] pVCpu->uCurrentAsid=%u\n", pHostCpu->idCpu, pVCpu->hm.s.uCurrentAsid)); + + int rc = VMXWriteVmcs32(VMX_VMCS16_VPID, pVCpu->hm.s.uCurrentAsid); + AssertRC(rc); +} + + +/** + * Flushes the guest TLB entry based on CPU capabilities. + * + * @param pHostCpu The HM physical-CPU structure. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Called with interrupts disabled. + */ +DECLINLINE(void) hmR0VmxFlushTaggedTlb(PHMPHYSCPU pHostCpu, PVMCPU pVCpu) +{ +#ifdef HMVMX_ALWAYS_FLUSH_TLB + VMCPU_FF_SET(pVCpu, VMCPU_FF_TLB_FLUSH); +#endif + PVM pVM = pVCpu->CTX_SUFF(pVM); + switch (pVM->hm.s.vmx.enmTlbFlushType) + { + case VMXTLBFLUSHTYPE_EPT_VPID: hmR0VmxFlushTaggedTlbBoth(pHostCpu, pVCpu); break; + case VMXTLBFLUSHTYPE_EPT: hmR0VmxFlushTaggedTlbEpt(pHostCpu, pVCpu); break; + case VMXTLBFLUSHTYPE_VPID: hmR0VmxFlushTaggedTlbVpid(pHostCpu, pVCpu); break; + case VMXTLBFLUSHTYPE_NONE: hmR0VmxFlushTaggedTlbNone(pHostCpu, pVCpu); break; + default: + AssertMsgFailed(("Invalid flush-tag function identifier\n")); + break; + } + /* Don't assert that VMCPU_FF_TLB_FLUSH should no longer be pending. It can be set by other EMTs. */ +} + + +/** + * Sets up the appropriate tagged TLB-flush level and handler for flushing guest + * TLB entries from the host TLB before VM-entry. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +static int hmR0VmxSetupTaggedTlb(PVM pVM) +{ + /* + * Determine optimal flush type for Nested Paging. + * We cannot ignore EPT if no suitable flush-types is supported by the CPU as we've already setup unrestricted + * guest execution (see hmR3InitFinalizeR0()). + */ + if (pVM->hm.s.fNestedPaging) + { + if (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVEPT) + { + if (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVEPT_SINGLE_CONTEXT) + pVM->hm.s.vmx.enmTlbFlushEpt = VMXTLBFLUSHEPT_SINGLE_CONTEXT; + else if (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVEPT_ALL_CONTEXTS) + pVM->hm.s.vmx.enmTlbFlushEpt = VMXTLBFLUSHEPT_ALL_CONTEXTS; + else + { + /* Shouldn't happen. EPT is supported but no suitable flush-types supported. */ + pVM->hm.s.vmx.enmTlbFlushEpt = VMXTLBFLUSHEPT_NOT_SUPPORTED; + pVM->aCpus[0].hm.s.u32HMError = VMX_UFC_EPT_FLUSH_TYPE_UNSUPPORTED; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Make sure the write-back cacheable memory type for EPT is supported. */ + if (RT_UNLIKELY(!(pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_EMT_WB))) + { + pVM->hm.s.vmx.enmTlbFlushEpt = VMXTLBFLUSHEPT_NOT_SUPPORTED; + pVM->aCpus[0].hm.s.u32HMError = VMX_UFC_EPT_MEM_TYPE_NOT_WB; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* EPT requires a page-walk length of 4. */ + if (RT_UNLIKELY(!(pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_PAGE_WALK_LENGTH_4))) + { + pVM->hm.s.vmx.enmTlbFlushEpt = VMXTLBFLUSHEPT_NOT_SUPPORTED; + pVM->aCpus[0].hm.s.u32HMError = VMX_UFC_EPT_PAGE_WALK_LENGTH_UNSUPPORTED; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + } + else + { + /* Shouldn't happen. EPT is supported but INVEPT instruction is not supported. */ + pVM->hm.s.vmx.enmTlbFlushEpt = VMXTLBFLUSHEPT_NOT_SUPPORTED; + pVM->aCpus[0].hm.s.u32HMError = VMX_UFC_EPT_INVEPT_UNAVAILABLE; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + } + + /* + * Determine optimal flush type for VPID. + */ + if (pVM->hm.s.vmx.fVpid) + { + if (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVVPID) + { + if (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVVPID_SINGLE_CONTEXT) + pVM->hm.s.vmx.enmTlbFlushVpid = VMXTLBFLUSHVPID_SINGLE_CONTEXT; + else if (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVVPID_ALL_CONTEXTS) + pVM->hm.s.vmx.enmTlbFlushVpid = VMXTLBFLUSHVPID_ALL_CONTEXTS; + else + { + /* Neither SINGLE nor ALL-context flush types for VPID is supported by the CPU. Ignore VPID capability. */ + if (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVVPID_INDIV_ADDR) + LogRelFunc(("Only INDIV_ADDR supported. Ignoring VPID.\n")); + if (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_INVVPID_SINGLE_CONTEXT_RETAIN_GLOBALS) + LogRelFunc(("Only SINGLE_CONTEXT_RETAIN_GLOBALS supported. Ignoring VPID.\n")); + pVM->hm.s.vmx.enmTlbFlushVpid = VMXTLBFLUSHVPID_NOT_SUPPORTED; + pVM->hm.s.vmx.fVpid = false; + } + } + else + { + /* Shouldn't happen. VPID is supported but INVVPID is not supported by the CPU. Ignore VPID capability. */ + Log4Func(("VPID supported without INVEPT support. Ignoring VPID.\n")); + pVM->hm.s.vmx.enmTlbFlushVpid = VMXTLBFLUSHVPID_NOT_SUPPORTED; + pVM->hm.s.vmx.fVpid = false; + } + } + + /* + * Setup the handler for flushing tagged-TLBs. + */ + if (pVM->hm.s.fNestedPaging && pVM->hm.s.vmx.fVpid) + pVM->hm.s.vmx.enmTlbFlushType = VMXTLBFLUSHTYPE_EPT_VPID; + else if (pVM->hm.s.fNestedPaging) + pVM->hm.s.vmx.enmTlbFlushType = VMXTLBFLUSHTYPE_EPT; + else if (pVM->hm.s.vmx.fVpid) + pVM->hm.s.vmx.enmTlbFlushType = VMXTLBFLUSHTYPE_VPID; + else + pVM->hm.s.vmx.enmTlbFlushType = VMXTLBFLUSHTYPE_NONE; + return VINF_SUCCESS; +} + + +/** + * Sets up pin-based VM-execution controls in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks We don't really care about optimizing vmwrites here as it's done only + * once per VM and hence we don't care about VMCS-field cache comparisons. + */ +static int hmR0VmxSetupPinCtls(PVMCPU pVCpu) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); + uint32_t fVal = pVM->hm.s.vmx.Msrs.PinCtls.n.allowed0; /* Bits set here must always be set. */ + uint32_t const fZap = pVM->hm.s.vmx.Msrs.PinCtls.n.allowed1; /* Bits cleared here must always be cleared. */ + + fVal |= VMX_PIN_CTLS_EXT_INT_EXIT /* External interrupts cause a VM-exit. */ + | VMX_PIN_CTLS_NMI_EXIT; /* Non-maskable interrupts (NMIs) cause a VM-exit. */ + + if (pVM->hm.s.vmx.Msrs.PinCtls.n.allowed1 & VMX_PIN_CTLS_VIRT_NMI) + fVal |= VMX_PIN_CTLS_VIRT_NMI; /* Use virtual NMIs and virtual-NMI blocking features. */ + + /* Enable the VMX preemption timer. */ + if (pVM->hm.s.vmx.fUsePreemptTimer) + { + Assert(pVM->hm.s.vmx.Msrs.PinCtls.n.allowed1 & VMX_PIN_CTLS_PREEMPT_TIMER); + fVal |= VMX_PIN_CTLS_PREEMPT_TIMER; + } + +#if 0 + /* Enable posted-interrupt processing. */ + if (pVM->hm.s.fPostedIntrs) + { + Assert(pVM->hm.s.vmx.Msrs.PinCtls.n.allowed1 & VMX_PIN_CTLS_POSTED_INT); + Assert(pVM->hm.s.vmx.Msrs.ExitCtls.n.allowed1 & VMX_EXIT_CTLS_ACK_EXT_INT); + fVal |= VMX_PIN_CTL_POSTED_INT; + } +#endif + + if ((fVal & fZap) != fVal) + { + LogRelFunc(("Invalid pin-based VM-execution controls combo! Cpu=%#RX32 fVal=%#RX32 fZap=%#RX32\n", + pVM->hm.s.vmx.Msrs.PinCtls.n.allowed0, fVal, fZap)); + pVCpu->hm.s.u32HMError = VMX_UFC_CTRL_PIN_EXEC; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Commit it to the VMCS and update our cache. */ + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PIN_EXEC, fVal); + AssertRCReturn(rc, rc); + pVCpu->hm.s.vmx.u32PinCtls = fVal; + + return VINF_SUCCESS; +} + + +/** + * Sets up secondary processor-based VM-execution controls in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks We don't really care about optimizing vmwrites here as it's done only + * once per VM and hence we don't care about VMCS-field cache comparisons. + */ +static int hmR0VmxSetupProcCtls2(PVMCPU pVCpu) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); + uint32_t fVal = pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed0; /* Bits set here must be set in the VMCS. */ + uint32_t const fZap = pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1; /* Bits cleared here must be cleared in the VMCS. */ + + /* WBINVD causes a VM-exit. */ + if (pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1 & VMX_PROC_CTLS2_WBINVD_EXIT) + fVal |= VMX_PROC_CTLS2_WBINVD_EXIT; + + /* Enable EPT (aka nested-paging). */ + if (pVM->hm.s.fNestedPaging) + fVal |= VMX_PROC_CTLS2_EPT; + + /* + * Enable the INVPCID instruction if supported by the hardware and we expose + * it to the guest. Without this, guest executing INVPCID would cause a #UD. + */ + if ( (pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1 & VMX_PROC_CTLS2_INVPCID) + && pVM->cpum.ro.GuestFeatures.fInvpcid) + fVal |= VMX_PROC_CTLS2_INVPCID; + + /* Enable VPID. */ + if (pVM->hm.s.vmx.fVpid) + fVal |= VMX_PROC_CTLS2_VPID; + + /* Enable Unrestricted guest execution. */ + if (pVM->hm.s.vmx.fUnrestrictedGuest) + fVal |= VMX_PROC_CTLS2_UNRESTRICTED_GUEST; + +#if 0 + if (pVM->hm.s.fVirtApicRegs) + { + /* Enable APIC-register virtualization. */ + Assert(pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1 & VMX_PROC_CTLS2_APIC_REG_VIRT); + fVal |= VMX_PROC_CTLS2_APIC_REG_VIRT; + + /* Enable virtual-interrupt delivery. */ + Assert(pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1 & VMX_PROC_CTLS2_VIRT_INTR_DELIVERY); + fVal |= VMX_PROC_CTLS2_VIRT_INTR_DELIVERY; + } +#endif + + /* Virtualize-APIC accesses if supported by the CPU. The virtual-APIC page is where the TPR shadow resides. */ + /** @todo VIRT_X2APIC support, it's mutually exclusive with this. So must be + * done dynamically. */ + if (pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1 & VMX_PROC_CTLS2_VIRT_APIC_ACCESS) + { + Assert(pVM->hm.s.vmx.HCPhysApicAccess); + Assert(!(pVM->hm.s.vmx.HCPhysApicAccess & 0xfff)); /* Bits 11:0 MBZ. */ + fVal |= VMX_PROC_CTLS2_VIRT_APIC_ACCESS; /* Virtualize APIC accesses. */ + int rc = VMXWriteVmcs64(VMX_VMCS64_CTRL_APIC_ACCESSADDR_FULL, pVM->hm.s.vmx.HCPhysApicAccess); + AssertRCReturn(rc, rc); + } + + /* Enable RDTSCP. */ + if (pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1 & VMX_PROC_CTLS2_RDTSCP) + fVal |= VMX_PROC_CTLS2_RDTSCP; + + /* Enable Pause-Loop exiting. */ + if ( pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1 & VMX_PROC_CTLS2_PAUSE_LOOP_EXIT + && pVM->hm.s.vmx.cPleGapTicks + && pVM->hm.s.vmx.cPleWindowTicks) + { + fVal |= VMX_PROC_CTLS2_PAUSE_LOOP_EXIT; + + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PLE_GAP, pVM->hm.s.vmx.cPleGapTicks); + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_PLE_WINDOW, pVM->hm.s.vmx.cPleWindowTicks); + AssertRCReturn(rc, rc); + } + + if ((fVal & fZap) != fVal) + { + LogRelFunc(("Invalid secondary processor-based VM-execution controls combo! cpu=%#RX32 fVal=%#RX32 fZap=%#RX32\n", + pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed0, fVal, fZap)); + pVCpu->hm.s.u32HMError = VMX_UFC_CTRL_PROC_EXEC2; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Commit it to the VMCS and update our cache. */ + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC2, fVal); + AssertRCReturn(rc, rc); + pVCpu->hm.s.vmx.u32ProcCtls2 = fVal; + + return VINF_SUCCESS; +} + + +/** + * Sets up processor-based VM-execution controls in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks We don't really care about optimizing vmwrites here as it's done only + * once per VM and hence we don't care about VMCS-field cache comparisons. + */ +static int hmR0VmxSetupProcCtls(PVMCPU pVCpu) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); + uint32_t fVal = pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed0; /* Bits set here must be set in the VMCS. */ + uint32_t const fZap = pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1; /* Bits cleared here must be cleared in the VMCS. */ + + fVal |= VMX_PROC_CTLS_HLT_EXIT /* HLT causes a VM-exit. */ + | VMX_PROC_CTLS_USE_TSC_OFFSETTING /* Use TSC-offsetting. */ + | VMX_PROC_CTLS_MOV_DR_EXIT /* MOV DRx causes a VM-exit. */ + | VMX_PROC_CTLS_UNCOND_IO_EXIT /* All IO instructions cause a VM-exit. */ + | VMX_PROC_CTLS_RDPMC_EXIT /* RDPMC causes a VM-exit. */ + | VMX_PROC_CTLS_MONITOR_EXIT /* MONITOR causes a VM-exit. */ + | VMX_PROC_CTLS_MWAIT_EXIT; /* MWAIT causes a VM-exit. */ + + /* We toggle VMX_PROC_CTLS_MOV_DR_EXIT later, check if it's not -always- needed to be set or clear. */ + if ( !(pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_MOV_DR_EXIT) + || (pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed0 & VMX_PROC_CTLS_MOV_DR_EXIT)) + { + LogRelFunc(("Unsupported VMX_PROC_CTLS_MOV_DR_EXIT combo!")); + pVCpu->hm.s.u32HMError = VMX_UFC_CTRL_PROC_MOV_DRX_EXIT; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Without Nested Paging, INVLPG (also affects INVPCID) and MOV CR3 instructions should cause VM-exits. */ + if (!pVM->hm.s.fNestedPaging) + { + Assert(!pVM->hm.s.vmx.fUnrestrictedGuest); /* Paranoia. */ + fVal |= VMX_PROC_CTLS_INVLPG_EXIT + | VMX_PROC_CTLS_CR3_LOAD_EXIT + | VMX_PROC_CTLS_CR3_STORE_EXIT; + } + + /* Use TPR shadowing if supported by the CPU. */ + if ( PDMHasApic(pVM) + && pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_USE_TPR_SHADOW) + { + Assert(pVCpu->hm.s.vmx.HCPhysVirtApic); + Assert(!(pVCpu->hm.s.vmx.HCPhysVirtApic & 0xfff)); /* Bits 11:0 MBZ. */ + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_TPR_THRESHOLD, 0); + rc |= VMXWriteVmcs64(VMX_VMCS64_CTRL_VIRT_APIC_PAGEADDR_FULL, pVCpu->hm.s.vmx.HCPhysVirtApic); + AssertRCReturn(rc, rc); + + fVal |= VMX_PROC_CTLS_USE_TPR_SHADOW; /* CR8 reads from the Virtual-APIC page. */ + /* CR8 writes cause a VM-exit based on TPR threshold. */ + Assert(!(fVal & VMX_PROC_CTLS_CR8_STORE_EXIT)); + Assert(!(fVal & VMX_PROC_CTLS_CR8_LOAD_EXIT)); + } + else + { + /* + * Some 32-bit CPUs do not support CR8 load/store exiting as MOV CR8 is invalid on 32-bit Intel CPUs. + * Set this control only for 64-bit guests. + */ + if (pVM->hm.s.fAllow64BitGuests) + { + fVal |= VMX_PROC_CTLS_CR8_STORE_EXIT /* CR8 reads cause a VM-exit. */ + | VMX_PROC_CTLS_CR8_LOAD_EXIT; /* CR8 writes cause a VM-exit. */ + } + } + + /* Use MSR-bitmaps if supported by the CPU. */ + if (pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_USE_MSR_BITMAPS) + { + fVal |= VMX_PROC_CTLS_USE_MSR_BITMAPS; + + Assert(pVCpu->hm.s.vmx.HCPhysMsrBitmap); + Assert(!(pVCpu->hm.s.vmx.HCPhysMsrBitmap & 0xfff)); /* Bits 11:0 MBZ. */ + int rc = VMXWriteVmcs64(VMX_VMCS64_CTRL_MSR_BITMAP_FULL, pVCpu->hm.s.vmx.HCPhysMsrBitmap); + AssertRCReturn(rc, rc); + + /* + * The guest can access the following MSRs (read, write) without causing VM-exits; they are loaded/stored + * automatically using dedicated fields in the VMCS. + */ + hmR0VmxSetMsrPermission(pVCpu, MSR_IA32_SYSENTER_CS, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + hmR0VmxSetMsrPermission(pVCpu, MSR_IA32_SYSENTER_ESP, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + hmR0VmxSetMsrPermission(pVCpu, MSR_IA32_SYSENTER_EIP, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + hmR0VmxSetMsrPermission(pVCpu, MSR_K8_GS_BASE, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + hmR0VmxSetMsrPermission(pVCpu, MSR_K8_FS_BASE, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); +#if HC_ARCH_BITS == 64 + /* + * Set passthru permissions for the following MSRs (mandatory for VT-x) required for 64-bit guests. + */ + if (pVM->hm.s.fAllow64BitGuests) + { + hmR0VmxSetMsrPermission(pVCpu, MSR_K8_LSTAR, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + hmR0VmxSetMsrPermission(pVCpu, MSR_K6_STAR, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + hmR0VmxSetMsrPermission(pVCpu, MSR_K8_SF_MASK, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + hmR0VmxSetMsrPermission(pVCpu, MSR_K8_KERNEL_GS_BASE, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + } +#endif + /* + * The IA32_PRED_CMD and IA32_FLUSH_CMD MSRs are write-only and has no state + * associated with then. We never need to intercept access (writes need to + * be executed without exiting, reads will #GP-fault anyway). + */ + if (pVM->cpum.ro.GuestFeatures.fIbpb) + hmR0VmxSetMsrPermission(pVCpu, MSR_IA32_PRED_CMD, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + if (pVM->cpum.ro.GuestFeatures.fFlushCmd) + hmR0VmxSetMsrPermission(pVCpu, MSR_IA32_FLUSH_CMD, VMXMSREXIT_PASSTHRU_READ, VMXMSREXIT_PASSTHRU_WRITE); + + /* Though MSR_IA32_PERF_GLOBAL_CTRL is saved/restored lazily, we want intercept reads/write to it for now. */ + } + + /* Use the secondary processor-based VM-execution controls if supported by the CPU. */ + if (pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_USE_SECONDARY_CTLS) + fVal |= VMX_PROC_CTLS_USE_SECONDARY_CTLS; + + if ((fVal & fZap) != fVal) + { + LogRelFunc(("Invalid processor-based VM-execution controls combo! cpu=%#RX32 fVal=%#RX32 fZap=%#RX32\n", + pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed0, fVal, fZap)); + pVCpu->hm.s.u32HMError = VMX_UFC_CTRL_PROC_EXEC; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Commit it to the VMCS and update our cache. */ + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, fVal); + AssertRCReturn(rc, rc); + pVCpu->hm.s.vmx.u32ProcCtls = fVal; + + /* Set up secondary processor-based VM-execution controls if the CPU supports it. */ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_SECONDARY_CTLS) + return hmR0VmxSetupProcCtls2(pVCpu); + + /* Sanity check, should not really happen. */ + if (RT_UNLIKELY(pVM->hm.s.vmx.fUnrestrictedGuest)) + { + LogRelFunc(("Unrestricted Guest enabled when secondary processor-based VM-execution controls not available\n")); + pVCpu->hm.s.u32HMError = VMX_UFC_INVALID_UX_COMBO; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Old CPUs without secondary processor-based VM-execution controls would end up here. */ + return VINF_SUCCESS; +} + + +/** + * Sets up miscellaneous (everything other than Pin & Processor-based + * VM-execution) control fields in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +static int hmR0VmxSetupMiscCtls(PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + + int rc = VERR_GENERAL_FAILURE; + + /* All fields are zero-initialized during allocation; but don't remove the commented block below. */ +#if 0 + /* All CR3 accesses cause VM-exits. Later we optimize CR3 accesses (see hmR0VmxExportGuestCR3AndCR4())*/ + rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_CR3_TARGET_COUNT, 0); + rc |= VMXWriteVmcs64(VMX_VMCS64_CTRL_TSC_OFFSET_FULL, 0); + + /* + * Set MASK & MATCH to 0. VMX checks if GuestPFErrCode & MASK == MATCH. If equal (in our case it always is) + * and if the X86_XCPT_PF bit in the exception bitmap is set it causes a VM-exit, if clear doesn't cause an exit. + * We thus use the exception bitmap to control it rather than use both. + */ + rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PAGEFAULT_ERROR_MASK, 0); + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_PAGEFAULT_ERROR_MATCH, 0); + + /* All IO & IOIO instructions cause VM-exits. */ + rc |= VMXWriteVmcs64(VMX_VMCS64_CTRL_IO_BITMAP_A_FULL, 0); + rc |= VMXWriteVmcs64(VMX_VMCS64_CTRL_IO_BITMAP_B_FULL, 0); + + /* Initialize the MSR-bitmap area. */ + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_ENTRY_MSR_LOAD_COUNT, 0); + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_EXIT_MSR_STORE_COUNT, 0); + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_EXIT_MSR_LOAD_COUNT, 0); + AssertRCReturn(rc, rc); +#endif + + /* Setup MSR auto-load/store area. */ + Assert(pVCpu->hm.s.vmx.HCPhysGuestMsr); + Assert(!(pVCpu->hm.s.vmx.HCPhysGuestMsr & 0xf)); /* Lower 4 bits MBZ. */ + rc = VMXWriteVmcs64(VMX_VMCS64_CTRL_ENTRY_MSR_LOAD_FULL, pVCpu->hm.s.vmx.HCPhysGuestMsr); + rc |= VMXWriteVmcs64(VMX_VMCS64_CTRL_EXIT_MSR_STORE_FULL, pVCpu->hm.s.vmx.HCPhysGuestMsr); + AssertRCReturn(rc, rc); + + Assert(pVCpu->hm.s.vmx.HCPhysHostMsr); + Assert(!(pVCpu->hm.s.vmx.HCPhysHostMsr & 0xf)); /* Lower 4 bits MBZ. */ + rc = VMXWriteVmcs64(VMX_VMCS64_CTRL_EXIT_MSR_LOAD_FULL, pVCpu->hm.s.vmx.HCPhysHostMsr); + AssertRCReturn(rc, rc); + + /* Set VMCS link pointer. Reserved for future use, must be -1. Intel spec. 24.4 "Guest-State Area". */ + rc = VMXWriteVmcs64(VMX_VMCS64_GUEST_VMCS_LINK_PTR_FULL, UINT64_C(0xffffffffffffffff)); + AssertRCReturn(rc, rc); + + /* All fields are zero-initialized during allocation; but don't remove the commented block below. */ +#if 0 + /* Setup debug controls */ + rc = VMXWriteVmcs64(VMX_VMCS64_GUEST_DEBUGCTL_FULL, 0); + rc |= VMXWriteVmcs32(VMX_VMCS_GUEST_PENDING_DEBUG_XCPTS, 0); + AssertRCReturn(rc, rc); +#endif + + return rc; +} + + +/** + * Sets up the initial exception bitmap in the VMCS based on static conditions. + * + * We shall setup those exception intercepts that don't change during the + * lifetime of the VM here. The rest are done dynamically while loading the + * guest state. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +static int hmR0VmxInitXcptBitmap(PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + + uint32_t uXcptBitmap; + + /* Must always intercept #AC to prevent the guest from hanging the CPU. */ + uXcptBitmap = RT_BIT_32(X86_XCPT_AC); + + /* Because we need to maintain the DR6 state even when intercepting DRx reads + and writes, and because recursive #DBs can cause the CPU hang, we must always + intercept #DB. */ + uXcptBitmap |= RT_BIT_32(X86_XCPT_DB); + + /* Without Nested Paging, #PF must cause a VM-exit so we can sync our shadow page tables. */ + if (!pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging) + uXcptBitmap |= RT_BIT(X86_XCPT_PF); + + /* Commit it to the VMCS. */ + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_EXCEPTION_BITMAP, uXcptBitmap); + AssertRCReturn(rc, rc); + + /* Update our cache of the exception bitmap. */ + pVCpu->hm.s.vmx.u32XcptBitmap = uXcptBitmap; + return VINF_SUCCESS; +} + + +/** + * Does per-VM VT-x initialization. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0DECL(int) VMXR0InitVM(PVM pVM) +{ + LogFlowFunc(("pVM=%p\n", pVM)); + + int rc = hmR0VmxStructsAlloc(pVM); + if (RT_FAILURE(rc)) + { + LogRelFunc(("hmR0VmxStructsAlloc failed! rc=%Rrc\n", rc)); + return rc; + } + + return VINF_SUCCESS; +} + + +/** + * Does per-VM VT-x termination. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0DECL(int) VMXR0TermVM(PVM pVM) +{ + LogFlowFunc(("pVM=%p\n", pVM)); + +#ifdef VBOX_WITH_CRASHDUMP_MAGIC + if (pVM->hm.s.vmx.hMemObjScratch != NIL_RTR0MEMOBJ) + ASMMemZero32(pVM->hm.s.vmx.pvScratch, PAGE_SIZE); +#endif + hmR0VmxStructsFree(pVM); + return VINF_SUCCESS; +} + + +/** + * Sets up the VM for execution under VT-x. + * This function is only called once per-VM during initialization. + * + * @returns VBox status code. + * @param pVM The cross context VM structure. + */ +VMMR0DECL(int) VMXR0SetupVM(PVM pVM) +{ + AssertPtrReturn(pVM, VERR_INVALID_PARAMETER); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + LogFlowFunc(("pVM=%p\n", pVM)); + + /* + * Without UnrestrictedGuest, pRealModeTSS and pNonPagingModeEPTPageTable *must* always be + * allocated. We no longer support the highly unlikely case of UnrestrictedGuest without + * pRealModeTSS, see hmR3InitFinalizeR0Intel(). + */ + if ( !pVM->hm.s.vmx.fUnrestrictedGuest + && ( !pVM->hm.s.vmx.pNonPagingModeEPTPageTable + || !pVM->hm.s.vmx.pRealModeTSS)) + { + LogRelFunc(("Invalid real-on-v86 state.\n")); + return VERR_INTERNAL_ERROR; + } + + /* Initialize these always, see hmR3InitFinalizeR0().*/ + pVM->hm.s.vmx.enmTlbFlushEpt = VMXTLBFLUSHEPT_NONE; + pVM->hm.s.vmx.enmTlbFlushVpid = VMXTLBFLUSHVPID_NONE; + + /* Setup the tagged-TLB flush handlers. */ + int rc = hmR0VmxSetupTaggedTlb(pVM); + if (RT_FAILURE(rc)) + { + LogRelFunc(("hmR0VmxSetupTaggedTlb failed! rc=%Rrc\n", rc)); + return rc; + } + + /* Check if we can use the VMCS controls for swapping the EFER MSR. */ + Assert(!pVM->hm.s.vmx.fSupportsVmcsEfer); +#if HC_ARCH_BITS == 64 + if ( (pVM->hm.s.vmx.Msrs.EntryCtls.n.allowed1 & VMX_ENTRY_CTLS_LOAD_EFER_MSR) + && (pVM->hm.s.vmx.Msrs.ExitCtls.n.allowed1 & VMX_EXIT_CTLS_LOAD_EFER_MSR) + && (pVM->hm.s.vmx.Msrs.ExitCtls.n.allowed1 & VMX_EXIT_CTLS_SAVE_EFER_MSR)) + { + pVM->hm.s.vmx.fSupportsVmcsEfer = true; + } +#endif + + /* At least verify VMX is enabled, since we can't check if we're in VMX root mode without #GP'ing. */ + RTCCUINTREG const uHostCR4 = ASMGetCR4(); + if (RT_UNLIKELY(!(uHostCR4 & X86_CR4_VMXE))) + return VERR_VMX_NOT_IN_VMX_ROOT_MODE; + + for (VMCPUID i = 0; i < pVM->cCpus; i++) + { + PVMCPU pVCpu = &pVM->aCpus[i]; + AssertPtr(pVCpu); + AssertPtr(pVCpu->hm.s.vmx.pvVmcs); + + /* Log the VCPU pointers, useful for debugging SMP VMs. */ + Log4Func(("pVCpu=%p idCpu=%RU32\n", pVCpu, pVCpu->idCpu)); + + /* Set revision dword at the beginning of the VMCS structure. */ + *(uint32_t *)pVCpu->hm.s.vmx.pvVmcs = RT_BF_GET(pVM->hm.s.vmx.Msrs.u64Basic, VMX_BF_BASIC_VMCS_ID); + + /* Set the VMCS launch state to "clear", see Intel spec. 31.6 "Preparation and launch a virtual machine". */ + rc = VMXClearVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + AssertLogRelMsgRCReturnStmt(rc, ("VMXR0SetupVM: VMXClearVmcs failed! rc=%Rrc\n", rc), + hmR0VmxUpdateErrorRecord(pVCpu, rc), rc); + + /* Load this VMCS as the current VMCS. */ + rc = VMXActivateVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + AssertLogRelMsgRCReturnStmt(rc, ("VMXR0SetupVM: VMXActivateVmcs failed! rc=%Rrc\n", rc), + hmR0VmxUpdateErrorRecord(pVCpu, rc), rc); + + rc = hmR0VmxSetupPinCtls(pVCpu); + AssertLogRelMsgRCReturnStmt(rc, ("VMXR0SetupVM: hmR0VmxSetupPinCtls failed! rc=%Rrc\n", rc), + hmR0VmxUpdateErrorRecord(pVCpu, rc), rc); + + rc = hmR0VmxSetupProcCtls(pVCpu); + AssertLogRelMsgRCReturnStmt(rc, ("VMXR0SetupVM: hmR0VmxSetupProcCtls failed! rc=%Rrc\n", rc), + hmR0VmxUpdateErrorRecord(pVCpu, rc), rc); + + rc = hmR0VmxSetupMiscCtls(pVCpu); + AssertLogRelMsgRCReturnStmt(rc, ("VMXR0SetupVM: hmR0VmxSetupMiscCtls failed! rc=%Rrc\n", rc), + hmR0VmxUpdateErrorRecord(pVCpu, rc), rc); + + rc = hmR0VmxInitXcptBitmap(pVCpu); + AssertLogRelMsgRCReturnStmt(rc, ("VMXR0SetupVM: hmR0VmxInitXcptBitmap failed! rc=%Rrc\n", rc), + hmR0VmxUpdateErrorRecord(pVCpu, rc), rc); + +#if HC_ARCH_BITS == 32 + rc = hmR0VmxInitVmcsReadCache(pVCpu); + AssertLogRelMsgRCReturnStmt(rc, ("VMXR0SetupVM: hmR0VmxInitVmcsReadCache failed! rc=%Rrc\n", rc), + hmR0VmxUpdateErrorRecord(pVCpu, rc), rc); +#endif + + /* Sync any CPU internal VMCS data back into our VMCS in memory. */ + rc = VMXClearVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + AssertLogRelMsgRCReturnStmt(rc, ("VMXR0SetupVM: VMXClearVmcs(2) failed! rc=%Rrc\n", rc), + hmR0VmxUpdateErrorRecord(pVCpu, rc), rc); + + pVCpu->hm.s.vmx.fVmcsState = HMVMX_VMCS_STATE_CLEAR; + + hmR0VmxUpdateErrorRecord(pVCpu, rc); + } + + return VINF_SUCCESS; +} + + +/** + * Saves the host control registers (CR0, CR3, CR4) into the host-state area in + * the VMCS. + * + * @returns VBox status code. + */ +static int hmR0VmxExportHostControlRegs(void) +{ + RTCCUINTREG uReg = ASMGetCR0(); + int rc = VMXWriteVmcsHstN(VMX_VMCS_HOST_CR0, uReg); + AssertRCReturn(rc, rc); + + uReg = ASMGetCR3(); + rc = VMXWriteVmcsHstN(VMX_VMCS_HOST_CR3, uReg); + AssertRCReturn(rc, rc); + + uReg = ASMGetCR4(); + rc = VMXWriteVmcsHstN(VMX_VMCS_HOST_CR4, uReg); + AssertRCReturn(rc, rc); + return rc; +} + + +/** + * Saves the host segment registers and GDTR, IDTR, (TR, GS and FS bases) into + * the host-state area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +static int hmR0VmxExportHostSegmentRegs(PVMCPU pVCpu) +{ +#if HC_ARCH_BITS == 64 +/** + * Macro for adjusting host segment selectors to satisfy VT-x's VM-entry + * requirements. See hmR0VmxExportHostSegmentRegs(). + */ +# define VMXLOCAL_ADJUST_HOST_SEG(seg, selValue) \ + if ((selValue) & (X86_SEL_RPL | X86_SEL_LDT)) \ + { \ + bool fValidSelector = true; \ + if ((selValue) & X86_SEL_LDT) \ + { \ + uint32_t uAttr = ASMGetSegAttr((selValue)); \ + fValidSelector = RT_BOOL(uAttr != UINT32_MAX && (uAttr & X86_DESC_P)); \ + } \ + if (fValidSelector) \ + { \ + pVCpu->hm.s.vmx.fRestoreHostFlags |= VMX_RESTORE_HOST_SEL_##seg; \ + pVCpu->hm.s.vmx.RestoreHost.uHostSel##seg = (selValue); \ + } \ + (selValue) = 0; \ + } + + /* + * If we've executed guest code using VT-x, the host-state bits will be messed up. We + * should -not- save the messed up state without restoring the original host-state, + * see @bugref{7240}. + * + * This apparently can happen (most likely the FPU changes), deal with it rather than + * asserting. Was observed booting Solaris 10u10 32-bit guest. + */ + if ( (pVCpu->hm.s.vmx.fRestoreHostFlags & VMX_RESTORE_HOST_REQUIRED) + && (pVCpu->hm.s.vmx.fRestoreHostFlags & ~VMX_RESTORE_HOST_REQUIRED)) + { + Log4Func(("Restoring Host State: fRestoreHostFlags=%#RX32 HostCpuId=%u\n", pVCpu->hm.s.vmx.fRestoreHostFlags, + pVCpu->idCpu)); + VMXRestoreHostState(pVCpu->hm.s.vmx.fRestoreHostFlags, &pVCpu->hm.s.vmx.RestoreHost); + } + pVCpu->hm.s.vmx.fRestoreHostFlags = 0; +#else + RT_NOREF(pVCpu); +#endif + + /* + * Host DS, ES, FS and GS segment registers. + */ +#if HC_ARCH_BITS == 64 + RTSEL uSelDS = ASMGetDS(); + RTSEL uSelES = ASMGetES(); + RTSEL uSelFS = ASMGetFS(); + RTSEL uSelGS = ASMGetGS(); +#else + RTSEL uSelDS = 0; + RTSEL uSelES = 0; + RTSEL uSelFS = 0; + RTSEL uSelGS = 0; +#endif + + /* + * Host CS and SS segment registers. + */ + RTSEL uSelCS = ASMGetCS(); + RTSEL uSelSS = ASMGetSS(); + + /* + * Host TR segment register. + */ + RTSEL uSelTR = ASMGetTR(); + +#if HC_ARCH_BITS == 64 + /* + * Determine if the host segment registers are suitable for VT-x. Otherwise use zero to + * gain VM-entry and restore them before we get preempted. + * + * See Intel spec. 26.2.3 "Checks on Host Segment and Descriptor-Table Registers". + */ + VMXLOCAL_ADJUST_HOST_SEG(DS, uSelDS); + VMXLOCAL_ADJUST_HOST_SEG(ES, uSelES); + VMXLOCAL_ADJUST_HOST_SEG(FS, uSelFS); + VMXLOCAL_ADJUST_HOST_SEG(GS, uSelGS); +# undef VMXLOCAL_ADJUST_HOST_SEG +#endif + + /* Verification based on Intel spec. 26.2.3 "Checks on Host Segment and Descriptor-Table Registers" */ + Assert(!(uSelCS & X86_SEL_RPL)); Assert(!(uSelCS & X86_SEL_LDT)); + Assert(!(uSelSS & X86_SEL_RPL)); Assert(!(uSelSS & X86_SEL_LDT)); + Assert(!(uSelDS & X86_SEL_RPL)); Assert(!(uSelDS & X86_SEL_LDT)); + Assert(!(uSelES & X86_SEL_RPL)); Assert(!(uSelES & X86_SEL_LDT)); + Assert(!(uSelFS & X86_SEL_RPL)); Assert(!(uSelFS & X86_SEL_LDT)); + Assert(!(uSelGS & X86_SEL_RPL)); Assert(!(uSelGS & X86_SEL_LDT)); + Assert(!(uSelTR & X86_SEL_RPL)); Assert(!(uSelTR & X86_SEL_LDT)); + Assert(uSelCS); + Assert(uSelTR); + + /* Assertion is right but we would not have updated u32ExitCtls yet. */ +#if 0 + if (!(pVCpu->hm.s.vmx.u32ExitCtls & VMX_EXIT_CTLS_HOST_ADDR_SPACE_SIZE)) + Assert(uSelSS != 0); +#endif + + /* Write these host selector fields into the host-state area in the VMCS. */ + int rc = VMXWriteVmcs32(VMX_VMCS16_HOST_CS_SEL, uSelCS); + rc |= VMXWriteVmcs32(VMX_VMCS16_HOST_SS_SEL, uSelSS); +#if HC_ARCH_BITS == 64 + rc |= VMXWriteVmcs32(VMX_VMCS16_HOST_DS_SEL, uSelDS); + rc |= VMXWriteVmcs32(VMX_VMCS16_HOST_ES_SEL, uSelES); + rc |= VMXWriteVmcs32(VMX_VMCS16_HOST_FS_SEL, uSelFS); + rc |= VMXWriteVmcs32(VMX_VMCS16_HOST_GS_SEL, uSelGS); +#else + NOREF(uSelDS); + NOREF(uSelES); + NOREF(uSelFS); + NOREF(uSelGS); +#endif + rc |= VMXWriteVmcs32(VMX_VMCS16_HOST_TR_SEL, uSelTR); + AssertRCReturn(rc, rc); + + /* + * Host GDTR and IDTR. + */ + RTGDTR Gdtr; + RTIDTR Idtr; + RT_ZERO(Gdtr); + RT_ZERO(Idtr); + ASMGetGDTR(&Gdtr); + ASMGetIDTR(&Idtr); + rc = VMXWriteVmcsHstN(VMX_VMCS_HOST_GDTR_BASE, Gdtr.pGdt); + rc |= VMXWriteVmcsHstN(VMX_VMCS_HOST_IDTR_BASE, Idtr.pIdt); + AssertRCReturn(rc, rc); + +#if HC_ARCH_BITS == 64 + /* + * Determine if we need to manually need to restore the GDTR and IDTR limits as VT-x zaps + * them to the maximum limit (0xffff) on every VM-exit. + */ + if (Gdtr.cbGdt != 0xffff) + pVCpu->hm.s.vmx.fRestoreHostFlags |= VMX_RESTORE_HOST_GDTR; + + /* + * IDT limit is effectively capped at 0xfff. (See Intel spec. 6.14.1 "64-Bit Mode IDT" and + * Intel spec. 6.2 "Exception and Interrupt Vectors".) Therefore if the host has the limit + * as 0xfff, VT-x bloating the limit to 0xffff shouldn't cause any different CPU behavior. + * However, several hosts either insists on 0xfff being the limit (Windows Patch Guard) or + * uses the limit for other purposes (darwin puts the CPU ID in there but botches sidt + * alignment in at least one consumer). So, we're only allowing the IDTR.LIMIT to be left + * at 0xffff on hosts where we are sure it won't cause trouble. + */ +# if defined(RT_OS_LINUX) || defined(RT_OS_SOLARIS) + if (Idtr.cbIdt < 0x0fff) +# else + if (Idtr.cbIdt != 0xffff) +# endif + { + pVCpu->hm.s.vmx.fRestoreHostFlags |= VMX_RESTORE_HOST_IDTR; + AssertCompile(sizeof(Idtr) == sizeof(X86XDTR64)); + memcpy(&pVCpu->hm.s.vmx.RestoreHost.HostIdtr, &Idtr, sizeof(X86XDTR64)); + } +#endif + + /* + * Host TR base. Verify that TR selector doesn't point past the GDT. Masking off the TI + * and RPL bits is effectively what the CPU does for "scaling by 8". TI is always 0 and + * RPL should be too in most cases. + */ + AssertMsgReturn((uSelTR | X86_SEL_RPL_LDT) <= Gdtr.cbGdt, + ("TR selector exceeds limit. TR=%RTsel cbGdt=%#x\n", uSelTR, Gdtr.cbGdt), VERR_VMX_INVALID_HOST_STATE); + + PCX86DESCHC pDesc = (PCX86DESCHC)(Gdtr.pGdt + (uSelTR & X86_SEL_MASK)); +#if HC_ARCH_BITS == 64 + uintptr_t uTRBase = X86DESC64_BASE(pDesc); + + /* + * VT-x unconditionally restores the TR limit to 0x67 and type to 11 (32-bit busy TSS) on + * all VM-exits. The type is the same for 64-bit busy TSS[1]. The limit needs manual + * restoration if the host has something else. Task switching is not supported in 64-bit + * mode[2], but the limit still matters as IOPM is supported in 64-bit mode. Restoring the + * limit lazily while returning to ring-3 is safe because IOPM is not applicable in ring-0. + * + * [1] See Intel spec. 3.5 "System Descriptor Types". + * [2] See Intel spec. 7.2.3 "TSS Descriptor in 64-bit mode". + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + Assert(pDesc->System.u4Type == 11); + if ( pDesc->System.u16LimitLow != 0x67 + || pDesc->System.u4LimitHigh) + { + pVCpu->hm.s.vmx.fRestoreHostFlags |= VMX_RESTORE_HOST_SEL_TR; + /* If the host has made GDT read-only, we would need to temporarily toggle CR0.WP before writing the GDT. */ + if (pVM->hm.s.fHostKernelFeatures & SUPKERNELFEATURES_GDT_READ_ONLY) + pVCpu->hm.s.vmx.fRestoreHostFlags |= VMX_RESTORE_HOST_GDT_READ_ONLY; + pVCpu->hm.s.vmx.RestoreHost.uHostSelTR = uSelTR; + } + + /* + * Store the GDTR as we need it when restoring the GDT and while restoring the TR. + */ + if (pVCpu->hm.s.vmx.fRestoreHostFlags & (VMX_RESTORE_HOST_GDTR | VMX_RESTORE_HOST_SEL_TR)) + { + AssertCompile(sizeof(Gdtr) == sizeof(X86XDTR64)); + memcpy(&pVCpu->hm.s.vmx.RestoreHost.HostGdtr, &Gdtr, sizeof(X86XDTR64)); + if (pVM->hm.s.fHostKernelFeatures & SUPKERNELFEATURES_GDT_NEED_WRITABLE) + { + /* The GDT is read-only but the writable GDT is available. */ + pVCpu->hm.s.vmx.fRestoreHostFlags |= VMX_RESTORE_HOST_GDT_NEED_WRITABLE; + pVCpu->hm.s.vmx.RestoreHost.HostGdtrRw.cb = Gdtr.cbGdt; + rc = SUPR0GetCurrentGdtRw(&pVCpu->hm.s.vmx.RestoreHost.HostGdtrRw.uAddr); + AssertRCReturn(rc, rc); + } + } +#else + uintptr_t uTRBase = X86DESC_BASE(pDesc); +#endif + rc = VMXWriteVmcsHstN(VMX_VMCS_HOST_TR_BASE, uTRBase); + AssertRCReturn(rc, rc); + + /* + * Host FS base and GS base. + */ +#if HC_ARCH_BITS == 64 + uint64_t u64FSBase = ASMRdMsr(MSR_K8_FS_BASE); + uint64_t u64GSBase = ASMRdMsr(MSR_K8_GS_BASE); + rc = VMXWriteVmcs64(VMX_VMCS_HOST_FS_BASE, u64FSBase); + rc |= VMXWriteVmcs64(VMX_VMCS_HOST_GS_BASE, u64GSBase); + AssertRCReturn(rc, rc); + + /* Store the base if we have to restore FS or GS manually as we need to restore the base as well. */ + if (pVCpu->hm.s.vmx.fRestoreHostFlags & VMX_RESTORE_HOST_SEL_FS) + pVCpu->hm.s.vmx.RestoreHost.uHostFSBase = u64FSBase; + if (pVCpu->hm.s.vmx.fRestoreHostFlags & VMX_RESTORE_HOST_SEL_GS) + pVCpu->hm.s.vmx.RestoreHost.uHostGSBase = u64GSBase; +#endif + return VINF_SUCCESS; +} + + +/** + * Exports certain host MSRs in the VM-exit MSR-load area and some in the + * host-state area of the VMCS. + * + * Theses MSRs will be automatically restored on the host after every successful + * VM-exit. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportHostMsrs(PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + AssertPtr(pVCpu->hm.s.vmx.pvHostMsr); + + /* + * Save MSRs that we restore lazily (due to preemption or transition to ring-3) + * rather than swapping them on every VM-entry. + */ + hmR0VmxLazySaveHostMsrs(pVCpu); + + /* + * Host Sysenter MSRs. + */ + int rc = VMXWriteVmcs32(VMX_VMCS32_HOST_SYSENTER_CS, ASMRdMsr_Low(MSR_IA32_SYSENTER_CS)); +#if HC_ARCH_BITS == 32 + rc |= VMXWriteVmcs32(VMX_VMCS_HOST_SYSENTER_ESP, ASMRdMsr_Low(MSR_IA32_SYSENTER_ESP)); + rc |= VMXWriteVmcs32(VMX_VMCS_HOST_SYSENTER_EIP, ASMRdMsr_Low(MSR_IA32_SYSENTER_EIP)); +#else + rc |= VMXWriteVmcs64(VMX_VMCS_HOST_SYSENTER_ESP, ASMRdMsr(MSR_IA32_SYSENTER_ESP)); + rc |= VMXWriteVmcs64(VMX_VMCS_HOST_SYSENTER_EIP, ASMRdMsr(MSR_IA32_SYSENTER_EIP)); +#endif + AssertRCReturn(rc, rc); + + /* + * Host EFER MSR. + * + * If the CPU supports the newer VMCS controls for managing EFER, use it. Otherwise it's + * done as part of auto-load/store MSR area in the VMCS, see hmR0VmxExportGuestMsrs(). + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (pVM->hm.s.vmx.fSupportsVmcsEfer) + { + rc = VMXWriteVmcs64(VMX_VMCS64_HOST_EFER_FULL, pVM->hm.s.vmx.u64HostEfer); + AssertRCReturn(rc, rc); + } + + /** @todo IA32_PERF_GLOBALCTRL, IA32_PAT also see hmR0VmxExportGuestExitCtls(). */ + + return VINF_SUCCESS; +} + + +/** + * Figures out if we need to swap the EFER MSR which is particularly expensive. + * + * We check all relevant bits. For now, that's everything besides LMA/LME, as + * these two bits are handled by VM-entry, see hmR0VmxExportGuestExitCtls() and + * hmR0VMxExportGuestEntryCtls(). + * + * @returns true if we need to load guest EFER, false otherwise. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Requires EFER, CR4. + * @remarks No-long-jump zone!!! + */ +static bool hmR0VmxShouldSwapEferMsr(PVMCPU pVCpu) +{ +#ifdef HMVMX_ALWAYS_SWAP_EFER + RT_NOREF(pVCpu); + return true; +#else + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; +#if HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) + /* For 32-bit hosts running 64-bit guests, we always swap EFER in the world-switcher. Nothing to do here. */ + if (CPUMIsGuestInLongModeEx(pCtx)) + return false; +#endif + + PVM pVM = pVCpu->CTX_SUFF(pVM); + uint64_t const u64HostEfer = pVM->hm.s.vmx.u64HostEfer; + uint64_t const u64GuestEfer = pCtx->msrEFER; + + /* + * For 64-bit guests, if EFER.SCE bit differs, we need to swap EFER to ensure that the + * guest's SYSCALL behaviour isn't broken, see @bugref{7386}. + */ + if ( CPUMIsGuestInLongModeEx(pCtx) + && (u64GuestEfer & MSR_K6_EFER_SCE) != (u64HostEfer & MSR_K6_EFER_SCE)) + { + return true; + } + + /* + * If the guest uses PAE and EFER.NXE bit differs, we need to swap EFER as it + * affects guest paging. 64-bit paging implies CR4.PAE as well. + * See Intel spec. 4.5 "IA-32e Paging" and Intel spec. 4.1.1 "Three Paging Modes". + */ + if ( (pCtx->cr4 & X86_CR4_PAE) + && (pCtx->cr0 & X86_CR0_PG) + && (u64GuestEfer & MSR_K6_EFER_NXE) != (u64HostEfer & MSR_K6_EFER_NXE)) + { + /* Assert that host is NX capable. */ + Assert(pVCpu->CTX_SUFF(pVM)->cpum.ro.HostFeatures.fNoExecute); + return true; + } + + return false; +#endif +} + + +/** + * Exports the guest state with appropriate VM-entry controls in the VMCS. + * + * These controls can affect things done on VM-exit; e.g. "load debug controls", + * see Intel spec. 24.8.1 "VM-entry controls". + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Requires EFER. + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestEntryCtls(PVMCPU pVCpu) +{ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_VMX_ENTRY_CTLS) + { + PVM pVM = pVCpu->CTX_SUFF(pVM); + uint32_t fVal = pVM->hm.s.vmx.Msrs.EntryCtls.n.allowed0; /* Bits set here must be set in the VMCS. */ + uint32_t const fZap = pVM->hm.s.vmx.Msrs.EntryCtls.n.allowed1; /* Bits cleared here must be cleared in the VMCS. */ + + /* Load debug controls (DR7 & IA32_DEBUGCTL_MSR). The first VT-x capable CPUs only supports the 1-setting of this bit. */ + fVal |= VMX_ENTRY_CTLS_LOAD_DEBUG; + + /* Set if the guest is in long mode. This will set/clear the EFER.LMA bit on VM-entry. */ + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.GstCtx)) + { + fVal |= VMX_ENTRY_CTLS_IA32E_MODE_GUEST; + Log4Func(("VMX_ENTRY_CTLS_IA32E_MODE_GUEST\n")); + } + else + Assert(!(fVal & VMX_ENTRY_CTLS_IA32E_MODE_GUEST)); + + /* If the CPU supports the newer VMCS controls for managing guest/host EFER, use it. */ + if ( pVM->hm.s.vmx.fSupportsVmcsEfer + && hmR0VmxShouldSwapEferMsr(pVCpu)) + { + fVal |= VMX_ENTRY_CTLS_LOAD_EFER_MSR; + Log4Func(("VMX_ENTRY_CTLS_LOAD_EFER_MSR\n")); + } + + /* + * The following should -not- be set (since we're not in SMM mode): + * - VMX_ENTRY_CTLS_ENTRY_TO_SMM + * - VMX_ENTRY_CTLS_DEACTIVATE_DUAL_MON + */ + + /** @todo VMX_ENTRY_CTLS_LOAD_PERF_MSR, + * VMX_ENTRY_CTLS_LOAD_PAT_MSR. */ + + if ((fVal & fZap) != fVal) + { + Log4Func(("Invalid VM-entry controls combo! Cpu=%#RX32 fVal=%#RX32 fZap=%#RX32\n", + pVM->hm.s.vmx.Msrs.EntryCtls.n.allowed0, fVal, fZap)); + pVCpu->hm.s.u32HMError = VMX_UFC_CTRL_ENTRY; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Commit it to the VMCS and update our cache. */ + if (pVCpu->hm.s.vmx.u32EntryCtls != fVal) + { + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_ENTRY, fVal); + AssertRCReturn(rc, rc); + pVCpu->hm.s.vmx.u32EntryCtls = fVal; + } + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_VMX_ENTRY_CTLS); + } + return VINF_SUCCESS; +} + + +/** + * Exports the guest state with appropriate VM-exit controls in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Requires EFER. + */ +static int hmR0VmxExportGuestExitCtls(PVMCPU pVCpu) +{ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_VMX_EXIT_CTLS) + { + PVM pVM = pVCpu->CTX_SUFF(pVM); + uint32_t fVal = pVM->hm.s.vmx.Msrs.ExitCtls.n.allowed0; /* Bits set here must be set in the VMCS. */ + uint32_t const fZap = pVM->hm.s.vmx.Msrs.ExitCtls.n.allowed1; /* Bits cleared here must be cleared in the VMCS. */ + + /* Save debug controls (DR7 & IA32_DEBUGCTL_MSR). The first VT-x CPUs only supported the 1-setting of this bit. */ + fVal |= VMX_EXIT_CTLS_SAVE_DEBUG; + + /* + * Set the host long mode active (EFER.LMA) bit (which Intel calls "Host address-space size") if necessary. + * On VM-exit, VT-x sets both the host EFER.LMA and EFER.LME bit to this value. See assertion in + * hmR0VmxExportHostMsrs(). + */ +#if HC_ARCH_BITS == 64 + fVal |= VMX_EXIT_CTLS_HOST_ADDR_SPACE_SIZE; + Log4Func(("VMX_EXIT_CTLS_HOST_ADDR_SPACE_SIZE\n")); +#else + Assert( pVCpu->hm.s.vmx.pfnStartVM == VMXR0SwitcherStartVM64 + || pVCpu->hm.s.vmx.pfnStartVM == VMXR0StartVM32); + /* Set the host address-space size based on the switcher, not guest state. See @bugref{8432}. */ + if (pVCpu->hm.s.vmx.pfnStartVM == VMXR0SwitcherStartVM64) + { + /* The switcher returns to long mode, EFER is managed by the switcher. */ + fVal |= VMX_EXIT_CTLS_HOST_ADDR_SPACE_SIZE; + Log4Func(("VMX_EXIT_CTLS_HOST_ADDR_SPACE_SIZE\n")); + } + else + Assert(!(fVal & VMX_EXIT_CTLS_HOST_ADDR_SPACE_SIZE)); +#endif + + /* If the newer VMCS fields for managing EFER exists, use it. */ + if ( pVM->hm.s.vmx.fSupportsVmcsEfer + && hmR0VmxShouldSwapEferMsr(pVCpu)) + { + fVal |= VMX_EXIT_CTLS_SAVE_EFER_MSR + | VMX_EXIT_CTLS_LOAD_EFER_MSR; + Log4Func(("VMX_EXIT_CTLS_SAVE_EFER_MSR and VMX_EXIT_CTLS_LOAD_EFER_MSR\n")); + } + + /* Don't acknowledge external interrupts on VM-exit. We want to let the host do that. */ + Assert(!(fVal & VMX_EXIT_CTLS_ACK_EXT_INT)); + + /** @todo VMX_EXIT_CTLS_LOAD_PERF_MSR, + * VMX_EXIT_CTLS_SAVE_PAT_MSR, + * VMX_EXIT_CTLS_LOAD_PAT_MSR. */ + + /* Enable saving of the VMX preemption timer value on VM-exit. */ + if ( pVM->hm.s.vmx.fUsePreemptTimer + && (pVM->hm.s.vmx.Msrs.ExitCtls.n.allowed1 & VMX_EXIT_CTLS_SAVE_PREEMPT_TIMER)) + fVal |= VMX_EXIT_CTLS_SAVE_PREEMPT_TIMER; + + if ((fVal & fZap) != fVal) + { + LogRelFunc(("Invalid VM-exit controls combo! cpu=%#RX32 fVal=%#RX32 fZap=%R#X32\n", + pVM->hm.s.vmx.Msrs.ExitCtls.n.allowed0, fVal, fZap)); + pVCpu->hm.s.u32HMError = VMX_UFC_CTRL_EXIT; + return VERR_HM_UNSUPPORTED_CPU_FEATURE_COMBO; + } + + /* Commit it to the VMCS and update our cache. */ + if (pVCpu->hm.s.vmx.u32ExitCtls != fVal) + { + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_EXIT, fVal); + AssertRCReturn(rc, rc); + pVCpu->hm.s.vmx.u32ExitCtls = fVal; + } + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_VMX_EXIT_CTLS); + } + return VINF_SUCCESS; +} + + +/** + * Sets the TPR threshold in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param u32TprThreshold The TPR threshold (task-priority class only). + */ +DECLINLINE(int) hmR0VmxApicSetTprThreshold(PVMCPU pVCpu, uint32_t u32TprThreshold) +{ + Assert(!(u32TprThreshold & ~VMX_TPR_THRESHOLD_MASK)); /* Bits 31:4 MBZ. */ + Assert(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW); RT_NOREF_PV(pVCpu); + return VMXWriteVmcs32(VMX_VMCS32_CTRL_TPR_THRESHOLD, u32TprThreshold); +} + + +/** + * Exports the guest APIC TPR state into the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestApicTpr(PVMCPU pVCpu) +{ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_APIC_TPR) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_APIC_TPR); + + if ( PDMHasApic(pVCpu->CTX_SUFF(pVM)) + && APICIsEnabled(pVCpu)) + { + /* + * Setup TPR shadowing. + */ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW) + { + Assert(pVCpu->hm.s.vmx.HCPhysVirtApic); + + bool fPendingIntr = false; + uint8_t u8Tpr = 0; + uint8_t u8PendingIntr = 0; + int rc = APICGetTpr(pVCpu, &u8Tpr, &fPendingIntr, &u8PendingIntr); + AssertRCReturn(rc, rc); + + /* + * If there are interrupts pending but masked by the TPR, instruct VT-x to + * cause a TPR-below-threshold VM-exit when the guest lowers its TPR below the + * priority of the pending interrupt so we can deliver the interrupt. If there + * are no interrupts pending, set threshold to 0 to not cause any + * TPR-below-threshold VM-exits. + */ + pVCpu->hm.s.vmx.pbVirtApic[XAPIC_OFF_TPR] = u8Tpr; + uint32_t u32TprThreshold = 0; + if (fPendingIntr) + { + /* Bits 3:0 of the TPR threshold field correspond to bits 7:4 of the TPR (which is the Task-Priority Class). */ + const uint8_t u8PendingPriority = u8PendingIntr >> 4; + const uint8_t u8TprPriority = u8Tpr >> 4; + if (u8PendingPriority <= u8TprPriority) + u32TprThreshold = u8PendingPriority; + } + + rc = hmR0VmxApicSetTprThreshold(pVCpu, u32TprThreshold); + AssertRCReturn(rc, rc); + } + } + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_APIC_TPR); + } + return VINF_SUCCESS; +} + + +/** + * Gets the guest's interruptibility-state ("interrupt shadow" as AMD calls it). + * + * @returns Guest's interruptibility-state. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static uint32_t hmR0VmxGetGuestIntrState(PVMCPU pVCpu) +{ + /* + * Check if we should inhibit interrupt delivery due to instructions like STI and MOV SS. + */ + uint32_t fIntrState = 0; + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS)) + { + /* If inhibition is active, RIP & RFLAGS should've been accessed + (i.e. read previously from the VMCS or from ring-3). */ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; +#ifdef VBOX_STRICT + uint64_t const fExtrn = ASMAtomicUoReadU64(&pCtx->fExtrn); + AssertMsg(!(fExtrn & (CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RFLAGS)), ("%#x\n", fExtrn)); +#endif + if (pCtx->rip == EMGetInhibitInterruptsPC(pVCpu)) + { + if (pCtx->eflags.Bits.u1IF) + fIntrState = VMX_VMCS_GUEST_INT_STATE_BLOCK_STI; + else + fIntrState = VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS; + } + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS)) + { + /* + * We can clear the inhibit force flag as even if we go back to the recompiler + * without executing guest code in VT-x, the flag's condition to be cleared is + * met and thus the cleared state is correct. + */ + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); + } + } + + /* + * NMIs to the guest are blocked after an NMI is injected until the guest executes an IRET. We only + * bother with virtual-NMI blocking when we have support for virtual NMIs in the CPU, otherwise + * setting this would block host-NMIs and IRET will not clear the blocking. + * + * See Intel spec. 26.6.1 "Interruptibility state". See @bugref{7445}. + */ + if ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS) + && (pVCpu->hm.s.vmx.u32PinCtls & VMX_PIN_CTLS_VIRT_NMI)) + { + fIntrState |= VMX_VMCS_GUEST_INT_STATE_BLOCK_NMI; + } + + return fIntrState; +} + + +/** + * Exports the exception intercepts required for guest execution in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestXcptIntercepts(PVMCPU pVCpu) +{ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_VMX_GUEST_XCPT_INTERCEPTS) + { + uint32_t uXcptBitmap = pVCpu->hm.s.vmx.u32XcptBitmap; + + /* The remaining exception intercepts are handled elsewhere, e.g. in hmR0VmxExportGuestCR0(). */ + if (pVCpu->hm.s.fGIMTrapXcptUD) + uXcptBitmap |= RT_BIT(X86_XCPT_UD); +#ifndef HMVMX_ALWAYS_TRAP_ALL_XCPTS + else + uXcptBitmap &= ~RT_BIT(X86_XCPT_UD); +#endif + + Assert(uXcptBitmap & RT_BIT_32(X86_XCPT_AC)); + Assert(uXcptBitmap & RT_BIT_32(X86_XCPT_DB)); + + if (uXcptBitmap != pVCpu->hm.s.vmx.u32XcptBitmap) + { + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_EXCEPTION_BITMAP, uXcptBitmap); + AssertRCReturn(rc, rc); + pVCpu->hm.s.vmx.u32XcptBitmap = uXcptBitmap; + } + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_VMX_GUEST_XCPT_INTERCEPTS); + Log4Func(("VMX_VMCS32_CTRL_EXCEPTION_BITMAP=%#RX64\n", uXcptBitmap)); + } + return VINF_SUCCESS; +} + + +/** + * Exports the guest's RIP into the guest-state area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestRip(PVMCPU pVCpu) +{ + int rc = VINF_SUCCESS; + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_RIP) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_RIP); + + rc = VMXWriteVmcsGstN(VMX_VMCS_GUEST_RIP, pVCpu->cpum.GstCtx.rip); + AssertRCReturn(rc, rc); + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_RIP); + Log4Func(("RIP=%#RX64\n", pVCpu->cpum.GstCtx.rip)); + } + return rc; +} + + +/** + * Exports the guest's RSP into the guest-state area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestRsp(PVMCPU pVCpu) +{ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_RSP) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_RSP); + + int rc = VMXWriteVmcsGstN(VMX_VMCS_GUEST_RSP, pVCpu->cpum.GstCtx.rsp); + AssertRCReturn(rc, rc); + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_RSP); + } + return VINF_SUCCESS; +} + + +/** + * Exports the guest's RFLAGS into the guest-state area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestRflags(PVMCPU pVCpu) +{ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_RFLAGS) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_RFLAGS); + + /* Intel spec. 2.3.1 "System Flags and Fields in IA-32e Mode" claims the upper 32-bits of RFLAGS are reserved (MBZ). + Let us assert it as such and use 32-bit VMWRITE. */ + Assert(!RT_HI_U32(pVCpu->cpum.GstCtx.rflags.u64)); + X86EFLAGS fEFlags = pVCpu->cpum.GstCtx.eflags; + Assert(fEFlags.u32 & X86_EFL_RA1_MASK); + Assert(!(fEFlags.u32 & ~(X86_EFL_1 | X86_EFL_LIVE_MASK))); + + /* + * If we're emulating real-mode using Virtual 8086 mode, save the real-mode eflags so + * we can restore them on VM-exit. Modify the real-mode guest's eflags so that VT-x + * can run the real-mode guest code under Virtual 8086 mode. + */ + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.vmx.pRealModeTSS); + Assert(PDMVmmDevHeapIsEnabled(pVCpu->CTX_SUFF(pVM))); + pVCpu->hm.s.vmx.RealMode.Eflags.u32 = fEFlags.u32; /* Save the original eflags of the real-mode guest. */ + fEFlags.Bits.u1VM = 1; /* Set the Virtual 8086 mode bit. */ + fEFlags.Bits.u2IOPL = 0; /* Change IOPL to 0, otherwise certain instructions won't fault. */ + } + + int rc = VMXWriteVmcs32(VMX_VMCS_GUEST_RFLAGS, fEFlags.u32); + AssertRCReturn(rc, rc); + + /* + * Setup pending debug exceptions if the guest is single-stepping using EFLAGS.TF. + * + * We must avoid setting any automatic debug exceptions delivery when single-stepping + * through the hypervisor debugger using EFLAGS.TF. + */ + if ( !pVCpu->hm.s.fSingleInstruction + && fEFlags.Bits.u1TF) + { + /** @todo r=ramshankar: Warning! We ASSUME EFLAGS.TF will not cleared on + * premature trips to ring-3 esp since IEM does not yet handle it. */ + rc = VMXWriteVmcs32(VMX_VMCS_GUEST_PENDING_DEBUG_XCPTS, VMX_VMCS_GUEST_PENDING_DEBUG_XCPT_BS); + AssertRCReturn(rc, rc); + } + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_RFLAGS); + Log4Func(("EFlags=%#RX32\n", fEFlags.u32)); + } + return VINF_SUCCESS; +} + + +/** + * Exports the guest CR0 control register into the guest-state area in the VMCS. + * + * The guest FPU state is always pre-loaded hence we don't need to bother about + * sharing FPU related CR0 bits between the guest and host. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestCR0(PVMCPU pVCpu) +{ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_CR0) + { + PVM pVM = pVCpu->CTX_SUFF(pVM); + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR0); + Assert(!RT_HI_U32(pVCpu->cpum.GstCtx.cr0)); + + uint32_t const u32ShadowCr0 = pVCpu->cpum.GstCtx.cr0; + uint32_t u32GuestCr0 = pVCpu->cpum.GstCtx.cr0; + + /* + * Setup VT-x's view of the guest CR0. + * Minimize VM-exits due to CR3 changes when we have NestedPaging. + */ + uint32_t uProcCtls = pVCpu->hm.s.vmx.u32ProcCtls; + if (pVM->hm.s.fNestedPaging) + { + if (CPUMIsGuestPagingEnabled(pVCpu)) + { + /* The guest has paging enabled, let it access CR3 without causing a VM-exit if supported. */ + uProcCtls &= ~( VMX_PROC_CTLS_CR3_LOAD_EXIT + | VMX_PROC_CTLS_CR3_STORE_EXIT); + } + else + { + /* The guest doesn't have paging enabled, make CR3 access cause a VM-exit to update our shadow. */ + uProcCtls |= VMX_PROC_CTLS_CR3_LOAD_EXIT + | VMX_PROC_CTLS_CR3_STORE_EXIT; + } + + /* If we have unrestricted guest execution, we never have to intercept CR3 reads. */ + if (pVM->hm.s.vmx.fUnrestrictedGuest) + uProcCtls &= ~VMX_PROC_CTLS_CR3_STORE_EXIT; + } + else + { + /* Guest CPL 0 writes to its read-only pages should cause a #PF VM-exit. */ + u32GuestCr0 |= X86_CR0_WP; + } + + /* + * Guest FPU bits. + * + * Since we pre-load the guest FPU always before VM-entry there is no need to track lazy state + * using CR0.TS. + * + * Intel spec. 23.8 "Restrictions on VMX operation" mentions that CR0.NE bit must always be + * set on the first CPUs to support VT-x and no mention of with regards to UX in VM-entry checks. + */ + u32GuestCr0 |= X86_CR0_NE; + + /* If CR0.NE isn't set, we need to intercept #MF exceptions and report them to the guest differently. */ + bool const fInterceptMF = !(u32ShadowCr0 & X86_CR0_NE); + + /* + * Update exception intercepts. + */ + uint32_t uXcptBitmap = pVCpu->hm.s.vmx.u32XcptBitmap; + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + Assert(PDMVmmDevHeapIsEnabled(pVM)); + Assert(pVM->hm.s.vmx.pRealModeTSS); + uXcptBitmap |= HMVMX_REAL_MODE_XCPT_MASK; + } + else + { + /* For now, cleared here as mode-switches can happen outside HM/VT-x. See @bugref{7626#c11}. */ + uXcptBitmap &= ~HMVMX_REAL_MODE_XCPT_MASK; + if (fInterceptMF) + uXcptBitmap |= RT_BIT(X86_XCPT_MF); + } + + /* Additional intercepts for debugging, define these yourself explicitly. */ +#ifdef HMVMX_ALWAYS_TRAP_ALL_XCPTS + uXcptBitmap |= 0 + | RT_BIT(X86_XCPT_BP) + | RT_BIT(X86_XCPT_DE) + | RT_BIT(X86_XCPT_NM) + | RT_BIT(X86_XCPT_TS) + | RT_BIT(X86_XCPT_UD) + | RT_BIT(X86_XCPT_NP) + | RT_BIT(X86_XCPT_SS) + | RT_BIT(X86_XCPT_GP) + | RT_BIT(X86_XCPT_PF) + | RT_BIT(X86_XCPT_MF) + ; +#elif defined(HMVMX_ALWAYS_TRAP_PF) + uXcptBitmap |= RT_BIT(X86_XCPT_PF); +#endif + if (pVCpu->hm.s.fTrapXcptGpForLovelyMesaDrv) + uXcptBitmap |= RT_BIT(X86_XCPT_GP); + Assert(pVM->hm.s.fNestedPaging || (uXcptBitmap & RT_BIT(X86_XCPT_PF))); + + /* + * Set/clear the CR0 specific bits along with their exceptions (PE, PG, CD, NW). + */ + uint32_t fSetCr0 = (uint32_t)(pVM->hm.s.vmx.Msrs.u64Cr0Fixed0 & pVM->hm.s.vmx.Msrs.u64Cr0Fixed1); + uint32_t fZapCr0 = (uint32_t)(pVM->hm.s.vmx.Msrs.u64Cr0Fixed0 | pVM->hm.s.vmx.Msrs.u64Cr0Fixed1); + if (pVM->hm.s.vmx.fUnrestrictedGuest) /* Exceptions for unrestricted-guests for fixed CR0 bits (PE, PG). */ + fSetCr0 &= ~(X86_CR0_PE | X86_CR0_PG); + else + Assert((fSetCr0 & (X86_CR0_PE | X86_CR0_PG)) == (X86_CR0_PE | X86_CR0_PG)); + + u32GuestCr0 |= fSetCr0; + u32GuestCr0 &= fZapCr0; + u32GuestCr0 &= ~(X86_CR0_CD | X86_CR0_NW); /* Always enable caching. */ + + /* + * CR0 is shared between host and guest along with a CR0 read shadow. Therefore, certain bits must not be changed + * by the guest because VT-x ignores saving/restoring them (namely CD, ET, NW) and for certain other bits + * we want to be notified immediately of guest CR0 changes (e.g. PG to update our shadow page tables). + */ + uint32_t u32Cr0Mask = X86_CR0_PE + | X86_CR0_NE + | (pVM->hm.s.fNestedPaging ? 0 : X86_CR0_WP) + | X86_CR0_PG + | X86_CR0_ET /* Bit ignored on VM-entry and VM-exit. Don't let the guest modify the host CR0.ET */ + | X86_CR0_CD /* Bit ignored on VM-entry and VM-exit. Don't let the guest modify the host CR0.CD */ + | X86_CR0_NW; /* Bit ignored on VM-entry and VM-exit. Don't let the guest modify the host CR0.NW */ + + /** @todo Avoid intercepting CR0.PE with unrestricted guests. Fix PGM + * enmGuestMode to be in-sync with the current mode. See @bugref{6398} + * and @bugref{6944}. */ +#if 0 + if (pVM->hm.s.vmx.fUnrestrictedGuest) + u32Cr0Mask &= ~X86_CR0_PE; +#endif + /* + * Finally, update VMCS fields with the CR0 values and the exception bitmap. + */ + int rc = VMXWriteVmcs32(VMX_VMCS_GUEST_CR0, u32GuestCr0); + rc |= VMXWriteVmcs32(VMX_VMCS_CTRL_CR0_READ_SHADOW, u32ShadowCr0); + if (u32Cr0Mask != pVCpu->hm.s.vmx.u32Cr0Mask) + rc |= VMXWriteVmcs32(VMX_VMCS_CTRL_CR0_MASK, u32Cr0Mask); + if (uProcCtls != pVCpu->hm.s.vmx.u32ProcCtls) + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, uProcCtls); + if (uXcptBitmap != pVCpu->hm.s.vmx.u32XcptBitmap) + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_EXCEPTION_BITMAP, uXcptBitmap); + AssertRCReturn(rc, rc); + + /* Update our caches. */ + pVCpu->hm.s.vmx.u32Cr0Mask = u32Cr0Mask; + pVCpu->hm.s.vmx.u32ProcCtls = uProcCtls; + pVCpu->hm.s.vmx.u32XcptBitmap = uXcptBitmap; + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_CR0); + + Log4Func(("u32Cr0Mask=%#RX32 u32ShadowCr0=%#RX32 u32GuestCr0=%#RX32 (fSetCr0=%#RX32 fZapCr0=%#RX32\n", u32Cr0Mask, + u32ShadowCr0, u32GuestCr0, fSetCr0, fZapCr0)); + } + + return VINF_SUCCESS; +} + + +/** + * Exports the guest control registers (CR3, CR4) into the guest-state area + * in the VMCS. + * + * @returns VBox strict status code. + * @retval VINF_EM_RESCHEDULE_REM if we try to emulate non-paged guest code + * without unrestricted guest access and the VMMDev is not presently + * mapped (e.g. EFI32). + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static VBOXSTRICTRC hmR0VmxExportGuestCR3AndCR4(PVMCPU pVCpu) +{ + int rc = VINF_SUCCESS; + PVM pVM = pVCpu->CTX_SUFF(pVM); + + /* + * Guest CR2. + * It's always loaded in the assembler code. Nothing to do here. + */ + + /* + * Guest CR3. + */ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_CR3) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR3); + + RTGCPHYS GCPhysGuestCR3 = NIL_RTGCPHYS; + if (pVM->hm.s.fNestedPaging) + { + pVCpu->hm.s.vmx.HCPhysEPTP = PGMGetHyperCR3(pVCpu); + + /* Validate. See Intel spec. 28.2.2 "EPT Translation Mechanism" and 24.6.11 "Extended-Page-Table Pointer (EPTP)" */ + Assert(pVCpu->hm.s.vmx.HCPhysEPTP); + Assert(!(pVCpu->hm.s.vmx.HCPhysEPTP & UINT64_C(0xfff0000000000000))); + Assert(!(pVCpu->hm.s.vmx.HCPhysEPTP & 0xfff)); + + /* VMX_EPT_MEMTYPE_WB support is already checked in hmR0VmxSetupTaggedTlb(). */ + pVCpu->hm.s.vmx.HCPhysEPTP |= VMX_EPT_MEMTYPE_WB + | (VMX_EPT_PAGE_WALK_LENGTH_DEFAULT << VMX_EPT_PAGE_WALK_LENGTH_SHIFT); + + /* Validate. See Intel spec. 26.2.1 "Checks on VMX Controls" */ + AssertMsg( ((pVCpu->hm.s.vmx.HCPhysEPTP >> 3) & 0x07) == 3 /* Bits 3:5 (EPT page walk length - 1) must be 3. */ + && ((pVCpu->hm.s.vmx.HCPhysEPTP >> 7) & 0x1f) == 0, /* Bits 7:11 MBZ. */ + ("EPTP %#RX64\n", pVCpu->hm.s.vmx.HCPhysEPTP)); + AssertMsg( !((pVCpu->hm.s.vmx.HCPhysEPTP >> 6) & 0x01) /* Bit 6 (EPT accessed & dirty bit). */ + || (pVM->hm.s.vmx.Msrs.u64EptVpidCaps & MSR_IA32_VMX_EPT_VPID_CAP_EPT_ACCESS_DIRTY), + ("EPTP accessed/dirty bit not supported by CPU but set %#RX64\n", pVCpu->hm.s.vmx.HCPhysEPTP)); + + rc = VMXWriteVmcs64(VMX_VMCS64_CTRL_EPTP_FULL, pVCpu->hm.s.vmx.HCPhysEPTP); + AssertRCReturn(rc, rc); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if ( pVM->hm.s.vmx.fUnrestrictedGuest + || CPUMIsGuestPagingEnabledEx(pCtx)) + { + /* If the guest is in PAE mode, pass the PDPEs to VT-x using the VMCS fields. */ + if (CPUMIsGuestInPAEModeEx(pCtx)) + { + rc = PGMGstGetPaePdpes(pVCpu, &pVCpu->hm.s.aPdpes[0]); + AssertRCReturn(rc, rc); + rc = VMXWriteVmcs64(VMX_VMCS64_GUEST_PDPTE0_FULL, pVCpu->hm.s.aPdpes[0].u); + rc |= VMXWriteVmcs64(VMX_VMCS64_GUEST_PDPTE1_FULL, pVCpu->hm.s.aPdpes[1].u); + rc |= VMXWriteVmcs64(VMX_VMCS64_GUEST_PDPTE2_FULL, pVCpu->hm.s.aPdpes[2].u); + rc |= VMXWriteVmcs64(VMX_VMCS64_GUEST_PDPTE3_FULL, pVCpu->hm.s.aPdpes[3].u); + AssertRCReturn(rc, rc); + } + + /* + * The guest's view of its CR3 is unblemished with Nested Paging when the + * guest is using paging or we have unrestricted guest execution to handle + * the guest when it's not using paging. + */ + GCPhysGuestCR3 = pCtx->cr3; + } + else + { + /* + * The guest is not using paging, but the CPU (VT-x) has to. While the guest + * thinks it accesses physical memory directly, we use our identity-mapped + * page table to map guest-linear to guest-physical addresses. EPT takes care + * of translating it to host-physical addresses. + */ + RTGCPHYS GCPhys; + Assert(pVM->hm.s.vmx.pNonPagingModeEPTPageTable); + + /* We obtain it here every time as the guest could have relocated this PCI region. */ + rc = PDMVmmDevHeapR3ToGCPhys(pVM, pVM->hm.s.vmx.pNonPagingModeEPTPageTable, &GCPhys); + if (RT_SUCCESS(rc)) + { /* likely */ } + else if (rc == VERR_PDM_DEV_HEAP_R3_TO_GCPHYS) + { + Log4Func(("VERR_PDM_DEV_HEAP_R3_TO_GCPHYS -> VINF_EM_RESCHEDULE_REM\n")); + return VINF_EM_RESCHEDULE_REM; /* We cannot execute now, switch to REM/IEM till the guest maps in VMMDev. */ + } + else + AssertMsgFailedReturn(("%Rrc\n", rc), rc); + + GCPhysGuestCR3 = GCPhys; + } + + Log4Func(("u32GuestCr3=%#RGp (GstN)\n", GCPhysGuestCR3)); + rc = VMXWriteVmcsGstN(VMX_VMCS_GUEST_CR3, GCPhysGuestCR3); + AssertRCReturn(rc, rc); + } + else + { + /* Non-nested paging case, just use the hypervisor's CR3. */ + RTHCPHYS HCPhysGuestCR3 = PGMGetHyperCR3(pVCpu); + + Log4Func(("u32GuestCr3=%#RHv (HstN)\n", HCPhysGuestCR3)); + rc = VMXWriteVmcsHstN(VMX_VMCS_GUEST_CR3, HCPhysGuestCR3); + AssertRCReturn(rc, rc); + } + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_CR3); + } + + /* + * Guest CR4. + * ASSUMES this is done everytime we get in from ring-3! (XCR0) + */ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_CR4) + { + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR4); + Assert(!RT_HI_U32(pCtx->cr4)); + + uint32_t u32GuestCr4 = pCtx->cr4; + uint32_t const u32ShadowCr4 = pCtx->cr4; + + /* + * Setup VT-x's view of the guest CR4. + * + * If we're emulating real-mode using virtual-8086 mode, we want to redirect software + * interrupts to the 8086 program interrupt handler. Clear the VME bit (the interrupt + * redirection bitmap is already all 0, see hmR3InitFinalizeR0()) + * + * See Intel spec. 20.2 "Software Interrupt Handling Methods While in Virtual-8086 Mode". + */ + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + Assert(pVM->hm.s.vmx.pRealModeTSS); + Assert(PDMVmmDevHeapIsEnabled(pVM)); + u32GuestCr4 &= ~X86_CR4_VME; + } + + if (pVM->hm.s.fNestedPaging) + { + if ( !CPUMIsGuestPagingEnabledEx(pCtx) + && !pVM->hm.s.vmx.fUnrestrictedGuest) + { + /* We use 4 MB pages in our identity mapping page table when the guest doesn't have paging. */ + u32GuestCr4 |= X86_CR4_PSE; + /* Our identity mapping is a 32-bit page directory. */ + u32GuestCr4 &= ~X86_CR4_PAE; + } + /* else use guest CR4.*/ + } + else + { + /* + * The shadow paging modes and guest paging modes are different, the shadow is in accordance with the host + * paging mode and thus we need to adjust VT-x's view of CR4 depending on our shadow page tables. + */ + switch (pVCpu->hm.s.enmShadowMode) + { + case PGMMODE_REAL: /* Real-mode. */ + case PGMMODE_PROTECTED: /* Protected mode without paging. */ + case PGMMODE_32_BIT: /* 32-bit paging. */ + { + u32GuestCr4 &= ~X86_CR4_PAE; + break; + } + + case PGMMODE_PAE: /* PAE paging. */ + case PGMMODE_PAE_NX: /* PAE paging with NX. */ + { + u32GuestCr4 |= X86_CR4_PAE; + break; + } + + case PGMMODE_AMD64: /* 64-bit AMD paging (long mode). */ + case PGMMODE_AMD64_NX: /* 64-bit AMD paging (long mode) with NX enabled. */ +#ifdef VBOX_ENABLE_64_BITS_GUESTS + break; +#endif + default: + AssertFailed(); + return VERR_PGM_UNSUPPORTED_SHADOW_PAGING_MODE; + } + } + + /* We need to set and clear the CR4 specific bits here (mainly the X86_CR4_VMXE bit). */ + uint64_t const fSetCr4 = (pVM->hm.s.vmx.Msrs.u64Cr4Fixed0 & pVM->hm.s.vmx.Msrs.u64Cr4Fixed1); + uint64_t const fZapCr4 = (pVM->hm.s.vmx.Msrs.u64Cr4Fixed0 | pVM->hm.s.vmx.Msrs.u64Cr4Fixed1); + u32GuestCr4 |= fSetCr4; + u32GuestCr4 &= fZapCr4; + + /* Setup CR4 mask. CR4 flags owned by the host, if the guest attempts to change them, + that would cause a VM-exit. */ + uint32_t u32Cr4Mask = X86_CR4_VME + | X86_CR4_PAE + | X86_CR4_PGE + | X86_CR4_PSE + | X86_CR4_VMXE; + if (pVM->cpum.ro.HostFeatures.fXSaveRstor) + u32Cr4Mask |= X86_CR4_OSXSAVE; + if (pVM->cpum.ro.GuestFeatures.fPcid) + u32Cr4Mask |= X86_CR4_PCIDE; + + /* Write VT-x's view of the guest CR4, the CR4 modify mask and the read-only CR4 shadow + into the VMCS and update our cache. */ + rc = VMXWriteVmcs32(VMX_VMCS_GUEST_CR4, u32GuestCr4); + rc |= VMXWriteVmcs32(VMX_VMCS_CTRL_CR4_READ_SHADOW, u32ShadowCr4); + if (pVCpu->hm.s.vmx.u32Cr4Mask != u32Cr4Mask) + rc |= VMXWriteVmcs32(VMX_VMCS_CTRL_CR4_MASK, u32Cr4Mask); + AssertRCReturn(rc, rc); + pVCpu->hm.s.vmx.u32Cr4Mask = u32Cr4Mask; + + /* Whether to save/load/restore XCR0 during world switch depends on CR4.OSXSAVE and host+guest XCR0. */ + pVCpu->hm.s.fLoadSaveGuestXcr0 = (pCtx->cr4 & X86_CR4_OSXSAVE) && pCtx->aXcr[0] != ASMGetXcr0(); + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_CR4); + + Log4Func(("u32GuestCr4=%#RX32 u32ShadowCr4=%#RX32 (fSetCr4=%#RX32 fZapCr4=%#RX32)\n", u32GuestCr4, u32ShadowCr4, fSetCr4, + fZapCr4)); + } + return rc; +} + + +/** + * Exports the guest debug registers into the guest-state area in the VMCS. + * The guest debug bits are partially shared with the host (e.g. DR6, DR0-3). + * + * This also sets up whether \#DB and MOV DRx accesses cause VM-exits. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportSharedDebugState(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + +#ifdef VBOX_STRICT + /* Validate. Intel spec. 26.3.1.1 "Checks on Guest Controls Registers, Debug Registers, MSRs" */ + if (pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_LOAD_DEBUG) + { + /* Validate. Intel spec. 17.2 "Debug Registers", recompiler paranoia checks. */ + Assert((pVCpu->cpum.GstCtx.dr[7] & (X86_DR7_MBZ_MASK | X86_DR7_RAZ_MASK)) == 0); + Assert((pVCpu->cpum.GstCtx.dr[7] & X86_DR7_RA1_MASK) == X86_DR7_RA1_MASK); + } +#endif + + bool fSteppingDB = false; + bool fInterceptMovDRx = false; + uint32_t uProcCtls = pVCpu->hm.s.vmx.u32ProcCtls; + if (pVCpu->hm.s.fSingleInstruction) + { + /* If the CPU supports the monitor trap flag, use it for single stepping in DBGF and avoid intercepting #DB. */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_MONITOR_TRAP_FLAG) + { + uProcCtls |= VMX_PROC_CTLS_MONITOR_TRAP_FLAG; + Assert(fSteppingDB == false); + } + else + { + pVCpu->cpum.GstCtx.eflags.u32 |= X86_EFL_TF; + pVCpu->hm.s.fCtxChanged |= HM_CHANGED_GUEST_RFLAGS; + pVCpu->hm.s.fClearTrapFlag = true; + fSteppingDB = true; + } + } + + uint32_t u32GuestDr7; + if ( fSteppingDB + || (CPUMGetHyperDR7(pVCpu) & X86_DR7_ENABLED_MASK)) + { + /* + * Use the combined guest and host DRx values found in the hypervisor register set + * because the debugger has breakpoints active or someone is single stepping on the + * host side without a monitor trap flag. + * + * Note! DBGF expects a clean DR6 state before executing guest code. + */ +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if ( CPUMIsGuestInLongModeEx(&pVCpu->cpum.GstCtx) + && !CPUMIsHyperDebugStateActivePending(pVCpu)) + { + CPUMR0LoadHyperDebugState(pVCpu, true /* include DR6 */); + Assert(CPUMIsHyperDebugStateActivePending(pVCpu)); + Assert(!CPUMIsGuestDebugStateActivePending(pVCpu)); + } + else +#endif + if (!CPUMIsHyperDebugStateActive(pVCpu)) + { + CPUMR0LoadHyperDebugState(pVCpu, true /* include DR6 */); + Assert(CPUMIsHyperDebugStateActive(pVCpu)); + Assert(!CPUMIsGuestDebugStateActive(pVCpu)); + } + + /* Update DR7 with the hypervisor value (other DRx registers are handled by CPUM one way or another). */ + u32GuestDr7 = (uint32_t)CPUMGetHyperDR7(pVCpu); + pVCpu->hm.s.fUsingHyperDR7 = true; + fInterceptMovDRx = true; + } + else + { + /* + * If the guest has enabled debug registers, we need to load them prior to + * executing guest code so they'll trigger at the right time. + */ + if (pVCpu->cpum.GstCtx.dr[7] & (X86_DR7_ENABLED_MASK | X86_DR7_GD)) + { +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if ( CPUMIsGuestInLongModeEx(&pVCpu->cpum.GstCtx) + && !CPUMIsGuestDebugStateActivePending(pVCpu)) + { + CPUMR0LoadGuestDebugState(pVCpu, true /* include DR6 */); + Assert(CPUMIsGuestDebugStateActivePending(pVCpu)); + Assert(!CPUMIsHyperDebugStateActivePending(pVCpu)); + STAM_COUNTER_INC(&pVCpu->hm.s.StatDRxArmed); + } + else +#endif + if (!CPUMIsGuestDebugStateActive(pVCpu)) + { + CPUMR0LoadGuestDebugState(pVCpu, true /* include DR6 */); + Assert(CPUMIsGuestDebugStateActive(pVCpu)); + Assert(!CPUMIsHyperDebugStateActive(pVCpu)); + STAM_COUNTER_INC(&pVCpu->hm.s.StatDRxArmed); + } + Assert(!fInterceptMovDRx); + } + /* + * If no debugging enabled, we'll lazy load DR0-3. Unlike on AMD-V, we + * must intercept #DB in order to maintain a correct DR6 guest value, and + * because we need to intercept it to prevent nested #DBs from hanging the + * CPU, we end up always having to intercept it. See hmR0VmxInitXcptBitmap. + */ +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + else if ( !CPUMIsGuestDebugStateActivePending(pVCpu) + && !CPUMIsGuestDebugStateActive(pVCpu)) +#else + else if (!CPUMIsGuestDebugStateActive(pVCpu)) +#endif + { + fInterceptMovDRx = true; + } + + /* Update DR7 with the actual guest value. */ + u32GuestDr7 = pVCpu->cpum.GstCtx.dr[7]; + pVCpu->hm.s.fUsingHyperDR7 = false; + } + + if (fInterceptMovDRx) + uProcCtls |= VMX_PROC_CTLS_MOV_DR_EXIT; + else + uProcCtls &= ~VMX_PROC_CTLS_MOV_DR_EXIT; + + /* + * Update the processor-based VM-execution controls with the MOV-DRx intercepts and the + * monitor-trap flag and update our cache. + */ + if (uProcCtls != pVCpu->hm.s.vmx.u32ProcCtls) + { + int rc2 = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, uProcCtls); + AssertRCReturn(rc2, rc2); + pVCpu->hm.s.vmx.u32ProcCtls = uProcCtls; + } + + /* + * Update guest DR7. + */ + int rc = VMXWriteVmcs32(VMX_VMCS_GUEST_DR7, u32GuestDr7); + AssertRCReturn(rc, rc); + + /* + * If we have forced EFLAGS.TF to be set because we're single-stepping in the hypervisor debugger, + * we need to clear interrupt inhibition if any as otherwise it causes a VM-entry failure. + * + * See Intel spec. 26.3.1.5 "Checks on Guest Non-Register State". + */ + if (fSteppingDB) + { + Assert(pVCpu->hm.s.fSingleInstruction); + Assert(pVCpu->cpum.GstCtx.eflags.Bits.u1TF); + + uint32_t fIntrState = 0; + rc = VMXReadVmcs32(VMX_VMCS32_GUEST_INT_STATE, &fIntrState); + AssertRCReturn(rc, rc); + + if (fIntrState & (VMX_VMCS_GUEST_INT_STATE_BLOCK_STI | VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS)) + { + fIntrState &= ~(VMX_VMCS_GUEST_INT_STATE_BLOCK_STI | VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS); + rc = VMXWriteVmcs32(VMX_VMCS32_GUEST_INT_STATE, fIntrState); + AssertRCReturn(rc, rc); + } + } + + return VINF_SUCCESS; +} + + +#ifdef VBOX_STRICT +/** + * Strict function to validate segment registers. + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Will import guest CR0 on strict builds during validation of + * segments. + */ +static void hmR0VmxValidateSegmentRegs(PVMCPU pVCpu) +{ + /* + * Validate segment registers. See Intel spec. 26.3.1.2 "Checks on Guest Segment Registers". + * + * The reason we check for attribute value 0 in this function and not just the unusable bit is + * because hmR0VmxExportGuestSegmentReg() only updates the VMCS' copy of the value with the unusable bit + * and doesn't change the guest-context value. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_CR0); + if ( !pVM->hm.s.vmx.fUnrestrictedGuest + && ( !CPUMIsGuestInRealModeEx(pCtx) + && !CPUMIsGuestInV86ModeEx(pCtx))) + { + /* Protected mode checks */ + /* CS */ + Assert(pCtx->cs.Attr.n.u1Present); + Assert(!(pCtx->cs.Attr.u & 0xf00)); + Assert(!(pCtx->cs.Attr.u & 0xfffe0000)); + Assert( (pCtx->cs.u32Limit & 0xfff) == 0xfff + || !(pCtx->cs.Attr.n.u1Granularity)); + Assert( !(pCtx->cs.u32Limit & 0xfff00000) + || (pCtx->cs.Attr.n.u1Granularity)); + /* CS cannot be loaded with NULL in protected mode. */ + Assert(pCtx->cs.Attr.u && !(pCtx->cs.Attr.u & X86DESCATTR_UNUSABLE)); /** @todo is this really true even for 64-bit CS? */ + if (pCtx->cs.Attr.n.u4Type == 9 || pCtx->cs.Attr.n.u4Type == 11) + Assert(pCtx->cs.Attr.n.u2Dpl == pCtx->ss.Attr.n.u2Dpl); + else if (pCtx->cs.Attr.n.u4Type == 13 || pCtx->cs.Attr.n.u4Type == 15) + Assert(pCtx->cs.Attr.n.u2Dpl <= pCtx->ss.Attr.n.u2Dpl); + else + AssertMsgFailed(("Invalid CS Type %#x\n", pCtx->cs.Attr.n.u2Dpl)); + /* SS */ + Assert((pCtx->ss.Sel & X86_SEL_RPL) == (pCtx->cs.Sel & X86_SEL_RPL)); + Assert(pCtx->ss.Attr.n.u2Dpl == (pCtx->ss.Sel & X86_SEL_RPL)); + if ( !(pCtx->cr0 & X86_CR0_PE) + || pCtx->cs.Attr.n.u4Type == 3) + { + Assert(!pCtx->ss.Attr.n.u2Dpl); + } + if (pCtx->ss.Attr.u && !(pCtx->ss.Attr.u & X86DESCATTR_UNUSABLE)) + { + Assert((pCtx->ss.Sel & X86_SEL_RPL) == (pCtx->cs.Sel & X86_SEL_RPL)); + Assert(pCtx->ss.Attr.n.u4Type == 3 || pCtx->ss.Attr.n.u4Type == 7); + Assert(pCtx->ss.Attr.n.u1Present); + Assert(!(pCtx->ss.Attr.u & 0xf00)); + Assert(!(pCtx->ss.Attr.u & 0xfffe0000)); + Assert( (pCtx->ss.u32Limit & 0xfff) == 0xfff + || !(pCtx->ss.Attr.n.u1Granularity)); + Assert( !(pCtx->ss.u32Limit & 0xfff00000) + || (pCtx->ss.Attr.n.u1Granularity)); + } + /* DS, ES, FS, GS - only check for usable selectors, see hmR0VmxExportGuestSegmentReg(). */ + if (pCtx->ds.Attr.u && !(pCtx->ds.Attr.u & X86DESCATTR_UNUSABLE)) + { + Assert(pCtx->ds.Attr.n.u4Type & X86_SEL_TYPE_ACCESSED); + Assert(pCtx->ds.Attr.n.u1Present); + Assert(pCtx->ds.Attr.n.u4Type > 11 || pCtx->ds.Attr.n.u2Dpl >= (pCtx->ds.Sel & X86_SEL_RPL)); + Assert(!(pCtx->ds.Attr.u & 0xf00)); + Assert(!(pCtx->ds.Attr.u & 0xfffe0000)); + Assert( (pCtx->ds.u32Limit & 0xfff) == 0xfff + || !(pCtx->ds.Attr.n.u1Granularity)); + Assert( !(pCtx->ds.u32Limit & 0xfff00000) + || (pCtx->ds.Attr.n.u1Granularity)); + Assert( !(pCtx->ds.Attr.n.u4Type & X86_SEL_TYPE_CODE) + || (pCtx->ds.Attr.n.u4Type & X86_SEL_TYPE_READ)); + } + if (pCtx->es.Attr.u && !(pCtx->es.Attr.u & X86DESCATTR_UNUSABLE)) + { + Assert(pCtx->es.Attr.n.u4Type & X86_SEL_TYPE_ACCESSED); + Assert(pCtx->es.Attr.n.u1Present); + Assert(pCtx->es.Attr.n.u4Type > 11 || pCtx->es.Attr.n.u2Dpl >= (pCtx->es.Sel & X86_SEL_RPL)); + Assert(!(pCtx->es.Attr.u & 0xf00)); + Assert(!(pCtx->es.Attr.u & 0xfffe0000)); + Assert( (pCtx->es.u32Limit & 0xfff) == 0xfff + || !(pCtx->es.Attr.n.u1Granularity)); + Assert( !(pCtx->es.u32Limit & 0xfff00000) + || (pCtx->es.Attr.n.u1Granularity)); + Assert( !(pCtx->es.Attr.n.u4Type & X86_SEL_TYPE_CODE) + || (pCtx->es.Attr.n.u4Type & X86_SEL_TYPE_READ)); + } + if (pCtx->fs.Attr.u && !(pCtx->fs.Attr.u & X86DESCATTR_UNUSABLE)) + { + Assert(pCtx->fs.Attr.n.u4Type & X86_SEL_TYPE_ACCESSED); + Assert(pCtx->fs.Attr.n.u1Present); + Assert(pCtx->fs.Attr.n.u4Type > 11 || pCtx->fs.Attr.n.u2Dpl >= (pCtx->fs.Sel & X86_SEL_RPL)); + Assert(!(pCtx->fs.Attr.u & 0xf00)); + Assert(!(pCtx->fs.Attr.u & 0xfffe0000)); + Assert( (pCtx->fs.u32Limit & 0xfff) == 0xfff + || !(pCtx->fs.Attr.n.u1Granularity)); + Assert( !(pCtx->fs.u32Limit & 0xfff00000) + || (pCtx->fs.Attr.n.u1Granularity)); + Assert( !(pCtx->fs.Attr.n.u4Type & X86_SEL_TYPE_CODE) + || (pCtx->fs.Attr.n.u4Type & X86_SEL_TYPE_READ)); + } + if (pCtx->gs.Attr.u && !(pCtx->gs.Attr.u & X86DESCATTR_UNUSABLE)) + { + Assert(pCtx->gs.Attr.n.u4Type & X86_SEL_TYPE_ACCESSED); + Assert(pCtx->gs.Attr.n.u1Present); + Assert(pCtx->gs.Attr.n.u4Type > 11 || pCtx->gs.Attr.n.u2Dpl >= (pCtx->gs.Sel & X86_SEL_RPL)); + Assert(!(pCtx->gs.Attr.u & 0xf00)); + Assert(!(pCtx->gs.Attr.u & 0xfffe0000)); + Assert( (pCtx->gs.u32Limit & 0xfff) == 0xfff + || !(pCtx->gs.Attr.n.u1Granularity)); + Assert( !(pCtx->gs.u32Limit & 0xfff00000) + || (pCtx->gs.Attr.n.u1Granularity)); + Assert( !(pCtx->gs.Attr.n.u4Type & X86_SEL_TYPE_CODE) + || (pCtx->gs.Attr.n.u4Type & X86_SEL_TYPE_READ)); + } + /* 64-bit capable CPUs. */ +# if HC_ARCH_BITS == 64 + Assert(!RT_HI_U32(pCtx->cs.u64Base)); + Assert(!pCtx->ss.Attr.u || !RT_HI_U32(pCtx->ss.u64Base)); + Assert(!pCtx->ds.Attr.u || !RT_HI_U32(pCtx->ds.u64Base)); + Assert(!pCtx->es.Attr.u || !RT_HI_U32(pCtx->es.u64Base)); +# endif + } + else if ( CPUMIsGuestInV86ModeEx(pCtx) + || ( CPUMIsGuestInRealModeEx(pCtx) + && !pVM->hm.s.vmx.fUnrestrictedGuest)) + { + /* Real and v86 mode checks. */ + /* hmR0VmxExportGuestSegmentReg() writes the modified in VMCS. We want what we're feeding to VT-x. */ + uint32_t u32CSAttr, u32SSAttr, u32DSAttr, u32ESAttr, u32FSAttr, u32GSAttr; + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + u32CSAttr = 0xf3; u32SSAttr = 0xf3; u32DSAttr = 0xf3; u32ESAttr = 0xf3; u32FSAttr = 0xf3; u32GSAttr = 0xf3; + } + else + { + u32CSAttr = pCtx->cs.Attr.u; u32SSAttr = pCtx->ss.Attr.u; u32DSAttr = pCtx->ds.Attr.u; + u32ESAttr = pCtx->es.Attr.u; u32FSAttr = pCtx->fs.Attr.u; u32GSAttr = pCtx->gs.Attr.u; + } + + /* CS */ + AssertMsg((pCtx->cs.u64Base == (uint64_t)pCtx->cs.Sel << 4), ("CS base %#x %#x\n", pCtx->cs.u64Base, pCtx->cs.Sel)); + Assert(pCtx->cs.u32Limit == 0xffff); + Assert(u32CSAttr == 0xf3); + /* SS */ + Assert(pCtx->ss.u64Base == (uint64_t)pCtx->ss.Sel << 4); + Assert(pCtx->ss.u32Limit == 0xffff); + Assert(u32SSAttr == 0xf3); + /* DS */ + Assert(pCtx->ds.u64Base == (uint64_t)pCtx->ds.Sel << 4); + Assert(pCtx->ds.u32Limit == 0xffff); + Assert(u32DSAttr == 0xf3); + /* ES */ + Assert(pCtx->es.u64Base == (uint64_t)pCtx->es.Sel << 4); + Assert(pCtx->es.u32Limit == 0xffff); + Assert(u32ESAttr == 0xf3); + /* FS */ + Assert(pCtx->fs.u64Base == (uint64_t)pCtx->fs.Sel << 4); + Assert(pCtx->fs.u32Limit == 0xffff); + Assert(u32FSAttr == 0xf3); + /* GS */ + Assert(pCtx->gs.u64Base == (uint64_t)pCtx->gs.Sel << 4); + Assert(pCtx->gs.u32Limit == 0xffff); + Assert(u32GSAttr == 0xf3); + /* 64-bit capable CPUs. */ +# if HC_ARCH_BITS == 64 + Assert(!RT_HI_U32(pCtx->cs.u64Base)); + Assert(!u32SSAttr || !RT_HI_U32(pCtx->ss.u64Base)); + Assert(!u32DSAttr || !RT_HI_U32(pCtx->ds.u64Base)); + Assert(!u32ESAttr || !RT_HI_U32(pCtx->es.u64Base)); +# endif + } +} +#endif /* VBOX_STRICT */ + + +/** + * Exports a guest segment register into the guest-state area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param idxSel Index of the selector in the VMCS. + * @param idxLimit Index of the segment limit in the VMCS. + * @param idxBase Index of the segment base in the VMCS. + * @param idxAccess Index of the access rights of the segment in the VMCS. + * @param pSelReg Pointer to the segment selector. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestSegmentReg(PVMCPU pVCpu, uint32_t idxSel, uint32_t idxLimit, uint32_t idxBase, uint32_t idxAccess, + PCCPUMSELREG pSelReg) +{ + int rc = VMXWriteVmcs32(idxSel, pSelReg->Sel); /* 16-bit guest selector field. */ + rc |= VMXWriteVmcs32(idxLimit, pSelReg->u32Limit); /* 32-bit guest segment limit field. */ + rc |= VMXWriteVmcsGstN(idxBase, pSelReg->u64Base); /* Natural width guest segment base field.*/ + AssertRCReturn(rc, rc); + + uint32_t u32Access = pSelReg->Attr.u; + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + /* VT-x requires our real-using-v86 mode hack to override the segment access-right bits. */ + u32Access = 0xf3; + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.vmx.pRealModeTSS); + Assert(PDMVmmDevHeapIsEnabled(pVCpu->CTX_SUFF(pVM))); + } + else + { + /* + * The way to differentiate between whether this is really a null selector or was just + * a selector loaded with 0 in real-mode is using the segment attributes. A selector + * loaded in real-mode with the value 0 is valid and usable in protected-mode and we + * should -not- mark it as an unusable segment. Both the recompiler & VT-x ensures + * NULL selectors loaded in protected-mode have their attribute as 0. + */ + if (!u32Access) + u32Access = X86DESCATTR_UNUSABLE; + } + + /* Validate segment access rights. Refer to Intel spec. "26.3.1.2 Checks on Guest Segment Registers". */ + AssertMsg((u32Access & X86DESCATTR_UNUSABLE) || (u32Access & X86_SEL_TYPE_ACCESSED), + ("Access bit not set for usable segment. idx=%#x sel=%#x attr %#x\n", idxBase, pSelReg, pSelReg->Attr.u)); + + rc = VMXWriteVmcs32(idxAccess, u32Access); /* 32-bit guest segment access-rights field. */ + AssertRCReturn(rc, rc); + return rc; +} + + +/** + * Exports the guest segment registers, GDTR, IDTR, LDTR, (TR, FS and GS bases) + * into the guest-state area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Will import guest CR0 on strict builds during validation of + * segments. + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestSegmentRegs(PVMCPU pVCpu) +{ + int rc = VERR_INTERNAL_ERROR_5; + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + + /* + * Guest Segment registers: CS, SS, DS, ES, FS, GS. + */ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_SREG_MASK) + { +#ifdef VBOX_WITH_REM + if (!pVM->hm.s.vmx.fUnrestrictedGuest) + { + Assert(pVM->hm.s.vmx.pRealModeTSS); + AssertCompile(PGMMODE_REAL < PGMMODE_PROTECTED); + if ( pVCpu->hm.s.vmx.fWasInRealMode + && PGMGetGuestMode(pVCpu) >= PGMMODE_PROTECTED) + { + /* Signal that the recompiler must flush its code-cache as the guest -may- rewrite code it will later execute + in real-mode (e.g. OpenBSD 4.0) */ + REMFlushTBs(pVM); + Log4Func(("Switch to protected mode detected!\n")); + pVCpu->hm.s.vmx.fWasInRealMode = false; + } + } +#endif + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_CS) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CS); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pVCpu->hm.s.vmx.RealMode.AttrCS.u = pCtx->cs.Attr.u; + rc = HMVMX_EXPORT_SREG(CS, &pCtx->cs); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_CS); + } + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_SS) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_SS); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pVCpu->hm.s.vmx.RealMode.AttrSS.u = pCtx->ss.Attr.u; + rc = HMVMX_EXPORT_SREG(SS, &pCtx->ss); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_SS); + } + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_DS) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_DS); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pVCpu->hm.s.vmx.RealMode.AttrDS.u = pCtx->ds.Attr.u; + rc = HMVMX_EXPORT_SREG(DS, &pCtx->ds); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_DS); + } + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_ES) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_ES); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pVCpu->hm.s.vmx.RealMode.AttrES.u = pCtx->es.Attr.u; + rc = HMVMX_EXPORT_SREG(ES, &pCtx->es); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_ES); + } + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_FS) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_FS); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pVCpu->hm.s.vmx.RealMode.AttrFS.u = pCtx->fs.Attr.u; + rc = HMVMX_EXPORT_SREG(FS, &pCtx->fs); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_FS); + } + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_GS) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_GS); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pVCpu->hm.s.vmx.RealMode.AttrGS.u = pCtx->gs.Attr.u; + rc = HMVMX_EXPORT_SREG(GS, &pCtx->gs); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_GS); + } + +#ifdef VBOX_STRICT + hmR0VmxValidateSegmentRegs(pVCpu); +#endif + + Log4Func(("CS=%#RX16 Base=%#RX64 Limit=%#RX32 Attr=%#RX32\n", pCtx->cs.Sel, pCtx->cs.u64Base, + pCtx->cs.u32Limit, pCtx->cs.Attr.u)); + } + + /* + * Guest TR. + */ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_TR) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_TR); + + /* + * Real-mode emulation using virtual-8086 mode with CR4.VME. Interrupt redirection is + * achieved using the interrupt redirection bitmap (all bits cleared to let the guest + * handle INT-n's) in the TSS. See hmR3InitFinalizeR0() to see how pRealModeTSS is setup. + */ + uint16_t u16Sel = 0; + uint32_t u32Limit = 0; + uint64_t u64Base = 0; + uint32_t u32AccessRights = 0; + + if (!pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + u16Sel = pCtx->tr.Sel; + u32Limit = pCtx->tr.u32Limit; + u64Base = pCtx->tr.u64Base; + u32AccessRights = pCtx->tr.Attr.u; + } + else + { + Assert(pVM->hm.s.vmx.pRealModeTSS); + Assert(PDMVmmDevHeapIsEnabled(pVM)); /* Guaranteed by HMCanExecuteGuest() -XXX- what about inner loop changes? */ + + /* We obtain it here every time as PCI regions could be reconfigured in the guest, changing the VMMDev base. */ + RTGCPHYS GCPhys; + rc = PDMVmmDevHeapR3ToGCPhys(pVM, pVM->hm.s.vmx.pRealModeTSS, &GCPhys); + AssertRCReturn(rc, rc); + + X86DESCATTR DescAttr; + DescAttr.u = 0; + DescAttr.n.u1Present = 1; + DescAttr.n.u4Type = X86_SEL_TYPE_SYS_386_TSS_BUSY; + + u16Sel = 0; + u32Limit = HM_VTX_TSS_SIZE; + u64Base = GCPhys; /* in real-mode phys = virt. */ + u32AccessRights = DescAttr.u; + } + + /* Validate. */ + Assert(!(u16Sel & RT_BIT(2))); + AssertMsg( (u32AccessRights & 0xf) == X86_SEL_TYPE_SYS_386_TSS_BUSY + || (u32AccessRights & 0xf) == X86_SEL_TYPE_SYS_286_TSS_BUSY, ("TSS is not busy!? %#x\n", u32AccessRights)); + AssertMsg(!(u32AccessRights & X86DESCATTR_UNUSABLE), ("TR unusable bit is not clear!? %#x\n", u32AccessRights)); + Assert(!(u32AccessRights & RT_BIT(4))); /* System MBZ.*/ + Assert(u32AccessRights & RT_BIT(7)); /* Present MB1.*/ + Assert(!(u32AccessRights & 0xf00)); /* 11:8 MBZ. */ + Assert(!(u32AccessRights & 0xfffe0000)); /* 31:17 MBZ. */ + Assert( (u32Limit & 0xfff) == 0xfff + || !(u32AccessRights & RT_BIT(15))); /* Granularity MBZ. */ + Assert( !(pCtx->tr.u32Limit & 0xfff00000) + || (u32AccessRights & RT_BIT(15))); /* Granularity MB1. */ + + rc = VMXWriteVmcs32(VMX_VMCS16_GUEST_TR_SEL, u16Sel); + rc |= VMXWriteVmcs32(VMX_VMCS32_GUEST_TR_LIMIT, u32Limit); + rc |= VMXWriteVmcs32(VMX_VMCS32_GUEST_TR_ACCESS_RIGHTS, u32AccessRights); + rc |= VMXWriteVmcsGstN(VMX_VMCS_GUEST_TR_BASE, u64Base); + AssertRCReturn(rc, rc); + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_TR); + Log4Func(("TR base=%#RX64\n", pCtx->tr.u64Base)); + } + + /* + * Guest GDTR. + */ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_GDTR) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_GDTR); + + rc = VMXWriteVmcs32(VMX_VMCS32_GUEST_GDTR_LIMIT, pCtx->gdtr.cbGdt); + rc |= VMXWriteVmcsGstN(VMX_VMCS_GUEST_GDTR_BASE, pCtx->gdtr.pGdt); + AssertRCReturn(rc, rc); + + /* Validate. */ + Assert(!(pCtx->gdtr.cbGdt & 0xffff0000)); /* Bits 31:16 MBZ. */ + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_GDTR); + Log4Func(("GDTR base=%#RX64\n", pCtx->gdtr.pGdt)); + } + + /* + * Guest LDTR. + */ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_LDTR) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_LDTR); + + /* The unusable bit is specific to VT-x, if it's a null selector mark it as an unusable segment. */ + uint32_t u32Access = 0; + if (!pCtx->ldtr.Attr.u) + u32Access = X86DESCATTR_UNUSABLE; + else + u32Access = pCtx->ldtr.Attr.u; + + rc = VMXWriteVmcs32(VMX_VMCS16_GUEST_LDTR_SEL, pCtx->ldtr.Sel); + rc |= VMXWriteVmcs32(VMX_VMCS32_GUEST_LDTR_LIMIT, pCtx->ldtr.u32Limit); + rc |= VMXWriteVmcs32(VMX_VMCS32_GUEST_LDTR_ACCESS_RIGHTS, u32Access); + rc |= VMXWriteVmcsGstN(VMX_VMCS_GUEST_LDTR_BASE, pCtx->ldtr.u64Base); + AssertRCReturn(rc, rc); + + /* Validate. */ + if (!(u32Access & X86DESCATTR_UNUSABLE)) + { + Assert(!(pCtx->ldtr.Sel & RT_BIT(2))); /* TI MBZ. */ + Assert(pCtx->ldtr.Attr.n.u4Type == 2); /* Type MB2 (LDT). */ + Assert(!pCtx->ldtr.Attr.n.u1DescType); /* System MBZ. */ + Assert(pCtx->ldtr.Attr.n.u1Present == 1); /* Present MB1. */ + Assert(!pCtx->ldtr.Attr.n.u4LimitHigh); /* 11:8 MBZ. */ + Assert(!(pCtx->ldtr.Attr.u & 0xfffe0000)); /* 31:17 MBZ. */ + Assert( (pCtx->ldtr.u32Limit & 0xfff) == 0xfff + || !pCtx->ldtr.Attr.n.u1Granularity); /* Granularity MBZ. */ + Assert( !(pCtx->ldtr.u32Limit & 0xfff00000) + || pCtx->ldtr.Attr.n.u1Granularity); /* Granularity MB1. */ + } + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_LDTR); + Log4Func(("LDTR base=%#RX64\n", pCtx->ldtr.u64Base)); + } + + /* + * Guest IDTR. + */ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_IDTR) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_IDTR); + + rc = VMXWriteVmcs32(VMX_VMCS32_GUEST_IDTR_LIMIT, pCtx->idtr.cbIdt); + rc |= VMXWriteVmcsGstN(VMX_VMCS_GUEST_IDTR_BASE, pCtx->idtr.pIdt); + AssertRCReturn(rc, rc); + + /* Validate. */ + Assert(!(pCtx->idtr.cbIdt & 0xffff0000)); /* Bits 31:16 MBZ. */ + + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_IDTR); + Log4Func(("IDTR base=%#RX64\n", pCtx->idtr.pIdt)); + } + + return VINF_SUCCESS; +} + + +/** + * Exports certain guest MSRs into the VM-entry MSR-load and VM-exit MSR-store + * areas. + * + * These MSRs will automatically be loaded to the host CPU on every successful + * VM-entry and stored from the host CPU on every successful VM-exit. This also + * creates/updates MSR slots for the host MSRs. The actual host MSR values are + * -not- updated here for performance reasons. See hmR0VmxExportHostMsrs(). + * + * Also exports the guest sysenter MSRs into the guest-state area in the VMCS. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportGuestMsrs(PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + AssertPtr(pVCpu->hm.s.vmx.pvGuestMsr); + + /* + * MSRs that we use the auto-load/store MSR area in the VMCS. + * For 64-bit hosts, we load/restore them lazily, see hmR0VmxLazyLoadGuestMsrs(). + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_VMX_GUEST_AUTO_MSRS) + { + if (pVM->hm.s.fAllow64BitGuests) + { +#if HC_ARCH_BITS == 32 + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_SYSCALL_MSRS | CPUMCTX_EXTRN_KERNEL_GS_BASE); + + int rc = hmR0VmxAddAutoLoadStoreMsr(pVCpu, MSR_K8_LSTAR, pCtx->msrLSTAR, false, NULL); + rc |= hmR0VmxAddAutoLoadStoreMsr(pVCpu, MSR_K6_STAR, pCtx->msrSTAR, false, NULL); + rc |= hmR0VmxAddAutoLoadStoreMsr(pVCpu, MSR_K8_SF_MASK, pCtx->msrSFMASK, false, NULL); + rc |= hmR0VmxAddAutoLoadStoreMsr(pVCpu, MSR_K8_KERNEL_GS_BASE, pCtx->msrKERNELGSBASE, false, NULL); + AssertRCReturn(rc, rc); +# ifdef LOG_ENABLED + PCVMXAUTOMSR pMsr = (PCVMXAUTOMSR)pVCpu->hm.s.vmx.pvGuestMsr; + for (uint32_t i = 0; i < pVCpu->hm.s.vmx.cMsrs; i++, pMsr++) + Log4Func(("MSR[%RU32]: u32Msr=%#RX32 u64Value=%#RX64\n", i, pMsr->u32Msr, pMsr->u64Value)); +# endif +#endif + } + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_VMX_GUEST_AUTO_MSRS); + } + + /* + * Guest Sysenter MSRs. + * These flags are only set when MSR-bitmaps are not supported by the CPU and we cause + * VM-exits on WRMSRs for these MSRs. + */ + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_SYSENTER_MSR_MASK) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_SYSENTER_MSRS); + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_SYSENTER_CS_MSR) + { + int rc = VMXWriteVmcs32(VMX_VMCS32_GUEST_SYSENTER_CS, pCtx->SysEnter.cs); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_SYSENTER_CS_MSR); + } + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_SYSENTER_EIP_MSR) + { + int rc = VMXWriteVmcsGstN(VMX_VMCS_GUEST_SYSENTER_EIP, pCtx->SysEnter.eip); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_SYSENTER_EIP_MSR); + } + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_SYSENTER_ESP_MSR) + { + int rc = VMXWriteVmcsGstN(VMX_VMCS_GUEST_SYSENTER_ESP, pCtx->SysEnter.esp); + AssertRCReturn(rc, rc); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_SYSENTER_ESP_MSR); + } + } + + if (ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged) & HM_CHANGED_GUEST_EFER_MSR) + { + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_EFER); + + if (hmR0VmxShouldSwapEferMsr(pVCpu)) + { + /* + * If the CPU supports VMCS controls for swapping EFER, use it. Otherwise, we have no option + * but to use the auto-load store MSR area in the VMCS for swapping EFER. See @bugref{7368}. + */ + if (pVM->hm.s.vmx.fSupportsVmcsEfer) + { + int rc = VMXWriteVmcs64(VMX_VMCS64_GUEST_EFER_FULL, pCtx->msrEFER); + AssertRCReturn(rc,rc); + Log4Func(("EFER=%#RX64\n", pCtx->msrEFER)); + } + else + { + int rc = hmR0VmxAddAutoLoadStoreMsr(pVCpu, MSR_K6_EFER, pCtx->msrEFER, false /* fUpdateHostMsr */, + NULL /* pfAddedAndUpdated */); + AssertRCReturn(rc, rc); + + /* We need to intercept reads too, see @bugref{7386#c16}. */ + if (pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_USE_MSR_BITMAPS) + hmR0VmxSetMsrPermission(pVCpu, MSR_K6_EFER, VMXMSREXIT_INTERCEPT_READ, VMXMSREXIT_INTERCEPT_WRITE); + Log4Func(("MSR[--]: u32Msr=%#RX32 u64Value=%#RX64 cMsrs=%u\n", MSR_K6_EFER, pCtx->msrEFER, + pVCpu->hm.s.vmx.cMsrs)); + } + } + else if (!pVM->hm.s.vmx.fSupportsVmcsEfer) + hmR0VmxRemoveAutoLoadStoreMsr(pVCpu, MSR_K6_EFER); + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~HM_CHANGED_GUEST_EFER_MSR); + } + + return VINF_SUCCESS; +} + + +#if HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) +/** + * Check if guest state allows safe use of 32-bit switcher again. + * + * Segment bases and protected mode structures must be 32-bit addressable + * because the 32-bit switcher will ignore high dword when writing these VMCS + * fields. See @bugref{8432} for details. + * + * @returns true if safe, false if must continue to use the 64-bit switcher. + * @param pCtx Pointer to the guest-CPU context. + * + * @remarks No-long-jump zone!!! + */ +static bool hmR0VmxIs32BitSwitcherSafe(PCCPUMCTX pCtx) +{ + if (pCtx->gdtr.pGdt & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->idtr.pIdt & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->ldtr.u64Base & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->tr.u64Base & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->es.u64Base & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->cs.u64Base & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->ss.u64Base & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->ds.u64Base & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->fs.u64Base & UINT64_C(0xffffffff00000000)) return false; + if (pCtx->gs.u64Base & UINT64_C(0xffffffff00000000)) return false; + + /* All good, bases are 32-bit. */ + return true; +} +#endif + + +/** + * Selects up the appropriate function to run guest code. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxSelectVMRunHandler(PVMCPU pVCpu) +{ + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (CPUMIsGuestInLongModeEx(pCtx)) + { +#ifndef VBOX_ENABLE_64_BITS_GUESTS + return VERR_PGM_UNSUPPORTED_SHADOW_PAGING_MODE; +#endif + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.fAllow64BitGuests); /* Guaranteed by hmR3InitFinalizeR0(). */ +#if HC_ARCH_BITS == 32 + /* 32-bit host. We need to switch to 64-bit before running the 64-bit guest. */ + if (pVCpu->hm.s.vmx.pfnStartVM != VMXR0SwitcherStartVM64) + { +#ifdef VBOX_STRICT + if (pVCpu->hm.s.vmx.pfnStartVM != NULL) /* Very first entry would have saved host-state already, ignore it. */ + { + /* Currently, all mode changes sends us back to ring-3, so these should be set. See @bugref{6944}. */ + uint64_t const fCtxChanged = ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged); + RT_UNTRUSTED_NONVOLATILE_COPY_FENCE(); + AssertMsg(fCtxChanged & ( HM_CHANGED_VMX_EXIT_CTLS + | HM_CHANGED_VMX_ENTRY_CTLS + | HM_CHANGED_GUEST_EFER_MSR), ("fCtxChanged=%#RX64\n", fCtxChanged)); + } +#endif + pVCpu->hm.s.vmx.pfnStartVM = VMXR0SwitcherStartVM64; + + /* Mark that we've switched to 64-bit handler, we can't safely switch back to 32-bit for + the rest of the VM run (until VM reset). See @bugref{8432#c7}. */ + pVCpu->hm.s.vmx.fSwitchedTo64on32 = true; + Log4Func(("Selected 64-bit switcher\n")); + } +#else + /* 64-bit host. */ + pVCpu->hm.s.vmx.pfnStartVM = VMXR0StartVM64; +#endif + } + else + { + /* Guest is not in long mode, use the 32-bit handler. */ +#if HC_ARCH_BITS == 32 + if ( pVCpu->hm.s.vmx.pfnStartVM != VMXR0StartVM32 + && !pVCpu->hm.s.vmx.fSwitchedTo64on32 /* If set, guest mode change does not imply switcher change. */ + && pVCpu->hm.s.vmx.pfnStartVM != NULL) /* Very first entry would have saved host-state already, ignore it. */ + { +# ifdef VBOX_STRICT + /* Currently, all mode changes sends us back to ring-3, so these should be set. See @bugref{6944}. */ + uint64_t const fCtxChanged = ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged); + RT_UNTRUSTED_NONVOLATILE_COPY_FENCE(); + AssertMsg(fCtxChanged & ( HM_CHANGED_VMX_EXIT_CTLS + | HM_CHANGED_VMX_ENTRY_CTLS + | HM_CHANGED_GUEST_EFER_MSR), ("fCtxChanged=%#RX64\n", fCtxChanged)); +# endif + } +# ifdef VBOX_ENABLE_64_BITS_GUESTS + /* + * Keep using the 64-bit switcher even though we're in 32-bit because of bad Intel + * design, see @bugref{8432#c7}. If real-on-v86 mode is active, clear the 64-bit + * switcher flag because now we know the guest is in a sane state where it's safe + * to use the 32-bit switcher. Otherwise check the guest state if it's safe to use + * the much faster 32-bit switcher again. + */ + if (!pVCpu->hm.s.vmx.fSwitchedTo64on32) + { + if (pVCpu->hm.s.vmx.pfnStartVM != VMXR0StartVM32) + Log4Func(("Selected 32-bit switcher\n")); + pVCpu->hm.s.vmx.pfnStartVM = VMXR0StartVM32; + } + else + { + Assert(pVCpu->hm.s.vmx.pfnStartVM == VMXR0SwitcherStartVM64); + if ( pVCpu->hm.s.vmx.RealMode.fRealOnV86Active + || hmR0VmxIs32BitSwitcherSafe(pCtx)) + { + pVCpu->hm.s.vmx.fSwitchedTo64on32 = false; + pVCpu->hm.s.vmx.pfnStartVM = VMXR0StartVM32; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_EFER_MSR + | HM_CHANGED_VMX_ENTRY_CTLS + | HM_CHANGED_VMX_EXIT_CTLS + | HM_CHANGED_HOST_CONTEXT); + Log4Func(("Selected 32-bit switcher (safe)\n")); + } + } +# else + pVCpu->hm.s.vmx.pfnStartVM = VMXR0StartVM32; +# endif +#else + pVCpu->hm.s.vmx.pfnStartVM = VMXR0StartVM32; +#endif + } + Assert(pVCpu->hm.s.vmx.pfnStartVM); + return VINF_SUCCESS; +} + + +/** + * Wrapper for running the guest code in VT-x. + * + * @returns VBox status code, no informational status codes. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +DECLINLINE(int) hmR0VmxRunGuest(PVMCPU pVCpu) +{ + /* Mark that HM is the keeper of all guest-CPU registers now that we're going to execute guest code. */ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + pCtx->fExtrn |= HMVMX_CPUMCTX_EXTRN_ALL | CPUMCTX_EXTRN_KEEPER_HM; + + /* + * 64-bit Windows uses XMM registers in the kernel as the Microsoft compiler expresses + * floating-point operations using SSE instructions. Some XMM registers (XMM6-XMM15) are + * callee-saved and thus the need for this XMM wrapper. + * + * See MSDN "Configuring Programs for 64-bit/x64 Software Conventions / Register Usage". + */ + bool const fResumeVM = RT_BOOL(pVCpu->hm.s.vmx.fVmcsState & HMVMX_VMCS_STATE_LAUNCHED); + /** @todo Add stats for resume vs launch. */ + PVM pVM = pVCpu->CTX_SUFF(pVM); +#ifdef VBOX_WITH_KERNEL_USING_XMM + int rc = hmR0VMXStartVMWrapXMM(fResumeVM, pCtx, &pVCpu->hm.s.vmx.VMCSCache, pVM, pVCpu, pVCpu->hm.s.vmx.pfnStartVM); +#else + int rc = pVCpu->hm.s.vmx.pfnStartVM(fResumeVM, pCtx, &pVCpu->hm.s.vmx.VMCSCache, pVM, pVCpu); +#endif + AssertMsg(rc <= VINF_SUCCESS, ("%Rrc\n", rc)); + return rc; +} + + +/** + * Reports world-switch error and dumps some useful debug info. + * + * @param pVCpu The cross context virtual CPU structure. + * @param rcVMRun The return code from VMLAUNCH/VMRESUME. + * @param pVmxTransient Pointer to the VMX transient structure (only + * exitReason updated). + */ +static void hmR0VmxReportWorldSwitchError(PVMCPU pVCpu, int rcVMRun, PVMXTRANSIENT pVmxTransient) +{ + Assert(pVCpu); + Assert(pVmxTransient); + HMVMX_ASSERT_PREEMPT_SAFE(pVCpu); + + Log4Func(("VM-entry failure: %Rrc\n", rcVMRun)); + switch (rcVMRun) + { + case VERR_VMX_INVALID_VMXON_PTR: + AssertFailed(); + break; + case VINF_SUCCESS: /* VMLAUNCH/VMRESUME succeeded but VM-entry failed... yeah, true story. */ + case VERR_VMX_UNABLE_TO_START_VM: /* VMLAUNCH/VMRESUME itself failed. */ + { + int rc = VMXReadVmcs32(VMX_VMCS32_RO_EXIT_REASON, &pVCpu->hm.s.vmx.LastError.u32ExitReason); + rc |= VMXReadVmcs32(VMX_VMCS32_RO_VM_INSTR_ERROR, &pVCpu->hm.s.vmx.LastError.u32InstrError); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRC(rc); + + pVCpu->hm.s.vmx.LastError.idEnteredCpu = pVCpu->hm.s.idEnteredCpu; + /* LastError.idCurrentCpu was already updated in hmR0VmxPreRunGuestCommitted(). + Cannot do it here as we may have been long preempted. */ + +#ifdef VBOX_STRICT + Log4(("uExitReason %#RX32 (VmxTransient %#RX16)\n", pVCpu->hm.s.vmx.LastError.u32ExitReason, + pVmxTransient->uExitReason)); + Log4(("Exit Qualification %#RX64\n", pVmxTransient->uExitQual)); + Log4(("InstrError %#RX32\n", pVCpu->hm.s.vmx.LastError.u32InstrError)); + if (pVCpu->hm.s.vmx.LastError.u32InstrError <= HMVMX_INSTR_ERROR_MAX) + Log4(("InstrError Desc. \"%s\"\n", g_apszVmxInstrErrors[pVCpu->hm.s.vmx.LastError.u32InstrError])); + else + Log4(("InstrError Desc. Range exceeded %u\n", HMVMX_INSTR_ERROR_MAX)); + Log4(("Entered host CPU %u\n", pVCpu->hm.s.vmx.LastError.idEnteredCpu)); + Log4(("Current host CPU %u\n", pVCpu->hm.s.vmx.LastError.idCurrentCpu)); + + /* VMX control bits. */ + uint32_t u32Val; + uint64_t u64Val; + RTHCUINTREG uHCReg; + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_PIN_EXEC, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_PIN_EXEC %#RX32\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_PROC_EXEC %#RX32\n", u32Val)); + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_SECONDARY_CTLS) + { + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_PROC_EXEC2, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_PROC_EXEC2 %#RX32\n", u32Val)); + } + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_ENTRY %#RX32\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_EXIT, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_EXIT %#RX32\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_CR3_TARGET_COUNT, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_CR3_TARGET_COUNT %#RX32\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_INTERRUPTION_INFO, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_ENTRY_INTERRUPTION_INFO %#RX32\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_EXCEPTION_ERRCODE, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_ENTRY_EXCEPTION_ERRCODE %#RX32\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_INSTR_LENGTH, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_ENTRY_INSTR_LENGTH %u\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_TPR_THRESHOLD, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_TPR_THRESHOLD %u\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_EXIT_MSR_STORE_COUNT, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_EXIT_MSR_STORE_COUNT %u (guest MSRs)\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_EXIT_MSR_LOAD_COUNT, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_EXIT_MSR_LOAD_COUNT %u (host MSRs)\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_MSR_LOAD_COUNT, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_ENTRY_MSR_LOAD_COUNT %u (guest MSRs)\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_EXCEPTION_BITMAP, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_EXCEPTION_BITMAP %#RX32\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_PAGEFAULT_ERROR_MASK, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_PAGEFAULT_ERROR_MASK %#RX32\n", u32Val)); + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_PAGEFAULT_ERROR_MATCH, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS32_CTRL_PAGEFAULT_ERROR_MATCH %#RX32\n", u32Val)); + rc = VMXReadVmcsHstN(VMX_VMCS_CTRL_CR0_MASK, &uHCReg); AssertRC(rc); + Log4(("VMX_VMCS_CTRL_CR0_MASK %#RHr\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_CTRL_CR0_READ_SHADOW, &uHCReg); AssertRC(rc); + Log4(("VMX_VMCS_CTRL_CR4_READ_SHADOW %#RHr\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_CTRL_CR4_MASK, &uHCReg); AssertRC(rc); + Log4(("VMX_VMCS_CTRL_CR4_MASK %#RHr\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_CTRL_CR4_READ_SHADOW, &uHCReg); AssertRC(rc); + Log4(("VMX_VMCS_CTRL_CR4_READ_SHADOW %#RHr\n", uHCReg)); + if (pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging) + { + rc = VMXReadVmcs64(VMX_VMCS64_CTRL_EPTP_FULL, &u64Val); AssertRC(rc); + Log4(("VMX_VMCS64_CTRL_EPTP_FULL %#RX64\n", u64Val)); + } + + /* Guest bits. */ + rc = VMXReadVmcsGstN(VMX_VMCS_GUEST_RIP, &u64Val); AssertRC(rc); + Log4(("Old Guest Rip %#RX64 New %#RX64\n", pVCpu->cpum.GstCtx.rip, u64Val)); + rc = VMXReadVmcsGstN(VMX_VMCS_GUEST_RSP, &u64Val); AssertRC(rc); + Log4(("Old Guest Rsp %#RX64 New %#RX64\n", pVCpu->cpum.GstCtx.rsp, u64Val)); + rc = VMXReadVmcs32(VMX_VMCS_GUEST_RFLAGS, &u32Val); AssertRC(rc); + Log4(("Old Guest Rflags %#RX32 New %#RX32\n", pVCpu->cpum.GstCtx.eflags.u32, u32Val)); + if (pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fVpid) + { + rc = VMXReadVmcs32(VMX_VMCS16_VPID, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS16_VPID %u\n", u32Val)); + } + + /* Host bits. */ + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_CR0, &uHCReg); AssertRC(rc); + Log4(("Host CR0 %#RHr\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_CR3, &uHCReg); AssertRC(rc); + Log4(("Host CR3 %#RHr\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_CR4, &uHCReg); AssertRC(rc); + Log4(("Host CR4 %#RHr\n", uHCReg)); + + RTGDTR HostGdtr; + PCX86DESCHC pDesc; + ASMGetGDTR(&HostGdtr); + rc = VMXReadVmcs32(VMX_VMCS16_HOST_CS_SEL, &u32Val); AssertRC(rc); + Log4(("Host CS %#08x\n", u32Val)); + if (u32Val < HostGdtr.cbGdt) + { + pDesc = (PCX86DESCHC)(HostGdtr.pGdt + (u32Val & X86_SEL_MASK)); + hmR0DumpDescriptor(pDesc, u32Val, "CS: "); + } + + rc = VMXReadVmcs32(VMX_VMCS16_HOST_DS_SEL, &u32Val); AssertRC(rc); + Log4(("Host DS %#08x\n", u32Val)); + if (u32Val < HostGdtr.cbGdt) + { + pDesc = (PCX86DESCHC)(HostGdtr.pGdt + (u32Val & X86_SEL_MASK)); + hmR0DumpDescriptor(pDesc, u32Val, "DS: "); + } + + rc = VMXReadVmcs32(VMX_VMCS16_HOST_ES_SEL, &u32Val); AssertRC(rc); + Log4(("Host ES %#08x\n", u32Val)); + if (u32Val < HostGdtr.cbGdt) + { + pDesc = (PCX86DESCHC)(HostGdtr.pGdt + (u32Val & X86_SEL_MASK)); + hmR0DumpDescriptor(pDesc, u32Val, "ES: "); + } + + rc = VMXReadVmcs32(VMX_VMCS16_HOST_FS_SEL, &u32Val); AssertRC(rc); + Log4(("Host FS %#08x\n", u32Val)); + if (u32Val < HostGdtr.cbGdt) + { + pDesc = (PCX86DESCHC)(HostGdtr.pGdt + (u32Val & X86_SEL_MASK)); + hmR0DumpDescriptor(pDesc, u32Val, "FS: "); + } + + rc = VMXReadVmcs32(VMX_VMCS16_HOST_GS_SEL, &u32Val); AssertRC(rc); + Log4(("Host GS %#08x\n", u32Val)); + if (u32Val < HostGdtr.cbGdt) + { + pDesc = (PCX86DESCHC)(HostGdtr.pGdt + (u32Val & X86_SEL_MASK)); + hmR0DumpDescriptor(pDesc, u32Val, "GS: "); + } + + rc = VMXReadVmcs32(VMX_VMCS16_HOST_SS_SEL, &u32Val); AssertRC(rc); + Log4(("Host SS %#08x\n", u32Val)); + if (u32Val < HostGdtr.cbGdt) + { + pDesc = (PCX86DESCHC)(HostGdtr.pGdt + (u32Val & X86_SEL_MASK)); + hmR0DumpDescriptor(pDesc, u32Val, "SS: "); + } + + rc = VMXReadVmcs32(VMX_VMCS16_HOST_TR_SEL, &u32Val); AssertRC(rc); + Log4(("Host TR %#08x\n", u32Val)); + if (u32Val < HostGdtr.cbGdt) + { + pDesc = (PCX86DESCHC)(HostGdtr.pGdt + (u32Val & X86_SEL_MASK)); + hmR0DumpDescriptor(pDesc, u32Val, "TR: "); + } + + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_TR_BASE, &uHCReg); AssertRC(rc); + Log4(("Host TR Base %#RHv\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_GDTR_BASE, &uHCReg); AssertRC(rc); + Log4(("Host GDTR Base %#RHv\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_IDTR_BASE, &uHCReg); AssertRC(rc); + Log4(("Host IDTR Base %#RHv\n", uHCReg)); + rc = VMXReadVmcs32(VMX_VMCS32_HOST_SYSENTER_CS, &u32Val); AssertRC(rc); + Log4(("Host SYSENTER CS %#08x\n", u32Val)); + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_SYSENTER_EIP, &uHCReg); AssertRC(rc); + Log4(("Host SYSENTER EIP %#RHv\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_SYSENTER_ESP, &uHCReg); AssertRC(rc); + Log4(("Host SYSENTER ESP %#RHv\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_RSP, &uHCReg); AssertRC(rc); + Log4(("Host RSP %#RHv\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_HOST_RIP, &uHCReg); AssertRC(rc); + Log4(("Host RIP %#RHv\n", uHCReg)); +# if HC_ARCH_BITS == 64 + Log4(("MSR_K6_EFER = %#RX64\n", ASMRdMsr(MSR_K6_EFER))); + Log4(("MSR_K8_CSTAR = %#RX64\n", ASMRdMsr(MSR_K8_CSTAR))); + Log4(("MSR_K8_LSTAR = %#RX64\n", ASMRdMsr(MSR_K8_LSTAR))); + Log4(("MSR_K6_STAR = %#RX64\n", ASMRdMsr(MSR_K6_STAR))); + Log4(("MSR_K8_SF_MASK = %#RX64\n", ASMRdMsr(MSR_K8_SF_MASK))); + Log4(("MSR_K8_KERNEL_GS_BASE = %#RX64\n", ASMRdMsr(MSR_K8_KERNEL_GS_BASE))); +# endif +#endif /* VBOX_STRICT */ + break; + } + + default: + /* Impossible */ + AssertMsgFailed(("hmR0VmxReportWorldSwitchError %Rrc (%#x)\n", rcVMRun, rcVMRun)); + break; + } +} + + +#if HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) +#ifndef VMX_USE_CACHED_VMCS_ACCESSES +# error "VMX_USE_CACHED_VMCS_ACCESSES not defined when it should be!" +#endif +#ifdef VBOX_STRICT +static bool hmR0VmxIsValidWriteField(uint32_t idxField) +{ + switch (idxField) + { + case VMX_VMCS_GUEST_RIP: + case VMX_VMCS_GUEST_RSP: + case VMX_VMCS_GUEST_SYSENTER_EIP: + case VMX_VMCS_GUEST_SYSENTER_ESP: + case VMX_VMCS_GUEST_GDTR_BASE: + case VMX_VMCS_GUEST_IDTR_BASE: + case VMX_VMCS_GUEST_CS_BASE: + case VMX_VMCS_GUEST_DS_BASE: + case VMX_VMCS_GUEST_ES_BASE: + case VMX_VMCS_GUEST_FS_BASE: + case VMX_VMCS_GUEST_GS_BASE: + case VMX_VMCS_GUEST_SS_BASE: + case VMX_VMCS_GUEST_LDTR_BASE: + case VMX_VMCS_GUEST_TR_BASE: + case VMX_VMCS_GUEST_CR3: + return true; + } + return false; +} + +static bool hmR0VmxIsValidReadField(uint32_t idxField) +{ + switch (idxField) + { + /* Read-only fields. */ + case VMX_VMCS_RO_EXIT_QUALIFICATION: + return true; + } + /* Remaining readable fields should also be writable. */ + return hmR0VmxIsValidWriteField(idxField); +} +#endif /* VBOX_STRICT */ + + +/** + * Executes the specified handler in 64-bit mode. + * + * @returns VBox status code (no informational status codes). + * @param pVCpu The cross context virtual CPU structure. + * @param enmOp The operation to perform. + * @param cParams Number of parameters. + * @param paParam Array of 32-bit parameters. + */ +VMMR0DECL(int) VMXR0Execute64BitsHandler(PVMCPU pVCpu, HM64ON32OP enmOp, uint32_t cParams, uint32_t *paParam) +{ + PVM pVM = pVCpu->CTX_SUFF(pVM); + AssertReturn(pVM->hm.s.pfnHost32ToGuest64R0, VERR_HM_NO_32_TO_64_SWITCHER); + Assert(enmOp > HM64ON32OP_INVALID && enmOp < HM64ON32OP_END); + Assert(pVCpu->hm.s.vmx.VMCSCache.Write.cValidEntries <= RT_ELEMENTS(pVCpu->hm.s.vmx.VMCSCache.Write.aField)); + Assert(pVCpu->hm.s.vmx.VMCSCache.Read.cValidEntries <= RT_ELEMENTS(pVCpu->hm.s.vmx.VMCSCache.Read.aField)); + +#ifdef VBOX_STRICT + for (uint32_t i = 0; i < pVCpu->hm.s.vmx.VMCSCache.Write.cValidEntries; i++) + Assert(hmR0VmxIsValidWriteField(pVCpu->hm.s.vmx.VMCSCache.Write.aField[i])); + + for (uint32_t i = 0; i <pVCpu->hm.s.vmx.VMCSCache.Read.cValidEntries; i++) + Assert(hmR0VmxIsValidReadField(pVCpu->hm.s.vmx.VMCSCache.Read.aField[i])); +#endif + + /* Disable interrupts. */ + RTCCUINTREG fOldEFlags = ASMIntDisableFlags(); + +#ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI + RTCPUID idHostCpu = RTMpCpuId(); + CPUMR0SetLApic(pVCpu, idHostCpu); +#endif + + PCHMPHYSCPU pHostCpu = hmR0GetCurrentCpu(); + RTHCPHYS HCPhysCpuPage = pHostCpu->HCPhysMemObj; + + /* Clear VMCS. Marking it inactive, clearing implementation-specific data and writing VMCS data back to memory. */ + VMXClearVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + pVCpu->hm.s.vmx.fVmcsState = HMVMX_VMCS_STATE_CLEAR; + + /* Leave VMX Root Mode. */ + VMXDisable(); + + SUPR0ChangeCR4(0, ~X86_CR4_VMXE); + + CPUMSetHyperESP(pVCpu, VMMGetStackRC(pVCpu)); + CPUMSetHyperEIP(pVCpu, enmOp); + for (int i = (int)cParams - 1; i >= 0; i--) + CPUMPushHyper(pVCpu, paParam[i]); + + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatWorldSwitch3264, z); + + /* Call the switcher. */ + int rc = pVM->hm.s.pfnHost32ToGuest64R0(pVM, RT_UOFFSETOF_DYN(VM, aCpus[pVCpu->idCpu].cpum) - RT_UOFFSETOF(VM, cpum)); + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatWorldSwitch3264, z); + + /** @todo replace with hmR0VmxEnterRootMode() and hmR0VmxLeaveRootMode(). */ + /* Make sure the VMX instructions don't cause #UD faults. */ + SUPR0ChangeCR4(X86_CR4_VMXE, RTCCUINTREG_MAX); + + /* Re-enter VMX Root Mode */ + int rc2 = VMXEnable(HCPhysCpuPage); + if (RT_FAILURE(rc2)) + { + SUPR0ChangeCR4(0, ~X86_CR4_VMXE); + ASMSetFlags(fOldEFlags); + pVM->hm.s.vmx.HCPhysVmxEnableError = HCPhysCpuPage; + return rc2; + } + + rc2 = VMXActivateVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + AssertRC(rc2); + pVCpu->hm.s.vmx.fVmcsState = HMVMX_VMCS_STATE_ACTIVE; + Assert(!(ASMGetFlags() & X86_EFL_IF)); + ASMSetFlags(fOldEFlags); + return rc; +} + + +/** + * Prepares for and executes VMLAUNCH (64-bit guests) for 32-bit hosts + * supporting 64-bit guests. + * + * @returns VBox status code. + * @param fResume Whether to VMLAUNCH or VMRESUME. + * @param pCtx Pointer to the guest-CPU context. + * @param pCache Pointer to the VMCS cache. + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLASM(int) VMXR0SwitcherStartVM64(RTHCUINT fResume, PCPUMCTX pCtx, PVMCSCACHE pCache, PVM pVM, PVMCPU pVCpu) +{ + NOREF(fResume); + + PCHMPHYSCPU pHostCpu = hmR0GetCurrentCpu(); + RTHCPHYS const HCPhysCpuPage = pHostCpu->HCPhysMemObj; + +#ifdef VBOX_WITH_CRASHDUMP_MAGIC + pCache->uPos = 1; + pCache->interPD = PGMGetInterPaeCR3(pVM); + pCache->pSwitcher = (uint64_t)pVM->hm.s.pfnHost32ToGuest64R0; +#endif + +#if defined(DEBUG) && defined(VMX_USE_CACHED_VMCS_ACCESSES) + pCache->TestIn.HCPhysCpuPage = 0; + pCache->TestIn.HCPhysVmcs = 0; + pCache->TestIn.pCache = 0; + pCache->TestOut.HCPhysVmcs = 0; + pCache->TestOut.pCache = 0; + pCache->TestOut.pCtx = 0; + pCache->TestOut.eflags = 0; +#else + NOREF(pCache); +#endif + + uint32_t aParam[10]; + aParam[0] = RT_LO_U32(HCPhysCpuPage); /* Param 1: VMXON physical address - Lo. */ + aParam[1] = RT_HI_U32(HCPhysCpuPage); /* Param 1: VMXON physical address - Hi. */ + aParam[2] = RT_LO_U32(pVCpu->hm.s.vmx.HCPhysVmcs); /* Param 2: VMCS physical address - Lo. */ + aParam[3] = RT_HI_U32(pVCpu->hm.s.vmx.HCPhysVmcs); /* Param 2: VMCS physical address - Hi. */ + aParam[4] = VM_RC_ADDR(pVM, &pVM->aCpus[pVCpu->idCpu].hm.s.vmx.VMCSCache); + aParam[5] = 0; + aParam[6] = VM_RC_ADDR(pVM, pVM); + aParam[7] = 0; + aParam[8] = VM_RC_ADDR(pVM, pVCpu); + aParam[9] = 0; + +#ifdef VBOX_WITH_CRASHDUMP_MAGIC + pCtx->dr[4] = pVM->hm.s.vmx.pScratchPhys + 16 + 8; + *(uint32_t *)(pVM->hm.s.vmx.pScratch + 16 + 8) = 1; +#endif + int rc = VMXR0Execute64BitsHandler(pVCpu, HM64ON32OP_VMXRCStartVM64, RT_ELEMENTS(aParam), &aParam[0]); + +#ifdef VBOX_WITH_CRASHDUMP_MAGIC + Assert(*(uint32_t *)(pVM->hm.s.vmx.pScratch + 16 + 8) == 5); + Assert(pCtx->dr[4] == 10); + *(uint32_t *)(pVM->hm.s.vmx.pScratch + 16 + 8) = 0xff; +#endif + +#if defined(DEBUG) && defined(VMX_USE_CACHED_VMCS_ACCESSES) + AssertMsg(pCache->TestIn.HCPhysCpuPage == HCPhysCpuPage, ("%RHp vs %RHp\n", pCache->TestIn.HCPhysCpuPage, HCPhysCpuPage)); + AssertMsg(pCache->TestIn.HCPhysVmcs == pVCpu->hm.s.vmx.HCPhysVmcs, ("%RHp vs %RHp\n", pCache->TestIn.HCPhysVmcs, + pVCpu->hm.s.vmx.HCPhysVmcs)); + AssertMsg(pCache->TestIn.HCPhysVmcs == pCache->TestOut.HCPhysVmcs, ("%RHp vs %RHp\n", pCache->TestIn.HCPhysVmcs, + pCache->TestOut.HCPhysVmcs)); + AssertMsg(pCache->TestIn.pCache == pCache->TestOut.pCache, ("%RGv vs %RGv\n", pCache->TestIn.pCache, + pCache->TestOut.pCache)); + AssertMsg(pCache->TestIn.pCache == VM_RC_ADDR(pVM, &pVM->aCpus[pVCpu->idCpu].hm.s.vmx.VMCSCache), + ("%RGv vs %RGv\n", pCache->TestIn.pCache, VM_RC_ADDR(pVM, &pVM->aCpus[pVCpu->idCpu].hm.s.vmx.VMCSCache))); + AssertMsg(pCache->TestIn.pCtx == pCache->TestOut.pCtx, ("%RGv vs %RGv\n", pCache->TestIn.pCtx, + pCache->TestOut.pCtx)); + Assert(!(pCache->TestOut.eflags & X86_EFL_IF)); +#endif + NOREF(pCtx); + return rc; +} + + +/** + * Initialize the VMCS-Read cache. + * + * The VMCS cache is used for 32-bit hosts running 64-bit guests (except 32-bit + * Darwin which runs with 64-bit paging in 32-bit mode) for 64-bit fields that + * cannot be accessed in 32-bit mode. Some 64-bit fields -can- be accessed + * (those that have a 32-bit FULL & HIGH part). + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +static int hmR0VmxInitVmcsReadCache(PVMCPU pVCpu) +{ +#define VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, idxField) \ + do { \ + Assert(pCache->Read.aField[idxField##_CACHE_IDX] == 0); \ + pCache->Read.aField[idxField##_CACHE_IDX] = idxField; \ + pCache->Read.aFieldVal[idxField##_CACHE_IDX] = 0; \ + ++cReadFields; \ + } while (0) + + PVMCSCACHE pCache = &pVCpu->hm.s.vmx.VMCSCache; + uint32_t cReadFields = 0; + + /* + * Don't remove the #if 0'd fields in this code. They're listed here for consistency + * and serve to indicate exceptions to the rules. + */ + + /* Guest-natural selector base fields. */ +#if 0 + /* These are 32-bit in practice. See Intel spec. 2.5 "Control Registers". */ + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_CR0); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_CR4); +#endif + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_ES_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_CS_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_SS_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_DS_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_FS_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_GS_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_LDTR_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_TR_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_GDTR_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_IDTR_BASE); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_RSP); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_RIP); +#if 0 + /* Unused natural width guest-state fields. */ + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_PENDING_DEBUG_XCPTS); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_CR3); /* Handled in Nested Paging case */ +#endif + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_SYSENTER_ESP); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_SYSENTER_EIP); + + /* 64-bit guest-state fields; unused as we use two 32-bit VMREADs for + these 64-bit fields (using "FULL" and "HIGH" fields). */ +#if 0 + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_VMCS_LINK_PTR_FULL); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_DEBUGCTL_FULL); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_PAT_FULL); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_EFER_FULL); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_PERF_GLOBAL_CTRL_FULL); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_PDPTE0_FULL); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_PDPTE1_FULL); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_PDPTE2_FULL); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS64_GUEST_PDPTE3_FULL); +#endif + + /* Natural width guest-state fields. */ + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_RO_EXIT_QUALIFICATION); + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_RO_GUEST_LINEAR_ADDR); + + if (pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging) + { + VMXLOCAL_INIT_READ_CACHE_FIELD(pCache, VMX_VMCS_GUEST_CR3); + AssertMsg(cReadFields == VMX_VMCS_MAX_NESTED_PAGING_CACHE_IDX, ("cReadFields=%u expected %u\n", cReadFields, + VMX_VMCS_MAX_NESTED_PAGING_CACHE_IDX)); + pCache->Read.cValidEntries = VMX_VMCS_MAX_NESTED_PAGING_CACHE_IDX; + } + else + { + AssertMsg(cReadFields == VMX_VMCS_MAX_CACHE_IDX, ("cReadFields=%u expected %u\n", cReadFields, VMX_VMCS_MAX_CACHE_IDX)); + pCache->Read.cValidEntries = VMX_VMCS_MAX_CACHE_IDX; + } + +#undef VMXLOCAL_INIT_READ_CACHE_FIELD + return VINF_SUCCESS; +} + + +/** + * Writes a field into the VMCS. This can either directly invoke a VMWRITE or + * queue up the VMWRITE by using the VMCS write cache (on 32-bit hosts, except + * darwin, running 64-bit guests). + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param idxField The VMCS field encoding. + * @param u64Val 16, 32 or 64-bit value. + */ +VMMR0DECL(int) VMXWriteVmcs64Ex(PVMCPU pVCpu, uint32_t idxField, uint64_t u64Val) +{ + int rc; + switch (idxField) + { + /* + * These fields consists of a "FULL" and a "HIGH" part which can be written to individually. + */ + /* 64-bit Control fields. */ + case VMX_VMCS64_CTRL_IO_BITMAP_A_FULL: + case VMX_VMCS64_CTRL_IO_BITMAP_B_FULL: + case VMX_VMCS64_CTRL_MSR_BITMAP_FULL: + case VMX_VMCS64_CTRL_EXIT_MSR_STORE_FULL: + case VMX_VMCS64_CTRL_EXIT_MSR_LOAD_FULL: + case VMX_VMCS64_CTRL_ENTRY_MSR_LOAD_FULL: + case VMX_VMCS64_CTRL_EXEC_VMCS_PTR_FULL: + case VMX_VMCS64_CTRL_TSC_OFFSET_FULL: + case VMX_VMCS64_CTRL_VIRT_APIC_PAGEADDR_FULL: + case VMX_VMCS64_CTRL_APIC_ACCESSADDR_FULL: + case VMX_VMCS64_CTRL_VMFUNC_CTRLS_FULL: + case VMX_VMCS64_CTRL_EPTP_FULL: + case VMX_VMCS64_CTRL_EPTP_LIST_FULL: + /* 64-bit Guest-state fields. */ + case VMX_VMCS64_GUEST_VMCS_LINK_PTR_FULL: + case VMX_VMCS64_GUEST_DEBUGCTL_FULL: + case VMX_VMCS64_GUEST_PAT_FULL: + case VMX_VMCS64_GUEST_EFER_FULL: + case VMX_VMCS64_GUEST_PERF_GLOBAL_CTRL_FULL: + case VMX_VMCS64_GUEST_PDPTE0_FULL: + case VMX_VMCS64_GUEST_PDPTE1_FULL: + case VMX_VMCS64_GUEST_PDPTE2_FULL: + case VMX_VMCS64_GUEST_PDPTE3_FULL: + /* 64-bit Host-state fields. */ + case VMX_VMCS64_HOST_PAT_FULL: + case VMX_VMCS64_HOST_EFER_FULL: + case VMX_VMCS64_HOST_PERF_GLOBAL_CTRL_FULL: + { + rc = VMXWriteVmcs32(idxField, RT_LO_U32(u64Val)); + rc |= VMXWriteVmcs32(idxField + 1, RT_HI_U32(u64Val)); + break; + } + + /* + * These fields do not have high and low parts. Queue up the VMWRITE by using the VMCS write-cache (for 64-bit + * values). When we switch the host to 64-bit mode for running 64-bit guests, these VMWRITEs get executed then. + */ + /* Natural-width Guest-state fields. */ + case VMX_VMCS_GUEST_CR3: + case VMX_VMCS_GUEST_ES_BASE: + case VMX_VMCS_GUEST_CS_BASE: + case VMX_VMCS_GUEST_SS_BASE: + case VMX_VMCS_GUEST_DS_BASE: + case VMX_VMCS_GUEST_FS_BASE: + case VMX_VMCS_GUEST_GS_BASE: + case VMX_VMCS_GUEST_LDTR_BASE: + case VMX_VMCS_GUEST_TR_BASE: + case VMX_VMCS_GUEST_GDTR_BASE: + case VMX_VMCS_GUEST_IDTR_BASE: + case VMX_VMCS_GUEST_RSP: + case VMX_VMCS_GUEST_RIP: + case VMX_VMCS_GUEST_SYSENTER_ESP: + case VMX_VMCS_GUEST_SYSENTER_EIP: + { + if (!(RT_HI_U32(u64Val))) + { + /* If this field is 64-bit, VT-x will zero out the top bits. */ + rc = VMXWriteVmcs32(idxField, RT_LO_U32(u64Val)); + } + else + { + /* Assert that only the 32->64 switcher case should ever come here. */ + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.fAllow64BitGuests); + rc = VMXWriteCachedVmcsEx(pVCpu, idxField, u64Val); + } + break; + } + + default: + { + AssertMsgFailed(("VMXWriteVmcs64Ex: Invalid field %#RX32 (pVCpu=%p u64Val=%#RX64)\n", idxField, pVCpu, u64Val)); + rc = VERR_INVALID_PARAMETER; + break; + } + } + AssertRCReturn(rc, rc); + return rc; +} + + +/** + * Queue up a VMWRITE by using the VMCS write cache. + * This is only used on 32-bit hosts (except darwin) for 64-bit guests. + * + * @param pVCpu The cross context virtual CPU structure. + * @param idxField The VMCS field encoding. + * @param u64Val 16, 32 or 64-bit value. + */ +VMMR0DECL(int) VMXWriteCachedVmcsEx(PVMCPU pVCpu, uint32_t idxField, uint64_t u64Val) +{ + AssertPtr(pVCpu); + PVMCSCACHE pCache = &pVCpu->hm.s.vmx.VMCSCache; + + AssertMsgReturn(pCache->Write.cValidEntries < VMCSCACHE_MAX_ENTRY - 1, + ("entries=%u\n", pCache->Write.cValidEntries), VERR_ACCESS_DENIED); + + /* Make sure there are no duplicates. */ + for (uint32_t i = 0; i < pCache->Write.cValidEntries; i++) + { + if (pCache->Write.aField[i] == idxField) + { + pCache->Write.aFieldVal[i] = u64Val; + return VINF_SUCCESS; + } + } + + pCache->Write.aField[pCache->Write.cValidEntries] = idxField; + pCache->Write.aFieldVal[pCache->Write.cValidEntries] = u64Val; + pCache->Write.cValidEntries++; + return VINF_SUCCESS; +} +#endif /* HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) */ + + +/** + * Sets up the usage of TSC-offsetting and updates the VMCS. + * + * If offsetting is not possible, cause VM-exits on RDTSC(P)s. Also sets up the + * VMX preemption timer. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0VmxUpdateTscOffsettingAndPreemptTimer(PVMCPU pVCpu) +{ + bool fOffsettedTsc; + bool fParavirtTsc; + PVM pVM = pVCpu->CTX_SUFF(pVM); + uint64_t uTscOffset; + if (pVM->hm.s.vmx.fUsePreemptTimer) + { + uint64_t cTicksToDeadline = TMCpuTickGetDeadlineAndTscOffset(pVM, pVCpu, &uTscOffset, &fOffsettedTsc, &fParavirtTsc); + + /* Make sure the returned values have sane upper and lower boundaries. */ + uint64_t u64CpuHz = SUPGetCpuHzFromGipBySetIndex(g_pSUPGlobalInfoPage, pVCpu->iHostCpuSet); + cTicksToDeadline = RT_MIN(cTicksToDeadline, u64CpuHz / 64); /* 1/64th of a second */ + cTicksToDeadline = RT_MAX(cTicksToDeadline, u64CpuHz / 2048); /* 1/2048th of a second */ + cTicksToDeadline >>= pVM->hm.s.vmx.cPreemptTimerShift; + + uint32_t cPreemptionTickCount = (uint32_t)RT_MIN(cTicksToDeadline, UINT32_MAX - 16); + int rc = VMXWriteVmcs32(VMX_VMCS32_PREEMPT_TIMER_VALUE, cPreemptionTickCount); + AssertRC(rc); + } + else + fOffsettedTsc = TMCpuTickCanUseRealTSC(pVM, pVCpu, &uTscOffset, &fParavirtTsc); + + if (fParavirtTsc) + { + /* Currently neither Hyper-V nor KVM need to update their paravirt. TSC + information before every VM-entry, hence disable it for performance sake. */ +#if 0 + int rc = GIMR0UpdateParavirtTsc(pVM, 0 /* u64Offset */); + AssertRC(rc); +#endif + STAM_COUNTER_INC(&pVCpu->hm.s.StatTscParavirt); + } + + uint32_t uProcCtls = pVCpu->hm.s.vmx.u32ProcCtls; + if ( fOffsettedTsc + && RT_LIKELY(!pVCpu->hm.s.fDebugWantRdTscExit)) + { + if (pVCpu->hm.s.vmx.u64TscOffset != uTscOffset) + { + int rc = VMXWriteVmcs64(VMX_VMCS64_CTRL_TSC_OFFSET_FULL, uTscOffset); + AssertRC(rc); + pVCpu->hm.s.vmx.u64TscOffset = uTscOffset; + } + + if (uProcCtls & VMX_PROC_CTLS_RDTSC_EXIT) + { + uProcCtls &= ~VMX_PROC_CTLS_RDTSC_EXIT; + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, uProcCtls); + AssertRC(rc); + pVCpu->hm.s.vmx.u32ProcCtls = uProcCtls; + } + STAM_COUNTER_INC(&pVCpu->hm.s.StatTscOffset); + } + else + { + /* We can't use TSC-offsetting (non-fixed TSC, warp drive active etc.), VM-exit on RDTSC(P). */ + if (!(uProcCtls & VMX_PROC_CTLS_RDTSC_EXIT)) + { + uProcCtls |= VMX_PROC_CTLS_RDTSC_EXIT; + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, uProcCtls); + AssertRC(rc); + pVCpu->hm.s.vmx.u32ProcCtls = uProcCtls; + } + STAM_COUNTER_INC(&pVCpu->hm.s.StatTscIntercept); + } +} + + +/** + * Gets the IEM exception flags for the specified vector and IDT vectoring / + * VM-exit interruption info type. + * + * @returns The IEM exception flags. + * @param uVector The event vector. + * @param uVmxVectorType The VMX event type. + * + * @remarks This function currently only constructs flags required for + * IEMEvaluateRecursiveXcpt and not the complete flags (e.g, error-code + * and CR2 aspects of an exception are not included). + */ +static uint32_t hmR0VmxGetIemXcptFlags(uint8_t uVector, uint32_t uVmxVectorType) +{ + uint32_t fIemXcptFlags; + switch (uVmxVectorType) + { + case VMX_IDT_VECTORING_INFO_TYPE_HW_XCPT: + case VMX_IDT_VECTORING_INFO_TYPE_NMI: + fIemXcptFlags = IEM_XCPT_FLAGS_T_CPU_XCPT; + break; + + case VMX_IDT_VECTORING_INFO_TYPE_EXT_INT: + fIemXcptFlags = IEM_XCPT_FLAGS_T_EXT_INT; + break; + + case VMX_IDT_VECTORING_INFO_TYPE_PRIV_SW_XCPT: + fIemXcptFlags = IEM_XCPT_FLAGS_T_SOFT_INT | IEM_XCPT_FLAGS_ICEBP_INSTR; + break; + + case VMX_IDT_VECTORING_INFO_TYPE_SW_XCPT: + { + fIemXcptFlags = IEM_XCPT_FLAGS_T_SOFT_INT; + if (uVector == X86_XCPT_BP) + fIemXcptFlags |= IEM_XCPT_FLAGS_BP_INSTR; + else if (uVector == X86_XCPT_OF) + fIemXcptFlags |= IEM_XCPT_FLAGS_OF_INSTR; + else + { + fIemXcptFlags = 0; + AssertMsgFailed(("Unexpected vector for software int. uVector=%#x", uVector)); + } + break; + } + + case VMX_IDT_VECTORING_INFO_TYPE_SW_INT: + fIemXcptFlags = IEM_XCPT_FLAGS_T_SOFT_INT; + break; + + default: + fIemXcptFlags = 0; + AssertMsgFailed(("Unexpected vector type! uVmxVectorType=%#x uVector=%#x", uVmxVectorType, uVector)); + break; + } + return fIemXcptFlags; +} + + +/** + * Sets an event as a pending event to be injected into the guest. + * + * @param pVCpu The cross context virtual CPU structure. + * @param u32IntInfo The VM-entry interruption-information field. + * @param cbInstr The VM-entry instruction length in bytes (for software + * interrupts, exceptions and privileged software + * exceptions). + * @param u32ErrCode The VM-entry exception error code. + * @param GCPtrFaultAddress The fault-address (CR2) in case it's a + * page-fault. + * + * @remarks Statistics counter assumes this is a guest event being injected or + * re-injected into the guest, i.e. 'StatInjectPendingReflect' is + * always incremented. + */ +DECLINLINE(void) hmR0VmxSetPendingEvent(PVMCPU pVCpu, uint32_t u32IntInfo, uint32_t cbInstr, uint32_t u32ErrCode, + RTGCUINTPTR GCPtrFaultAddress) +{ + Assert(!pVCpu->hm.s.Event.fPending); + pVCpu->hm.s.Event.fPending = true; + pVCpu->hm.s.Event.u64IntInfo = u32IntInfo; + pVCpu->hm.s.Event.u32ErrCode = u32ErrCode; + pVCpu->hm.s.Event.cbInstr = cbInstr; + pVCpu->hm.s.Event.GCPtrFaultAddress = GCPtrFaultAddress; +} + + +/** + * Sets a double-fault (\#DF) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0VmxSetPendingXcptDF(PVMCPU pVCpu) +{ + uint32_t const u32IntInfo = RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VECTOR, X86_XCPT_DF) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_TYPE, VMX_EXIT_INT_INFO_TYPE_HW_XCPT) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_ERR_CODE_VALID, 1) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VALID, 1); + hmR0VmxSetPendingEvent(pVCpu, u32IntInfo, 0 /* cbInstr */, 0 /* u32ErrCode */, 0 /* GCPtrFaultAddress */); +} + + +/** + * Sets an invalid-opcode (\#UD) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0VmxSetPendingXcptUD(PVMCPU pVCpu) +{ + uint32_t const u32IntInfo = RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VECTOR, X86_XCPT_UD) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_TYPE, VMX_EXIT_INT_INFO_TYPE_HW_XCPT) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_ERR_CODE_VALID, 0) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VALID, 1); + hmR0VmxSetPendingEvent(pVCpu, u32IntInfo, 0 /* cbInstr */, 0 /* u32ErrCode */, 0 /* GCPtrFaultAddress */); +} + + +/** + * Sets a debug (\#DB) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0VmxSetPendingXcptDB(PVMCPU pVCpu) +{ + uint32_t const u32IntInfo = RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VECTOR, X86_XCPT_DB) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_TYPE, VMX_EXIT_INT_INFO_TYPE_HW_XCPT) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_ERR_CODE_VALID, 0) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VALID, 1); + hmR0VmxSetPendingEvent(pVCpu, u32IntInfo, 0 /* cbInstr */, 0 /* u32ErrCode */, 0 /* GCPtrFaultAddress */); +} + + +#ifdef VBOX_WITH_NESTED_HWVIRT_VMX +/** + * Sets a general-protection (\#GP) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + * @param u32ErrCode The error code for the general-protection exception. + */ +DECLINLINE(void) hmR0VmxSetPendingXcptGP(PVMCPU pVCpu, uint32_t u32ErrCode) +{ + uint32_t const u32IntInfo = RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VECTOR, X86_XCPT_GP) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_TYPE, VMX_EXIT_INT_INFO_TYPE_HW_XCPT) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_ERR_CODE_VALID, 1) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VALID, 1); + hmR0VmxSetPendingEvent(pVCpu, u32IntInfo, 0 /* cbInstr */, u32ErrCode, 0 /* GCPtrFaultAddress */); +} + + +/** + * Sets a stack (\#SS) exception as pending-for-injection into the VM. + * + * @param pVCpu The cross context virtual CPU structure. + * @param u32ErrCode The error code for the stack exception. + */ +DECLINLINE(void) hmR0VmxSetPendingXcptSS(PVMCPU pVCpu, uint32_t u32ErrCode) +{ + uint32_t const u32IntInfo = RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VECTOR, X86_XCPT_SS) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_TYPE, VMX_EXIT_INT_INFO_TYPE_HW_XCPT) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_ERR_CODE_VALID, 1) + | RT_BF_MAKE(VMX_BF_ENTRY_INT_INFO_VALID, 1); + hmR0VmxSetPendingEvent(pVCpu, u32IntInfo, 0 /* cbInstr */, u32ErrCode, 0 /* GCPtrFaultAddress */); +} + + +# ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM +/** + * Decodes the memory operand of an instruction that caused a VM-exit. + * + * The VM-exit qualification field provides the displacement field for memory + * operand instructions, if any. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @retval VINF_SUCCESS if the operand was successfully decoded. + * @retval VINF_HM_PENDING_XCPT if an exception was raised while decoding the + * operand. + * @param pVCpu The cross context virtual CPU structure. + * @param uExitInstrInfo The VM-exit instruction information field. + * @param enmMemAccess The memory operand's access type (read or write). + * @param GCPtrDisp The instruction displacement field, if any. For + * RIP-relative addressing pass RIP + displacement here. + * @param pGCPtrMem Where to store the effective destination memory address. + */ +static VBOXSTRICTRC hmR0VmxDecodeMemOperand(PVMCPU pVCpu, uint32_t uExitInstrInfo, RTGCPTR GCPtrDisp, VMXMEMACCESS enmMemAccess, + PRTGCPTR pGCPtrMem) +{ + Assert(pGCPtrMem); + Assert(!CPUMIsGuestInRealOrV86Mode(pVCpu)); + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_SREG_MASK | CPUMCTX_EXTRN_EFER + | CPUMCTX_EXTRN_CR0); + + static uint64_t const s_auAddrSizeMasks[] = { UINT64_C(0xffff), UINT64_C(0xffffffff), UINT64_C(0xffffffffffffffff) }; + static uint64_t const s_auAccessSizeMasks[] = { sizeof(uint16_t), sizeof(uint32_t), sizeof(uint64_t) }; + AssertCompile(RT_ELEMENTS(s_auAccessSizeMasks) == RT_ELEMENTS(s_auAddrSizeMasks)); + + VMXEXITINSTRINFO ExitInstrInfo; + ExitInstrInfo.u = uExitInstrInfo; + uint8_t const uAddrSize = ExitInstrInfo.All.u3AddrSize; + uint8_t const iSegReg = ExitInstrInfo.All.iSegReg; + bool const fIdxRegValid = !ExitInstrInfo.All.fIdxRegInvalid; + uint8_t const iIdxReg = ExitInstrInfo.All.iIdxReg; + uint8_t const uScale = ExitInstrInfo.All.u2Scaling; + bool const fBaseRegValid = !ExitInstrInfo.All.fBaseRegInvalid; + uint8_t const iBaseReg = ExitInstrInfo.All.iBaseReg; + bool const fIsMemOperand = !ExitInstrInfo.All.fIsRegOperand; + bool const fIsLongMode = CPUMIsGuestInLongModeEx(&pVCpu->cpum.GstCtx); + + /* + * Validate instruction information. + * This shouldn't happen on real hardware but useful while testing our nested hardware-virtualization code. + */ + AssertLogRelMsgReturn(uAddrSize < RT_ELEMENTS(s_auAddrSizeMasks), + ("Invalid address size. ExitInstrInfo=%#RX32\n", ExitInstrInfo.u), VERR_VMX_IPE_1); + AssertLogRelMsgReturn(iSegReg < X86_SREG_COUNT, + ("Invalid segment register. ExitInstrInfo=%#RX32\n", ExitInstrInfo.u), VERR_VMX_IPE_2); + AssertLogRelMsgReturn(fIsMemOperand, + ("Expected memory operand. ExitInstrInfo=%#RX32\n", ExitInstrInfo.u), VERR_VMX_IPE_3); + + /* + * Compute the complete effective address. + * + * See AMD instruction spec. 1.4.2 "SIB Byte Format" + * See AMD spec. 4.5.2 "Segment Registers". + */ + RTGCPTR GCPtrMem = GCPtrDisp; + if (fBaseRegValid) + GCPtrMem += pVCpu->cpum.GstCtx.aGRegs[iBaseReg].u64; + if (fIdxRegValid) + GCPtrMem += pVCpu->cpum.GstCtx.aGRegs[iIdxReg].u64 << uScale; + + RTGCPTR const GCPtrOff = GCPtrMem; + if ( !fIsLongMode + || iSegReg >= X86_SREG_FS) + GCPtrMem += pVCpu->cpum.GstCtx.aSRegs[iSegReg].u64Base; + GCPtrMem &= s_auAddrSizeMasks[uAddrSize]; + + /* + * Validate effective address. + * See AMD spec. 4.5.3 "Segment Registers in 64-Bit Mode". + */ + uint8_t const cbAccess = s_auAccessSizeMasks[uAddrSize]; + Assert(cbAccess > 0); + if (fIsLongMode) + { + if (X86_IS_CANONICAL(GCPtrMem)) + { + *pGCPtrMem = GCPtrMem; + return VINF_SUCCESS; + } + + /** @todo r=ramshankar: We should probably raise \#SS or \#GP. See AMD spec. 4.12.2 + * "Data Limit Checks in 64-bit Mode". */ + Log4Func(("Long mode effective address is not canonical GCPtrMem=%#RX64\n", GCPtrMem)); + hmR0VmxSetPendingXcptGP(pVCpu, 0); + return VINF_HM_PENDING_XCPT; + } + + /* + * This is a watered down version of iemMemApplySegment(). + * Parts that are not applicable for VMX instructions like real-or-v8086 mode + * and segment CPL/DPL checks are skipped. + */ + RTGCPTR32 const GCPtrFirst32 = (RTGCPTR32)GCPtrOff; + RTGCPTR32 const GCPtrLast32 = GCPtrFirst32 + cbAccess - 1; + PCCPUMSELREG pSel = &pVCpu->cpum.GstCtx.aSRegs[iSegReg]; + + /* Check if the segment is present and usable. */ + if ( pSel->Attr.n.u1Present + && !pSel->Attr.n.u1Unusable) + { + Assert(pSel->Attr.n.u1DescType); + if (!(pSel->Attr.n.u4Type & X86_SEL_TYPE_CODE)) + { + /* Check permissions for the data segment. */ + if ( enmMemAccess == VMXMEMACCESS_WRITE + && !(pSel->Attr.n.u4Type & X86_SEL_TYPE_WRITE)) + { + Log4Func(("Data segment access invalid. iSegReg=%#x Attr=%#RX32\n", iSegReg, pSel->Attr.u)); + hmR0VmxSetPendingXcptGP(pVCpu, iSegReg); + return VINF_HM_PENDING_XCPT; + } + + /* Check limits if it's a normal data segment. */ + if (!(pSel->Attr.n.u4Type & X86_SEL_TYPE_DOWN)) + { + if ( GCPtrFirst32 > pSel->u32Limit + || GCPtrLast32 > pSel->u32Limit) + { + Log4Func(("Data segment limit exceeded." + "iSegReg=%#x GCPtrFirst32=%#RX32 GCPtrLast32=%#RX32 u32Limit=%#RX32\n", iSegReg, GCPtrFirst32, + GCPtrLast32, pSel->u32Limit)); + if (iSegReg == X86_SREG_SS) + hmR0VmxSetPendingXcptSS(pVCpu, 0); + else + hmR0VmxSetPendingXcptGP(pVCpu, 0); + return VINF_HM_PENDING_XCPT; + } + } + else + { + /* Check limits if it's an expand-down data segment. + Note! The upper boundary is defined by the B bit, not the G bit! */ + if ( GCPtrFirst32 < pSel->u32Limit + UINT32_C(1) + || GCPtrLast32 > (pSel->Attr.n.u1DefBig ? UINT32_MAX : UINT32_C(0xffff))) + { + Log4Func(("Expand-down data segment limit exceeded." + "iSegReg=%#x GCPtrFirst32=%#RX32 GCPtrLast32=%#RX32 u32Limit=%#RX32\n", iSegReg, GCPtrFirst32, + GCPtrLast32, pSel->u32Limit)); + if (iSegReg == X86_SREG_SS) + hmR0VmxSetPendingXcptSS(pVCpu, 0); + else + hmR0VmxSetPendingXcptGP(pVCpu, 0); + return VINF_HM_PENDING_XCPT; + } + } + } + else + { + /* Check permissions for the code segment. */ + if ( enmMemAccess == VMXMEMACCESS_WRITE + || ( enmMemAccess == VMXMEMACCESS_READ + && !(pSel->Attr.n.u4Type & X86_SEL_TYPE_READ))) + { + Log4Func(("Code segment access invalid. Attr=%#RX32\n", pSel->Attr.u)); + Assert(!CPUMIsGuestInRealOrV86ModeEx(&pVCpu->cpum.GstCtx)); + hmR0VmxSetPendingXcptGP(pVCpu, 0); + return VINF_HM_PENDING_XCPT; + } + + /* Check limits for the code segment (normal/expand-down not applicable for code segments). */ + if ( GCPtrFirst32 > pSel->u32Limit + || GCPtrLast32 > pSel->u32Limit) + { + Log4Func(("Code segment limit exceeded. GCPtrFirst32=%#RX32 GCPtrLast32=%#RX32 u32Limit=%#RX32\n", + GCPtrFirst32, GCPtrLast32, pSel->u32Limit)); + if (iSegReg == X86_SREG_SS) + hmR0VmxSetPendingXcptSS(pVCpu, 0); + else + hmR0VmxSetPendingXcptGP(pVCpu, 0); + return VINF_HM_PENDING_XCPT; + } + } + } + else + { + Log4Func(("Not present or unusable segment. iSegReg=%#x Attr=%#RX32\n", iSegReg, pSel->Attr.u)); + hmR0VmxSetPendingXcptGP(pVCpu, 0); + return VINF_HM_PENDING_XCPT; + } + + *pGCPtrMem = GCPtrMem; + return VINF_SUCCESS; +} + + +/** + * Perform the relevant VMX instruction checks for VM-exits that occurred due to the + * guest attempting to execute a VMX instruction. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @retval VINF_SUCCESS if we should continue handling the VM-exit. + * @retval VINF_HM_PENDING_XCPT if an exception was raised. + * + * @param pVCpu The cross context virtual CPU structure. + * @param uExitReason The VM-exit reason. + * + * @todo NstVmx: Document other error codes when VM-exit is implemented. + * @remarks No-long-jump zone!!! + */ +static VBOXSTRICTRC hmR0VmxCheckExitDueToVmxInstr(PVMCPU pVCpu, uint32_t uExitReason) +{ + HMVMX_CPUMCTX_ASSERT(pVCpu, CPUMCTX_EXTRN_CR4 | CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_RFLAGS | CPUMCTX_EXTRN_SS + | CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_EFER); + + if ( CPUMIsGuestInRealOrV86ModeEx(&pVCpu->cpum.GstCtx) + || ( CPUMIsGuestInLongModeEx(&pVCpu->cpum.GstCtx) + && !CPUMIsGuestIn64BitCodeEx(&pVCpu->cpum.GstCtx))) + { + Log4Func(("In real/v86-mode or long-mode outside 64-bit code segment -> #UD\n")); + hmR0VmxSetPendingXcptUD(pVCpu); + return VINF_HM_PENDING_XCPT; + } + + if (uExitReason == VMX_EXIT_VMXON) + { + /* + * We check CR4.VMXE because it is required to be always set while in VMX operation + * by physical CPUs and our CR4 read shadow is only consulted when executing specific + * instructions (CLTS, LMSW, MOV CR, and SMSW) and thus doesn't affect CPU operation + * otherwise (i.e. physical CPU won't automatically #UD if Cr4Shadow.VMXE is 0). + */ + if (!CPUMIsGuestVmxEnabled(&pVCpu->cpum.GstCtx)) + { + Log4Func(("CR4.VMXE is not set -> #UD\n")); + hmR0VmxSetPendingXcptUD(pVCpu); + return VINF_HM_PENDING_XCPT; + } + } + else if (!CPUMIsGuestInVmxRootMode(&pVCpu->cpum.GstCtx)) + { + /* + * The guest has not entered VMX operation but attempted to execute a VMX instruction + * (other than VMXON), we need to raise a #UD. + */ + Log4Func(("Not in VMX root mode -> #UD\n")); + hmR0VmxSetPendingXcptUD(pVCpu); + return VINF_HM_PENDING_XCPT; + } + + if (CPUMIsGuestInVmxNonRootMode(&pVCpu->cpum.GstCtx)) + { + /* + * The nested-guest attempted to execute a VMX instruction, cause a VM-exit and let + * the guest hypervisor deal with it. + */ + /** @todo NSTVMX: Trigger a VM-exit */ + } + + /* + * VMX instructions require CPL 0 except in VMX non-root mode where the VM-exit intercept + * (above) takes preceedence over the CPL check. + */ + if (CPUMGetGuestCPL(pVCpu) > 0) + { + Log4Func(("CPL > 0 -> #GP(0)\n")); + hmR0VmxSetPendingXcptGP(pVCpu, 0); + return VINF_HM_PENDING_XCPT; + } + + return VINF_SUCCESS; +} +# endif /* !VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM */ +#endif /* VBOX_WITH_NESTED_HWVIRT_VMX */ + + +/** + * Handle a condition that occurred while delivering an event through the guest + * IDT. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @retval VINF_SUCCESS if we should continue handling the VM-exit. + * @retval VINF_HM_DOUBLE_FAULT if a \#DF condition was detected and we ought + * to continue execution of the guest which will delivery the \#DF. + * @retval VINF_EM_RESET if we detected a triple-fault condition. + * @retval VERR_EM_GUEST_CPU_HANG if we detected a guest CPU hang. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmxTransient Pointer to the VMX transient structure. + * + * @remarks No-long-jump zone!!! + */ +static VBOXSTRICTRC hmR0VmxCheckExitDueToEventDelivery(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + uint32_t const uExitVector = VMX_EXIT_INT_INFO_VECTOR(pVmxTransient->uExitIntInfo); + + int rc2 = hmR0VmxReadIdtVectoringInfoVmcs(pVmxTransient); + rc2 |= hmR0VmxReadExitIntInfoVmcs(pVmxTransient); + AssertRCReturn(rc2, rc2); + + VBOXSTRICTRC rcStrict = VINF_SUCCESS; + if (VMX_IDT_VECTORING_INFO_IS_VALID(pVmxTransient->uIdtVectoringInfo)) + { + uint32_t const uIdtVectorType = VMX_IDT_VECTORING_INFO_TYPE(pVmxTransient->uIdtVectoringInfo); + uint32_t const uIdtVector = VMX_IDT_VECTORING_INFO_VECTOR(pVmxTransient->uIdtVectoringInfo); + + /* + * If the event was a software interrupt (generated with INT n) or a software exception + * (generated by INT3/INTO) or a privileged software exception (generated by INT1), we + * can handle the VM-exit and continue guest execution which will re-execute the + * instruction rather than re-injecting the exception, as that can cause premature + * trips to ring-3 before injection and involve TRPM which currently has no way of + * storing that these exceptions were caused by these instructions (ICEBP's #DB poses + * the problem). + */ + IEMXCPTRAISE enmRaise; + IEMXCPTRAISEINFO fRaiseInfo; + if ( uIdtVectorType == VMX_IDT_VECTORING_INFO_TYPE_SW_INT + || uIdtVectorType == VMX_IDT_VECTORING_INFO_TYPE_SW_XCPT + || uIdtVectorType == VMX_IDT_VECTORING_INFO_TYPE_PRIV_SW_XCPT) + { + enmRaise = IEMXCPTRAISE_REEXEC_INSTR; + fRaiseInfo = IEMXCPTRAISEINFO_NONE; + } + else if (VMX_EXIT_INT_INFO_IS_VALID(pVmxTransient->uExitIntInfo)) + { + uint32_t const uExitVectorType = VMX_IDT_VECTORING_INFO_TYPE(pVmxTransient->uExitIntInfo); + uint32_t const fIdtVectorFlags = hmR0VmxGetIemXcptFlags(uIdtVector, uIdtVectorType); + uint32_t const fExitVectorFlags = hmR0VmxGetIemXcptFlags(uExitVector, uExitVectorType); + /** @todo Make AssertMsgReturn as just AssertMsg later. */ + AssertMsgReturn(uExitVectorType == VMX_EXIT_INT_INFO_TYPE_HW_XCPT, + ("hmR0VmxCheckExitDueToEventDelivery: Unexpected VM-exit interruption info. %#x!\n", + uExitVectorType), VERR_VMX_IPE_5); + + enmRaise = IEMEvaluateRecursiveXcpt(pVCpu, fIdtVectorFlags, uIdtVector, fExitVectorFlags, uExitVector, &fRaiseInfo); + + /* Determine a vectoring #PF condition, see comment in hmR0VmxExitXcptPF(). */ + if (fRaiseInfo & (IEMXCPTRAISEINFO_EXT_INT_PF | IEMXCPTRAISEINFO_NMI_PF)) + { + pVmxTransient->fVectoringPF = true; + enmRaise = IEMXCPTRAISE_PREV_EVENT; + } + } + else + { + /* + * If an exception or hardware interrupt delivery caused an EPT violation/misconfig or APIC access + * VM-exit, then the VM-exit interruption-information will not be valid and we end up here. + * It is sufficient to reflect the original event to the guest after handling the VM-exit. + */ + Assert( uIdtVectorType == VMX_IDT_VECTORING_INFO_TYPE_HW_XCPT + || uIdtVectorType == VMX_IDT_VECTORING_INFO_TYPE_NMI + || uIdtVectorType == VMX_IDT_VECTORING_INFO_TYPE_EXT_INT); + enmRaise = IEMXCPTRAISE_PREV_EVENT; + fRaiseInfo = IEMXCPTRAISEINFO_NONE; + } + + /* + * On CPUs that support Virtual NMIs, if this VM-exit (be it an exception or EPT violation/misconfig + * etc.) occurred while delivering the NMI, we need to clear the block-by-NMI field in the guest + * interruptibility-state before re-delivering the NMI after handling the VM-exit. Otherwise the + * subsequent VM-entry would fail. + * + * See Intel spec. 30.7.1.2 "Resuming Guest Software after Handling an Exception". See @bugref{7445}. + */ + if ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS) + && uIdtVectorType == VMX_IDT_VECTORING_INFO_TYPE_NMI + && ( enmRaise == IEMXCPTRAISE_PREV_EVENT + || (fRaiseInfo & IEMXCPTRAISEINFO_NMI_PF)) + && (pVCpu->hm.s.vmx.u32PinCtls & VMX_PIN_CTLS_VIRT_NMI)) + { + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_BLOCK_NMIS); + } + + switch (enmRaise) + { + case IEMXCPTRAISE_CURRENT_XCPT: + { + Log4Func(("IDT: Pending secondary Xcpt: uIdtVectoringInfo=%#RX64 uExitIntInfo=%#RX64\n", + pVmxTransient->uIdtVectoringInfo, pVmxTransient->uExitIntInfo)); + Assert(rcStrict == VINF_SUCCESS); + break; + } + + case IEMXCPTRAISE_PREV_EVENT: + { + uint32_t u32ErrCode; + if (VMX_IDT_VECTORING_INFO_IS_ERROR_CODE_VALID(pVmxTransient->uIdtVectoringInfo)) + { + rc2 = hmR0VmxReadIdtVectoringErrorCodeVmcs(pVmxTransient); + AssertRCReturn(rc2, rc2); + u32ErrCode = pVmxTransient->uIdtVectoringErrorCode; + } + else + u32ErrCode = 0; + + /* If uExitVector is #PF, CR2 value will be updated from the VMCS if it's a guest #PF, see hmR0VmxExitXcptPF(). */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectPendingReflect); + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_IDT_INFO(pVmxTransient->uIdtVectoringInfo), + 0 /* cbInstr */, u32ErrCode, pVCpu->cpum.GstCtx.cr2); + + Log4Func(("IDT: Pending vectoring event %#RX64 Err=%#RX32\n", pVCpu->hm.s.Event.u64IntInfo, + pVCpu->hm.s.Event.u32ErrCode)); + Assert(rcStrict == VINF_SUCCESS); + break; + } + + case IEMXCPTRAISE_REEXEC_INSTR: + Assert(rcStrict == VINF_SUCCESS); + break; + + case IEMXCPTRAISE_DOUBLE_FAULT: + { + /* + * Determing a vectoring double #PF condition. Used later, when PGM evaluates the + * second #PF as a guest #PF (and not a shadow #PF) and needs to be converted into a #DF. + */ + if (fRaiseInfo & IEMXCPTRAISEINFO_PF_PF) + { + pVmxTransient->fVectoringDoublePF = true; + Log4Func(("IDT: Vectoring double #PF %#RX64 cr2=%#RX64\n", pVCpu->hm.s.Event.u64IntInfo, + pVCpu->cpum.GstCtx.cr2)); + rcStrict = VINF_SUCCESS; + } + else + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectPendingReflect); + hmR0VmxSetPendingXcptDF(pVCpu); + Log4Func(("IDT: Pending vectoring #DF %#RX64 uIdtVector=%#x uExitVector=%#x\n", pVCpu->hm.s.Event.u64IntInfo, + uIdtVector, uExitVector)); + rcStrict = VINF_HM_DOUBLE_FAULT; + } + break; + } + + case IEMXCPTRAISE_TRIPLE_FAULT: + { + Log4Func(("IDT: Pending vectoring triple-fault uIdt=%#x uExit=%#x\n", uIdtVector, uExitVector)); + rcStrict = VINF_EM_RESET; + break; + } + + case IEMXCPTRAISE_CPU_HANG: + { + Log4Func(("IDT: Bad guest! Entering CPU hang. fRaiseInfo=%#x\n", fRaiseInfo)); + rcStrict = VERR_EM_GUEST_CPU_HANG; + break; + } + + default: + { + AssertMsgFailed(("IDT: vcpu[%RU32] Unexpected/invalid value! enmRaise=%#x\n", pVCpu->idCpu, enmRaise)); + rcStrict = VERR_VMX_IPE_2; + break; + } + } + } + else if ( VMX_EXIT_INT_INFO_IS_VALID(pVmxTransient->uExitIntInfo) + && VMX_EXIT_INT_INFO_IS_NMI_UNBLOCK_IRET(pVmxTransient->uExitIntInfo) + && uExitVector != X86_XCPT_DF + && (pVCpu->hm.s.vmx.u32PinCtls & VMX_PIN_CTLS_VIRT_NMI)) + { + /* + * Execution of IRET caused this fault when NMI blocking was in effect (i.e we're in the guest NMI handler). + * We need to set the block-by-NMI field so that NMIs remain blocked until the IRET execution is restarted. + * See Intel spec. 30.7.1.2 "Resuming guest software after handling an exception". + */ + if (!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + { + Log4Func(("Setting VMCPU_FF_BLOCK_NMIS. fValid=%RTbool uExitReason=%u\n", + VMX_EXIT_INT_INFO_IS_VALID(pVmxTransient->uExitIntInfo), pVmxTransient->uExitReason)); + VMCPU_FF_SET(pVCpu, VMCPU_FF_BLOCK_NMIS); + } + } + + Assert( rcStrict == VINF_SUCCESS || rcStrict == VINF_HM_DOUBLE_FAULT + || rcStrict == VINF_EM_RESET || rcStrict == VERR_EM_GUEST_CPU_HANG); + return rcStrict; +} + + +/** + * Imports a guest segment register from the current VMCS into + * the guest-CPU context. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param idxSel Index of the selector in the VMCS. + * @param idxLimit Index of the segment limit in the VMCS. + * @param idxBase Index of the segment base in the VMCS. + * @param idxAccess Index of the access rights of the segment in the VMCS. + * @param pSelReg Pointer to the segment selector. + * + * @remarks Called with interrupts and/or preemption disabled, try not to assert and + * do not log! + * + * @remarks Never call this function directly!!! Use the + * HMVMX_IMPORT_SREG() macro as that takes care + * of whether to read from the VMCS cache or not. + */ +static int hmR0VmxImportGuestSegmentReg(PVMCPU pVCpu, uint32_t idxSel, uint32_t idxLimit, uint32_t idxBase, uint32_t idxAccess, + PCPUMSELREG pSelReg) +{ + NOREF(pVCpu); + + uint32_t u32Sel; + uint32_t u32Limit; + uint32_t u32Attr; + uint64_t u64Base; + int rc = VMXReadVmcs32(idxSel, &u32Sel); + rc |= VMXReadVmcs32(idxLimit, &u32Limit); + rc |= VMXReadVmcs32(idxAccess, &u32Attr); + rc |= VMXReadVmcsGstNByIdxVal(idxBase, &u64Base); + AssertRCReturn(rc, rc); + + pSelReg->Sel = (uint16_t)u32Sel; + pSelReg->ValidSel = (uint16_t)u32Sel; + pSelReg->fFlags = CPUMSELREG_FLAGS_VALID; + pSelReg->u32Limit = u32Limit; + pSelReg->u64Base = u64Base; + pSelReg->Attr.u = u32Attr; + + /* + * If VT-x marks the segment as unusable, most other bits remain undefined: + * - For CS the L, D and G bits have meaning. + * - For SS the DPL has meaning (it -is- the CPL for Intel and VBox). + * - For the remaining data segments no bits are defined. + * + * The present bit and the unusable bit has been observed to be set at the + * same time (the selector was supposed to be invalid as we started executing + * a V8086 interrupt in ring-0). + * + * What should be important for the rest of the VBox code, is that the P bit is + * cleared. Some of the other VBox code recognizes the unusable bit, but + * AMD-V certainly don't, and REM doesn't really either. So, to be on the + * safe side here, we'll strip off P and other bits we don't care about. If + * any code breaks because Attr.u != 0 when Sel < 4, it should be fixed. + * + * See Intel spec. 27.3.2 "Saving Segment Registers and Descriptor-Table Registers". + */ + if (pSelReg->Attr.u & X86DESCATTR_UNUSABLE) + { + Assert(idxSel != VMX_VMCS16_GUEST_TR_SEL); /* TR is the only selector that can never be unusable. */ + + /* Masking off: X86DESCATTR_P, X86DESCATTR_LIMIT_HIGH, and X86DESCATTR_AVL. The latter two are really irrelevant. */ + pSelReg->Attr.u &= X86DESCATTR_UNUSABLE | X86DESCATTR_L | X86DESCATTR_D | X86DESCATTR_G + | X86DESCATTR_DPL | X86DESCATTR_TYPE | X86DESCATTR_DT; +#ifdef VBOX_STRICT + VMMRZCallRing3Disable(pVCpu); + Log4Func(("Unusable idxSel=%#x attr=%#x -> %#x\n", idxSel, u32Sel, pSelReg->Attr.u)); +# ifdef DEBUG_bird + AssertMsg((u32Attr & ~X86DESCATTR_P) == pSelReg->Attr.u, + ("%#x: %#x != %#x (sel=%#x base=%#llx limit=%#x)\n", + idxSel, u32Sel, pSelReg->Attr.u, pSelReg->Sel, pSelReg->u64Base, pSelReg->u32Limit)); +# endif + VMMRZCallRing3Enable(pVCpu); +#endif + } + return VINF_SUCCESS; +} + + +/** + * Imports the guest RIP from the VMCS back into the guest-CPU context. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Called with interrupts and/or preemption disabled, should not assert! + * @remarks Do -not- call this function directly, use hmR0VmxImportGuestState() + * instead!!! + */ +DECLINLINE(int) hmR0VmxImportGuestRip(PVMCPU pVCpu) +{ + uint64_t u64Val; + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (pCtx->fExtrn & CPUMCTX_EXTRN_RIP) + { + int rc = VMXReadVmcsGstN(VMX_VMCS_GUEST_RIP, &u64Val); + if (RT_SUCCESS(rc)) + { + pCtx->rip = u64Val; + EMR0HistoryUpdatePC(pVCpu, pCtx->rip, false); + pCtx->fExtrn &= ~CPUMCTX_EXTRN_RIP; + } + return rc; + } + return VINF_SUCCESS; +} + + +/** + * Imports the guest RFLAGS from the VMCS back into the guest-CPU context. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Called with interrupts and/or preemption disabled, should not assert! + * @remarks Do -not- call this function directly, use hmR0VmxImportGuestState() + * instead!!! + */ +DECLINLINE(int) hmR0VmxImportGuestRFlags(PVMCPU pVCpu) +{ + uint32_t u32Val; + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (pCtx->fExtrn & CPUMCTX_EXTRN_RFLAGS) + { + int rc = VMXReadVmcs32(VMX_VMCS_GUEST_RFLAGS, &u32Val); + if (RT_SUCCESS(rc)) + { + pCtx->eflags.u32 = u32Val; + + /* Restore eflags for real-on-v86-mode hack. */ + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + pCtx->eflags.Bits.u1VM = 0; + pCtx->eflags.Bits.u2IOPL = pVCpu->hm.s.vmx.RealMode.Eflags.Bits.u2IOPL; + } + } + pCtx->fExtrn &= ~CPUMCTX_EXTRN_RFLAGS; + return rc; + } + return VINF_SUCCESS; +} + + +/** + * Imports the guest interruptibility-state from the VMCS back into the guest-CPU + * context. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Called with interrupts and/or preemption disabled, try not to assert and + * do not log! + * @remarks Do -not- call this function directly, use hmR0VmxImportGuestState() + * instead!!! + */ +DECLINLINE(int) hmR0VmxImportGuestIntrState(PVMCPU pVCpu) +{ + uint32_t u32Val; + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + int rc = VMXReadVmcs32(VMX_VMCS32_GUEST_INT_STATE, &u32Val); + AssertRCReturn(rc, rc); + + /* + * We additionally have a requirement to import RIP, RFLAGS depending on whether we + * might need them in hmR0VmxEvaluatePendingEvent(). + */ + if (!u32Val) + { + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS)) + { + rc = hmR0VmxImportGuestRip(pVCpu); + rc |= hmR0VmxImportGuestRFlags(pVCpu); + AssertRCReturn(rc, rc); + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); + } + + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_BLOCK_NMIS); + } + else + { + rc = hmR0VmxImportGuestRip(pVCpu); + rc |= hmR0VmxImportGuestRFlags(pVCpu); + AssertRCReturn(rc, rc); + + if (u32Val & ( VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS + | VMX_VMCS_GUEST_INT_STATE_BLOCK_STI)) + { + EMSetInhibitInterruptsPC(pVCpu, pCtx->rip); + } + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); + + if (u32Val & VMX_VMCS_GUEST_INT_STATE_BLOCK_NMI) + { + if (!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + VMCPU_FF_SET(pVCpu, VMCPU_FF_BLOCK_NMIS); + } + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_BLOCK_NMIS); + } + + return VINF_SUCCESS; +} + + +/** + * Worker for VMXR0ImportStateOnDemand. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param fWhat What to import, CPUMCTX_EXTRN_XXX. + */ +static int hmR0VmxImportGuestState(PVMCPU pVCpu, uint64_t fWhat) +{ +#define VMXLOCAL_BREAK_RC(a_rc) \ + if (RT_FAILURE(a_rc)) \ + break + + int rc = VINF_SUCCESS; + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint64_t u64Val; + uint32_t u32Val; + + Log4Func(("fExtrn=%#RX64 fWhat=%#RX64\n", pCtx->fExtrn, fWhat)); + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatImportGuestState, x); + + /* + * We disable interrupts to make the updating of the state and in particular + * the fExtrn modification atomic wrt to preemption hooks. + */ + RTCCUINTREG const fEFlags = ASMIntDisableFlags(); + + fWhat &= pCtx->fExtrn; + if (fWhat) + { + do + { + if (fWhat & CPUMCTX_EXTRN_RIP) + { + rc = hmR0VmxImportGuestRip(pVCpu); + VMXLOCAL_BREAK_RC(rc); + } + + if (fWhat & CPUMCTX_EXTRN_RFLAGS) + { + rc = hmR0VmxImportGuestRFlags(pVCpu); + VMXLOCAL_BREAK_RC(rc); + } + + if (fWhat & CPUMCTX_EXTRN_HM_VMX_INT_STATE) + { + rc = hmR0VmxImportGuestIntrState(pVCpu); + VMXLOCAL_BREAK_RC(rc); + } + + if (fWhat & CPUMCTX_EXTRN_RSP) + { + rc = VMXReadVmcsGstN(VMX_VMCS_GUEST_RSP, &u64Val); + VMXLOCAL_BREAK_RC(rc); + pCtx->rsp = u64Val; + } + + if (fWhat & CPUMCTX_EXTRN_SREG_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CS) + { + rc = HMVMX_IMPORT_SREG(CS, &pCtx->cs); + rc |= hmR0VmxImportGuestRip(pVCpu); + VMXLOCAL_BREAK_RC(rc); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pCtx->cs.Attr.u = pVCpu->hm.s.vmx.RealMode.AttrCS.u; + EMR0HistoryUpdatePC(pVCpu, pCtx->cs.u64Base + pCtx->rip, true); + } + if (fWhat & CPUMCTX_EXTRN_SS) + { + rc = HMVMX_IMPORT_SREG(SS, &pCtx->ss); + VMXLOCAL_BREAK_RC(rc); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pCtx->ss.Attr.u = pVCpu->hm.s.vmx.RealMode.AttrSS.u; + } + if (fWhat & CPUMCTX_EXTRN_DS) + { + rc = HMVMX_IMPORT_SREG(DS, &pCtx->ds); + VMXLOCAL_BREAK_RC(rc); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pCtx->ds.Attr.u = pVCpu->hm.s.vmx.RealMode.AttrDS.u; + } + if (fWhat & CPUMCTX_EXTRN_ES) + { + rc = HMVMX_IMPORT_SREG(ES, &pCtx->es); + VMXLOCAL_BREAK_RC(rc); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pCtx->es.Attr.u = pVCpu->hm.s.vmx.RealMode.AttrES.u; + } + if (fWhat & CPUMCTX_EXTRN_FS) + { + rc = HMVMX_IMPORT_SREG(FS, &pCtx->fs); + VMXLOCAL_BREAK_RC(rc); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pCtx->fs.Attr.u = pVCpu->hm.s.vmx.RealMode.AttrFS.u; + } + if (fWhat & CPUMCTX_EXTRN_GS) + { + rc = HMVMX_IMPORT_SREG(GS, &pCtx->gs); + VMXLOCAL_BREAK_RC(rc); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + pCtx->gs.Attr.u = pVCpu->hm.s.vmx.RealMode.AttrGS.u; + } + } + + if (fWhat & CPUMCTX_EXTRN_TABLE_MASK) + { + if (fWhat & CPUMCTX_EXTRN_LDTR) + { + rc = HMVMX_IMPORT_SREG(LDTR, &pCtx->ldtr); + VMXLOCAL_BREAK_RC(rc); + } + + if (fWhat & CPUMCTX_EXTRN_GDTR) + { + rc = VMXReadVmcsGstN(VMX_VMCS_GUEST_GDTR_BASE, &u64Val); + rc |= VMXReadVmcs32(VMX_VMCS32_GUEST_GDTR_LIMIT, &u32Val); + VMXLOCAL_BREAK_RC(rc); + pCtx->gdtr.pGdt = u64Val; + pCtx->gdtr.cbGdt = u32Val; + } + + /* Guest IDTR. */ + if (fWhat & CPUMCTX_EXTRN_IDTR) + { + rc = VMXReadVmcsGstN(VMX_VMCS_GUEST_IDTR_BASE, &u64Val); + rc |= VMXReadVmcs32(VMX_VMCS32_GUEST_IDTR_LIMIT, &u32Val); + VMXLOCAL_BREAK_RC(rc); + pCtx->idtr.pIdt = u64Val; + pCtx->idtr.cbIdt = u32Val; + } + + /* Guest TR. */ + if (fWhat & CPUMCTX_EXTRN_TR) + { + /* Real-mode emulation using virtual-8086 mode has the fake TSS (pRealModeTSS) in TR, don't save that one. */ + if (!pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + rc = HMVMX_IMPORT_SREG(TR, &pCtx->tr); + VMXLOCAL_BREAK_RC(rc); + } + } + } + + if (fWhat & CPUMCTX_EXTRN_SYSENTER_MSRS) + { + rc = VMXReadVmcsGstN(VMX_VMCS_GUEST_SYSENTER_EIP, &pCtx->SysEnter.eip); + rc |= VMXReadVmcsGstN(VMX_VMCS_GUEST_SYSENTER_ESP, &pCtx->SysEnter.esp); + rc |= VMXReadVmcs32(VMX_VMCS32_GUEST_SYSENTER_CS, &u32Val); + pCtx->SysEnter.cs = u32Val; + VMXLOCAL_BREAK_RC(rc); + } + +#if HC_ARCH_BITS == 64 + if (fWhat & CPUMCTX_EXTRN_KERNEL_GS_BASE) + { + if ( pVM->hm.s.fAllow64BitGuests + && (pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_LOADED_GUEST)) + pCtx->msrKERNELGSBASE = ASMRdMsr(MSR_K8_KERNEL_GS_BASE); + } + + if (fWhat & CPUMCTX_EXTRN_SYSCALL_MSRS) + { + if ( pVM->hm.s.fAllow64BitGuests + && (pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_LOADED_GUEST)) + { + pCtx->msrLSTAR = ASMRdMsr(MSR_K8_LSTAR); + pCtx->msrSTAR = ASMRdMsr(MSR_K6_STAR); + pCtx->msrSFMASK = ASMRdMsr(MSR_K8_SF_MASK); + } + } +#endif + + if ( (fWhat & (CPUMCTX_EXTRN_TSC_AUX | CPUMCTX_EXTRN_OTHER_MSRS)) +#if HC_ARCH_BITS == 32 + || (fWhat & (CPUMCTX_EXTRN_KERNEL_GS_BASE | CPUMCTX_EXTRN_SYSCALL_MSRS)) +#endif + ) + { + PCVMXAUTOMSR pMsr = (PVMXAUTOMSR)pVCpu->hm.s.vmx.pvGuestMsr; + uint32_t const cMsrs = pVCpu->hm.s.vmx.cMsrs; + for (uint32_t i = 0; i < cMsrs; i++, pMsr++) + { + switch (pMsr->u32Msr) + { +#if HC_ARCH_BITS == 32 + case MSR_K8_LSTAR: pCtx->msrLSTAR = pMsr->u64Value; break; + case MSR_K6_STAR: pCtx->msrSTAR = pMsr->u64Value; break; + case MSR_K8_SF_MASK: pCtx->msrSFMASK = pMsr->u64Value; break; + case MSR_K8_KERNEL_GS_BASE: pCtx->msrKERNELGSBASE = pMsr->u64Value; break; +#endif + case MSR_IA32_SPEC_CTRL: CPUMSetGuestSpecCtrl(pVCpu, pMsr->u64Value); break; + case MSR_K8_TSC_AUX: CPUMSetGuestTscAux(pVCpu, pMsr->u64Value); break; + case MSR_K6_EFER: /* EFER can't be changed without causing a VM-exit */ break; + default: + { + pVCpu->hm.s.u32HMError = pMsr->u32Msr; + ASMSetFlags(fEFlags); + AssertMsgFailed(("Unexpected MSR in auto-load/store area. uMsr=%#RX32 cMsrs=%u\n", pMsr->u32Msr, + cMsrs)); + return VERR_HM_UNEXPECTED_LD_ST_MSR; + } + } + } + } + + if (fWhat & CPUMCTX_EXTRN_DR7) + { + if (!pVCpu->hm.s.fUsingHyperDR7) + { + /* Upper 32-bits are always zero. See Intel spec. 2.7.3 "Loading and Storing Debug Registers". */ + rc = VMXReadVmcs32(VMX_VMCS_GUEST_DR7, &u32Val); + VMXLOCAL_BREAK_RC(rc); + pCtx->dr[7] = u32Val; + } + } + + if (fWhat & CPUMCTX_EXTRN_CR_MASK) + { + uint32_t u32Shadow; + if (fWhat & CPUMCTX_EXTRN_CR0) + { + rc = VMXReadVmcs32(VMX_VMCS_GUEST_CR0, &u32Val); + rc |= VMXReadVmcs32(VMX_VMCS_CTRL_CR0_READ_SHADOW, &u32Shadow); + VMXLOCAL_BREAK_RC(rc); + u32Val = (u32Val & ~pVCpu->hm.s.vmx.u32Cr0Mask) + | (u32Shadow & pVCpu->hm.s.vmx.u32Cr0Mask); + VMMRZCallRing3Disable(pVCpu); /* Calls into PGM which has Log statements. */ + CPUMSetGuestCR0(pVCpu, u32Val); + VMMRZCallRing3Enable(pVCpu); + } + + if (fWhat & CPUMCTX_EXTRN_CR4) + { + rc = VMXReadVmcs32(VMX_VMCS_GUEST_CR4, &u32Val); + rc |= VMXReadVmcs32(VMX_VMCS_CTRL_CR4_READ_SHADOW, &u32Shadow); + VMXLOCAL_BREAK_RC(rc); + u32Val = (u32Val & ~pVCpu->hm.s.vmx.u32Cr4Mask) + | (u32Shadow & pVCpu->hm.s.vmx.u32Cr4Mask); + CPUMSetGuestCR4(pVCpu, u32Val); + } + + if (fWhat & CPUMCTX_EXTRN_CR3) + { + /* CR0.PG bit changes are always intercepted, so it's up to date. */ + if ( pVM->hm.s.vmx.fUnrestrictedGuest + || ( pVM->hm.s.fNestedPaging + && CPUMIsGuestPagingEnabledEx(pCtx))) + { + rc = VMXReadVmcsGstN(VMX_VMCS_GUEST_CR3, &u64Val); + if (pCtx->cr3 != u64Val) + { + CPUMSetGuestCR3(pVCpu, u64Val); + VMCPU_FF_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3); + } + + /* If the guest is in PAE mode, sync back the PDPE's into the guest state. + Note: CR4.PAE, CR0.PG, EFER bit changes are always intercepted, so they're up to date. */ + if (CPUMIsGuestInPAEModeEx(pCtx)) + { + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_PDPTE0_FULL, &pVCpu->hm.s.aPdpes[0].u); + rc |= VMXReadVmcs64(VMX_VMCS64_GUEST_PDPTE1_FULL, &pVCpu->hm.s.aPdpes[1].u); + rc |= VMXReadVmcs64(VMX_VMCS64_GUEST_PDPTE2_FULL, &pVCpu->hm.s.aPdpes[2].u); + rc |= VMXReadVmcs64(VMX_VMCS64_GUEST_PDPTE3_FULL, &pVCpu->hm.s.aPdpes[3].u); + VMXLOCAL_BREAK_RC(rc); + VMCPU_FF_SET(pVCpu, VMCPU_FF_HM_UPDATE_PAE_PDPES); + } + } + } + } + } while (0); + + if (RT_SUCCESS(rc)) + { + /* Update fExtrn. */ + pCtx->fExtrn &= ~fWhat; + + /* If everything has been imported, clear the HM keeper bit. */ + if (!(pCtx->fExtrn & HMVMX_CPUMCTX_EXTRN_ALL)) + { + pCtx->fExtrn &= ~CPUMCTX_EXTRN_KEEPER_HM; + Assert(!pCtx->fExtrn); + } + } + } + else + AssertMsg(!pCtx->fExtrn || (pCtx->fExtrn & HMVMX_CPUMCTX_EXTRN_ALL), ("%#RX64\n", pCtx->fExtrn)); + + ASMSetFlags(fEFlags); + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatImportGuestState, x); + + /* + * Honor any pending CR3 updates. + * + * Consider this scenario: VM-exit -> VMMRZCallRing3Enable() -> do stuff that causes a longjmp -> hmR0VmxCallRing3Callback() + * -> VMMRZCallRing3Disable() -> hmR0VmxImportGuestState() -> Sets VMCPU_FF_HM_UPDATE_CR3 pending -> return from the longjmp + * -> continue with VM-exit handling -> hmR0VmxImportGuestState() and here we are. + * + * The reason for such complicated handling is because VM-exits that call into PGM expect CR3 to be up-to-date and thus + * if any CR3-saves -before- the VM-exit (longjmp) postponed the CR3 update via the force-flag, any VM-exit handler that + * calls into PGM when it re-saves CR3 will end up here and we call PGMUpdateCR3(). This is why the code below should + * -NOT- check if CPUMCTX_EXTRN_CR3 is set! + * + * The longjmp exit path can't check these CR3 force-flags and call code that takes a lock again. We cover for it here. + */ + if (VMMRZCallRing3IsEnabled(pVCpu)) + { + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3)) + { + Assert(!(ASMAtomicUoReadU64(&pCtx->fExtrn) & CPUMCTX_EXTRN_CR3)); + PGMUpdateCR3(pVCpu, CPUMGetGuestCR3(pVCpu)); + } + + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_PAE_PDPES)) + PGMGstUpdatePaePdpes(pVCpu, &pVCpu->hm.s.aPdpes[0]); + + Assert(!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3)); + Assert(!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_PAE_PDPES)); + } + + return VINF_SUCCESS; +#undef VMXLOCAL_BREAK_RC +} + + +/** + * Saves the guest state from the VMCS into the guest-CPU context. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param fWhat What to import, CPUMCTX_EXTRN_XXX. + */ +VMMR0DECL(int) VMXR0ImportStateOnDemand(PVMCPU pVCpu, uint64_t fWhat) +{ + return hmR0VmxImportGuestState(pVCpu, fWhat); +} + + +/** + * Check per-VM and per-VCPU force flag actions that require us to go back to + * ring-3 for one reason or another. + * + * @returns Strict VBox status code (i.e. informational status codes too) + * @retval VINF_SUCCESS if we don't have any actions that require going back to + * ring-3. + * @retval VINF_PGM_SYNC_CR3 if we have pending PGM CR3 sync. + * @retval VINF_EM_PENDING_REQUEST if we have pending requests (like hardware + * interrupts) + * @retval VINF_PGM_POOL_FLUSH_PENDING if PGM is doing a pool flush and requires + * all EMTs to be in ring-3. + * @retval VINF_EM_RAW_TO_R3 if there is pending DMA requests. + * @retval VINF_EM_NO_MEMORY PGM is out of memory, we need to return + * to the EM loop. + * + * @param pVCpu The cross context virtual CPU structure. + * @param fStepping Running in hmR0VmxRunGuestCodeStep(). + */ +static VBOXSTRICTRC hmR0VmxCheckForceFlags(PVMCPU pVCpu, bool fStepping) +{ + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + + /* + * Anything pending? Should be more likely than not if we're doing a good job. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if ( !fStepping + ? !VM_FF_IS_ANY_SET(pVM, VM_FF_HP_R0_PRE_HM_MASK) + && !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HP_R0_PRE_HM_MASK) + : !VM_FF_IS_ANY_SET(pVM, VM_FF_HP_R0_PRE_HM_STEP_MASK) + && !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HP_R0_PRE_HM_STEP_MASK) ) + return VINF_SUCCESS; + + /* Pending PGM C3 sync. */ + if (VMCPU_FF_IS_ANY_SET(pVCpu,VMCPU_FF_PGM_SYNC_CR3 | VMCPU_FF_PGM_SYNC_CR3_NON_GLOBAL)) + { + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + Assert(!(ASMAtomicUoReadU64(&pCtx->fExtrn) & (CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_CR3 | CPUMCTX_EXTRN_CR4))); + VBOXSTRICTRC rcStrict2 = PGMSyncCR3(pVCpu, pCtx->cr0, pCtx->cr3, pCtx->cr4, + VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3)); + if (rcStrict2 != VINF_SUCCESS) + { + AssertRC(VBOXSTRICTRC_VAL(rcStrict2)); + Log4Func(("PGMSyncCR3 forcing us back to ring-3. rc2=%d\n", VBOXSTRICTRC_VAL(rcStrict2))); + return rcStrict2; + } + } + + /* Pending HM-to-R3 operations (critsects, timers, EMT rendezvous etc.) */ + if ( VM_FF_IS_ANY_SET(pVM, VM_FF_HM_TO_R3_MASK) + || VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HM_TO_R3_MASK)) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchHmToR3FF); + int rc2 = RT_LIKELY(!VM_FF_IS_SET(pVM, VM_FF_PGM_NO_MEMORY)) ? VINF_EM_RAW_TO_R3 : VINF_EM_NO_MEMORY; + Log4Func(("HM_TO_R3 forcing us back to ring-3. rc=%d\n", rc2)); + return rc2; + } + + /* Pending VM request packets, such as hardware interrupts. */ + if ( VM_FF_IS_SET(pVM, VM_FF_REQUEST) + || VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_REQUEST)) + { + Log4Func(("Pending VM request forcing us back to ring-3\n")); + return VINF_EM_PENDING_REQUEST; + } + + /* Pending PGM pool flushes. */ + if (VM_FF_IS_SET(pVM, VM_FF_PGM_POOL_FLUSH_PENDING)) + { + Log4Func(("PGM pool flush pending forcing us back to ring-3\n")); + return VINF_PGM_POOL_FLUSH_PENDING; + } + + /* Pending DMA requests. */ + if (VM_FF_IS_SET(pVM, VM_FF_PDM_DMA)) + { + Log4Func(("Pending DMA request forcing us back to ring-3\n")); + return VINF_EM_RAW_TO_R3; + } + + return VINF_SUCCESS; +} + + +/** + * Converts any TRPM trap into a pending HM event. This is typically used when + * entering from ring-3 (not longjmp returns). + * + * @param pVCpu The cross context virtual CPU structure. + */ +static void hmR0VmxTrpmTrapToPendingEvent(PVMCPU pVCpu) +{ + Assert(TRPMHasTrap(pVCpu)); + Assert(!pVCpu->hm.s.Event.fPending); + + uint8_t uVector; + TRPMEVENT enmTrpmEvent; + RTGCUINT uErrCode; + RTGCUINTPTR GCPtrFaultAddress; + uint8_t cbInstr; + + int rc = TRPMQueryTrapAll(pVCpu, &uVector, &enmTrpmEvent, &uErrCode, &GCPtrFaultAddress, &cbInstr); + AssertRC(rc); + + /* Refer Intel spec. 24.8.3 "VM-entry Controls for Event Injection" for the format of u32IntInfo. */ + uint32_t u32IntInfo = uVector | VMX_EXIT_INT_INFO_VALID; + if (enmTrpmEvent == TRPM_TRAP) + { + switch (uVector) + { + case X86_XCPT_NMI: + u32IntInfo |= (VMX_EXIT_INT_INFO_TYPE_NMI << VMX_EXIT_INT_INFO_TYPE_SHIFT); + break; + + case X86_XCPT_BP: + case X86_XCPT_OF: + u32IntInfo |= (VMX_EXIT_INT_INFO_TYPE_SW_XCPT << VMX_EXIT_INT_INFO_TYPE_SHIFT); + break; + + case X86_XCPT_PF: + case X86_XCPT_DF: + case X86_XCPT_TS: + case X86_XCPT_NP: + case X86_XCPT_SS: + case X86_XCPT_GP: + case X86_XCPT_AC: + u32IntInfo |= VMX_EXIT_INT_INFO_ERROR_CODE_VALID; + RT_FALL_THRU(); + default: + u32IntInfo |= (VMX_EXIT_INT_INFO_TYPE_HW_XCPT << VMX_EXIT_INT_INFO_TYPE_SHIFT); + break; + } + } + else if (enmTrpmEvent == TRPM_HARDWARE_INT) + u32IntInfo |= (VMX_EXIT_INT_INFO_TYPE_EXT_INT << VMX_EXIT_INT_INFO_TYPE_SHIFT); + else if (enmTrpmEvent == TRPM_SOFTWARE_INT) + u32IntInfo |= (VMX_EXIT_INT_INFO_TYPE_SW_INT << VMX_EXIT_INT_INFO_TYPE_SHIFT); + else + AssertMsgFailed(("Invalid TRPM event type %d\n", enmTrpmEvent)); + + rc = TRPMResetTrap(pVCpu); + AssertRC(rc); + Log4(("TRPM->HM event: u32IntInfo=%#RX32 enmTrpmEvent=%d cbInstr=%u uErrCode=%#RX32 GCPtrFaultAddress=%#RGv\n", + u32IntInfo, enmTrpmEvent, cbInstr, uErrCode, GCPtrFaultAddress)); + + hmR0VmxSetPendingEvent(pVCpu, u32IntInfo, cbInstr, uErrCode, GCPtrFaultAddress); +} + + +/** + * Converts the pending HM event into a TRPM trap. + * + * @param pVCpu The cross context virtual CPU structure. + */ +static void hmR0VmxPendingEventToTrpmTrap(PVMCPU pVCpu) +{ + Assert(pVCpu->hm.s.Event.fPending); + + uint32_t uVectorType = VMX_IDT_VECTORING_INFO_TYPE(pVCpu->hm.s.Event.u64IntInfo); + uint32_t uVector = VMX_IDT_VECTORING_INFO_VECTOR(pVCpu->hm.s.Event.u64IntInfo); + bool fErrorCodeValid = VMX_IDT_VECTORING_INFO_IS_ERROR_CODE_VALID(pVCpu->hm.s.Event.u64IntInfo); + uint32_t uErrorCode = pVCpu->hm.s.Event.u32ErrCode; + + /* If a trap was already pending, we did something wrong! */ + Assert(TRPMQueryTrap(pVCpu, NULL /* pu8TrapNo */, NULL /* pEnmType */) == VERR_TRPM_NO_ACTIVE_TRAP); + + TRPMEVENT enmTrapType; + switch (uVectorType) + { + case VMX_IDT_VECTORING_INFO_TYPE_EXT_INT: + enmTrapType = TRPM_HARDWARE_INT; + break; + + case VMX_IDT_VECTORING_INFO_TYPE_SW_INT: + enmTrapType = TRPM_SOFTWARE_INT; + break; + + case VMX_IDT_VECTORING_INFO_TYPE_NMI: + case VMX_IDT_VECTORING_INFO_TYPE_PRIV_SW_XCPT: + case VMX_IDT_VECTORING_INFO_TYPE_SW_XCPT: /* #BP and #OF */ + case VMX_IDT_VECTORING_INFO_TYPE_HW_XCPT: + enmTrapType = TRPM_TRAP; + break; + + default: + AssertMsgFailed(("Invalid trap type %#x\n", uVectorType)); + enmTrapType = TRPM_32BIT_HACK; + break; + } + + Log4(("HM event->TRPM: uVector=%#x enmTrapType=%d\n", uVector, enmTrapType)); + + int rc = TRPMAssertTrap(pVCpu, uVector, enmTrapType); + AssertRC(rc); + + if (fErrorCodeValid) + TRPMSetErrorCode(pVCpu, uErrorCode); + + if ( uVectorType == VMX_IDT_VECTORING_INFO_TYPE_HW_XCPT + && uVector == X86_XCPT_PF) + { + TRPMSetFaultAddress(pVCpu, pVCpu->hm.s.Event.GCPtrFaultAddress); + } + else if ( uVectorType == VMX_IDT_VECTORING_INFO_TYPE_SW_INT + || uVectorType == VMX_IDT_VECTORING_INFO_TYPE_SW_XCPT + || uVectorType == VMX_IDT_VECTORING_INFO_TYPE_PRIV_SW_XCPT) + { + AssertMsg( uVectorType == VMX_IDT_VECTORING_INFO_TYPE_SW_INT + || (uVector == X86_XCPT_BP || uVector == X86_XCPT_OF), + ("Invalid vector: uVector=%#x uVectorType=%#x\n", uVector, uVectorType)); + TRPMSetInstrLength(pVCpu, pVCpu->hm.s.Event.cbInstr); + } + + /* Clear the events from the VMCS. */ + VMXWriteVmcs32(VMX_VMCS32_CTRL_ENTRY_INTERRUPTION_INFO, 0); + + /* We're now done converting the pending event. */ + pVCpu->hm.s.Event.fPending = false; +} + + +/** + * Does the necessary state syncing before returning to ring-3 for any reason + * (longjmp, preemption, voluntary exits to ring-3) from VT-x. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param fImportState Whether to import the guest state from the VMCS back + * to the guest-CPU context. + * + * @remarks No-long-jmp zone!!! + */ +static int hmR0VmxLeave(PVMCPU pVCpu, bool fImportState) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + + RTCPUID idCpu = RTMpCpuId(); + Log4Func(("HostCpuId=%u\n", idCpu)); + + /* + * !!! IMPORTANT !!! + * If you modify code here, check whether hmR0VmxCallRing3Callback() needs to be updated too. + */ + + /* Save the guest state if necessary. */ + if (fImportState) + { + int rc = hmR0VmxImportGuestState(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRCReturn(rc, rc); + } + + /* Restore host FPU state if necessary. We will resync on next R0 reentry. */ + CPUMR0FpuStateMaybeSaveGuestAndRestoreHost(pVCpu); + Assert(!CPUMIsGuestFPUStateActive(pVCpu)); + + /* Restore host debug registers if necessary. We will resync on next R0 reentry. */ +#ifdef VBOX_STRICT + if (CPUMIsHyperDebugStateActive(pVCpu)) + Assert(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_MOV_DR_EXIT); +#endif + CPUMR0DebugStateMaybeSaveGuestAndRestoreHost(pVCpu, true /* save DR6 */); + Assert(!CPUMIsGuestDebugStateActive(pVCpu) && !CPUMIsGuestDebugStateActivePending(pVCpu)); + Assert(!CPUMIsHyperDebugStateActive(pVCpu) && !CPUMIsHyperDebugStateActivePending(pVCpu)); + +#if HC_ARCH_BITS == 64 + /* Restore host-state bits that VT-x only restores partially. */ + if ( (pVCpu->hm.s.vmx.fRestoreHostFlags & VMX_RESTORE_HOST_REQUIRED) + && (pVCpu->hm.s.vmx.fRestoreHostFlags & ~VMX_RESTORE_HOST_REQUIRED)) + { + Log4Func(("Restoring Host State: fRestoreHostFlags=%#RX32 HostCpuId=%u\n", pVCpu->hm.s.vmx.fRestoreHostFlags, idCpu)); + VMXRestoreHostState(pVCpu->hm.s.vmx.fRestoreHostFlags, &pVCpu->hm.s.vmx.RestoreHost); + } + pVCpu->hm.s.vmx.fRestoreHostFlags = 0; +#endif + + /* Restore the lazy host MSRs as we're leaving VT-x context. */ + if (pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_LOADED_GUEST) + { + /* We shouldn't restore the host MSRs without saving the guest MSRs first. */ + if (!fImportState) + { + int rc = hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_KERNEL_GS_BASE | CPUMCTX_EXTRN_SYSCALL_MSRS); + AssertRCReturn(rc, rc); + } + hmR0VmxLazyRestoreHostMsrs(pVCpu); + Assert(!pVCpu->hm.s.vmx.fLazyMsrs); + } + else + pVCpu->hm.s.vmx.fLazyMsrs = 0; + + /* Update auto-load/store host MSRs values when we re-enter VT-x (as we could be on a different CPU). */ + pVCpu->hm.s.vmx.fUpdatedHostMsrs = false; + + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatEntry); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatImportGuestState); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatExportGuestState); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatPreExit); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatExitHandling); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatExitIO); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatExitMovCRx); + STAM_PROFILE_ADV_SET_STOPPED(&pVCpu->hm.s.StatExitXcptNmi); + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchLongJmpToR3); + + VMCPU_CMPXCHG_STATE(pVCpu, VMCPUSTATE_STARTED_HM, VMCPUSTATE_STARTED_EXEC); + + /** @todo This partially defeats the purpose of having preemption hooks. + * The problem is, deregistering the hooks should be moved to a place that + * lasts until the EMT is about to be destroyed not everytime while leaving HM + * context. + */ + if (pVCpu->hm.s.vmx.fVmcsState & HMVMX_VMCS_STATE_ACTIVE) + { + int rc = VMXClearVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + AssertRCReturn(rc, rc); + + pVCpu->hm.s.vmx.fVmcsState = HMVMX_VMCS_STATE_CLEAR; + Log4Func(("Cleared Vmcs. HostCpuId=%u\n", idCpu)); + } + Assert(!(pVCpu->hm.s.vmx.fVmcsState & HMVMX_VMCS_STATE_LAUNCHED)); + NOREF(idCpu); + + return VINF_SUCCESS; +} + + +/** + * Leaves the VT-x session. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jmp zone!!! + */ +static int hmR0VmxLeaveSession(PVMCPU pVCpu) +{ + HM_DISABLE_PREEMPT(pVCpu); + HMVMX_ASSERT_CPU_SAFE(pVCpu); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* When thread-context hooks are used, we can avoid doing the leave again if we had been preempted before + and done this from the VMXR0ThreadCtxCallback(). */ + if (!pVCpu->hm.s.fLeaveDone) + { + int rc2 = hmR0VmxLeave(pVCpu, true /* fImportState */); + AssertRCReturnStmt(rc2, HM_RESTORE_PREEMPT(), rc2); + pVCpu->hm.s.fLeaveDone = true; + } + Assert(!pVCpu->cpum.GstCtx.fExtrn); + + /* + * !!! IMPORTANT !!! + * If you modify code here, make sure to check whether hmR0VmxCallRing3Callback() needs to be updated too. + */ + + /* Deregister hook now that we've left HM context before re-enabling preemption. */ + /** @todo Deregistering here means we need to VMCLEAR always + * (longjmp/exit-to-r3) in VT-x which is not efficient, eliminate need + * for calling VMMR0ThreadCtxHookDisable here! */ + VMMR0ThreadCtxHookDisable(pVCpu); + + /* Leave HM context. This takes care of local init (term). */ + int rc = HMR0LeaveCpu(pVCpu); + + HM_RESTORE_PREEMPT(); + return rc; +} + + +/** + * Does the necessary state syncing before doing a longjmp to ring-3. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jmp zone!!! + */ +DECLINLINE(int) hmR0VmxLongJmpToRing3(PVMCPU pVCpu) +{ + return hmR0VmxLeaveSession(pVCpu); +} + + +/** + * Take necessary actions before going back to ring-3. + * + * An action requires us to go back to ring-3. This function does the necessary + * steps before we can safely return to ring-3. This is not the same as longjmps + * to ring-3, this is voluntary and prepares the guest so it may continue + * executing outside HM (recompiler/IEM). + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param rcExit The reason for exiting to ring-3. Can be + * VINF_VMM_UNKNOWN_RING3_CALL. + */ +static int hmR0VmxExitToRing3(PVMCPU pVCpu, VBOXSTRICTRC rcExit) +{ + Assert(pVCpu); + HMVMX_ASSERT_PREEMPT_SAFE(pVCpu); + + if (RT_UNLIKELY(rcExit == VERR_VMX_INVALID_VMCS_PTR)) + { + VMXGetActivatedVmcs(&pVCpu->hm.s.vmx.LastError.u64VmcsPhys); + pVCpu->hm.s.vmx.LastError.u32VmcsRev = *(uint32_t *)pVCpu->hm.s.vmx.pvVmcs; + pVCpu->hm.s.vmx.LastError.idEnteredCpu = pVCpu->hm.s.idEnteredCpu; + /* LastError.idCurrentCpu was updated in hmR0VmxPreRunGuestCommitted(). */ + } + + /* Please, no longjumps here (any logging shouldn't flush jump back to ring-3). NO LOGGING BEFORE THIS POINT! */ + VMMRZCallRing3Disable(pVCpu); + Log4Func(("rcExit=%d\n", VBOXSTRICTRC_VAL(rcExit))); + + /* We need to do this only while truly exiting the "inner loop" back to ring-3 and -not- for any longjmp to ring3. */ + if (pVCpu->hm.s.Event.fPending) + { + hmR0VmxPendingEventToTrpmTrap(pVCpu); + Assert(!pVCpu->hm.s.Event.fPending); + } + + /* Clear interrupt-window and NMI-window controls as we re-evaluate it when we return from ring-3. */ + hmR0VmxClearIntNmiWindowsVmcs(pVCpu); + + /* If we're emulating an instruction, we shouldn't have any TRPM traps pending + and if we're injecting an event we should have a TRPM trap pending. */ + AssertMsg(rcExit != VINF_EM_RAW_INJECT_TRPM_EVENT || TRPMHasTrap(pVCpu), ("%Rrc\n", VBOXSTRICTRC_VAL(rcExit))); +#ifndef DEBUG_bird /* Triggered after firing an NMI against NT4SP1, possibly a triple fault in progress. */ + AssertMsg(rcExit != VINF_EM_RAW_EMULATE_INSTR || !TRPMHasTrap(pVCpu), ("%Rrc\n", VBOXSTRICTRC_VAL(rcExit))); +#endif + + /* Save guest state and restore host state bits. */ + int rc = hmR0VmxLeaveSession(pVCpu); + AssertRCReturn(rc, rc); + STAM_COUNTER_DEC(&pVCpu->hm.s.StatSwitchLongJmpToR3); + /* Thread-context hooks are unregistered at this point!!! */ + + /* Sync recompiler state. */ + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_TO_R3); + CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_SYSENTER_MSR + | CPUM_CHANGED_LDTR + | CPUM_CHANGED_GDTR + | CPUM_CHANGED_IDTR + | CPUM_CHANGED_TR + | CPUM_CHANGED_HIDDEN_SEL_REGS); + if ( pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging + && CPUMIsGuestPagingEnabledEx(&pVCpu->cpum.GstCtx)) + { + CPUMSetChangedFlags(pVCpu, CPUM_CHANGED_GLOBAL_TLB_FLUSH); + } + + Assert(!pVCpu->hm.s.fClearTrapFlag); + + /* Update the exit-to-ring 3 reason. */ + pVCpu->hm.s.rcLastExitToR3 = VBOXSTRICTRC_VAL(rcExit); + + /* On our way back from ring-3 reload the guest state if there is a possibility of it being changed. */ + if (rcExit != VINF_EM_RAW_INTERRUPT) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchExitToR3); + + /* We do -not- want any longjmp notifications after this! We must return to ring-3 ASAP. */ + VMMRZCallRing3RemoveNotification(pVCpu); + VMMRZCallRing3Enable(pVCpu); + + return rc; +} + + +/** + * VMMRZCallRing3() callback wrapper which saves the guest state before we + * longjump to ring-3 and possibly get preempted. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @param enmOperation The operation causing the ring-3 longjump. + * @param pvUser User argument, currently unused, NULL. + */ +static DECLCALLBACK(int) hmR0VmxCallRing3Callback(PVMCPU pVCpu, VMMCALLRING3 enmOperation, void *pvUser) +{ + RT_NOREF(pvUser); + if (enmOperation == VMMCALLRING3_VM_R0_ASSERTION) + { + /* + * !!! IMPORTANT !!! + * If you modify code here, check whether hmR0VmxLeave() and hmR0VmxLeaveSession() needs to be updated too. + * This is a stripped down version which gets out ASAP, trying to not trigger any further assertions. + */ + VMMRZCallRing3RemoveNotification(pVCpu); + VMMRZCallRing3Disable(pVCpu); + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + RTThreadPreemptDisable(&PreemptState); + + hmR0VmxImportGuestState(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + CPUMR0FpuStateMaybeSaveGuestAndRestoreHost(pVCpu); + CPUMR0DebugStateMaybeSaveGuestAndRestoreHost(pVCpu, true /* save DR6 */); + +#if HC_ARCH_BITS == 64 + /* Restore host-state bits that VT-x only restores partially. */ + if ( (pVCpu->hm.s.vmx.fRestoreHostFlags & VMX_RESTORE_HOST_REQUIRED) + && (pVCpu->hm.s.vmx.fRestoreHostFlags & ~VMX_RESTORE_HOST_REQUIRED)) + VMXRestoreHostState(pVCpu->hm.s.vmx.fRestoreHostFlags, &pVCpu->hm.s.vmx.RestoreHost); + pVCpu->hm.s.vmx.fRestoreHostFlags = 0; +#endif + + /* Restore the lazy host MSRs as we're leaving VT-x context. */ + if (pVCpu->hm.s.vmx.fLazyMsrs & VMX_LAZY_MSRS_LOADED_GUEST) + hmR0VmxLazyRestoreHostMsrs(pVCpu); + + /* Update auto-load/store host MSRs values when we re-enter VT-x (as we could be on a different CPU). */ + pVCpu->hm.s.vmx.fUpdatedHostMsrs = false; + VMCPU_CMPXCHG_STATE(pVCpu, VMCPUSTATE_STARTED_HM, VMCPUSTATE_STARTED_EXEC); + if (pVCpu->hm.s.vmx.fVmcsState & HMVMX_VMCS_STATE_ACTIVE) + { + VMXClearVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + pVCpu->hm.s.vmx.fVmcsState = HMVMX_VMCS_STATE_CLEAR; + } + + /** @todo eliminate the need for calling VMMR0ThreadCtxHookDisable here! */ + VMMR0ThreadCtxHookDisable(pVCpu); + HMR0LeaveCpu(pVCpu); + RTThreadPreemptRestore(&PreemptState); + return VINF_SUCCESS; + } + + Assert(pVCpu); + Assert(pvUser); + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + HMVMX_ASSERT_PREEMPT_SAFE(pVCpu); + + VMMRZCallRing3Disable(pVCpu); + Assert(VMMR0IsLogFlushDisabled(pVCpu)); + + Log4Func((" -> hmR0VmxLongJmpToRing3 enmOperation=%d\n", enmOperation)); + + int rc = hmR0VmxLongJmpToRing3(pVCpu); + AssertRCReturn(rc, rc); + + VMMRZCallRing3Enable(pVCpu); + return VINF_SUCCESS; +} + + +/** + * Sets the interrupt-window exiting control in the VMCS which instructs VT-x to + * cause a VM-exit as soon as the guest is in a state to receive interrupts. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0VmxSetIntWindowExitVmcs(PVMCPU pVCpu) +{ + if (RT_LIKELY(pVCpu->CTX_SUFF(pVM)->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_INT_WINDOW_EXIT)) + { + if (!(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_INT_WINDOW_EXIT)) + { + pVCpu->hm.s.vmx.u32ProcCtls |= VMX_PROC_CTLS_INT_WINDOW_EXIT; + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, pVCpu->hm.s.vmx.u32ProcCtls); + AssertRC(rc); + Log4Func(("Setup interrupt-window exiting\n")); + } + } /* else we will deliver interrupts whenever the guest exits next and is in a state to receive events. */ +} + + +/** + * Clears the interrupt-window exiting control in the VMCS. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0VmxClearIntWindowExitVmcs(PVMCPU pVCpu) +{ + Assert(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_INT_WINDOW_EXIT); + pVCpu->hm.s.vmx.u32ProcCtls &= ~VMX_PROC_CTLS_INT_WINDOW_EXIT; + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, pVCpu->hm.s.vmx.u32ProcCtls); + AssertRC(rc); + Log4Func(("Cleared interrupt-window exiting\n")); +} + + +/** + * Sets the NMI-window exiting control in the VMCS which instructs VT-x to + * cause a VM-exit as soon as the guest is in a state to receive NMIs. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0VmxSetNmiWindowExitVmcs(PVMCPU pVCpu) +{ + if (RT_LIKELY(pVCpu->CTX_SUFF(pVM)->hm.s.vmx.Msrs.ProcCtls.n.allowed1 & VMX_PROC_CTLS_NMI_WINDOW_EXIT)) + { + if (!(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_NMI_WINDOW_EXIT)) + { + pVCpu->hm.s.vmx.u32ProcCtls |= VMX_PROC_CTLS_NMI_WINDOW_EXIT; + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, pVCpu->hm.s.vmx.u32ProcCtls); + AssertRC(rc); + Log4Func(("Setup NMI-window exiting\n")); + } + } /* else we will deliver NMIs whenever we VM-exit next, even possibly nesting NMIs. Can't be helped on ancient CPUs. */ +} + + +/** + * Clears the NMI-window exiting control in the VMCS. + * + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(void) hmR0VmxClearNmiWindowExitVmcs(PVMCPU pVCpu) +{ + Assert(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_NMI_WINDOW_EXIT); + pVCpu->hm.s.vmx.u32ProcCtls &= ~VMX_PROC_CTLS_NMI_WINDOW_EXIT; + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, pVCpu->hm.s.vmx.u32ProcCtls); + AssertRC(rc); + Log4Func(("Cleared NMI-window exiting\n")); +} + + +/** + * Evaluates the event to be delivered to the guest and sets it as the pending + * event. + * + * @returns The VT-x guest-interruptibility state. + * @param pVCpu The cross context virtual CPU structure. + */ +static uint32_t hmR0VmxEvaluatePendingEvent(PVMCPU pVCpu) +{ + /* Get the current interruptibility-state of the guest and then figure out what can be injected. */ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint32_t const fIntrState = hmR0VmxGetGuestIntrState(pVCpu); + bool const fBlockMovSS = RT_BOOL(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS); + bool const fBlockSti = RT_BOOL(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_STI); + bool const fBlockNmi = RT_BOOL(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_NMI); + + Assert(!fBlockSti || !(ASMAtomicUoReadU64(&pCtx->fExtrn) & CPUMCTX_EXTRN_RFLAGS)); + Assert(!(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_SMI)); /* We don't support block-by-SMI yet.*/ + Assert(!fBlockSti || pCtx->eflags.Bits.u1IF); /* Cannot set block-by-STI when interrupts are disabled. */ + Assert(!TRPMHasTrap(pVCpu)); + + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC)) + APICUpdatePendingInterrupts(pVCpu); + + /* + * Toggling of interrupt force-flags here is safe since we update TRPM on premature exits + * to ring-3 before executing guest code, see hmR0VmxExitToRing3(). We must NOT restore these force-flags. + */ + /** @todo SMI. SMIs take priority over NMIs. */ + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NMI)) /* NMI. NMIs take priority over regular interrupts. */ + { + /* On some CPUs block-by-STI also blocks NMIs. See Intel spec. 26.3.1.5 "Checks On Guest Non-Register State". */ + if ( !pVCpu->hm.s.Event.fPending + && !fBlockNmi + && !fBlockSti + && !fBlockMovSS) + { + Log4Func(("Pending NMI\n")); + uint32_t u32IntInfo = X86_XCPT_NMI | VMX_EXIT_INT_INFO_VALID; + u32IntInfo |= (VMX_EXIT_INT_INFO_TYPE_NMI << VMX_EXIT_INT_INFO_TYPE_SHIFT); + + hmR0VmxSetPendingEvent(pVCpu, u32IntInfo, 0 /* cbInstr */, 0 /* u32ErrCode */, 0 /* GCPtrFaultAddress */); + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INTERRUPT_NMI); + } + else + hmR0VmxSetNmiWindowExitVmcs(pVCpu); + } + /* + * Check if the guest can receive external interrupts (PIC/APIC). Once PDMGetInterrupt() returns + * a valid interrupt we must- deliver the interrupt. We can no longer re-request it from the APIC. + */ + else if ( VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC) + && !pVCpu->hm.s.fSingleInstruction) + { + Assert(!DBGFIsStepping(pVCpu)); + int rc = hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_RFLAGS); + AssertRCReturn(rc, 0); + bool const fBlockInt = !(pCtx->eflags.u32 & X86_EFL_IF); + if ( !pVCpu->hm.s.Event.fPending + && !fBlockInt + && !fBlockSti + && !fBlockMovSS) + { + uint8_t u8Interrupt; + rc = PDMGetInterrupt(pVCpu, &u8Interrupt); + if (RT_SUCCESS(rc)) + { + Log4Func(("Pending external interrupt u8Interrupt=%#x\n", u8Interrupt)); + uint32_t u32IntInfo = u8Interrupt + | VMX_EXIT_INT_INFO_VALID + | (VMX_EXIT_INT_INFO_TYPE_EXT_INT << VMX_EXIT_INT_INFO_TYPE_SHIFT); + + hmR0VmxSetPendingEvent(pVCpu, u32IntInfo, 0 /* cbInstr */, 0 /* u32ErrCode */, 0 /* GCPtrfaultAddress */); + } + else if (rc == VERR_APIC_INTR_MASKED_BY_TPR) + { + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW) + hmR0VmxApicSetTprThreshold(pVCpu, u8Interrupt >> 4); + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchTprMaskedIrq); + + /* + * If the CPU doesn't have TPR shadowing, we will always get a VM-exit on TPR changes and + * APICSetTpr() will end up setting the VMCPU_FF_INTERRUPT_APIC if required, so there is no + * need to re-set this force-flag here. + */ + } + else + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchGuestIrq); + } + else + hmR0VmxSetIntWindowExitVmcs(pVCpu); + } + + return fIntrState; +} + + +/** + * Injects any pending events into the guest if the guest is in a state to + * receive them. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure. + * @param fIntrState The VT-x guest-interruptibility state. + * @param fStepping Running in hmR0VmxRunGuestCodeStep() and we should + * return VINF_EM_DBG_STEPPED if the event was + * dispatched directly. + */ +static VBOXSTRICTRC hmR0VmxInjectPendingEvent(PVMCPU pVCpu, uint32_t fIntrState, bool fStepping) +{ + HMVMX_ASSERT_PREEMPT_SAFE(pVCpu); + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + + bool const fBlockMovSS = RT_BOOL(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS); + bool const fBlockSti = RT_BOOL(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_STI); + + Assert(!fBlockSti || !(ASMAtomicUoReadU64(&pVCpu->cpum.GstCtx.fExtrn) & CPUMCTX_EXTRN_RFLAGS)); + Assert(!(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_SMI)); /* We don't support block-by-SMI yet.*/ + Assert(!fBlockSti || pVCpu->cpum.GstCtx.eflags.Bits.u1IF); /* Cannot set block-by-STI when interrupts are disabled. */ + Assert(!TRPMHasTrap(pVCpu)); + + VBOXSTRICTRC rcStrict = VINF_SUCCESS; + if (pVCpu->hm.s.Event.fPending) + { + /* + * Do -not- clear any interrupt-window exiting control here. We might have an interrupt + * pending even while injecting an event and in this case, we want a VM-exit as soon as + * the guest is ready for the next interrupt, see @bugref{6208#c45}. + * + * See Intel spec. 26.6.5 "Interrupt-Window Exiting and Virtual-Interrupt Delivery". + */ + uint32_t const uIntType = VMX_ENTRY_INT_INFO_TYPE(pVCpu->hm.s.Event.u64IntInfo); +#ifdef VBOX_STRICT + if (uIntType == VMX_ENTRY_INT_INFO_TYPE_EXT_INT) + { + bool const fBlockInt = !(pVCpu->cpum.GstCtx.eflags.u32 & X86_EFL_IF); + Assert(!fBlockInt); + Assert(!fBlockSti); + Assert(!fBlockMovSS); + } + else if (uIntType == VMX_ENTRY_INT_INFO_TYPE_NMI) + { + bool const fBlockNmi = RT_BOOL(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_NMI); + Assert(!fBlockSti); + Assert(!fBlockMovSS); + Assert(!fBlockNmi); + } +#endif + Log4(("Injecting pending event vcpu[%RU32] u64IntInfo=%#RX64 Type=%#RX32\n", pVCpu->idCpu, pVCpu->hm.s.Event.u64IntInfo, + uIntType)); + + /* + * Inject the event and get any changes to the guest-interruptibility state. + * + * The guest-interruptibility state may need to be updated if we inject the event + * into the guest IDT ourselves (for real-on-v86 guest injecting software interrupts). + */ + rcStrict = hmR0VmxInjectEventVmcs(pVCpu, pVCpu->hm.s.Event.u64IntInfo, pVCpu->hm.s.Event.cbInstr, + pVCpu->hm.s.Event.u32ErrCode, pVCpu->hm.s.Event.GCPtrFaultAddress, fStepping, + &fIntrState); + AssertRCReturn(VBOXSTRICTRC_VAL(rcStrict), rcStrict); + + if (uIntType == VMX_ENTRY_INT_INFO_TYPE_EXT_INT) + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectInterrupt); + else + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectXcpt); + } + + /* + * Update the guest-interruptibility state. + * + * This is required for the real-on-v86 software interrupt injection case above, as well as + * updates to the guest state from ring-3 or IEM/REM. + */ + int rc = VMXWriteVmcs32(VMX_VMCS32_GUEST_INT_STATE, fIntrState); + AssertRCReturn(rc, rc); + + /* + * There's no need to clear the VM-entry interruption-information field here if we're not + * injecting anything. VT-x clears the valid bit on every VM-exit. + * + * See Intel spec. 24.8.3 "VM-Entry Controls for Event Injection". + */ + + Assert(rcStrict == VINF_SUCCESS || rcStrict == VINF_EM_RESET || (rcStrict == VINF_EM_DBG_STEPPED && fStepping)); + NOREF(fBlockMovSS); NOREF(fBlockSti); + return rcStrict; +} + + +/** + * Injects a double-fault (\#DF) exception into the VM. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure. + * @param fStepping Whether we're running in hmR0VmxRunGuestCodeStep() + * and should return VINF_EM_DBG_STEPPED if the event + * is injected directly (register modified by us, not + * by hardware on VM-entry). + * @param pfIntrState Pointer to the current guest interruptibility-state. + * This interruptibility-state will be updated if + * necessary. This cannot not be NULL. + */ +DECLINLINE(VBOXSTRICTRC) hmR0VmxInjectXcptDF(PVMCPU pVCpu, bool fStepping, uint32_t *pfIntrState) +{ + uint32_t const u32IntInfo = X86_XCPT_DF | VMX_EXIT_INT_INFO_VALID + | (VMX_EXIT_INT_INFO_TYPE_HW_XCPT << VMX_EXIT_INT_INFO_TYPE_SHIFT) + | VMX_EXIT_INT_INFO_ERROR_CODE_VALID; + return hmR0VmxInjectEventVmcs(pVCpu, u32IntInfo, 0 /* cbInstr */, 0 /* u32ErrCode */, 0 /* GCPtrFaultAddress */, fStepping, + pfIntrState); +} + + +/** + * Injects a general-protection (\#GP) fault into the VM. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure. + * @param fErrorCodeValid Whether the error code is valid (depends on the CPU + * mode, i.e. in real-mode it's not valid). + * @param u32ErrorCode The error code associated with the \#GP. + * @param fStepping Whether we're running in + * hmR0VmxRunGuestCodeStep() and should return + * VINF_EM_DBG_STEPPED if the event is injected + * directly (register modified by us, not by + * hardware on VM-entry). + * @param pfIntrState Pointer to the current guest interruptibility-state. + * This interruptibility-state will be updated if + * necessary. This cannot not be NULL. + */ +DECLINLINE(VBOXSTRICTRC) hmR0VmxInjectXcptGP(PVMCPU pVCpu, bool fErrorCodeValid, uint32_t u32ErrorCode, bool fStepping, + uint32_t *pfIntrState) +{ + uint32_t const u32IntInfo = X86_XCPT_GP | VMX_EXIT_INT_INFO_VALID + | (VMX_EXIT_INT_INFO_TYPE_HW_XCPT << VMX_EXIT_INT_INFO_TYPE_SHIFT) + | (fErrorCodeValid ? VMX_EXIT_INT_INFO_ERROR_CODE_VALID : 0); + return hmR0VmxInjectEventVmcs(pVCpu, u32IntInfo, 0 /* cbInstr */, u32ErrorCode, 0 /* GCPtrFaultAddress */, fStepping, + pfIntrState); +} + + +/** + * Pushes a 2-byte value onto the real-mode (in virtual-8086 mode) guest's + * stack. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @retval VINF_EM_RESET if pushing a value to the stack caused a triple-fault. + * @param pVCpu The cross context virtual CPU structure. + * @param uValue The value to push to the guest stack. + */ +static VBOXSTRICTRC hmR0VmxRealModeGuestStackPush(PVMCPU pVCpu, uint16_t uValue) +{ + /* + * The stack limit is 0xffff in real-on-virtual 8086 mode. Real-mode with weird stack limits cannot be run in + * virtual 8086 mode in VT-x. See Intel spec. 26.3.1.2 "Checks on Guest Segment Registers". + * See Intel Instruction reference for PUSH and Intel spec. 22.33.1 "Segment Wraparound". + */ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (pCtx->sp == 1) + return VINF_EM_RESET; + pCtx->sp -= sizeof(uint16_t); /* May wrap around which is expected behaviour. */ + int rc = PGMPhysSimpleWriteGCPhys(pVCpu->CTX_SUFF(pVM), pCtx->ss.u64Base + pCtx->sp, &uValue, sizeof(uint16_t)); + AssertRC(rc); + return rc; +} + + +/** + * Injects an event into the guest upon VM-entry by updating the relevant fields + * in the VM-entry area in the VMCS. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @retval VINF_SUCCESS if the event is successfully injected into the VMCS. + * @retval VINF_EM_RESET if event injection resulted in a triple-fault. + * + * @param pVCpu The cross context virtual CPU structure. + * @param u64IntInfo The VM-entry interruption-information field. + * @param cbInstr The VM-entry instruction length in bytes (for + * software interrupts, exceptions and privileged + * software exceptions). + * @param u32ErrCode The VM-entry exception error code. + * @param GCPtrFaultAddress The page-fault address for \#PF exceptions. + * @param pfIntrState Pointer to the current guest interruptibility-state. + * This interruptibility-state will be updated if + * necessary. This cannot not be NULL. + * @param fStepping Whether we're running in + * hmR0VmxRunGuestCodeStep() and should return + * VINF_EM_DBG_STEPPED if the event is injected + * directly (register modified by us, not by + * hardware on VM-entry). + */ +static VBOXSTRICTRC hmR0VmxInjectEventVmcs(PVMCPU pVCpu, uint64_t u64IntInfo, uint32_t cbInstr, uint32_t u32ErrCode, + RTGCUINTREG GCPtrFaultAddress, bool fStepping, uint32_t *pfIntrState) +{ + /* Intel spec. 24.8.3 "VM-Entry Controls for Event Injection" specifies the interruption-information field to be 32-bits. */ + AssertMsg(!RT_HI_U32(u64IntInfo), ("%#RX64\n", u64IntInfo)); + Assert(pfIntrState); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint32_t u32IntInfo = (uint32_t)u64IntInfo; + uint32_t const uVector = VMX_ENTRY_INT_INFO_VECTOR(u32IntInfo); + uint32_t const uIntType = VMX_ENTRY_INT_INFO_TYPE(u32IntInfo); + +#ifdef VBOX_STRICT + /* + * Validate the error-code-valid bit for hardware exceptions. + * No error codes for exceptions in real-mode. + * + * See Intel spec. 20.1.4 "Interrupt and Exception Handling" + */ + if ( uIntType == VMX_EXIT_INT_INFO_TYPE_HW_XCPT + && !CPUMIsGuestInRealModeEx(pCtx)) + { + switch (uVector) + { + case X86_XCPT_PF: + case X86_XCPT_DF: + case X86_XCPT_TS: + case X86_XCPT_NP: + case X86_XCPT_SS: + case X86_XCPT_GP: + case X86_XCPT_AC: + AssertMsg(VMX_ENTRY_INT_INFO_IS_ERROR_CODE_VALID(u32IntInfo), + ("Error-code-valid bit not set for exception that has an error code uVector=%#x\n", uVector)); + RT_FALL_THRU(); + default: + break; + } + } +#endif + + /* Cannot inject an NMI when block-by-MOV SS is in effect. */ + Assert( uIntType != VMX_EXIT_INT_INFO_TYPE_NMI + || !(*pfIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS)); + + STAM_COUNTER_INC(&pVCpu->hm.s.paStatInjectedIrqsR0[uVector & MASK_INJECT_IRQ_STAT]); + + /* + * Hardware interrupts & exceptions cannot be delivered through the software interrupt + * redirection bitmap to the real mode task in virtual-8086 mode. We must jump to the + * interrupt handler in the (real-mode) guest. + * + * See Intel spec. 20.3 "Interrupt and Exception handling in Virtual-8086 Mode". + * See Intel spec. 20.1.4 "Interrupt and Exception Handling" for real-mode interrupt handling. + */ + if (CPUMIsGuestInRealModeEx(pCtx)) /* CR0.PE bit changes are always intercepted, so it's up to date. */ + { + if (pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fUnrestrictedGuest) + { + /* + * For unrestricted execution enabled CPUs running real-mode guests, we must not + * set the deliver-error-code bit. + * + * See Intel spec. 26.2.1.3 "VM-Entry Control Fields". + */ + u32IntInfo &= ~VMX_ENTRY_INT_INFO_ERROR_CODE_VALID; + } + else + { + PVM pVM = pVCpu->CTX_SUFF(pVM); + Assert(PDMVmmDevHeapIsEnabled(pVM)); + Assert(pVM->hm.s.vmx.pRealModeTSS); + + /* We require RIP, RSP, RFLAGS, CS, IDTR, import them. */ + int rc2 = hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_SREG_MASK | CPUMCTX_EXTRN_TABLE_MASK | CPUMCTX_EXTRN_RIP + | CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_RFLAGS); + AssertRCReturn(rc2, rc2); + + /* Check if the interrupt handler is present in the IVT (real-mode IDT). IDT limit is (4N - 1). */ + size_t const cbIdtEntry = sizeof(X86IDTR16); + if (uVector * cbIdtEntry + (cbIdtEntry - 1) > pCtx->idtr.cbIdt) + { + /* If we are trying to inject a #DF with no valid IDT entry, return a triple-fault. */ + if (uVector == X86_XCPT_DF) + return VINF_EM_RESET; + + /* If we're injecting a #GP with no valid IDT entry, inject a double-fault. */ + if (uVector == X86_XCPT_GP) + return hmR0VmxInjectXcptDF(pVCpu, fStepping, pfIntrState); + + /* + * If we're injecting an event with no valid IDT entry, inject a #GP. + * No error codes for exceptions in real-mode. + * + * See Intel spec. 20.1.4 "Interrupt and Exception Handling" + */ + return hmR0VmxInjectXcptGP(pVCpu, false /* fErrCodeValid */, 0 /* u32ErrCode */, fStepping, pfIntrState); + } + + /* Software exceptions (#BP and #OF exceptions thrown as a result of INT3 or INTO) */ + uint16_t uGuestIp = pCtx->ip; + if (uIntType == VMX_ENTRY_INT_INFO_TYPE_SW_XCPT) + { + Assert(uVector == X86_XCPT_BP || uVector == X86_XCPT_OF); + /* #BP and #OF are both benign traps, we need to resume the next instruction. */ + uGuestIp = pCtx->ip + (uint16_t)cbInstr; + } + else if (uIntType == VMX_ENTRY_INT_INFO_TYPE_SW_INT) + uGuestIp = pCtx->ip + (uint16_t)cbInstr; + + /* Get the code segment selector and offset from the IDT entry for the interrupt handler. */ + X86IDTR16 IdtEntry; + RTGCPHYS GCPhysIdtEntry = (RTGCPHYS)pCtx->idtr.pIdt + uVector * cbIdtEntry; + rc2 = PGMPhysSimpleReadGCPhys(pVM, &IdtEntry, GCPhysIdtEntry, cbIdtEntry); + AssertRCReturn(rc2, rc2); + + /* Construct the stack frame for the interrupt/exception handler. */ + VBOXSTRICTRC rcStrict; + rcStrict = hmR0VmxRealModeGuestStackPush(pVCpu, pCtx->eflags.u32); + if (rcStrict == VINF_SUCCESS) + rcStrict = hmR0VmxRealModeGuestStackPush(pVCpu, pCtx->cs.Sel); + if (rcStrict == VINF_SUCCESS) + rcStrict = hmR0VmxRealModeGuestStackPush(pVCpu, uGuestIp); + + /* Clear the required eflag bits and jump to the interrupt/exception handler. */ + if (rcStrict == VINF_SUCCESS) + { + pCtx->eflags.u32 &= ~(X86_EFL_IF | X86_EFL_TF | X86_EFL_RF | X86_EFL_AC); + pCtx->rip = IdtEntry.offSel; + pCtx->cs.Sel = IdtEntry.uSel; + pCtx->cs.ValidSel = IdtEntry.uSel; + pCtx->cs.u64Base = IdtEntry.uSel << cbIdtEntry; + if ( uIntType == VMX_ENTRY_INT_INFO_TYPE_HW_XCPT + && uVector == X86_XCPT_PF) + pCtx->cr2 = GCPtrFaultAddress; + + /* If any other guest-state bits are changed here, make sure to update + hmR0VmxPreRunGuestCommitted() when thread-context hooks are used. */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CS | HM_CHANGED_GUEST_CR2 + | HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS + | HM_CHANGED_GUEST_RSP); + + /* We're clearing interrupts, which means no block-by-STI interrupt-inhibition. */ + if (*pfIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_STI) + { + Assert( uIntType != VMX_ENTRY_INT_INFO_TYPE_NMI + && uIntType != VMX_ENTRY_INT_INFO_TYPE_EXT_INT); + Log4Func(("Clearing inhibition due to STI\n")); + *pfIntrState &= ~VMX_VMCS_GUEST_INT_STATE_BLOCK_STI; + } + Log4(("Injecting real-mode: u32IntInfo=%#x u32ErrCode=%#x cbInstr=%#x Eflags=%#x CS:EIP=%04x:%04x\n", + u32IntInfo, u32ErrCode, cbInstr, pCtx->eflags.u, pCtx->cs.Sel, pCtx->eip)); + + /* The event has been truly dispatched. Mark it as no longer pending so we don't attempt to 'undo' + it, if we are returning to ring-3 before executing guest code. */ + pVCpu->hm.s.Event.fPending = false; + + /* Make hmR0VmxPreRunGuest() return if we're stepping since we've changed cs:rip. */ + if (fStepping) + rcStrict = VINF_EM_DBG_STEPPED; + } + AssertMsg(rcStrict == VINF_SUCCESS || rcStrict == VINF_EM_RESET || (rcStrict == VINF_EM_DBG_STEPPED && fStepping), + ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + return rcStrict; + } + } + + /* Validate. */ + Assert(VMX_ENTRY_INT_INFO_IS_VALID(u32IntInfo)); /* Bit 31 (Valid bit) must be set by caller. */ + Assert(!(u32IntInfo & VMX_BF_ENTRY_INT_INFO_RSVD_12_30_MASK)); /* Bits 30:12 MBZ. */ + + /* Inject. */ + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_ENTRY_INTERRUPTION_INFO, u32IntInfo); + if (VMX_ENTRY_INT_INFO_IS_ERROR_CODE_VALID(u32IntInfo)) + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_ENTRY_EXCEPTION_ERRCODE, u32ErrCode); + rc |= VMXWriteVmcs32(VMX_VMCS32_CTRL_ENTRY_INSTR_LENGTH, cbInstr); + AssertRCReturn(rc, rc); + + /* Update CR2. */ + if ( VMX_ENTRY_INT_INFO_TYPE(u32IntInfo) == VMX_EXIT_INT_INFO_TYPE_HW_XCPT + && uVector == X86_XCPT_PF) + pCtx->cr2 = GCPtrFaultAddress; + + Log4(("Injecting u32IntInfo=%#x u32ErrCode=%#x cbInstr=%#x CR2=%#RX64\n", u32IntInfo, u32ErrCode, cbInstr, pCtx->cr2)); + + return VINF_SUCCESS; +} + + +/** + * Clears the interrupt-window exiting control in the VMCS and if necessary + * clears the current event in the VMCS as well. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks Use this function only to clear events that have not yet been + * delivered to the guest but are injected in the VMCS! + * @remarks No-long-jump zone!!! + */ +static void hmR0VmxClearIntNmiWindowsVmcs(PVMCPU pVCpu) +{ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_INT_WINDOW_EXIT) + { + hmR0VmxClearIntWindowExitVmcs(pVCpu); + Log4Func(("Cleared interrupt window\n")); + } + + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_NMI_WINDOW_EXIT) + { + hmR0VmxClearNmiWindowExitVmcs(pVCpu); + Log4Func(("Cleared NMI window\n")); + } +} + + +/** + * Enters the VT-x session. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0DECL(int) VMXR0Enter(PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fSupported); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + LogFlowFunc(("pVCpu=%p\n", pVCpu)); + Assert((pVCpu->hm.s.fCtxChanged & (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)) + == (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)); + +#ifdef VBOX_STRICT + /* At least verify VMX is enabled, since we can't check if we're in VMX root mode without #GP'ing. */ + RTCCUINTREG uHostCR4 = ASMGetCR4(); + if (!(uHostCR4 & X86_CR4_VMXE)) + { + LogRelFunc(("X86_CR4_VMXE bit in CR4 is not set!\n")); + return VERR_VMX_X86_CR4_VMXE_CLEARED; + } +#endif + + /* + * Load the VCPU's VMCS as the current (and active) one. + */ + Assert(pVCpu->hm.s.vmx.fVmcsState & HMVMX_VMCS_STATE_CLEAR); + int rc = VMXActivateVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + if (RT_SUCCESS(rc)) + { + pVCpu->hm.s.vmx.fVmcsState = HMVMX_VMCS_STATE_ACTIVE; + pVCpu->hm.s.fLeaveDone = false; + Log4Func(("Activated Vmcs. HostCpuId=%u\n", RTMpCpuId())); + + /* + * Do the EMT scheduled L1D flush here if needed. + */ + if (pVCpu->CTX_SUFF(pVM)->hm.s.fL1dFlushOnSched) + ASMWrMsr(MSR_IA32_FLUSH_CMD, MSR_IA32_FLUSH_CMD_F_L1D); + } + return rc; +} + + +/** + * The thread-context callback (only on platforms which support it). + * + * @param enmEvent The thread-context event. + * @param pVCpu The cross context virtual CPU structure. + * @param fGlobalInit Whether global VT-x/AMD-V init. was used. + * @thread EMT(pVCpu) + */ +VMMR0DECL(void) VMXR0ThreadCtxCallback(RTTHREADCTXEVENT enmEvent, PVMCPU pVCpu, bool fGlobalInit) +{ + NOREF(fGlobalInit); + + switch (enmEvent) + { + case RTTHREADCTXEVENT_OUT: + { + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(VMMR0ThreadCtxHookIsEnabled(pVCpu)); + VMCPU_ASSERT_EMT(pVCpu); + + /* No longjmps (logger flushes, locks) in this fragile context. */ + VMMRZCallRing3Disable(pVCpu); + Log4Func(("Preempting: HostCpuId=%u\n", RTMpCpuId())); + + /* + * Restore host-state (FPU, debug etc.) + */ + if (!pVCpu->hm.s.fLeaveDone) + { + /* + * Do -not- import the guest-state here as we might already be in the middle of importing + * it, esp. bad if we're holding the PGM lock, see comment in hmR0VmxImportGuestState(). + */ + hmR0VmxLeave(pVCpu, false /* fImportState */); + pVCpu->hm.s.fLeaveDone = true; + } + + /* Leave HM context, takes care of local init (term). */ + int rc = HMR0LeaveCpu(pVCpu); + AssertRC(rc); NOREF(rc); + + /* Restore longjmp state. */ + VMMRZCallRing3Enable(pVCpu); + STAM_REL_COUNTER_INC(&pVCpu->hm.s.StatSwitchPreempt); + break; + } + + case RTTHREADCTXEVENT_IN: + { + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(VMMR0ThreadCtxHookIsEnabled(pVCpu)); + VMCPU_ASSERT_EMT(pVCpu); + + /* No longjmps here, as we don't want to trigger preemption (& its hook) while resuming. */ + VMMRZCallRing3Disable(pVCpu); + Log4Func(("Resumed: HostCpuId=%u\n", RTMpCpuId())); + + /* Initialize the bare minimum state required for HM. This takes care of + initializing VT-x if necessary (onlined CPUs, local init etc.) */ + int rc = hmR0EnterCpu(pVCpu); + AssertRC(rc); + Assert((pVCpu->hm.s.fCtxChanged & (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)) + == (HM_CHANGED_HOST_CONTEXT | HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)); + + /* Load the active VMCS as the current one. */ + if (pVCpu->hm.s.vmx.fVmcsState & HMVMX_VMCS_STATE_CLEAR) + { + rc = VMXActivateVmcs(pVCpu->hm.s.vmx.HCPhysVmcs); + AssertRC(rc); NOREF(rc); + pVCpu->hm.s.vmx.fVmcsState = HMVMX_VMCS_STATE_ACTIVE; + Log4Func(("Resumed: Activated Vmcs. HostCpuId=%u\n", RTMpCpuId())); + } + pVCpu->hm.s.fLeaveDone = false; + + /* Do the EMT scheduled L1D flush if needed. */ + if (pVCpu->CTX_SUFF(pVM)->hm.s.fL1dFlushOnSched) + ASMWrMsr(MSR_IA32_FLUSH_CMD, MSR_IA32_FLUSH_CMD_F_L1D); + + /* Restore longjmp state. */ + VMMRZCallRing3Enable(pVCpu); + break; + } + + default: + break; + } +} + + +/** + * Exports the host state into the VMCS host-state area. + * Sets up the VM-exit MSR-load area. + * + * The CPU state will be loaded from these fields on every successful VM-exit. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxExportHostState(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + int rc = VINF_SUCCESS; + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_HOST_CONTEXT) + { + rc = hmR0VmxExportHostControlRegs(); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + rc = hmR0VmxExportHostSegmentRegs(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + rc = hmR0VmxExportHostMsrs(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + pVCpu->hm.s.fCtxChanged &= ~HM_CHANGED_HOST_CONTEXT; + } + return rc; +} + + +/** + * Saves the host state in the VMCS host-state. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +VMMR0DECL(int) VMXR0ExportHostState(PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* + * Export the host state here while entering HM context. + * When thread-context hooks are used, we might get preempted and have to re-save the host + * state but most of the time we won't be, so do it here before we disable interrupts. + */ + return hmR0VmxExportHostState(pVCpu); +} + + +/** + * Exports the guest state into the VMCS guest-state area. + * + * The will typically be done before VM-entry when the guest-CPU state and the + * VMCS state may potentially be out of sync. + * + * Sets up the VM-entry MSR-load and VM-exit MSR-store areas. Sets up the + * VM-entry controls. + * Sets up the appropriate VMX non-root function to execute guest code based on + * the guest CPU mode. + * + * @returns VBox strict status code. + * @retval VINF_EM_RESCHEDULE_REM if we try to emulate non-paged guest code + * without unrestricted guest access and the VMMDev is not presently + * mapped (e.g. EFI32). + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static VBOXSTRICTRC hmR0VmxExportGuestState(PVMCPU pVCpu) +{ + AssertPtr(pVCpu); + HMVMX_ASSERT_PREEMPT_SAFE(pVCpu); + + LogFlowFunc(("pVCpu=%p\n", pVCpu)); + + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatExportGuestState, x); + + /* Determine real-on-v86 mode. */ + pVCpu->hm.s.vmx.RealMode.fRealOnV86Active = false; + if ( !pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fUnrestrictedGuest + && CPUMIsGuestInRealModeEx(&pVCpu->cpum.GstCtx)) + pVCpu->hm.s.vmx.RealMode.fRealOnV86Active = true; + + /* + * Any ordering dependency among the sub-functions below must be explicitly stated using comments. + * Ideally, assert that the cross-dependent bits are up-to-date at the point of using it. + */ + int rc = hmR0VmxSelectVMRunHandler(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + /* This needs to be done after hmR0VmxSelectVMRunHandler() as changing pfnStartVM may require VM-entry control updates. */ + rc = hmR0VmxExportGuestEntryCtls(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + /* This needs to be done after hmR0VmxSelectVMRunHandler() as changing pfnStartVM may require VM-exit control updates. */ + rc = hmR0VmxExportGuestExitCtls(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + rc = hmR0VmxExportGuestCR0(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + VBOXSTRICTRC rcStrict = hmR0VmxExportGuestCR3AndCR4(pVCpu); + if (rcStrict == VINF_SUCCESS) + { /* likely */ } + else + { + Assert(rcStrict == VINF_EM_RESCHEDULE_REM || RT_FAILURE_NP(rcStrict)); + return rcStrict; + } + + rc = hmR0VmxExportGuestSegmentRegs(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + /* This needs to be done after hmR0VmxExportGuestEntryCtls() and hmR0VmxExportGuestExitCtls() as it + may alter controls if we determine we don't have to swap EFER after all. */ + rc = hmR0VmxExportGuestMsrs(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + rc = hmR0VmxExportGuestApicTpr(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + rc = hmR0VmxExportGuestXcptIntercepts(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + rc = hmR0VmxExportGuestRip(pVCpu); + rc |= hmR0VmxExportGuestRsp(pVCpu); + rc |= hmR0VmxExportGuestRflags(pVCpu); + AssertLogRelMsgRCReturn(rc, ("rc=%Rrc\n", rc), rc); + + /* Clear any bits that may be set but exported unconditionally or unused/reserved bits. */ + ASMAtomicUoAndU64(&pVCpu->hm.s.fCtxChanged, ~( (HM_CHANGED_GUEST_GPRS_MASK & ~HM_CHANGED_GUEST_RSP) + | HM_CHANGED_GUEST_CR2 + | (HM_CHANGED_GUEST_DR_MASK & ~HM_CHANGED_GUEST_DR7) + | HM_CHANGED_GUEST_X87 + | HM_CHANGED_GUEST_SSE_AVX + | HM_CHANGED_GUEST_OTHER_XSAVE + | HM_CHANGED_GUEST_XCRx + | HM_CHANGED_GUEST_KERNEL_GS_BASE /* Part of lazy or auto load-store MSRs. */ + | HM_CHANGED_GUEST_SYSCALL_MSRS /* Part of lazy or auto load-store MSRs. */ + | HM_CHANGED_GUEST_TSC_AUX + | HM_CHANGED_GUEST_OTHER_MSRS + | HM_CHANGED_GUEST_HWVIRT + | (HM_CHANGED_KEEPER_STATE_MASK & ~HM_CHANGED_VMX_MASK))); + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExportGuestState, x); + return rc; +} + + +/** + * Exports the state shared between the host and guest into the VMCS. + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static void hmR0VmxExportSharedState(PVMCPU pVCpu) +{ + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_DR_MASK) + { + int rc = hmR0VmxExportSharedDebugState(pVCpu); + AssertRC(rc); + pVCpu->hm.s.fCtxChanged &= ~HM_CHANGED_GUEST_DR_MASK; + + /* Loading shared debug bits might have changed eflags.TF bit for debugging purposes. */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_GUEST_RFLAGS) + { + rc = hmR0VmxExportGuestRflags(pVCpu); + AssertRC(rc); + } + } + + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_VMX_GUEST_LAZY_MSRS) + { + hmR0VmxLazyLoadGuestMsrs(pVCpu); + pVCpu->hm.s.fCtxChanged &= ~HM_CHANGED_VMX_GUEST_LAZY_MSRS; + } + + AssertMsg(!(pVCpu->hm.s.fCtxChanged & HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE), + ("fCtxChanged=%#RX64\n", pVCpu->hm.s.fCtxChanged)); +} + + +/** + * Worker for loading the guest-state bits in the inner VT-x execution loop. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @retval VINF_EM_RESCHEDULE_REM if we try to emulate non-paged guest code + * without unrestricted guest access and the VMMDev is not presently + * mapped (e.g. EFI32). + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks No-long-jump zone!!! + */ +static VBOXSTRICTRC hmR0VmxExportGuestStateOptimal(PVMCPU pVCpu) +{ + HMVMX_ASSERT_PREEMPT_SAFE(pVCpu); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + Assert(VMMR0IsLogFlushDisabled(pVCpu)); + +#ifdef HMVMX_ALWAYS_SYNC_FULL_GUEST_STATE + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); +#endif + + /* + * For many exits it's only RIP that changes and hence try to export it first + * without going through a lot of change flag checks. + */ + VBOXSTRICTRC rcStrict; + uint64_t fCtxChanged = ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged); + RT_UNTRUSTED_NONVOLATILE_COPY_FENCE(); + if ((fCtxChanged & (HM_CHANGED_ALL_GUEST & ~HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)) == HM_CHANGED_GUEST_RIP) + { + rcStrict = hmR0VmxExportGuestRip(pVCpu); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + { /* likely */} + else + AssertMsgFailedReturn(("hmR0VmxExportGuestRip failed! rc=%Rrc\n", VBOXSTRICTRC_VAL(rcStrict)), rcStrict); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExportMinimal); + } + else if (fCtxChanged & (HM_CHANGED_ALL_GUEST & ~HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)) + { + rcStrict = hmR0VmxExportGuestState(pVCpu); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + { /* likely */} + else + { + AssertMsg(rcStrict == VINF_EM_RESCHEDULE_REM, ("hmR0VmxExportGuestState failed! rc=%Rrc\n", + VBOXSTRICTRC_VAL(rcStrict))); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + return rcStrict; + } + STAM_COUNTER_INC(&pVCpu->hm.s.StatExportFull); + } + else + rcStrict = VINF_SUCCESS; + +#ifdef VBOX_STRICT + /* All the guest state bits should be loaded except maybe the host context and/or the shared host/guest bits. */ + fCtxChanged = ASMAtomicUoReadU64(&pVCpu->hm.s.fCtxChanged); + RT_UNTRUSTED_NONVOLATILE_COPY_FENCE(); + AssertMsg(!(fCtxChanged & (HM_CHANGED_ALL_GUEST & ~HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE)), + ("fCtxChanged=%#RX64\n", fCtxChanged)); +#endif + return rcStrict; +} + + +/** + * Does the preparations before executing guest code in VT-x. + * + * This may cause longjmps to ring-3 and may even result in rescheduling to the + * recompiler/IEM. We must be cautious what we do here regarding committing + * guest-state information into the VMCS assuming we assuredly execute the + * guest in VT-x mode. + * + * If we fall back to the recompiler/IEM after updating the VMCS and clearing + * the common-state (TRPM/forceflags), we must undo those changes so that the + * recompiler/IEM can (and should) use them when it resumes guest execution. + * Otherwise such operations must be done when we can no longer exit to ring-3. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @retval VINF_SUCCESS if we can proceed with running the guest, interrupts + * have been disabled. + * @retval VINF_EM_RESET if a triple-fault occurs while injecting a + * double-fault into the guest. + * @retval VINF_EM_DBG_STEPPED if @a fStepping is true and an event was + * dispatched directly. + * @retval VINF_* scheduling changes, we have to go back to ring-3. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmxTransient Pointer to the VMX transient structure. + * @param fStepping Set if called from hmR0VmxRunGuestCodeStep(). Makes + * us ignore some of the reasons for returning to + * ring-3, and return VINF_EM_DBG_STEPPED if event + * dispatching took place. + */ +static VBOXSTRICTRC hmR0VmxPreRunGuest(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient, bool fStepping) +{ + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + +#ifdef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + if (CPUMIsGuestInVmxNonRootMode(&pVCpu->cpum.GstCtx)) + { + Log2(("hmR0VmxPreRunGuest: Rescheduling to IEM due to nested-hwvirt or forced IEM exec -> VINF_EM_RESCHEDULE_REM\n")); + RT_NOREF3(pVCpu, pVmxTransient, fStepping); + return VINF_EM_RESCHEDULE_REM; + } +#endif + +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0 + PGMRZDynMapFlushAutoSet(pVCpu); +#endif + + /* Check force flag actions that might require us to go back to ring-3. */ + VBOXSTRICTRC rcStrict = hmR0VmxCheckForceFlags(pVCpu, fStepping); + if (rcStrict == VINF_SUCCESS) + { /* FFs doesn't get set all the time. */ } + else + return rcStrict; + + /* + * Setup the virtualized-APIC accesses. + * + * Note! This can cause a longjumps to R3 due to the acquisition of the PGM lock + * in both PGMHandlerPhysicalReset() and IOMMMIOMapMMIOHCPage(), see @bugref{8721}. + * + * This is the reason we do it here and not in hmR0VmxExportGuestState(). + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if ( !pVCpu->hm.s.vmx.u64MsrApicBase + && (pVCpu->hm.s.vmx.u32ProcCtls2 & VMX_PROC_CTLS2_VIRT_APIC_ACCESS) + && PDMHasApic(pVM)) + { + uint64_t const u64MsrApicBase = APICGetBaseMsrNoCheck(pVCpu); + Assert(u64MsrApicBase); + Assert(pVM->hm.s.vmx.HCPhysApicAccess); + + RTGCPHYS const GCPhysApicBase = u64MsrApicBase & PAGE_BASE_GC_MASK; + + /* Unalias any existing mapping. */ + int rc = PGMHandlerPhysicalReset(pVM, GCPhysApicBase); + AssertRCReturn(rc, rc); + + /* Map the HC APIC-access page in place of the MMIO page, also updates the shadow page tables if necessary. */ + Log4Func(("Mapped HC APIC-access page at %#RGp\n", GCPhysApicBase)); + rc = IOMMMIOMapMMIOHCPage(pVM, pVCpu, GCPhysApicBase, pVM->hm.s.vmx.HCPhysApicAccess, X86_PTE_RW | X86_PTE_P); + AssertRCReturn(rc, rc); + + /* Update the per-VCPU cache of the APIC base MSR. */ + pVCpu->hm.s.vmx.u64MsrApicBase = u64MsrApicBase; + } + + if (TRPMHasTrap(pVCpu)) + hmR0VmxTrpmTrapToPendingEvent(pVCpu); + uint32_t fIntrState = hmR0VmxEvaluatePendingEvent(pVCpu); + + /* + * Event injection may take locks (currently the PGM lock for real-on-v86 case) and thus + * needs to be done with longjmps or interrupts + preemption enabled. Event injection might + * also result in triple-faulting the VM. + */ + rcStrict = hmR0VmxInjectPendingEvent(pVCpu, fIntrState, fStepping); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + { /* likely */ } + else + { + AssertMsg(rcStrict == VINF_EM_RESET || (rcStrict == VINF_EM_DBG_STEPPED && fStepping), + ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + return rcStrict; + } + + /* + * A longjump might result in importing CR3 even for VM-exits that don't necessarily + * import CR3 themselves. We will need to update them here, as even as late as the above + * hmR0VmxInjectPendingEvent() call may lazily import guest-CPU state on demand causing + * the below force flags to be set. + */ + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3)) + { + Assert(!(ASMAtomicUoReadU64(&pVCpu->cpum.GstCtx.fExtrn) & CPUMCTX_EXTRN_CR3)); + int rc2 = PGMUpdateCR3(pVCpu, CPUMGetGuestCR3(pVCpu)); + AssertMsgReturn(rc2 == VINF_SUCCESS || rc2 == VINF_PGM_SYNC_CR3, + ("%Rrc\n", rc2), RT_FAILURE_NP(rc2) ? rc2 : VERR_IPE_UNEXPECTED_INFO_STATUS); + Assert(!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3)); + } + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_PAE_PDPES)) + { + PGMGstUpdatePaePdpes(pVCpu, &pVCpu->hm.s.aPdpes[0]); + Assert(!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_PAE_PDPES)); + } + + /* + * No longjmps to ring-3 from this point on!!! + * Asserts() will still longjmp to ring-3 (but won't return), which is intentional, better than a kernel panic. + * This also disables flushing of the R0-logger instance (if any). + */ + VMMRZCallRing3Disable(pVCpu); + + /* + * Export the guest state bits. + * + * We cannot perform longjmps while loading the guest state because we do not preserve the + * host/guest state (although the VMCS will be preserved) across longjmps which can cause + * CPU migration. + * + * If we are injecting events to a real-on-v86 mode guest, we will have to update + * RIP and some segment registers, i.e. hmR0VmxInjectPendingEvent()->hmR0VmxInjectEventVmcs(). + * Hence, loading of the guest state needs to be done -after- injection of events. + */ + rcStrict = hmR0VmxExportGuestStateOptimal(pVCpu); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + { /* likely */ } + else + { + VMMRZCallRing3Enable(pVCpu); + return rcStrict; + } + + /* + * We disable interrupts so that we don't miss any interrupts that would flag preemption + * (IPI/timers etc.) when thread-context hooks aren't used and we've been running with + * preemption disabled for a while. Since this is purly to aid the + * RTThreadPreemptIsPending() code, it doesn't matter that it may temporarily reenable and + * disable interrupt on NT. + * + * We need to check for force-flags that could've possible been altered since we last + * checked them (e.g. by PDMGetInterrupt() leaving the PDM critical section, + * see @bugref{6398}). + * + * We also check a couple of other force-flags as a last opportunity to get the EMT back + * to ring-3 before executing guest code. + */ + pVmxTransient->fEFlags = ASMIntDisableFlags(); + + if ( ( !VM_FF_IS_ANY_SET(pVM, VM_FF_EMT_RENDEZVOUS | VM_FF_TM_VIRTUAL_SYNC) + && !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HM_TO_R3_MASK)) + || ( fStepping /* Optimized for the non-stepping case, so a bit of unnecessary work when stepping. */ + && !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_HM_TO_R3_MASK & ~(VMCPU_FF_TIMER | VMCPU_FF_PDM_CRITSECT))) ) + { + if (!RTThreadPreemptIsPending(NIL_RTTHREAD)) + { + pVCpu->hm.s.Event.fPending = false; + + /* + * We've injected any pending events. This is really the point of no return (to ring-3). + * + * Note! The caller expects to continue with interrupts & longjmps disabled on successful + * returns from this function, so don't enable them here. + */ + return VINF_SUCCESS; + } + + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchPendingHostIrq); + rcStrict = VINF_EM_RAW_INTERRUPT; + } + else + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchHmToR3FF); + rcStrict = VINF_EM_RAW_TO_R3; + } + + ASMSetFlags(pVmxTransient->fEFlags); + VMMRZCallRing3Enable(pVCpu); + + return rcStrict; +} + + +/** + * Prepares to run guest code in VT-x and we've committed to doing so. This + * means there is no backing out to ring-3 or anywhere else at this + * point. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmxTransient Pointer to the VMX transient structure. + * + * @remarks Called with preemption disabled. + * @remarks No-long-jump zone!!! + */ +static void hmR0VmxPreRunGuestCommitted(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + Assert(VMMR0IsLogFlushDisabled(pVCpu)); + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + + /* + * Indicate start of guest execution and where poking EMT out of guest-context is recognized. + */ + VMCPU_ASSERT_STATE(pVCpu, VMCPUSTATE_STARTED_HM); + VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED_EXEC); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (!CPUMIsGuestFPUStateActive(pVCpu)) + { + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatLoadGuestFpuState, x); + if (CPUMR0LoadGuestFPU(pVM, pVCpu) == VINF_CPUM_HOST_CR0_MODIFIED) + pVCpu->hm.s.fCtxChanged |= HM_CHANGED_HOST_CONTEXT; + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatLoadGuestFpuState, x); + STAM_COUNTER_INC(&pVCpu->hm.s.StatLoadGuestFpu); + } + + /* + * Lazy-update of the host MSRs values in the auto-load/store MSR area. + */ + if ( !pVCpu->hm.s.vmx.fUpdatedHostMsrs + && pVCpu->hm.s.vmx.cMsrs > 0) + hmR0VmxUpdateAutoLoadStoreHostMsrs(pVCpu); + + /* + * Re-save the host state bits as we may've been preempted (only happens when + * thread-context hooks are used or when hmR0VmxSetupVMRunHandler() changes pfnStartVM). + * Note that the 64-on-32 switcher saves the (64-bit) host state into the VMCS and + * if we change the switcher back to 32-bit, we *must* save the 32-bit host state here. + * See @bugref{8432}. + */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_HOST_CONTEXT) + { + int rc = hmR0VmxExportHostState(pVCpu); + AssertRC(rc); + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchPreemptExportHostState); + } + Assert(!(pVCpu->hm.s.fCtxChanged & HM_CHANGED_HOST_CONTEXT)); + + /* + * Export the state shared between host and guest (FPU, debug, lazy MSRs). + */ + if (pVCpu->hm.s.fCtxChanged & HM_CHANGED_VMX_HOST_GUEST_SHARED_STATE) + hmR0VmxExportSharedState(pVCpu); + AssertMsg(!pVCpu->hm.s.fCtxChanged, ("fCtxChanged=%#RX64\n", pVCpu->hm.s.fCtxChanged)); + + /* Store status of the shared guest-host state at the time of VM-entry. */ +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + if (CPUMIsGuestInLongModeEx(&pVCpu->cpum.GstCtx)) + { + pVmxTransient->fWasGuestDebugStateActive = CPUMIsGuestDebugStateActivePending(pVCpu); + pVmxTransient->fWasHyperDebugStateActive = CPUMIsHyperDebugStateActivePending(pVCpu); + } + else +#endif + { + pVmxTransient->fWasGuestDebugStateActive = CPUMIsGuestDebugStateActive(pVCpu); + pVmxTransient->fWasHyperDebugStateActive = CPUMIsHyperDebugStateActive(pVCpu); + } + + /* + * Cache the TPR-shadow for checking on every VM-exit if it might have changed. + */ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW) + pVmxTransient->u8GuestTpr = pVCpu->hm.s.vmx.pbVirtApic[XAPIC_OFF_TPR]; + + PHMPHYSCPU pHostCpu = hmR0GetCurrentCpu(); + RTCPUID idCurrentCpu = pHostCpu->idCpu; + if ( pVmxTransient->fUpdateTscOffsettingAndPreemptTimer + || idCurrentCpu != pVCpu->hm.s.idLastCpu) + { + hmR0VmxUpdateTscOffsettingAndPreemptTimer(pVCpu); + pVmxTransient->fUpdateTscOffsettingAndPreemptTimer = false; + } + + ASMAtomicWriteBool(&pVCpu->hm.s.fCheckedTLBFlush, true); /* Used for TLB flushing, set this across the world switch. */ + hmR0VmxFlushTaggedTlb(pHostCpu, pVCpu); /* Invalidate the appropriate guest entries from the TLB. */ + Assert(idCurrentCpu == pVCpu->hm.s.idLastCpu); + pVCpu->hm.s.vmx.LastError.idCurrentCpu = idCurrentCpu; /* Update the error reporting info. with the current host CPU. */ + + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatEntry, &pVCpu->hm.s.StatInGC, x); + + TMNotifyStartOfExecution(pVCpu); /* Finally, notify TM to resume its clocks as we're about + to start executing. */ + + /* + * Load the TSC_AUX MSR when we are not intercepting RDTSCP. + */ + if (pVCpu->hm.s.vmx.u32ProcCtls2 & VMX_PROC_CTLS2_RDTSCP) + { + if (!(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_RDTSC_EXIT)) + { + bool fMsrUpdated; + hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_TSC_AUX); + int rc2 = hmR0VmxAddAutoLoadStoreMsr(pVCpu, MSR_K8_TSC_AUX, CPUMGetGuestTscAux(pVCpu), true /* fUpdateHostMsr */, + &fMsrUpdated); + AssertRC(rc2); + Assert(fMsrUpdated || pVCpu->hm.s.vmx.fUpdatedHostMsrs); + /* Finally, mark that all host MSR values are updated so we don't redo it without leaving VT-x. See @bugref{6956}. */ + pVCpu->hm.s.vmx.fUpdatedHostMsrs = true; + } + else + { + hmR0VmxRemoveAutoLoadStoreMsr(pVCpu, MSR_K8_TSC_AUX); + Assert(!pVCpu->hm.s.vmx.cMsrs || pVCpu->hm.s.vmx.fUpdatedHostMsrs); + } + } + + if (pVM->cpum.ro.GuestFeatures.fIbrs) + { + bool fMsrUpdated; + hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_OTHER_MSRS); + int rc2 = hmR0VmxAddAutoLoadStoreMsr(pVCpu, MSR_IA32_SPEC_CTRL, CPUMGetGuestSpecCtrl(pVCpu), true /* fUpdateHostMsr */, + &fMsrUpdated); + AssertRC(rc2); + Assert(fMsrUpdated || pVCpu->hm.s.vmx.fUpdatedHostMsrs); + /* Finally, mark that all host MSR values are updated so we don't redo it without leaving VT-x. See @bugref{6956}. */ + pVCpu->hm.s.vmx.fUpdatedHostMsrs = true; + } + +#ifdef VBOX_STRICT + hmR0VmxCheckAutoLoadStoreMsrs(pVCpu); + hmR0VmxCheckHostEferMsr(pVCpu); + AssertRC(hmR0VmxCheckVmcsCtls(pVCpu)); +#endif +#ifdef HMVMX_ALWAYS_CHECK_GUEST_STATE + if (!(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_MSR_BITMAPS)) + { + uint32_t uInvalidReason = hmR0VmxCheckGuestState(pVCpu); + if (uInvalidReason != VMX_IGS_REASON_NOT_FOUND) + Log4(("hmR0VmxCheckGuestState returned %#x\n", uInvalidReason)); + } +#endif +} + + +/** + * Performs some essential restoration of state after running guest code in + * VT-x. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pVmxTransient Pointer to the VMX transient structure. + * @param rcVMRun Return code of VMLAUNCH/VMRESUME. + * + * @remarks Called with interrupts disabled, and returns with interrupts enabled! + * + * @remarks No-long-jump zone!!! This function will however re-enable longjmps + * unconditionally when it is safe to do so. + */ +static void hmR0VmxPostRunGuest(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient, int rcVMRun) +{ + uint64_t const uHostTsc = ASMReadTSC(); + Assert(!VMMRZCallRing3IsEnabled(pVCpu)); + + ASMAtomicWriteBool(&pVCpu->hm.s.fCheckedTLBFlush, false); /* See HMInvalidatePageOnAllVCpus(): used for TLB flushing. */ + ASMAtomicIncU32(&pVCpu->hm.s.cWorldSwitchExits); /* Initialized in vmR3CreateUVM(): used for EMT poking. */ + pVCpu->hm.s.fCtxChanged = 0; /* Exits/longjmps to ring-3 requires saving the guest state. */ + pVmxTransient->fVmcsFieldsRead = 0; /* Transient fields need to be read from the VMCS. */ + pVmxTransient->fVectoringPF = false; /* Vectoring page-fault needs to be determined later. */ + pVmxTransient->fVectoringDoublePF = false; /* Vectoring double page-fault needs to be determined later. */ + + if (!(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_RDTSC_EXIT)) + TMCpuTickSetLastSeen(pVCpu, uHostTsc + pVCpu->hm.s.vmx.u64TscOffset); + + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatInGC, &pVCpu->hm.s.StatPreExit, x); + TMNotifyEndOfExecution(pVCpu); /* Notify TM that the guest is no longer running. */ + Assert(!ASMIntAreEnabled()); + VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED_HM); + +#if HC_ARCH_BITS == 64 + pVCpu->hm.s.vmx.fRestoreHostFlags |= VMX_RESTORE_HOST_REQUIRED; /* Host state messed up by VT-x, we must restore. */ +#endif +#if HC_ARCH_BITS == 32 && defined(VBOX_ENABLE_64_BITS_GUESTS) + /* The 64-on-32 switcher maintains fVmcsState on its own and we need to leave it alone here. */ + if (pVCpu->hm.s.vmx.pfnStartVM != VMXR0SwitcherStartVM64) + pVCpu->hm.s.vmx.fVmcsState |= HMVMX_VMCS_STATE_LAUNCHED; /* Use VMRESUME instead of VMLAUNCH in the next run. */ +#else + pVCpu->hm.s.vmx.fVmcsState |= HMVMX_VMCS_STATE_LAUNCHED; /* Use VMRESUME instead of VMLAUNCH in the next run. */ +#endif +#ifdef VBOX_STRICT + hmR0VmxCheckHostEferMsr(pVCpu); /* Verify that VMRUN/VMLAUNCH didn't modify host EFER. */ +#endif + ASMSetFlags(pVmxTransient->fEFlags); /* Enable interrupts. */ + + /* Save the basic VM-exit reason. Refer Intel spec. 24.9.1 "Basic VM-exit Information". */ + uint32_t uExitReason; + int rc = VMXReadVmcs32(VMX_VMCS32_RO_EXIT_REASON, &uExitReason); + rc |= hmR0VmxReadEntryIntInfoVmcs(pVmxTransient); + AssertRC(rc); + pVmxTransient->uExitReason = VMX_EXIT_REASON_BASIC(uExitReason); + pVmxTransient->fVMEntryFailed = VMX_EXIT_REASON_HAS_ENTRY_FAILED(uExitReason); + + if (rcVMRun == VINF_SUCCESS) + { + /* + * Update the VM-exit history array here even if the VM-entry failed due to: + * - Invalid guest state. + * - MSR loading. + * - Machine-check event. + * + * In any of the above cases we will still have a "valid" VM-exit reason + * despite @a fVMEntryFailed being false. + * + * See Intel spec. 26.7 "VM-Entry failures during or after loading guest state". + * + * Note! We don't have CS or RIP at this point. Will probably address that later + * by amending the history entry added here. + */ + EMHistoryAddExit(pVCpu, EMEXIT_MAKE_FT(EMEXIT_F_KIND_VMX, pVmxTransient->uExitReason & EMEXIT_F_TYPE_MASK), + UINT64_MAX, uHostTsc); + + if (!pVmxTransient->fVMEntryFailed) + { + VMMRZCallRing3Enable(pVCpu); + + Assert(!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_CR3)); + Assert(!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_HM_UPDATE_PAE_PDPES)); + +#if defined(HMVMX_ALWAYS_SYNC_FULL_GUEST_STATE) || defined(HMVMX_ALWAYS_SAVE_FULL_GUEST_STATE) + rc = hmR0VmxImportGuestState(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRC(rc); +#elif defined(HMVMX_ALWAYS_SAVE_GUEST_RFLAGS) + rc = hmR0VmxImportGuestState(pVCpu, HMVMX_CPUMCTX_EXTRN_RFLAGS); + AssertRC(rc); +#else + /* + * Import the guest-interruptibility state always as we need it while evaluating + * injecting events on re-entry. + * + * We don't import CR0 (when Unrestricted guest execution is unavailable) despite + * checking for real-mode while exporting the state because all bits that cause + * mode changes wrt CR0 are intercepted. + */ + rc = hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_HM_VMX_INT_STATE); + AssertRC(rc); +#endif + + /* + * Sync the TPR shadow with our APIC state. + */ + if ( (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW) + && pVmxTransient->u8GuestTpr != pVCpu->hm.s.vmx.pbVirtApic[XAPIC_OFF_TPR]) + { + rc = APICSetTpr(pVCpu, pVCpu->hm.s.vmx.pbVirtApic[XAPIC_OFF_TPR]); + AssertRC(rc); + ASMAtomicOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_APIC_TPR); + } + + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + return; + } + } + else + Log4Func(("VM-entry failure: rcVMRun=%Rrc fVMEntryFailed=%RTbool\n", rcVMRun, pVmxTransient->fVMEntryFailed)); + + VMMRZCallRing3Enable(pVCpu); +} + + +/** + * Runs the guest code using VT-x the normal way. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * + * @note Mostly the same as hmR0VmxRunGuestCodeStep(). + */ +static VBOXSTRICTRC hmR0VmxRunGuestCodeNormal(PVMCPU pVCpu) +{ + VMXTRANSIENT VmxTransient; + VmxTransient.fUpdateTscOffsettingAndPreemptTimer = true; + VBOXSTRICTRC rcStrict = VERR_INTERNAL_ERROR_5; + uint32_t cLoops = 0; + + for (;; cLoops++) + { + Assert(!HMR0SuspendPending()); + HMVMX_ASSERT_CPU_SAFE(pVCpu); + + /* Preparatory work for running guest code, this may force us to return + to ring-3. This bugger disables interrupts on VINF_SUCCESS! */ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatEntry, x); + rcStrict = hmR0VmxPreRunGuest(pVCpu, &VmxTransient, false /* fStepping */); + if (rcStrict != VINF_SUCCESS) + break; + + hmR0VmxPreRunGuestCommitted(pVCpu, &VmxTransient); + int rcRun = hmR0VmxRunGuest(pVCpu); + + /* Restore any residual host-state and save any bits shared between host + and guest into the guest-CPU state. Re-enables interrupts! */ + hmR0VmxPostRunGuest(pVCpu, &VmxTransient, rcRun); + + /* Check for errors with running the VM (VMLAUNCH/VMRESUME). */ + if (RT_SUCCESS(rcRun)) + { /* very likely */ } + else + { + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatPreExit, x); + hmR0VmxReportWorldSwitchError(pVCpu, rcRun, &VmxTransient); + return rcRun; + } + + /* Profile the VM-exit. */ + AssertMsg(VmxTransient.uExitReason <= VMX_EXIT_MAX, ("%#x\n", VmxTransient.uExitReason)); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitAll); + STAM_COUNTER_INC(&pVCpu->hm.s.paStatExitReasonR0[VmxTransient.uExitReason & MASK_EXITREASON_STAT]); + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatPreExit, &pVCpu->hm.s.StatExitHandling, x); + HMVMX_START_EXIT_DISPATCH_PROF(); + + VBOXVMM_R0_HMVMX_VMEXIT_NOCTX(pVCpu, &pVCpu->cpum.GstCtx, VmxTransient.uExitReason); + + /* Handle the VM-exit. */ +#ifdef HMVMX_USE_FUNCTION_TABLE + rcStrict = g_apfnVMExitHandlers[VmxTransient.uExitReason](pVCpu, &VmxTransient); +#else + rcStrict = hmR0VmxHandleExit(pVCpu, &VmxTransient, VmxTransient.uExitReason); +#endif + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitHandling, x); + if (rcStrict == VINF_SUCCESS) + { + if (cLoops <= pVCpu->CTX_SUFF(pVM)->hm.s.cMaxResumeLoops) + continue; /* likely */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchMaxResumeLoops); + rcStrict = VINF_EM_RAW_INTERRUPT; + } + break; + } + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatEntry, x); + return rcStrict; +} + + + +/** @name Execution loop for single stepping, DBGF events and expensive Dtrace + * probes. + * + * The following few functions and associated structure contains the bloat + * necessary for providing detailed debug events and dtrace probes as well as + * reliable host side single stepping. This works on the principle of + * "subclassing" the normal execution loop and workers. We replace the loop + * method completely and override selected helpers to add necessary adjustments + * to their core operation. + * + * The goal is to keep the "parent" code lean and mean, so as not to sacrifice + * any performance for debug and analysis features. + * + * @{ + */ + +/** + * Transient per-VCPU debug state of VMCS and related info. we save/restore in + * the debug run loop. + */ +typedef struct VMXRUNDBGSTATE +{ + /** The RIP we started executing at. This is for detecting that we stepped. */ + uint64_t uRipStart; + /** The CS we started executing with. */ + uint16_t uCsStart; + + /** Whether we've actually modified the 1st execution control field. */ + bool fModifiedProcCtls : 1; + /** Whether we've actually modified the 2nd execution control field. */ + bool fModifiedProcCtls2 : 1; + /** Whether we've actually modified the exception bitmap. */ + bool fModifiedXcptBitmap : 1; + + /** We desire the modified the CR0 mask to be cleared. */ + bool fClearCr0Mask : 1; + /** We desire the modified the CR4 mask to be cleared. */ + bool fClearCr4Mask : 1; + /** Stuff we need in VMX_VMCS32_CTRL_PROC_EXEC. */ + uint32_t fCpe1Extra; + /** Stuff we do not want in VMX_VMCS32_CTRL_PROC_EXEC. */ + uint32_t fCpe1Unwanted; + /** Stuff we need in VMX_VMCS32_CTRL_PROC_EXEC2. */ + uint32_t fCpe2Extra; + /** Extra stuff we need in VMX_VMCS32_CTRL_EXCEPTION_BITMAP. */ + uint32_t bmXcptExtra; + /** The sequence number of the Dtrace provider settings the state was + * configured against. */ + uint32_t uDtraceSettingsSeqNo; + /** VM-exits to check (one bit per VM-exit). */ + uint32_t bmExitsToCheck[3]; + + /** The initial VMX_VMCS32_CTRL_PROC_EXEC value (helps with restore). */ + uint32_t fProcCtlsInitial; + /** The initial VMX_VMCS32_CTRL_PROC_EXEC2 value (helps with restore). */ + uint32_t fProcCtls2Initial; + /** The initial VMX_VMCS32_CTRL_EXCEPTION_BITMAP value (helps with restore). */ + uint32_t bmXcptInitial; +} VMXRUNDBGSTATE; +AssertCompileMemberSize(VMXRUNDBGSTATE, bmExitsToCheck, (VMX_EXIT_MAX + 1 + 31) / 32 * 4); +typedef VMXRUNDBGSTATE *PVMXRUNDBGSTATE; + + +/** + * Initializes the VMXRUNDBGSTATE structure. + * + * @param pVCpu The cross context virtual CPU structure of the + * calling EMT. + * @param pDbgState The structure to initialize. + */ +static void hmR0VmxRunDebugStateInit(PVMCPU pVCpu, PVMXRUNDBGSTATE pDbgState) +{ + pDbgState->uRipStart = pVCpu->cpum.GstCtx.rip; + pDbgState->uCsStart = pVCpu->cpum.GstCtx.cs.Sel; + + pDbgState->fModifiedProcCtls = false; + pDbgState->fModifiedProcCtls2 = false; + pDbgState->fModifiedXcptBitmap = false; + pDbgState->fClearCr0Mask = false; + pDbgState->fClearCr4Mask = false; + pDbgState->fCpe1Extra = 0; + pDbgState->fCpe1Unwanted = 0; + pDbgState->fCpe2Extra = 0; + pDbgState->bmXcptExtra = 0; + pDbgState->fProcCtlsInitial = pVCpu->hm.s.vmx.u32ProcCtls; + pDbgState->fProcCtls2Initial = pVCpu->hm.s.vmx.u32ProcCtls2; + pDbgState->bmXcptInitial = pVCpu->hm.s.vmx.u32XcptBitmap; +} + + +/** + * Updates the VMSC fields with changes requested by @a pDbgState. + * + * This is performed after hmR0VmxPreRunGuestDebugStateUpdate as well + * immediately before executing guest code, i.e. when interrupts are disabled. + * We don't check status codes here as we cannot easily assert or return in the + * latter case. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pDbgState The debug state. + */ +static void hmR0VmxPreRunGuestDebugStateApply(PVMCPU pVCpu, PVMXRUNDBGSTATE pDbgState) +{ + /* + * Ensure desired flags in VMCS control fields are set. + * (Ignoring write failure here, as we're committed and it's just debug extras.) + * + * Note! We load the shadow CR0 & CR4 bits when we flag the clearing, so + * there should be no stale data in pCtx at this point. + */ + if ( (pVCpu->hm.s.vmx.u32ProcCtls & pDbgState->fCpe1Extra) != pDbgState->fCpe1Extra + || (pVCpu->hm.s.vmx.u32ProcCtls & pDbgState->fCpe1Unwanted)) + { + pVCpu->hm.s.vmx.u32ProcCtls |= pDbgState->fCpe1Extra; + pVCpu->hm.s.vmx.u32ProcCtls &= ~pDbgState->fCpe1Unwanted; + VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, pVCpu->hm.s.vmx.u32ProcCtls); + Log6Func(("VMX_VMCS32_CTRL_PROC_EXEC: %#RX32\n", pVCpu->hm.s.vmx.u32ProcCtls)); + pDbgState->fModifiedProcCtls = true; + } + + if ((pVCpu->hm.s.vmx.u32ProcCtls2 & pDbgState->fCpe2Extra) != pDbgState->fCpe2Extra) + { + pVCpu->hm.s.vmx.u32ProcCtls2 |= pDbgState->fCpe2Extra; + VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC2, pVCpu->hm.s.vmx.u32ProcCtls2); + Log6Func(("VMX_VMCS32_CTRL_PROC_EXEC2: %#RX32\n", pVCpu->hm.s.vmx.u32ProcCtls2)); + pDbgState->fModifiedProcCtls2 = true; + } + + if ((pVCpu->hm.s.vmx.u32XcptBitmap & pDbgState->bmXcptExtra) != pDbgState->bmXcptExtra) + { + pVCpu->hm.s.vmx.u32XcptBitmap |= pDbgState->bmXcptExtra; + VMXWriteVmcs32(VMX_VMCS32_CTRL_EXCEPTION_BITMAP, pVCpu->hm.s.vmx.u32XcptBitmap); + Log6Func(("VMX_VMCS32_CTRL_EXCEPTION_BITMAP: %#RX32\n", pVCpu->hm.s.vmx.u32XcptBitmap)); + pDbgState->fModifiedXcptBitmap = true; + } + + if (pDbgState->fClearCr0Mask && pVCpu->hm.s.vmx.u32Cr0Mask != 0) + { + pVCpu->hm.s.vmx.u32Cr0Mask = 0; + VMXWriteVmcs32(VMX_VMCS_CTRL_CR0_MASK, 0); + Log6Func(("VMX_VMCS_CTRL_CR0_MASK: 0\n")); + } + + if (pDbgState->fClearCr4Mask && pVCpu->hm.s.vmx.u32Cr4Mask != 0) + { + pVCpu->hm.s.vmx.u32Cr4Mask = 0; + VMXWriteVmcs32(VMX_VMCS_CTRL_CR4_MASK, 0); + Log6Func(("VMX_VMCS_CTRL_CR4_MASK: 0\n")); + } +} + + +/** + * Restores VMCS fields that were changed by hmR0VmxPreRunGuestDebugStateApply for + * re-entry next time around. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure. + * @param pDbgState The debug state. + * @param rcStrict The return code from executing the guest using single + * stepping. + */ +static VBOXSTRICTRC hmR0VmxRunDebugStateRevert(PVMCPU pVCpu, PVMXRUNDBGSTATE pDbgState, VBOXSTRICTRC rcStrict) +{ + /* + * Restore VM-exit control settings as we may not reenter this function the + * next time around. + */ + /* We reload the initial value, trigger what we can of recalculations the + next time around. From the looks of things, that's all that's required atm. */ + if (pDbgState->fModifiedProcCtls) + { + if (!(pDbgState->fProcCtlsInitial & VMX_PROC_CTLS_MOV_DR_EXIT) && CPUMIsHyperDebugStateActive(pVCpu)) + pDbgState->fProcCtlsInitial |= VMX_PROC_CTLS_MOV_DR_EXIT; /* Avoid assertion in hmR0VmxLeave */ + int rc2 = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, pDbgState->fProcCtlsInitial); + AssertRCReturn(rc2, rc2); + pVCpu->hm.s.vmx.u32ProcCtls = pDbgState->fProcCtlsInitial; + } + + /* We're currently the only ones messing with this one, so just restore the + cached value and reload the field. */ + if ( pDbgState->fModifiedProcCtls2 + && pVCpu->hm.s.vmx.u32ProcCtls2 != pDbgState->fProcCtls2Initial) + { + int rc2 = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC2, pDbgState->fProcCtls2Initial); + AssertRCReturn(rc2, rc2); + pVCpu->hm.s.vmx.u32ProcCtls2 = pDbgState->fProcCtls2Initial; + } + + /* If we've modified the exception bitmap, we restore it and trigger + reloading and partial recalculation the next time around. */ + if (pDbgState->fModifiedXcptBitmap) + pVCpu->hm.s.vmx.u32XcptBitmap = pDbgState->bmXcptInitial; + + return rcStrict; +} + + +/** + * Configures VM-exit controls for current DBGF and DTrace settings. + * + * This updates @a pDbgState and the VMCS execution control fields to reflect + * the necessary VM-exits demanded by DBGF and DTrace. + * + * @param pVCpu The cross context virtual CPU structure. + * @param pDbgState The debug state. + * @param pVmxTransient Pointer to the VMX transient structure. May update + * fUpdateTscOffsettingAndPreemptTimer. + */ +static void hmR0VmxPreRunGuestDebugStateUpdate(PVMCPU pVCpu, PVMXRUNDBGSTATE pDbgState, PVMXTRANSIENT pVmxTransient) +{ + /* + * Take down the dtrace serial number so we can spot changes. + */ + pDbgState->uDtraceSettingsSeqNo = VBOXVMM_GET_SETTINGS_SEQ_NO(); + ASMCompilerBarrier(); + + /* + * We'll rebuild most of the middle block of data members (holding the + * current settings) as we go along here, so start by clearing it all. + */ + pDbgState->bmXcptExtra = 0; + pDbgState->fCpe1Extra = 0; + pDbgState->fCpe1Unwanted = 0; + pDbgState->fCpe2Extra = 0; + for (unsigned i = 0; i < RT_ELEMENTS(pDbgState->bmExitsToCheck); i++) + pDbgState->bmExitsToCheck[i] = 0; + + /* + * Software interrupts (INT XXh) - no idea how to trigger these... + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if ( DBGF_IS_EVENT_ENABLED(pVM, DBGFEVENT_INTERRUPT_SOFTWARE) + || VBOXVMM_INT_SOFTWARE_ENABLED()) + { + ASMBitSet(pDbgState->bmExitsToCheck, VMX_EXIT_XCPT_OR_NMI); + } + + /* + * INT3 breakpoints - triggered by #BP exceptions. + */ + if (pVM->dbgf.ro.cEnabledInt3Breakpoints > 0) + pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_BP); + + /* + * Exception bitmap and XCPT events+probes. + */ + for (int iXcpt = 0; iXcpt < (DBGFEVENT_XCPT_LAST - DBGFEVENT_XCPT_FIRST + 1); iXcpt++) + if (DBGF_IS_EVENT_ENABLED(pVM, (DBGFEVENTTYPE)(DBGFEVENT_XCPT_FIRST + iXcpt))) + pDbgState->bmXcptExtra |= RT_BIT_32(iXcpt); + + if (VBOXVMM_XCPT_DE_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_DE); + if (VBOXVMM_XCPT_DB_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_DB); + if (VBOXVMM_XCPT_BP_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_BP); + if (VBOXVMM_XCPT_OF_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_OF); + if (VBOXVMM_XCPT_BR_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_BR); + if (VBOXVMM_XCPT_UD_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_UD); + if (VBOXVMM_XCPT_NM_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_NM); + if (VBOXVMM_XCPT_DF_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_DF); + if (VBOXVMM_XCPT_TS_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_TS); + if (VBOXVMM_XCPT_NP_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_NP); + if (VBOXVMM_XCPT_SS_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_SS); + if (VBOXVMM_XCPT_GP_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_GP); + if (VBOXVMM_XCPT_PF_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_PF); + if (VBOXVMM_XCPT_MF_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_MF); + if (VBOXVMM_XCPT_AC_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_AC); + if (VBOXVMM_XCPT_XF_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_XF); + if (VBOXVMM_XCPT_VE_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_VE); + if (VBOXVMM_XCPT_SX_ENABLED()) pDbgState->bmXcptExtra |= RT_BIT_32(X86_XCPT_SX); + + if (pDbgState->bmXcptExtra) + ASMBitSet(pDbgState->bmExitsToCheck, VMX_EXIT_XCPT_OR_NMI); + + /* + * Process events and probes for VM-exits, making sure we get the wanted VM-exits. + * + * Note! This is the reverse of what hmR0VmxHandleExitDtraceEvents does. + * So, when adding/changing/removing please don't forget to update it. + * + * Some of the macros are picking up local variables to save horizontal space, + * (being able to see it in a table is the lesser evil here). + */ +#define IS_EITHER_ENABLED(a_pVM, a_EventSubName) \ + ( DBGF_IS_EVENT_ENABLED(a_pVM, RT_CONCAT(DBGFEVENT_, a_EventSubName)) \ + || RT_CONCAT3(VBOXVMM_, a_EventSubName, _ENABLED)() ) +#define SET_ONLY_XBM_IF_EITHER_EN(a_EventSubName, a_uExit) \ + if (IS_EITHER_ENABLED(pVM, a_EventSubName)) \ + { AssertCompile((unsigned)(a_uExit) < sizeof(pDbgState->bmExitsToCheck) * 8); \ + ASMBitSet((pDbgState)->bmExitsToCheck, a_uExit); \ + } else do { } while (0) +#define SET_CPE1_XBM_IF_EITHER_EN(a_EventSubName, a_uExit, a_fCtrlProcExec) \ + if (IS_EITHER_ENABLED(pVM, a_EventSubName)) \ + { \ + (pDbgState)->fCpe1Extra |= (a_fCtrlProcExec); \ + AssertCompile((unsigned)(a_uExit) < sizeof(pDbgState->bmExitsToCheck) * 8); \ + ASMBitSet((pDbgState)->bmExitsToCheck, a_uExit); \ + } else do { } while (0) +#define SET_CPEU_XBM_IF_EITHER_EN(a_EventSubName, a_uExit, a_fUnwantedCtrlProcExec) \ + if (IS_EITHER_ENABLED(pVM, a_EventSubName)) \ + { \ + (pDbgState)->fCpe1Unwanted |= (a_fUnwantedCtrlProcExec); \ + AssertCompile((unsigned)(a_uExit) < sizeof(pDbgState->bmExitsToCheck) * 8); \ + ASMBitSet((pDbgState)->bmExitsToCheck, a_uExit); \ + } else do { } while (0) +#define SET_CPE2_XBM_IF_EITHER_EN(a_EventSubName, a_uExit, a_fCtrlProcExec2) \ + if (IS_EITHER_ENABLED(pVM, a_EventSubName)) \ + { \ + (pDbgState)->fCpe2Extra |= (a_fCtrlProcExec2); \ + AssertCompile((unsigned)(a_uExit) < sizeof(pDbgState->bmExitsToCheck) * 8); \ + ASMBitSet((pDbgState)->bmExitsToCheck, a_uExit); \ + } else do { } while (0) + + SET_ONLY_XBM_IF_EITHER_EN(EXIT_TASK_SWITCH, VMX_EXIT_TASK_SWITCH); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN(EXIT_VMX_EPT_VIOLATION, VMX_EXIT_EPT_VIOLATION); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN(EXIT_VMX_EPT_MISCONFIG, VMX_EXIT_EPT_MISCONFIG); /* unconditional (unless #VE) */ + SET_ONLY_XBM_IF_EITHER_EN(EXIT_VMX_VAPIC_ACCESS, VMX_EXIT_APIC_ACCESS); /* feature dependent, nothing to enable here */ + SET_ONLY_XBM_IF_EITHER_EN(EXIT_VMX_VAPIC_WRITE, VMX_EXIT_APIC_WRITE); /* feature dependent, nothing to enable here */ + + SET_ONLY_XBM_IF_EITHER_EN(INSTR_CPUID, VMX_EXIT_CPUID); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_CPUID, VMX_EXIT_CPUID); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_GETSEC, VMX_EXIT_GETSEC); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_GETSEC, VMX_EXIT_GETSEC); + SET_CPE1_XBM_IF_EITHER_EN(INSTR_HALT, VMX_EXIT_HLT, VMX_PROC_CTLS_HLT_EXIT); /* paranoia */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_HALT, VMX_EXIT_HLT); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_INVD, VMX_EXIT_INVD); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_INVD, VMX_EXIT_INVD); + SET_CPE1_XBM_IF_EITHER_EN(INSTR_INVLPG, VMX_EXIT_INVLPG, VMX_PROC_CTLS_INVLPG_EXIT); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_INVLPG, VMX_EXIT_INVLPG); + SET_CPE1_XBM_IF_EITHER_EN(INSTR_RDPMC, VMX_EXIT_RDPMC, VMX_PROC_CTLS_RDPMC_EXIT); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_RDPMC, VMX_EXIT_RDPMC); + SET_CPE1_XBM_IF_EITHER_EN(INSTR_RDTSC, VMX_EXIT_RDTSC, VMX_PROC_CTLS_RDTSC_EXIT); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_RDTSC, VMX_EXIT_RDTSC); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_RSM, VMX_EXIT_RSM); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_RSM, VMX_EXIT_RSM); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMM_CALL, VMX_EXIT_VMCALL); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMM_CALL, VMX_EXIT_VMCALL); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMCLEAR, VMX_EXIT_VMCLEAR); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMCLEAR, VMX_EXIT_VMCLEAR); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMLAUNCH, VMX_EXIT_VMLAUNCH); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMLAUNCH, VMX_EXIT_VMLAUNCH); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMPTRLD, VMX_EXIT_VMPTRLD); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMPTRLD, VMX_EXIT_VMPTRLD); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMPTRST, VMX_EXIT_VMPTRST); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMPTRST, VMX_EXIT_VMPTRST); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMREAD, VMX_EXIT_VMREAD); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMREAD, VMX_EXIT_VMREAD); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMRESUME, VMX_EXIT_VMRESUME); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMRESUME, VMX_EXIT_VMRESUME); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMWRITE, VMX_EXIT_VMWRITE); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMWRITE, VMX_EXIT_VMWRITE); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMXOFF, VMX_EXIT_VMXOFF); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMXOFF, VMX_EXIT_VMXOFF); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMXON, VMX_EXIT_VMXON); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMXON, VMX_EXIT_VMXON); + + if ( IS_EITHER_ENABLED(pVM, INSTR_CRX_READ) + || IS_EITHER_ENABLED(pVM, INSTR_CRX_WRITE)) + { + int rc = hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_CR4 | CPUMCTX_EXTRN_APIC_TPR); + AssertRC(rc); + +#if 0 /** @todo fix me */ + pDbgState->fClearCr0Mask = true; + pDbgState->fClearCr4Mask = true; +#endif + if (IS_EITHER_ENABLED(pVM, INSTR_CRX_READ)) + pDbgState->fCpe1Extra |= VMX_PROC_CTLS_CR3_STORE_EXIT | VMX_PROC_CTLS_CR8_STORE_EXIT; + if (IS_EITHER_ENABLED(pVM, INSTR_CRX_WRITE)) + pDbgState->fCpe1Extra |= VMX_PROC_CTLS_CR3_LOAD_EXIT | VMX_PROC_CTLS_CR8_LOAD_EXIT; + pDbgState->fCpe1Unwanted |= VMX_PROC_CTLS_USE_TPR_SHADOW; /* risky? */ + /* Note! We currently don't use VMX_VMCS32_CTRL_CR3_TARGET_COUNT. It would + require clearing here and in the loop if we start using it. */ + ASMBitSet(pDbgState->bmExitsToCheck, VMX_EXIT_MOV_CRX); + } + else + { + if (pDbgState->fClearCr0Mask) + { + pDbgState->fClearCr0Mask = false; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CR0); + } + if (pDbgState->fClearCr4Mask) + { + pDbgState->fClearCr4Mask = false; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_CR4); + } + } + SET_ONLY_XBM_IF_EITHER_EN( EXIT_CRX_READ, VMX_EXIT_MOV_CRX); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_CRX_WRITE, VMX_EXIT_MOV_CRX); + + if ( IS_EITHER_ENABLED(pVM, INSTR_DRX_READ) + || IS_EITHER_ENABLED(pVM, INSTR_DRX_WRITE)) + { + /** @todo later, need to fix handler as it assumes this won't usually happen. */ + ASMBitSet(pDbgState->bmExitsToCheck, VMX_EXIT_MOV_DRX); + } + SET_ONLY_XBM_IF_EITHER_EN( EXIT_DRX_READ, VMX_EXIT_MOV_DRX); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_DRX_WRITE, VMX_EXIT_MOV_DRX); + + SET_CPEU_XBM_IF_EITHER_EN(INSTR_RDMSR, VMX_EXIT_RDMSR, VMX_PROC_CTLS_USE_MSR_BITMAPS); /* risky clearing this? */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_RDMSR, VMX_EXIT_RDMSR); + SET_CPEU_XBM_IF_EITHER_EN(INSTR_WRMSR, VMX_EXIT_WRMSR, VMX_PROC_CTLS_USE_MSR_BITMAPS); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_WRMSR, VMX_EXIT_WRMSR); + SET_CPE1_XBM_IF_EITHER_EN(INSTR_MWAIT, VMX_EXIT_MWAIT, VMX_PROC_CTLS_MWAIT_EXIT); /* paranoia */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_MWAIT, VMX_EXIT_MWAIT); + SET_CPE1_XBM_IF_EITHER_EN(INSTR_MONITOR, VMX_EXIT_MONITOR, VMX_PROC_CTLS_MONITOR_EXIT); /* paranoia */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_MONITOR, VMX_EXIT_MONITOR); +#if 0 /** @todo too slow, fix handler. */ + SET_CPE1_XBM_IF_EITHER_EN(INSTR_PAUSE, VMX_EXIT_PAUSE, VMX_PROC_CTLS_PAUSE_EXIT); +#endif + SET_ONLY_XBM_IF_EITHER_EN( EXIT_PAUSE, VMX_EXIT_PAUSE); + + if ( IS_EITHER_ENABLED(pVM, INSTR_SGDT) + || IS_EITHER_ENABLED(pVM, INSTR_SIDT) + || IS_EITHER_ENABLED(pVM, INSTR_LGDT) + || IS_EITHER_ENABLED(pVM, INSTR_LIDT)) + { + pDbgState->fCpe2Extra |= VMX_PROC_CTLS2_DESC_TABLE_EXIT; + ASMBitSet(pDbgState->bmExitsToCheck, VMX_EXIT_GDTR_IDTR_ACCESS); + } + SET_ONLY_XBM_IF_EITHER_EN( EXIT_SGDT, VMX_EXIT_GDTR_IDTR_ACCESS); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_SIDT, VMX_EXIT_GDTR_IDTR_ACCESS); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_LGDT, VMX_EXIT_GDTR_IDTR_ACCESS); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_LIDT, VMX_EXIT_GDTR_IDTR_ACCESS); + + if ( IS_EITHER_ENABLED(pVM, INSTR_SLDT) + || IS_EITHER_ENABLED(pVM, INSTR_STR) + || IS_EITHER_ENABLED(pVM, INSTR_LLDT) + || IS_EITHER_ENABLED(pVM, INSTR_LTR)) + { + pDbgState->fCpe2Extra |= VMX_PROC_CTLS2_DESC_TABLE_EXIT; + ASMBitSet(pDbgState->bmExitsToCheck, VMX_EXIT_LDTR_TR_ACCESS); + } + SET_ONLY_XBM_IF_EITHER_EN( EXIT_SLDT, VMX_EXIT_LDTR_TR_ACCESS); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_STR, VMX_EXIT_LDTR_TR_ACCESS); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_LLDT, VMX_EXIT_LDTR_TR_ACCESS); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_LTR, VMX_EXIT_LDTR_TR_ACCESS); + + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_INVEPT, VMX_EXIT_INVEPT); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_INVEPT, VMX_EXIT_INVEPT); + SET_CPE1_XBM_IF_EITHER_EN(INSTR_RDTSCP, VMX_EXIT_RDTSCP, VMX_PROC_CTLS_RDTSC_EXIT); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_RDTSCP, VMX_EXIT_RDTSCP); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_INVVPID, VMX_EXIT_INVVPID); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_INVVPID, VMX_EXIT_INVVPID); + SET_CPE2_XBM_IF_EITHER_EN(INSTR_WBINVD, VMX_EXIT_WBINVD, VMX_PROC_CTLS2_WBINVD_EXIT); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_WBINVD, VMX_EXIT_WBINVD); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_XSETBV, VMX_EXIT_XSETBV); /* unconditional */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_XSETBV, VMX_EXIT_XSETBV); + SET_CPE2_XBM_IF_EITHER_EN(INSTR_RDRAND, VMX_EXIT_RDRAND, VMX_PROC_CTLS2_RDRAND_EXIT); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_RDRAND, VMX_EXIT_RDRAND); + SET_CPE1_XBM_IF_EITHER_EN(INSTR_VMX_INVPCID, VMX_EXIT_INVPCID, VMX_PROC_CTLS_INVLPG_EXIT); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_INVPCID, VMX_EXIT_INVPCID); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_VMX_VMFUNC, VMX_EXIT_VMFUNC); /* unconditional for the current setup */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_VMX_VMFUNC, VMX_EXIT_VMFUNC); + SET_CPE2_XBM_IF_EITHER_EN(INSTR_RDSEED, VMX_EXIT_RDSEED, VMX_PROC_CTLS2_RDSEED_EXIT); + SET_ONLY_XBM_IF_EITHER_EN( EXIT_RDSEED, VMX_EXIT_RDSEED); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_XSAVES, VMX_EXIT_XSAVES); /* unconditional (enabled by host, guest cfg) */ + SET_ONLY_XBM_IF_EITHER_EN(EXIT_XSAVES, VMX_EXIT_XSAVES); + SET_ONLY_XBM_IF_EITHER_EN(INSTR_XRSTORS, VMX_EXIT_XRSTORS); /* unconditional (enabled by host, guest cfg) */ + SET_ONLY_XBM_IF_EITHER_EN( EXIT_XRSTORS, VMX_EXIT_XRSTORS); + +#undef IS_EITHER_ENABLED +#undef SET_ONLY_XBM_IF_EITHER_EN +#undef SET_CPE1_XBM_IF_EITHER_EN +#undef SET_CPEU_XBM_IF_EITHER_EN +#undef SET_CPE2_XBM_IF_EITHER_EN + + /* + * Sanitize the control stuff. + */ + pDbgState->fCpe2Extra &= pVM->hm.s.vmx.Msrs.ProcCtls2.n.allowed1; + if (pDbgState->fCpe2Extra) + pDbgState->fCpe1Extra |= VMX_PROC_CTLS_USE_SECONDARY_CTLS; + pDbgState->fCpe1Extra &= pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed1; + pDbgState->fCpe1Unwanted &= ~pVM->hm.s.vmx.Msrs.ProcCtls.n.allowed0; + if (pVCpu->hm.s.fDebugWantRdTscExit != RT_BOOL(pDbgState->fCpe1Extra & VMX_PROC_CTLS_RDTSC_EXIT)) + { + pVCpu->hm.s.fDebugWantRdTscExit ^= true; + pVmxTransient->fUpdateTscOffsettingAndPreemptTimer = true; + } + + Log6(("HM: debug state: cpe1=%#RX32 cpeu=%#RX32 cpe2=%#RX32%s%s\n", + pDbgState->fCpe1Extra, pDbgState->fCpe1Unwanted, pDbgState->fCpe2Extra, + pDbgState->fClearCr0Mask ? " clr-cr0" : "", + pDbgState->fClearCr4Mask ? " clr-cr4" : "")); +} + + +/** + * Fires off DBGF events and dtrace probes for a VM-exit, when it's + * appropriate. + * + * The caller has checked the VM-exit against the + * VMXRUNDBGSTATE::bmExitsToCheck bitmap. The caller has checked for NMIs + * already, so we don't have to do that either. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure. + * @param pVmxTransient Pointer to the VMX-transient structure. + * @param uExitReason The VM-exit reason. + * + * @remarks The name of this function is displayed by dtrace, so keep it short + * and to the point. No longer than 33 chars long, please. + */ +static VBOXSTRICTRC hmR0VmxHandleExitDtraceEvents(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient, uint32_t uExitReason) +{ + /* + * Translate the event into a DBGF event (enmEvent + uEventArg) and at the + * same time check whether any corresponding Dtrace event is enabled (fDtrace). + * + * Note! This is the reverse operation of what hmR0VmxPreRunGuestDebugStateUpdate + * does. Must add/change/remove both places. Same ordering, please. + * + * Added/removed events must also be reflected in the next section + * where we dispatch dtrace events. + */ + bool fDtrace1 = false; + bool fDtrace2 = false; + DBGFEVENTTYPE enmEvent1 = DBGFEVENT_END; + DBGFEVENTTYPE enmEvent2 = DBGFEVENT_END; + uint32_t uEventArg = 0; +#define SET_EXIT(a_EventSubName) \ + do { \ + enmEvent2 = RT_CONCAT(DBGFEVENT_EXIT_, a_EventSubName); \ + fDtrace2 = RT_CONCAT3(VBOXVMM_EXIT_, a_EventSubName, _ENABLED)(); \ + } while (0) +#define SET_BOTH(a_EventSubName) \ + do { \ + enmEvent1 = RT_CONCAT(DBGFEVENT_INSTR_, a_EventSubName); \ + enmEvent2 = RT_CONCAT(DBGFEVENT_EXIT_, a_EventSubName); \ + fDtrace1 = RT_CONCAT3(VBOXVMM_INSTR_, a_EventSubName, _ENABLED)(); \ + fDtrace2 = RT_CONCAT3(VBOXVMM_EXIT_, a_EventSubName, _ENABLED)(); \ + } while (0) + switch (uExitReason) + { + case VMX_EXIT_MTF: + return hmR0VmxExitMtf(pVCpu, pVmxTransient); + + case VMX_EXIT_XCPT_OR_NMI: + { + uint8_t const idxVector = VMX_EXIT_INT_INFO_VECTOR(pVmxTransient->uExitIntInfo); + switch (VMX_EXIT_INT_INFO_TYPE(pVmxTransient->uExitIntInfo)) + { + case VMX_EXIT_INT_INFO_TYPE_HW_XCPT: + case VMX_EXIT_INT_INFO_TYPE_SW_XCPT: + case VMX_EXIT_INT_INFO_TYPE_PRIV_SW_XCPT: + if (idxVector <= (unsigned)(DBGFEVENT_XCPT_LAST - DBGFEVENT_XCPT_FIRST)) + { + if (VMX_EXIT_INT_INFO_IS_ERROR_CODE_VALID(pVmxTransient->uExitIntInfo)) + { + hmR0VmxReadExitIntErrorCodeVmcs(pVmxTransient); + uEventArg = pVmxTransient->uExitIntErrorCode; + } + enmEvent1 = (DBGFEVENTTYPE)(DBGFEVENT_XCPT_FIRST + idxVector); + switch (enmEvent1) + { + case DBGFEVENT_XCPT_DE: fDtrace1 = VBOXVMM_XCPT_DE_ENABLED(); break; + case DBGFEVENT_XCPT_DB: fDtrace1 = VBOXVMM_XCPT_DB_ENABLED(); break; + case DBGFEVENT_XCPT_BP: fDtrace1 = VBOXVMM_XCPT_BP_ENABLED(); break; + case DBGFEVENT_XCPT_OF: fDtrace1 = VBOXVMM_XCPT_OF_ENABLED(); break; + case DBGFEVENT_XCPT_BR: fDtrace1 = VBOXVMM_XCPT_BR_ENABLED(); break; + case DBGFEVENT_XCPT_UD: fDtrace1 = VBOXVMM_XCPT_UD_ENABLED(); break; + case DBGFEVENT_XCPT_NM: fDtrace1 = VBOXVMM_XCPT_NM_ENABLED(); break; + case DBGFEVENT_XCPT_DF: fDtrace1 = VBOXVMM_XCPT_DF_ENABLED(); break; + case DBGFEVENT_XCPT_TS: fDtrace1 = VBOXVMM_XCPT_TS_ENABLED(); break; + case DBGFEVENT_XCPT_NP: fDtrace1 = VBOXVMM_XCPT_NP_ENABLED(); break; + case DBGFEVENT_XCPT_SS: fDtrace1 = VBOXVMM_XCPT_SS_ENABLED(); break; + case DBGFEVENT_XCPT_GP: fDtrace1 = VBOXVMM_XCPT_GP_ENABLED(); break; + case DBGFEVENT_XCPT_PF: fDtrace1 = VBOXVMM_XCPT_PF_ENABLED(); break; + case DBGFEVENT_XCPT_MF: fDtrace1 = VBOXVMM_XCPT_MF_ENABLED(); break; + case DBGFEVENT_XCPT_AC: fDtrace1 = VBOXVMM_XCPT_AC_ENABLED(); break; + case DBGFEVENT_XCPT_XF: fDtrace1 = VBOXVMM_XCPT_XF_ENABLED(); break; + case DBGFEVENT_XCPT_VE: fDtrace1 = VBOXVMM_XCPT_VE_ENABLED(); break; + case DBGFEVENT_XCPT_SX: fDtrace1 = VBOXVMM_XCPT_SX_ENABLED(); break; + default: break; + } + } + else + AssertFailed(); + break; + + case VMX_EXIT_INT_INFO_TYPE_SW_INT: + uEventArg = idxVector; + enmEvent1 = DBGFEVENT_INTERRUPT_SOFTWARE; + fDtrace1 = VBOXVMM_INT_SOFTWARE_ENABLED(); + break; + } + break; + } + + case VMX_EXIT_TRIPLE_FAULT: + enmEvent1 = DBGFEVENT_TRIPLE_FAULT; + //fDtrace1 = VBOXVMM_EXIT_TRIPLE_FAULT_ENABLED(); + break; + case VMX_EXIT_TASK_SWITCH: SET_EXIT(TASK_SWITCH); break; + case VMX_EXIT_EPT_VIOLATION: SET_EXIT(VMX_EPT_VIOLATION); break; + case VMX_EXIT_EPT_MISCONFIG: SET_EXIT(VMX_EPT_MISCONFIG); break; + case VMX_EXIT_APIC_ACCESS: SET_EXIT(VMX_VAPIC_ACCESS); break; + case VMX_EXIT_APIC_WRITE: SET_EXIT(VMX_VAPIC_WRITE); break; + + /* Instruction specific VM-exits: */ + case VMX_EXIT_CPUID: SET_BOTH(CPUID); break; + case VMX_EXIT_GETSEC: SET_BOTH(GETSEC); break; + case VMX_EXIT_HLT: SET_BOTH(HALT); break; + case VMX_EXIT_INVD: SET_BOTH(INVD); break; + case VMX_EXIT_INVLPG: SET_BOTH(INVLPG); break; + case VMX_EXIT_RDPMC: SET_BOTH(RDPMC); break; + case VMX_EXIT_RDTSC: SET_BOTH(RDTSC); break; + case VMX_EXIT_RSM: SET_BOTH(RSM); break; + case VMX_EXIT_VMCALL: SET_BOTH(VMM_CALL); break; + case VMX_EXIT_VMCLEAR: SET_BOTH(VMX_VMCLEAR); break; + case VMX_EXIT_VMLAUNCH: SET_BOTH(VMX_VMLAUNCH); break; + case VMX_EXIT_VMPTRLD: SET_BOTH(VMX_VMPTRLD); break; + case VMX_EXIT_VMPTRST: SET_BOTH(VMX_VMPTRST); break; + case VMX_EXIT_VMREAD: SET_BOTH(VMX_VMREAD); break; + case VMX_EXIT_VMRESUME: SET_BOTH(VMX_VMRESUME); break; + case VMX_EXIT_VMWRITE: SET_BOTH(VMX_VMWRITE); break; + case VMX_EXIT_VMXOFF: SET_BOTH(VMX_VMXOFF); break; + case VMX_EXIT_VMXON: SET_BOTH(VMX_VMXON); break; + case VMX_EXIT_MOV_CRX: + hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + if (VMX_EXIT_QUAL_CRX_ACCESS(pVmxTransient->uExitQual) == VMX_EXIT_QUAL_CRX_ACCESS_READ) + SET_BOTH(CRX_READ); + else + SET_BOTH(CRX_WRITE); + uEventArg = VMX_EXIT_QUAL_CRX_REGISTER(pVmxTransient->uExitQual); + break; + case VMX_EXIT_MOV_DRX: + hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + if ( VMX_EXIT_QUAL_DRX_DIRECTION(pVmxTransient->uExitQual) + == VMX_EXIT_QUAL_DRX_DIRECTION_READ) + SET_BOTH(DRX_READ); + else + SET_BOTH(DRX_WRITE); + uEventArg = VMX_EXIT_QUAL_DRX_REGISTER(pVmxTransient->uExitQual); + break; + case VMX_EXIT_RDMSR: SET_BOTH(RDMSR); break; + case VMX_EXIT_WRMSR: SET_BOTH(WRMSR); break; + case VMX_EXIT_MWAIT: SET_BOTH(MWAIT); break; + case VMX_EXIT_MONITOR: SET_BOTH(MONITOR); break; + case VMX_EXIT_PAUSE: SET_BOTH(PAUSE); break; + case VMX_EXIT_GDTR_IDTR_ACCESS: + hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + switch (RT_BF_GET(pVmxTransient->ExitInstrInfo.u, VMX_BF_XDTR_INSINFO_INSTR_ID)) + { + case VMX_XDTR_INSINFO_II_SGDT: SET_BOTH(SGDT); break; + case VMX_XDTR_INSINFO_II_SIDT: SET_BOTH(SIDT); break; + case VMX_XDTR_INSINFO_II_LGDT: SET_BOTH(LGDT); break; + case VMX_XDTR_INSINFO_II_LIDT: SET_BOTH(LIDT); break; + } + break; + + case VMX_EXIT_LDTR_TR_ACCESS: + hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + switch (RT_BF_GET(pVmxTransient->ExitInstrInfo.u, VMX_BF_YYTR_INSINFO_INSTR_ID)) + { + case VMX_YYTR_INSINFO_II_SLDT: SET_BOTH(SLDT); break; + case VMX_YYTR_INSINFO_II_STR: SET_BOTH(STR); break; + case VMX_YYTR_INSINFO_II_LLDT: SET_BOTH(LLDT); break; + case VMX_YYTR_INSINFO_II_LTR: SET_BOTH(LTR); break; + } + break; + + case VMX_EXIT_INVEPT: SET_BOTH(VMX_INVEPT); break; + case VMX_EXIT_RDTSCP: SET_BOTH(RDTSCP); break; + case VMX_EXIT_INVVPID: SET_BOTH(VMX_INVVPID); break; + case VMX_EXIT_WBINVD: SET_BOTH(WBINVD); break; + case VMX_EXIT_XSETBV: SET_BOTH(XSETBV); break; + case VMX_EXIT_RDRAND: SET_BOTH(RDRAND); break; + case VMX_EXIT_INVPCID: SET_BOTH(VMX_INVPCID); break; + case VMX_EXIT_VMFUNC: SET_BOTH(VMX_VMFUNC); break; + case VMX_EXIT_RDSEED: SET_BOTH(RDSEED); break; + case VMX_EXIT_XSAVES: SET_BOTH(XSAVES); break; + case VMX_EXIT_XRSTORS: SET_BOTH(XRSTORS); break; + + /* Events that aren't relevant at this point. */ + case VMX_EXIT_EXT_INT: + case VMX_EXIT_INT_WINDOW: + case VMX_EXIT_NMI_WINDOW: + case VMX_EXIT_TPR_BELOW_THRESHOLD: + case VMX_EXIT_PREEMPT_TIMER: + case VMX_EXIT_IO_INSTR: + break; + + /* Errors and unexpected events. */ + case VMX_EXIT_INIT_SIGNAL: + case VMX_EXIT_SIPI: + case VMX_EXIT_IO_SMI: + case VMX_EXIT_SMI: + case VMX_EXIT_ERR_INVALID_GUEST_STATE: + case VMX_EXIT_ERR_MSR_LOAD: + case VMX_EXIT_ERR_MACHINE_CHECK: + break; + + default: + AssertMsgFailed(("Unexpected VM-exit=%#x\n", uExitReason)); + break; + } +#undef SET_BOTH +#undef SET_EXIT + + /* + * Dtrace tracepoints go first. We do them here at once so we don't + * have to copy the guest state saving and stuff a few dozen times. + * Down side is that we've got to repeat the switch, though this time + * we use enmEvent since the probes are a subset of what DBGF does. + */ + if (fDtrace1 || fDtrace2) + { + hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + hmR0VmxImportGuestState(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + switch (enmEvent1) + { + /** @todo consider which extra parameters would be helpful for each probe. */ + case DBGFEVENT_END: break; + case DBGFEVENT_XCPT_DE: VBOXVMM_XCPT_DE(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_DB: VBOXVMM_XCPT_DB(pVCpu, pCtx, pCtx->dr[6]); break; + case DBGFEVENT_XCPT_BP: VBOXVMM_XCPT_BP(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_OF: VBOXVMM_XCPT_OF(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_BR: VBOXVMM_XCPT_BR(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_UD: VBOXVMM_XCPT_UD(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_NM: VBOXVMM_XCPT_NM(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_DF: VBOXVMM_XCPT_DF(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_TS: VBOXVMM_XCPT_TS(pVCpu, pCtx, uEventArg); break; + case DBGFEVENT_XCPT_NP: VBOXVMM_XCPT_NP(pVCpu, pCtx, uEventArg); break; + case DBGFEVENT_XCPT_SS: VBOXVMM_XCPT_SS(pVCpu, pCtx, uEventArg); break; + case DBGFEVENT_XCPT_GP: VBOXVMM_XCPT_GP(pVCpu, pCtx, uEventArg); break; + case DBGFEVENT_XCPT_PF: VBOXVMM_XCPT_PF(pVCpu, pCtx, uEventArg, pCtx->cr2); break; + case DBGFEVENT_XCPT_MF: VBOXVMM_XCPT_MF(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_AC: VBOXVMM_XCPT_AC(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_XF: VBOXVMM_XCPT_XF(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_VE: VBOXVMM_XCPT_VE(pVCpu, pCtx); break; + case DBGFEVENT_XCPT_SX: VBOXVMM_XCPT_SX(pVCpu, pCtx, uEventArg); break; + case DBGFEVENT_INTERRUPT_SOFTWARE: VBOXVMM_INT_SOFTWARE(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_INSTR_CPUID: VBOXVMM_INSTR_CPUID(pVCpu, pCtx, pCtx->eax, pCtx->ecx); break; + case DBGFEVENT_INSTR_GETSEC: VBOXVMM_INSTR_GETSEC(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_HALT: VBOXVMM_INSTR_HALT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_INVD: VBOXVMM_INSTR_INVD(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_INVLPG: VBOXVMM_INSTR_INVLPG(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_RDPMC: VBOXVMM_INSTR_RDPMC(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_RDTSC: VBOXVMM_INSTR_RDTSC(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_RSM: VBOXVMM_INSTR_RSM(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_CRX_READ: VBOXVMM_INSTR_CRX_READ(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_INSTR_CRX_WRITE: VBOXVMM_INSTR_CRX_WRITE(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_INSTR_DRX_READ: VBOXVMM_INSTR_DRX_READ(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_INSTR_DRX_WRITE: VBOXVMM_INSTR_DRX_WRITE(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_INSTR_RDMSR: VBOXVMM_INSTR_RDMSR(pVCpu, pCtx, pCtx->ecx); break; + case DBGFEVENT_INSTR_WRMSR: VBOXVMM_INSTR_WRMSR(pVCpu, pCtx, pCtx->ecx, + RT_MAKE_U64(pCtx->eax, pCtx->edx)); break; + case DBGFEVENT_INSTR_MWAIT: VBOXVMM_INSTR_MWAIT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_MONITOR: VBOXVMM_INSTR_MONITOR(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_PAUSE: VBOXVMM_INSTR_PAUSE(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_SGDT: VBOXVMM_INSTR_SGDT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_SIDT: VBOXVMM_INSTR_SIDT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_LGDT: VBOXVMM_INSTR_LGDT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_LIDT: VBOXVMM_INSTR_LIDT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_SLDT: VBOXVMM_INSTR_SLDT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_STR: VBOXVMM_INSTR_STR(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_LLDT: VBOXVMM_INSTR_LLDT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_LTR: VBOXVMM_INSTR_LTR(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_RDTSCP: VBOXVMM_INSTR_RDTSCP(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_WBINVD: VBOXVMM_INSTR_WBINVD(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_XSETBV: VBOXVMM_INSTR_XSETBV(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_RDRAND: VBOXVMM_INSTR_RDRAND(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_RDSEED: VBOXVMM_INSTR_RDSEED(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_XSAVES: VBOXVMM_INSTR_XSAVES(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_XRSTORS: VBOXVMM_INSTR_XRSTORS(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMM_CALL: VBOXVMM_INSTR_VMM_CALL(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMCLEAR: VBOXVMM_INSTR_VMX_VMCLEAR(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMLAUNCH: VBOXVMM_INSTR_VMX_VMLAUNCH(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMPTRLD: VBOXVMM_INSTR_VMX_VMPTRLD(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMPTRST: VBOXVMM_INSTR_VMX_VMPTRST(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMREAD: VBOXVMM_INSTR_VMX_VMREAD(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMRESUME: VBOXVMM_INSTR_VMX_VMRESUME(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMWRITE: VBOXVMM_INSTR_VMX_VMWRITE(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMXOFF: VBOXVMM_INSTR_VMX_VMXOFF(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMXON: VBOXVMM_INSTR_VMX_VMXON(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_INVEPT: VBOXVMM_INSTR_VMX_INVEPT(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_INVVPID: VBOXVMM_INSTR_VMX_INVVPID(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_INVPCID: VBOXVMM_INSTR_VMX_INVPCID(pVCpu, pCtx); break; + case DBGFEVENT_INSTR_VMX_VMFUNC: VBOXVMM_INSTR_VMX_VMFUNC(pVCpu, pCtx); break; + default: AssertMsgFailed(("enmEvent1=%d uExitReason=%d\n", enmEvent1, uExitReason)); break; + } + switch (enmEvent2) + { + /** @todo consider which extra parameters would be helpful for each probe. */ + case DBGFEVENT_END: break; + case DBGFEVENT_EXIT_TASK_SWITCH: VBOXVMM_EXIT_TASK_SWITCH(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_CPUID: VBOXVMM_EXIT_CPUID(pVCpu, pCtx, pCtx->eax, pCtx->ecx); break; + case DBGFEVENT_EXIT_GETSEC: VBOXVMM_EXIT_GETSEC(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_HALT: VBOXVMM_EXIT_HALT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_INVD: VBOXVMM_EXIT_INVD(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_INVLPG: VBOXVMM_EXIT_INVLPG(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_RDPMC: VBOXVMM_EXIT_RDPMC(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_RDTSC: VBOXVMM_EXIT_RDTSC(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_RSM: VBOXVMM_EXIT_RSM(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_CRX_READ: VBOXVMM_EXIT_CRX_READ(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_EXIT_CRX_WRITE: VBOXVMM_EXIT_CRX_WRITE(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_EXIT_DRX_READ: VBOXVMM_EXIT_DRX_READ(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_EXIT_DRX_WRITE: VBOXVMM_EXIT_DRX_WRITE(pVCpu, pCtx, (uint8_t)uEventArg); break; + case DBGFEVENT_EXIT_RDMSR: VBOXVMM_EXIT_RDMSR(pVCpu, pCtx, pCtx->ecx); break; + case DBGFEVENT_EXIT_WRMSR: VBOXVMM_EXIT_WRMSR(pVCpu, pCtx, pCtx->ecx, + RT_MAKE_U64(pCtx->eax, pCtx->edx)); break; + case DBGFEVENT_EXIT_MWAIT: VBOXVMM_EXIT_MWAIT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_MONITOR: VBOXVMM_EXIT_MONITOR(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_PAUSE: VBOXVMM_EXIT_PAUSE(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_SGDT: VBOXVMM_EXIT_SGDT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_SIDT: VBOXVMM_EXIT_SIDT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_LGDT: VBOXVMM_EXIT_LGDT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_LIDT: VBOXVMM_EXIT_LIDT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_SLDT: VBOXVMM_EXIT_SLDT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_STR: VBOXVMM_EXIT_STR(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_LLDT: VBOXVMM_EXIT_LLDT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_LTR: VBOXVMM_EXIT_LTR(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_RDTSCP: VBOXVMM_EXIT_RDTSCP(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_WBINVD: VBOXVMM_EXIT_WBINVD(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_XSETBV: VBOXVMM_EXIT_XSETBV(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_RDRAND: VBOXVMM_EXIT_RDRAND(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_RDSEED: VBOXVMM_EXIT_RDSEED(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_XSAVES: VBOXVMM_EXIT_XSAVES(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_XRSTORS: VBOXVMM_EXIT_XRSTORS(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMM_CALL: VBOXVMM_EXIT_VMM_CALL(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMCLEAR: VBOXVMM_EXIT_VMX_VMCLEAR(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMLAUNCH: VBOXVMM_EXIT_VMX_VMLAUNCH(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMPTRLD: VBOXVMM_EXIT_VMX_VMPTRLD(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMPTRST: VBOXVMM_EXIT_VMX_VMPTRST(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMREAD: VBOXVMM_EXIT_VMX_VMREAD(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMRESUME: VBOXVMM_EXIT_VMX_VMRESUME(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMWRITE: VBOXVMM_EXIT_VMX_VMWRITE(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMXOFF: VBOXVMM_EXIT_VMX_VMXOFF(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMXON: VBOXVMM_EXIT_VMX_VMXON(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_INVEPT: VBOXVMM_EXIT_VMX_INVEPT(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_INVVPID: VBOXVMM_EXIT_VMX_INVVPID(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_INVPCID: VBOXVMM_EXIT_VMX_INVPCID(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VMFUNC: VBOXVMM_EXIT_VMX_VMFUNC(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_EPT_MISCONFIG: VBOXVMM_EXIT_VMX_EPT_MISCONFIG(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_EPT_VIOLATION: VBOXVMM_EXIT_VMX_EPT_VIOLATION(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VAPIC_ACCESS: VBOXVMM_EXIT_VMX_VAPIC_ACCESS(pVCpu, pCtx); break; + case DBGFEVENT_EXIT_VMX_VAPIC_WRITE: VBOXVMM_EXIT_VMX_VAPIC_WRITE(pVCpu, pCtx); break; + default: AssertMsgFailed(("enmEvent2=%d uExitReason=%d\n", enmEvent2, uExitReason)); break; + } + } + + /* + * Fire of the DBGF event, if enabled (our check here is just a quick one, + * the DBGF call will do a full check). + * + * Note! DBGF sets DBGFEVENT_INTERRUPT_SOFTWARE in the bitmap. + * Note! If we have to events, we prioritize the first, i.e. the instruction + * one, in order to avoid event nesting. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if ( enmEvent1 != DBGFEVENT_END + && DBGF_IS_EVENT_ENABLED(pVM, enmEvent1)) + { + HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP); + VBOXSTRICTRC rcStrict = DBGFEventGenericWithArgs(pVM, pVCpu, enmEvent1, DBGFEVENTCTX_HM, 1, uEventArg); + if (rcStrict != VINF_SUCCESS) + return rcStrict; + } + else if ( enmEvent2 != DBGFEVENT_END + && DBGF_IS_EVENT_ENABLED(pVM, enmEvent2)) + { + HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP); + VBOXSTRICTRC rcStrict = DBGFEventGenericWithArgs(pVM, pVCpu, enmEvent2, DBGFEVENTCTX_HM, 1, uEventArg); + if (rcStrict != VINF_SUCCESS) + return rcStrict; + } + + return VINF_SUCCESS; +} + + +/** + * Single-stepping VM-exit filtering. + * + * This is preprocessing the VM-exits and deciding whether we've gotten far + * enough to return VINF_EM_DBG_STEPPED already. If not, normal VM-exit + * handling is performed. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure of the calling EMT. + * @param pVmxTransient Pointer to the VMX-transient structure. + * @param pDbgState The debug state. + */ +DECLINLINE(VBOXSTRICTRC) hmR0VmxRunDebugHandleExit(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient, PVMXRUNDBGSTATE pDbgState) +{ + /* + * Expensive (saves context) generic dtrace VM-exit probe. + */ + uint32_t const uExitReason = pVmxTransient->uExitReason; + if (!VBOXVMM_R0_HMVMX_VMEXIT_ENABLED()) + { /* more likely */ } + else + { + hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + int rc = hmR0VmxImportGuestState(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRC(rc); + VBOXVMM_R0_HMVMX_VMEXIT(pVCpu, &pVCpu->cpum.GstCtx, pVmxTransient->uExitReason, pVmxTransient->uExitQual); + } + + /* + * Check for host NMI, just to get that out of the way. + */ + if (uExitReason != VMX_EXIT_XCPT_OR_NMI) + { /* normally likely */ } + else + { + int rc2 = hmR0VmxReadExitIntInfoVmcs(pVmxTransient); + AssertRCReturn(rc2, rc2); + uint32_t uIntType = VMX_EXIT_INT_INFO_TYPE(pVmxTransient->uExitIntInfo); + if (uIntType == VMX_EXIT_INT_INFO_TYPE_NMI) + return hmR0VmxExitXcptOrNmi(pVCpu, pVmxTransient); + } + + /* + * Check for single stepping event if we're stepping. + */ + if (pVCpu->hm.s.fSingleInstruction) + { + switch (uExitReason) + { + case VMX_EXIT_MTF: + return hmR0VmxExitMtf(pVCpu, pVmxTransient); + + /* Various events: */ + case VMX_EXIT_XCPT_OR_NMI: + case VMX_EXIT_EXT_INT: + case VMX_EXIT_TRIPLE_FAULT: + case VMX_EXIT_INT_WINDOW: + case VMX_EXIT_NMI_WINDOW: + case VMX_EXIT_TASK_SWITCH: + case VMX_EXIT_TPR_BELOW_THRESHOLD: + case VMX_EXIT_APIC_ACCESS: + case VMX_EXIT_EPT_VIOLATION: + case VMX_EXIT_EPT_MISCONFIG: + case VMX_EXIT_PREEMPT_TIMER: + + /* Instruction specific VM-exits: */ + case VMX_EXIT_CPUID: + case VMX_EXIT_GETSEC: + case VMX_EXIT_HLT: + case VMX_EXIT_INVD: + case VMX_EXIT_INVLPG: + case VMX_EXIT_RDPMC: + case VMX_EXIT_RDTSC: + case VMX_EXIT_RSM: + case VMX_EXIT_VMCALL: + case VMX_EXIT_VMCLEAR: + case VMX_EXIT_VMLAUNCH: + case VMX_EXIT_VMPTRLD: + case VMX_EXIT_VMPTRST: + case VMX_EXIT_VMREAD: + case VMX_EXIT_VMRESUME: + case VMX_EXIT_VMWRITE: + case VMX_EXIT_VMXOFF: + case VMX_EXIT_VMXON: + case VMX_EXIT_MOV_CRX: + case VMX_EXIT_MOV_DRX: + case VMX_EXIT_IO_INSTR: + case VMX_EXIT_RDMSR: + case VMX_EXIT_WRMSR: + case VMX_EXIT_MWAIT: + case VMX_EXIT_MONITOR: + case VMX_EXIT_PAUSE: + case VMX_EXIT_GDTR_IDTR_ACCESS: + case VMX_EXIT_LDTR_TR_ACCESS: + case VMX_EXIT_INVEPT: + case VMX_EXIT_RDTSCP: + case VMX_EXIT_INVVPID: + case VMX_EXIT_WBINVD: + case VMX_EXIT_XSETBV: + case VMX_EXIT_RDRAND: + case VMX_EXIT_INVPCID: + case VMX_EXIT_VMFUNC: + case VMX_EXIT_RDSEED: + case VMX_EXIT_XSAVES: + case VMX_EXIT_XRSTORS: + { + int rc = hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP); + AssertRCReturn(rc, rc); + if ( pVCpu->cpum.GstCtx.rip != pDbgState->uRipStart + || pVCpu->cpum.GstCtx.cs.Sel != pDbgState->uCsStart) + return VINF_EM_DBG_STEPPED; + break; + } + + /* Errors and unexpected events: */ + case VMX_EXIT_INIT_SIGNAL: + case VMX_EXIT_SIPI: + case VMX_EXIT_IO_SMI: + case VMX_EXIT_SMI: + case VMX_EXIT_ERR_INVALID_GUEST_STATE: + case VMX_EXIT_ERR_MSR_LOAD: + case VMX_EXIT_ERR_MACHINE_CHECK: + case VMX_EXIT_APIC_WRITE: /* Some talk about this being fault like, so I guess we must process it? */ + break; + + default: + AssertMsgFailed(("Unexpected VM-exit=%#x\n", uExitReason)); + break; + } + } + + /* + * Check for debugger event breakpoints and dtrace probes. + */ + if ( uExitReason < RT_ELEMENTS(pDbgState->bmExitsToCheck) * 32U + && ASMBitTest(pDbgState->bmExitsToCheck, uExitReason) ) + { + VBOXSTRICTRC rcStrict = hmR0VmxHandleExitDtraceEvents(pVCpu, pVmxTransient, uExitReason); + if (rcStrict != VINF_SUCCESS) + return rcStrict; + } + + /* + * Normal processing. + */ +#ifdef HMVMX_USE_FUNCTION_TABLE + return g_apfnVMExitHandlers[uExitReason](pVCpu, pVmxTransient); +#else + return hmR0VmxHandleExit(pVCpu, pVmxTransient, uExitReason); +#endif +} + + +/** + * Single steps guest code using VT-x. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure. + * + * @note Mostly the same as hmR0VmxRunGuestCodeNormal(). + */ +static VBOXSTRICTRC hmR0VmxRunGuestCodeDebug(PVMCPU pVCpu) +{ + VMXTRANSIENT VmxTransient; + VmxTransient.fUpdateTscOffsettingAndPreemptTimer = true; + + /* Set HMCPU indicators. */ + bool const fSavedSingleInstruction = pVCpu->hm.s.fSingleInstruction; + pVCpu->hm.s.fSingleInstruction = pVCpu->hm.s.fSingleInstruction || DBGFIsStepping(pVCpu); + pVCpu->hm.s.fDebugWantRdTscExit = false; + pVCpu->hm.s.fUsingDebugLoop = true; + + /* State we keep to help modify and later restore the VMCS fields we alter, and for detecting steps. */ + VMXRUNDBGSTATE DbgState; + hmR0VmxRunDebugStateInit(pVCpu, &DbgState); + hmR0VmxPreRunGuestDebugStateUpdate(pVCpu, &DbgState, &VmxTransient); + + /* + * The loop. + */ + VBOXSTRICTRC rcStrict = VERR_INTERNAL_ERROR_5; + for (uint32_t cLoops = 0; ; cLoops++) + { + Assert(!HMR0SuspendPending()); + HMVMX_ASSERT_CPU_SAFE(pVCpu); + bool fStepping = pVCpu->hm.s.fSingleInstruction; + + /* + * Preparatory work for running guest code, this may force us to return + * to ring-3. This bugger disables interrupts on VINF_SUCCESS! + */ + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatEntry, x); + hmR0VmxPreRunGuestDebugStateApply(pVCpu, &DbgState); /* Set up execute controls the next to can respond to. */ + rcStrict = hmR0VmxPreRunGuest(pVCpu, &VmxTransient, fStepping); + if (rcStrict != VINF_SUCCESS) + break; + + hmR0VmxPreRunGuestCommitted(pVCpu, &VmxTransient); + hmR0VmxPreRunGuestDebugStateApply(pVCpu, &DbgState); /* Override any obnoxious code in the above two calls. */ + + /* + * Now we can run the guest code. + */ + int rcRun = hmR0VmxRunGuest(pVCpu); + + /* + * Restore any residual host-state and save any bits shared between host + * and guest into the guest-CPU state. Re-enables interrupts! + */ + hmR0VmxPostRunGuest(pVCpu, &VmxTransient, rcRun); + + /* Check for errors with running the VM (VMLAUNCH/VMRESUME). */ + if (RT_SUCCESS(rcRun)) + { /* very likely */ } + else + { + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatPreExit, x); + hmR0VmxReportWorldSwitchError(pVCpu, rcRun, &VmxTransient); + return rcRun; + } + + /* Profile the VM-exit. */ + AssertMsg(VmxTransient.uExitReason <= VMX_EXIT_MAX, ("%#x\n", VmxTransient.uExitReason)); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitAll); + STAM_COUNTER_INC(&pVCpu->hm.s.paStatExitReasonR0[VmxTransient.uExitReason & MASK_EXITREASON_STAT]); + STAM_PROFILE_ADV_STOP_START(&pVCpu->hm.s.StatPreExit, &pVCpu->hm.s.StatExitHandling, x); + HMVMX_START_EXIT_DISPATCH_PROF(); + + VBOXVMM_R0_HMVMX_VMEXIT_NOCTX(pVCpu, &pVCpu->cpum.GstCtx, VmxTransient.uExitReason); + + /* + * Handle the VM-exit - we quit earlier on certain VM-exits, see hmR0VmxHandleExitDebug(). + */ + rcStrict = hmR0VmxRunDebugHandleExit(pVCpu, &VmxTransient, &DbgState); + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitHandling, x); + if (rcStrict != VINF_SUCCESS) + break; + if (cLoops > pVCpu->CTX_SUFF(pVM)->hm.s.cMaxResumeLoops) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchMaxResumeLoops); + rcStrict = VINF_EM_RAW_INTERRUPT; + break; + } + + /* + * Stepping: Did the RIP change, if so, consider it a single step. + * Otherwise, make sure one of the TFs gets set. + */ + if (fStepping) + { + int rc = hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP); + AssertRC(rc); + if ( pVCpu->cpum.GstCtx.rip != DbgState.uRipStart + || pVCpu->cpum.GstCtx.cs.Sel != DbgState.uCsStart) + { + rcStrict = VINF_EM_DBG_STEPPED; + break; + } + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_DR7); + } + + /* + * Update when dtrace settings changes (DBGF kicks us, so no need to check). + */ + if (VBOXVMM_GET_SETTINGS_SEQ_NO() != DbgState.uDtraceSettingsSeqNo) + hmR0VmxPreRunGuestDebugStateUpdate(pVCpu, &DbgState, &VmxTransient); + } + + /* + * Clear the X86_EFL_TF if necessary. + */ + if (pVCpu->hm.s.fClearTrapFlag) + { + int rc = hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_RFLAGS); + AssertRC(rc); + pVCpu->hm.s.fClearTrapFlag = false; + pVCpu->cpum.GstCtx.eflags.Bits.u1TF = 0; + } + /** @todo there seems to be issues with the resume flag when the monitor trap + * flag is pending without being used. Seen early in bios init when + * accessing APIC page in protected mode. */ + + /* + * Restore VM-exit control settings as we may not reenter this function the + * next time around. + */ + rcStrict = hmR0VmxRunDebugStateRevert(pVCpu, &DbgState, rcStrict); + + /* Restore HMCPU indicators. */ + pVCpu->hm.s.fUsingDebugLoop = false; + pVCpu->hm.s.fDebugWantRdTscExit = false; + pVCpu->hm.s.fSingleInstruction = fSavedSingleInstruction; + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatEntry, x); + return rcStrict; +} + + +/** @} */ + + +/** + * Checks if any expensive dtrace probes are enabled and we should go to the + * debug loop. + * + * @returns true if we should use debug loop, false if not. + */ +static bool hmR0VmxAnyExpensiveProbesEnabled(void) +{ + /* It's probably faster to OR the raw 32-bit counter variables together. + Since the variables are in an array and the probes are next to one + another (more or less), we have good locality. So, better read + eight-nine cache lines ever time and only have one conditional, than + 128+ conditionals, right? */ + return ( VBOXVMM_R0_HMVMX_VMEXIT_ENABLED_RAW() /* expensive too due to context */ + | VBOXVMM_XCPT_DE_ENABLED_RAW() + | VBOXVMM_XCPT_DB_ENABLED_RAW() + | VBOXVMM_XCPT_BP_ENABLED_RAW() + | VBOXVMM_XCPT_OF_ENABLED_RAW() + | VBOXVMM_XCPT_BR_ENABLED_RAW() + | VBOXVMM_XCPT_UD_ENABLED_RAW() + | VBOXVMM_XCPT_NM_ENABLED_RAW() + | VBOXVMM_XCPT_DF_ENABLED_RAW() + | VBOXVMM_XCPT_TS_ENABLED_RAW() + | VBOXVMM_XCPT_NP_ENABLED_RAW() + | VBOXVMM_XCPT_SS_ENABLED_RAW() + | VBOXVMM_XCPT_GP_ENABLED_RAW() + | VBOXVMM_XCPT_PF_ENABLED_RAW() + | VBOXVMM_XCPT_MF_ENABLED_RAW() + | VBOXVMM_XCPT_AC_ENABLED_RAW() + | VBOXVMM_XCPT_XF_ENABLED_RAW() + | VBOXVMM_XCPT_VE_ENABLED_RAW() + | VBOXVMM_XCPT_SX_ENABLED_RAW() + | VBOXVMM_INT_SOFTWARE_ENABLED_RAW() + | VBOXVMM_INT_HARDWARE_ENABLED_RAW() + ) != 0 + || ( VBOXVMM_INSTR_HALT_ENABLED_RAW() + | VBOXVMM_INSTR_MWAIT_ENABLED_RAW() + | VBOXVMM_INSTR_MONITOR_ENABLED_RAW() + | VBOXVMM_INSTR_CPUID_ENABLED_RAW() + | VBOXVMM_INSTR_INVD_ENABLED_RAW() + | VBOXVMM_INSTR_WBINVD_ENABLED_RAW() + | VBOXVMM_INSTR_INVLPG_ENABLED_RAW() + | VBOXVMM_INSTR_RDTSC_ENABLED_RAW() + | VBOXVMM_INSTR_RDTSCP_ENABLED_RAW() + | VBOXVMM_INSTR_RDPMC_ENABLED_RAW() + | VBOXVMM_INSTR_RDMSR_ENABLED_RAW() + | VBOXVMM_INSTR_WRMSR_ENABLED_RAW() + | VBOXVMM_INSTR_CRX_READ_ENABLED_RAW() + | VBOXVMM_INSTR_CRX_WRITE_ENABLED_RAW() + | VBOXVMM_INSTR_DRX_READ_ENABLED_RAW() + | VBOXVMM_INSTR_DRX_WRITE_ENABLED_RAW() + | VBOXVMM_INSTR_PAUSE_ENABLED_RAW() + | VBOXVMM_INSTR_XSETBV_ENABLED_RAW() + | VBOXVMM_INSTR_SIDT_ENABLED_RAW() + | VBOXVMM_INSTR_LIDT_ENABLED_RAW() + | VBOXVMM_INSTR_SGDT_ENABLED_RAW() + | VBOXVMM_INSTR_LGDT_ENABLED_RAW() + | VBOXVMM_INSTR_SLDT_ENABLED_RAW() + | VBOXVMM_INSTR_LLDT_ENABLED_RAW() + | VBOXVMM_INSTR_STR_ENABLED_RAW() + | VBOXVMM_INSTR_LTR_ENABLED_RAW() + | VBOXVMM_INSTR_GETSEC_ENABLED_RAW() + | VBOXVMM_INSTR_RSM_ENABLED_RAW() + | VBOXVMM_INSTR_RDRAND_ENABLED_RAW() + | VBOXVMM_INSTR_RDSEED_ENABLED_RAW() + | VBOXVMM_INSTR_XSAVES_ENABLED_RAW() + | VBOXVMM_INSTR_XRSTORS_ENABLED_RAW() + | VBOXVMM_INSTR_VMM_CALL_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMCLEAR_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMLAUNCH_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMPTRLD_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMPTRST_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMREAD_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMRESUME_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMWRITE_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMXOFF_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMXON_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_VMFUNC_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_INVEPT_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_INVVPID_ENABLED_RAW() + | VBOXVMM_INSTR_VMX_INVPCID_ENABLED_RAW() + ) != 0 + || ( VBOXVMM_EXIT_TASK_SWITCH_ENABLED_RAW() + | VBOXVMM_EXIT_HALT_ENABLED_RAW() + | VBOXVMM_EXIT_MWAIT_ENABLED_RAW() + | VBOXVMM_EXIT_MONITOR_ENABLED_RAW() + | VBOXVMM_EXIT_CPUID_ENABLED_RAW() + | VBOXVMM_EXIT_INVD_ENABLED_RAW() + | VBOXVMM_EXIT_WBINVD_ENABLED_RAW() + | VBOXVMM_EXIT_INVLPG_ENABLED_RAW() + | VBOXVMM_EXIT_RDTSC_ENABLED_RAW() + | VBOXVMM_EXIT_RDTSCP_ENABLED_RAW() + | VBOXVMM_EXIT_RDPMC_ENABLED_RAW() + | VBOXVMM_EXIT_RDMSR_ENABLED_RAW() + | VBOXVMM_EXIT_WRMSR_ENABLED_RAW() + | VBOXVMM_EXIT_CRX_READ_ENABLED_RAW() + | VBOXVMM_EXIT_CRX_WRITE_ENABLED_RAW() + | VBOXVMM_EXIT_DRX_READ_ENABLED_RAW() + | VBOXVMM_EXIT_DRX_WRITE_ENABLED_RAW() + | VBOXVMM_EXIT_PAUSE_ENABLED_RAW() + | VBOXVMM_EXIT_XSETBV_ENABLED_RAW() + | VBOXVMM_EXIT_SIDT_ENABLED_RAW() + | VBOXVMM_EXIT_LIDT_ENABLED_RAW() + | VBOXVMM_EXIT_SGDT_ENABLED_RAW() + | VBOXVMM_EXIT_LGDT_ENABLED_RAW() + | VBOXVMM_EXIT_SLDT_ENABLED_RAW() + | VBOXVMM_EXIT_LLDT_ENABLED_RAW() + | VBOXVMM_EXIT_STR_ENABLED_RAW() + | VBOXVMM_EXIT_LTR_ENABLED_RAW() + | VBOXVMM_EXIT_GETSEC_ENABLED_RAW() + | VBOXVMM_EXIT_RSM_ENABLED_RAW() + | VBOXVMM_EXIT_RDRAND_ENABLED_RAW() + | VBOXVMM_EXIT_RDSEED_ENABLED_RAW() + | VBOXVMM_EXIT_XSAVES_ENABLED_RAW() + | VBOXVMM_EXIT_XRSTORS_ENABLED_RAW() + | VBOXVMM_EXIT_VMM_CALL_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMCLEAR_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMLAUNCH_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMPTRLD_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMPTRST_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMREAD_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMRESUME_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMWRITE_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMXOFF_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMXON_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VMFUNC_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_INVEPT_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_INVVPID_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_INVPCID_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_EPT_VIOLATION_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_EPT_MISCONFIG_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VAPIC_ACCESS_ENABLED_RAW() + | VBOXVMM_EXIT_VMX_VAPIC_WRITE_ENABLED_RAW() + ) != 0; +} + + +/** + * Runs the guest code using VT-x. + * + * @returns Strict VBox status code (i.e. informational status codes too). + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0DECL(VBOXSTRICTRC) VMXR0RunGuestCode(PVMCPU pVCpu) +{ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + Assert(VMMRZCallRing3IsEnabled(pVCpu)); + Assert(!ASMAtomicUoReadU64(&pCtx->fExtrn)); + HMVMX_ASSERT_PREEMPT_SAFE(pVCpu); + + VMMRZCallRing3SetNotification(pVCpu, hmR0VmxCallRing3Callback, pCtx); + + VBOXSTRICTRC rcStrict; + if ( !pVCpu->hm.s.fUseDebugLoop + && (!VBOXVMM_ANY_PROBES_ENABLED() || !hmR0VmxAnyExpensiveProbesEnabled()) + && !DBGFIsStepping(pVCpu) + && !pVCpu->CTX_SUFF(pVM)->dbgf.ro.cEnabledInt3Breakpoints) + rcStrict = hmR0VmxRunGuestCodeNormal(pVCpu); + else + rcStrict = hmR0VmxRunGuestCodeDebug(pVCpu); + + if (rcStrict == VERR_EM_INTERPRETER) + rcStrict = VINF_EM_RAW_EMULATE_INSTR; + else if (rcStrict == VINF_EM_RESET) + rcStrict = VINF_EM_TRIPLE_FAULT; + + int rc2 = hmR0VmxExitToRing3(pVCpu, rcStrict); + if (RT_FAILURE(rc2)) + { + pVCpu->hm.s.u32HMError = (uint32_t)VBOXSTRICTRC_VAL(rcStrict); + rcStrict = rc2; + } + Assert(!ASMAtomicUoReadU64(&pCtx->fExtrn)); + Assert(!VMMRZCallRing3IsNotificationSet(pVCpu)); + return rcStrict; +} + + +#ifndef HMVMX_USE_FUNCTION_TABLE +DECLINLINE(VBOXSTRICTRC) hmR0VmxHandleExit(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient, uint32_t rcReason) +{ +#ifdef DEBUG_ramshankar +#define VMEXIT_CALL_RET(a_fSave, a_CallExpr) \ + do { \ + if (a_fSave != 0) \ + hmR0VmxImportGuestState(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); \ + VBOXSTRICTRC rcStrict = a_CallExpr; \ + if (a_fSave != 0) \ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); \ + return rcStrict; \ + } while (0) +#else +# define VMEXIT_CALL_RET(a_fSave, a_CallExpr) return a_CallExpr +#endif + switch (rcReason) + { + case VMX_EXIT_EPT_MISCONFIG: VMEXIT_CALL_RET(0, hmR0VmxExitEptMisconfig(pVCpu, pVmxTransient)); + case VMX_EXIT_EPT_VIOLATION: VMEXIT_CALL_RET(0, hmR0VmxExitEptViolation(pVCpu, pVmxTransient)); + case VMX_EXIT_IO_INSTR: VMEXIT_CALL_RET(0, hmR0VmxExitIoInstr(pVCpu, pVmxTransient)); + case VMX_EXIT_CPUID: VMEXIT_CALL_RET(0, hmR0VmxExitCpuid(pVCpu, pVmxTransient)); + case VMX_EXIT_RDTSC: VMEXIT_CALL_RET(0, hmR0VmxExitRdtsc(pVCpu, pVmxTransient)); + case VMX_EXIT_RDTSCP: VMEXIT_CALL_RET(0, hmR0VmxExitRdtscp(pVCpu, pVmxTransient)); + case VMX_EXIT_APIC_ACCESS: VMEXIT_CALL_RET(0, hmR0VmxExitApicAccess(pVCpu, pVmxTransient)); + case VMX_EXIT_XCPT_OR_NMI: VMEXIT_CALL_RET(0, hmR0VmxExitXcptOrNmi(pVCpu, pVmxTransient)); + case VMX_EXIT_MOV_CRX: VMEXIT_CALL_RET(0, hmR0VmxExitMovCRx(pVCpu, pVmxTransient)); + case VMX_EXIT_EXT_INT: VMEXIT_CALL_RET(0, hmR0VmxExitExtInt(pVCpu, pVmxTransient)); + case VMX_EXIT_INT_WINDOW: VMEXIT_CALL_RET(0, hmR0VmxExitIntWindow(pVCpu, pVmxTransient)); + case VMX_EXIT_TPR_BELOW_THRESHOLD: VMEXIT_CALL_RET(0, hmR0VmxExitTprBelowThreshold(pVCpu, pVmxTransient)); + case VMX_EXIT_MWAIT: VMEXIT_CALL_RET(0, hmR0VmxExitMwait(pVCpu, pVmxTransient)); + case VMX_EXIT_MONITOR: VMEXIT_CALL_RET(0, hmR0VmxExitMonitor(pVCpu, pVmxTransient)); + case VMX_EXIT_TASK_SWITCH: VMEXIT_CALL_RET(0, hmR0VmxExitTaskSwitch(pVCpu, pVmxTransient)); + case VMX_EXIT_PREEMPT_TIMER: VMEXIT_CALL_RET(0, hmR0VmxExitPreemptTimer(pVCpu, pVmxTransient)); + case VMX_EXIT_RDMSR: VMEXIT_CALL_RET(0, hmR0VmxExitRdmsr(pVCpu, pVmxTransient)); + case VMX_EXIT_WRMSR: VMEXIT_CALL_RET(0, hmR0VmxExitWrmsr(pVCpu, pVmxTransient)); + case VMX_EXIT_VMCALL: VMEXIT_CALL_RET(0, hmR0VmxExitVmcall(pVCpu, pVmxTransient)); + case VMX_EXIT_MOV_DRX: VMEXIT_CALL_RET(0, hmR0VmxExitMovDRx(pVCpu, pVmxTransient)); + case VMX_EXIT_HLT: VMEXIT_CALL_RET(0, hmR0VmxExitHlt(pVCpu, pVmxTransient)); + case VMX_EXIT_INVD: VMEXIT_CALL_RET(0, hmR0VmxExitInvd(pVCpu, pVmxTransient)); + case VMX_EXIT_INVLPG: VMEXIT_CALL_RET(0, hmR0VmxExitInvlpg(pVCpu, pVmxTransient)); + case VMX_EXIT_RSM: VMEXIT_CALL_RET(0, hmR0VmxExitRsm(pVCpu, pVmxTransient)); + case VMX_EXIT_MTF: VMEXIT_CALL_RET(0, hmR0VmxExitMtf(pVCpu, pVmxTransient)); + case VMX_EXIT_PAUSE: VMEXIT_CALL_RET(0, hmR0VmxExitPause(pVCpu, pVmxTransient)); + case VMX_EXIT_GDTR_IDTR_ACCESS: VMEXIT_CALL_RET(0, hmR0VmxExitXdtrAccess(pVCpu, pVmxTransient)); + case VMX_EXIT_LDTR_TR_ACCESS: VMEXIT_CALL_RET(0, hmR0VmxExitXdtrAccess(pVCpu, pVmxTransient)); + case VMX_EXIT_WBINVD: VMEXIT_CALL_RET(0, hmR0VmxExitWbinvd(pVCpu, pVmxTransient)); + case VMX_EXIT_XSETBV: VMEXIT_CALL_RET(0, hmR0VmxExitXsetbv(pVCpu, pVmxTransient)); + case VMX_EXIT_RDRAND: VMEXIT_CALL_RET(0, hmR0VmxExitRdrand(pVCpu, pVmxTransient)); + case VMX_EXIT_INVPCID: VMEXIT_CALL_RET(0, hmR0VmxExitInvpcid(pVCpu, pVmxTransient)); + case VMX_EXIT_GETSEC: VMEXIT_CALL_RET(0, hmR0VmxExitGetsec(pVCpu, pVmxTransient)); + case VMX_EXIT_RDPMC: VMEXIT_CALL_RET(0, hmR0VmxExitRdpmc(pVCpu, pVmxTransient)); +#ifdef VBOX_WITH_NESTED_HWVIRT_VMX + case VMX_EXIT_VMCLEAR: VMEXIT_CALL_RET(0, hmR0VmxExitVmclear(pVCpu, pVmxTransient)); + case VMX_EXIT_VMLAUNCH: VMEXIT_CALL_RET(0, hmR0VmxExitVmlaunch(pVCpu, pVmxTransient)); + case VMX_EXIT_VMPTRLD: VMEXIT_CALL_RET(0, hmR0VmxExitVmptrld(pVCpu, pVmxTransient)); + case VMX_EXIT_VMPTRST: VMEXIT_CALL_RET(0, hmR0VmxExitVmptrst(pVCpu, pVmxTransient)); + case VMX_EXIT_VMREAD: VMEXIT_CALL_RET(0, hmR0VmxExitVmread(pVCpu, pVmxTransient)); + case VMX_EXIT_VMRESUME: VMEXIT_CALL_RET(0, hmR0VmxExitVmwrite(pVCpu, pVmxTransient)); + case VMX_EXIT_VMWRITE: VMEXIT_CALL_RET(0, hmR0VmxExitVmresume(pVCpu, pVmxTransient)); + case VMX_EXIT_VMXOFF: VMEXIT_CALL_RET(0, hmR0VmxExitVmxoff(pVCpu, pVmxTransient)); + case VMX_EXIT_VMXON: VMEXIT_CALL_RET(0, hmR0VmxExitVmxon(pVCpu, pVmxTransient)); +#else + case VMX_EXIT_VMCLEAR: + case VMX_EXIT_VMLAUNCH: + case VMX_EXIT_VMPTRLD: + case VMX_EXIT_VMPTRST: + case VMX_EXIT_VMREAD: + case VMX_EXIT_VMRESUME: + case VMX_EXIT_VMWRITE: + case VMX_EXIT_VMXOFF: + case VMX_EXIT_VMXON: + return hmR0VmxExitSetPendingXcptUD(pVCpu, pVmxTransient); +#endif + + case VMX_EXIT_TRIPLE_FAULT: return hmR0VmxExitTripleFault(pVCpu, pVmxTransient); + case VMX_EXIT_NMI_WINDOW: return hmR0VmxExitNmiWindow(pVCpu, pVmxTransient); + case VMX_EXIT_INIT_SIGNAL: return hmR0VmxExitInitSignal(pVCpu, pVmxTransient); + case VMX_EXIT_SIPI: return hmR0VmxExitSipi(pVCpu, pVmxTransient); + case VMX_EXIT_IO_SMI: return hmR0VmxExitIoSmi(pVCpu, pVmxTransient); + case VMX_EXIT_SMI: return hmR0VmxExitSmi(pVCpu, pVmxTransient); + case VMX_EXIT_ERR_MSR_LOAD: return hmR0VmxExitErrMsrLoad(pVCpu, pVmxTransient); + case VMX_EXIT_ERR_INVALID_GUEST_STATE: return hmR0VmxExitErrInvalidGuestState(pVCpu, pVmxTransient); + case VMX_EXIT_ERR_MACHINE_CHECK: return hmR0VmxExitErrMachineCheck(pVCpu, pVmxTransient); + + case VMX_EXIT_INVEPT: + case VMX_EXIT_INVVPID: + case VMX_EXIT_VMFUNC: + case VMX_EXIT_XSAVES: + case VMX_EXIT_XRSTORS: + return hmR0VmxExitSetPendingXcptUD(pVCpu, pVmxTransient); + + case VMX_EXIT_ENCLS: + case VMX_EXIT_RDSEED: /* only spurious VM-exits, so undefined */ + case VMX_EXIT_PML_FULL: + default: + return hmR0VmxExitErrUndefined(pVCpu, pVmxTransient); + } +#undef VMEXIT_CALL_RET +} +#endif /* !HMVMX_USE_FUNCTION_TABLE */ + + +#ifdef VBOX_STRICT +/* Is there some generic IPRT define for this that are not in Runtime/internal/\* ?? */ +# define HMVMX_ASSERT_PREEMPT_CPUID_VAR() \ + RTCPUID const idAssertCpu = RTThreadPreemptIsEnabled(NIL_RTTHREAD) ? NIL_RTCPUID : RTMpCpuId() + +# define HMVMX_ASSERT_PREEMPT_CPUID() \ + do { \ + RTCPUID const idAssertCpuNow = RTThreadPreemptIsEnabled(NIL_RTTHREAD) ? NIL_RTCPUID : RTMpCpuId(); \ + AssertMsg(idAssertCpu == idAssertCpuNow, ("VMX %#x, %#x\n", idAssertCpu, idAssertCpuNow)); \ + } while (0) + +# define HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(a_pVCpu, a_pVmxTransient) \ + do { \ + AssertPtr((a_pVCpu)); \ + AssertPtr((a_pVmxTransient)); \ + Assert((a_pVmxTransient)->fVMEntryFailed == false); \ + Assert(ASMIntAreEnabled()); \ + HMVMX_ASSERT_PREEMPT_SAFE(a_pVCpu); \ + HMVMX_ASSERT_PREEMPT_CPUID_VAR(); \ + Log4Func(("vcpu[%RU32] -v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v-v\n", (a_pVCpu)->idCpu)); \ + HMVMX_ASSERT_PREEMPT_SAFE(a_pVCpu); \ + if (VMMR0IsLogFlushDisabled((a_pVCpu))) \ + HMVMX_ASSERT_PREEMPT_CPUID(); \ + HMVMX_STOP_EXIT_DISPATCH_PROF(); \ + } while (0) + +# define HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(a_pVCpu, a_pVmxTransient) \ + do { \ + Log4Func(("\n")); \ + } while (0) +#else +# define HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(a_pVCpu, a_pVmxTransient) \ + do { \ + HMVMX_STOP_EXIT_DISPATCH_PROF(); \ + NOREF((a_pVCpu)); NOREF((a_pVmxTransient)); \ + } while (0) +# define HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(a_pVCpu, a_pVmxTransient) do { } while (0) +#endif + + +/** + * Advances the guest RIP by the specified number of bytes. + * + * @param pVCpu The cross context virtual CPU structure. + * @param cbInstr Number of bytes to advance the RIP by. + * + * @remarks No-long-jump zone!!! + */ +DECLINLINE(void) hmR0VmxAdvanceGuestRipBy(PVMCPU pVCpu, uint32_t cbInstr) +{ + /* Advance the RIP. */ + pVCpu->cpum.GstCtx.rip += cbInstr; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP); + + /* Update interrupt inhibition. */ + if ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS) + && pVCpu->cpum.GstCtx.rip != EMGetInhibitInterruptsPC(pVCpu)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); +} + + +/** + * Advances the guest RIP after reading it from the VMCS. + * + * @returns VBox status code, no informational status codes. + * @param pVCpu The cross context virtual CPU structure. + * @param pVmxTransient Pointer to the VMX transient structure. + * + * @remarks No-long-jump zone!!! + */ +static int hmR0VmxAdvanceGuestRip(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= hmR0VmxImportGuestState(pVCpu, CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RFLAGS); + AssertRCReturn(rc, rc); + + hmR0VmxAdvanceGuestRipBy(pVCpu, pVmxTransient->cbInstr); + return VINF_SUCCESS; +} + + +/** + * Tries to determine what part of the guest-state VT-x has deemed as invalid + * and update error record fields accordingly. + * + * @return VMX_IGS_* return codes. + * @retval VMX_IGS_REASON_NOT_FOUND if this function could not find anything + * wrong with the guest state. + * + * @param pVCpu The cross context virtual CPU structure. + * + * @remarks This function assumes our cache of the VMCS controls + * are valid, i.e. hmR0VmxCheckVmcsCtls() succeeded. + */ +static uint32_t hmR0VmxCheckGuestState(PVMCPU pVCpu) +{ +#define HMVMX_ERROR_BREAK(err) { uError = (err); break; } +#define HMVMX_CHECK_BREAK(expr, err) if (!(expr)) { \ + uError = (err); \ + break; \ + } else do { } while (0) + + int rc; + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + uint32_t uError = VMX_IGS_ERROR; + uint32_t u32Val; + bool const fUnrestrictedGuest = pVM->hm.s.vmx.fUnrestrictedGuest; + + do + { + /* + * CR0. + */ + uint32_t fSetCr0 = (uint32_t)(pVM->hm.s.vmx.Msrs.u64Cr0Fixed0 & pVM->hm.s.vmx.Msrs.u64Cr0Fixed1); + uint32_t const fZapCr0 = (uint32_t)(pVM->hm.s.vmx.Msrs.u64Cr0Fixed0 | pVM->hm.s.vmx.Msrs.u64Cr0Fixed1); + /* Exceptions for unrestricted-guests for fixed CR0 bits (PE, PG). + See Intel spec. 26.3.1 "Checks on Guest Control Registers, Debug Registers and MSRs." */ + if (fUnrestrictedGuest) + fSetCr0 &= ~(X86_CR0_PE | X86_CR0_PG); + + uint32_t u32GuestCr0; + rc = VMXReadVmcs32(VMX_VMCS_GUEST_CR0, &u32GuestCr0); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK((u32GuestCr0 & fSetCr0) == fSetCr0, VMX_IGS_CR0_FIXED1); + HMVMX_CHECK_BREAK(!(u32GuestCr0 & ~fZapCr0), VMX_IGS_CR0_FIXED0); + if ( !fUnrestrictedGuest + && (u32GuestCr0 & X86_CR0_PG) + && !(u32GuestCr0 & X86_CR0_PE)) + { + HMVMX_ERROR_BREAK(VMX_IGS_CR0_PG_PE_COMBO); + } + + /* + * CR4. + */ + uint64_t const fSetCr4 = (pVM->hm.s.vmx.Msrs.u64Cr4Fixed0 & pVM->hm.s.vmx.Msrs.u64Cr4Fixed1); + uint64_t const fZapCr4 = (pVM->hm.s.vmx.Msrs.u64Cr4Fixed0 | pVM->hm.s.vmx.Msrs.u64Cr4Fixed1); + + uint32_t u32GuestCr4; + rc = VMXReadVmcs32(VMX_VMCS_GUEST_CR4, &u32GuestCr4); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK((u32GuestCr4 & fSetCr4) == fSetCr4, VMX_IGS_CR4_FIXED1); + HMVMX_CHECK_BREAK(!(u32GuestCr4 & ~fZapCr4), VMX_IGS_CR4_FIXED0); + + /* + * IA32_DEBUGCTL MSR. + */ + uint64_t u64Val; + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_DEBUGCTL_FULL, &u64Val); + AssertRCBreak(rc); + if ( (pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_LOAD_DEBUG) + && (u64Val & 0xfffffe3c)) /* Bits 31:9, bits 5:2 MBZ. */ + { + HMVMX_ERROR_BREAK(VMX_IGS_DEBUGCTL_MSR_RESERVED); + } + uint64_t u64DebugCtlMsr = u64Val; + +#ifdef VBOX_STRICT + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY, &u32Val); + AssertRCBreak(rc); + Assert(u32Val == pVCpu->hm.s.vmx.u32EntryCtls); +#endif + bool const fLongModeGuest = RT_BOOL(pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_IA32E_MODE_GUEST); + + /* + * RIP and RFLAGS. + */ + uint32_t u32Eflags; +#if HC_ARCH_BITS == 64 + rc = VMXReadVmcs64(VMX_VMCS_GUEST_RIP, &u64Val); + AssertRCBreak(rc); + /* pCtx->rip can be different than the one in the VMCS (e.g. run guest code and VM-exits that don't update it). */ + if ( !fLongModeGuest + || !pCtx->cs.Attr.n.u1Long) + { + HMVMX_CHECK_BREAK(!(u64Val & UINT64_C(0xffffffff00000000)), VMX_IGS_LONGMODE_RIP_INVALID); + } + /** @todo If the processor supports N < 64 linear-address bits, bits 63:N + * must be identical if the "IA-32e mode guest" VM-entry + * control is 1 and CS.L is 1. No check applies if the + * CPU supports 64 linear-address bits. */ + + /* Flags in pCtx can be different (real-on-v86 for instance). We are only concerned about the VMCS contents here. */ + rc = VMXReadVmcs64(VMX_VMCS_GUEST_RFLAGS, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u64Val & UINT64_C(0xffffffffffc08028)), /* Bit 63:22, Bit 15, 5, 3 MBZ. */ + VMX_IGS_RFLAGS_RESERVED); + HMVMX_CHECK_BREAK((u64Val & X86_EFL_RA1_MASK), VMX_IGS_RFLAGS_RESERVED1); /* Bit 1 MB1. */ + u32Eflags = u64Val; +#else + rc = VMXReadVmcs32(VMX_VMCS_GUEST_RFLAGS, &u32Eflags); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u32Eflags & 0xffc08028), VMX_IGS_RFLAGS_RESERVED); /* Bit 31:22, Bit 15, 5, 3 MBZ. */ + HMVMX_CHECK_BREAK((u32Eflags & X86_EFL_RA1_MASK), VMX_IGS_RFLAGS_RESERVED1); /* Bit 1 MB1. */ +#endif + + if ( fLongModeGuest + || ( fUnrestrictedGuest + && !(u32GuestCr0 & X86_CR0_PE))) + { + HMVMX_CHECK_BREAK(!(u32Eflags & X86_EFL_VM), VMX_IGS_RFLAGS_VM_INVALID); + } + + uint32_t u32EntryInfo; + rc = VMXReadVmcs32(VMX_VMCS32_CTRL_ENTRY_INTERRUPTION_INFO, &u32EntryInfo); + AssertRCBreak(rc); + if ( VMX_ENTRY_INT_INFO_IS_VALID(u32EntryInfo) + && VMX_ENTRY_INT_INFO_TYPE(u32EntryInfo) == VMX_EXIT_INT_INFO_TYPE_EXT_INT) + { + HMVMX_CHECK_BREAK(u32Eflags & X86_EFL_IF, VMX_IGS_RFLAGS_IF_INVALID); + } + + /* + * 64-bit checks. + */ +#if HC_ARCH_BITS == 64 + if (fLongModeGuest) + { + HMVMX_CHECK_BREAK(u32GuestCr0 & X86_CR0_PG, VMX_IGS_CR0_PG_LONGMODE); + HMVMX_CHECK_BREAK(u32GuestCr4 & X86_CR4_PAE, VMX_IGS_CR4_PAE_LONGMODE); + } + + if ( !fLongModeGuest + && (u32GuestCr4 & X86_CR4_PCIDE)) + { + HMVMX_ERROR_BREAK(VMX_IGS_CR4_PCIDE); + } + + /** @todo CR3 field must be such that bits 63:52 and bits in the range + * 51:32 beyond the processor's physical-address width are 0. */ + + if ( (pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_LOAD_DEBUG) + && (pCtx->dr[7] & X86_DR7_MBZ_MASK)) + { + HMVMX_ERROR_BREAK(VMX_IGS_DR7_RESERVED); + } + + rc = VMXReadVmcs64(VMX_VMCS_HOST_SYSENTER_ESP, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(u64Val), VMX_IGS_SYSENTER_ESP_NOT_CANONICAL); + + rc = VMXReadVmcs64(VMX_VMCS_HOST_SYSENTER_EIP, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(u64Val), VMX_IGS_SYSENTER_EIP_NOT_CANONICAL); +#endif + + /* + * PERF_GLOBAL MSR. + */ + if (pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_LOAD_PERF_MSR) + { + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_PERF_GLOBAL_CTRL_FULL, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u64Val & UINT64_C(0xfffffff8fffffffc)), + VMX_IGS_PERF_GLOBAL_MSR_RESERVED); /* Bits 63:35, bits 31:2 MBZ. */ + } + + /* + * PAT MSR. + */ + if (pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_LOAD_PAT_MSR) + { + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_PAT_FULL, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u64Val & UINT64_C(0x707070707070707)), VMX_IGS_PAT_MSR_RESERVED); + for (unsigned i = 0; i < 8; i++) + { + uint8_t u8Val = (u64Val & 0xff); + if ( u8Val != 0 /* UC */ + && u8Val != 1 /* WC */ + && u8Val != 4 /* WT */ + && u8Val != 5 /* WP */ + && u8Val != 6 /* WB */ + && u8Val != 7 /* UC- */) + { + HMVMX_ERROR_BREAK(VMX_IGS_PAT_MSR_INVALID); + } + u64Val >>= 8; + } + } + + /* + * EFER MSR. + */ + if (pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_LOAD_EFER_MSR) + { + Assert(pVM->hm.s.vmx.fSupportsVmcsEfer); + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_EFER_FULL, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u64Val & UINT64_C(0xfffffffffffff2fe)), + VMX_IGS_EFER_MSR_RESERVED); /* Bits 63:12, bit 9, bits 7:1 MBZ. */ + HMVMX_CHECK_BREAK(RT_BOOL(u64Val & MSR_K6_EFER_LMA) == RT_BOOL( pVCpu->hm.s.vmx.u32EntryCtls + & VMX_ENTRY_CTLS_IA32E_MODE_GUEST), + VMX_IGS_EFER_LMA_GUEST_MODE_MISMATCH); + /** @todo r=ramshankar: Unrestricted check here is probably wrong, see + * iemVmxVmentryCheckGuestState(). */ + HMVMX_CHECK_BREAK( fUnrestrictedGuest + || !(u32GuestCr0 & X86_CR0_PG) + || RT_BOOL(u64Val & MSR_K6_EFER_LMA) == RT_BOOL(u64Val & MSR_K6_EFER_LME), + VMX_IGS_EFER_LMA_LME_MISMATCH); + } + + /* + * Segment registers. + */ + HMVMX_CHECK_BREAK( (pCtx->ldtr.Attr.u & X86DESCATTR_UNUSABLE) + || !(pCtx->ldtr.Sel & X86_SEL_LDT), VMX_IGS_LDTR_TI_INVALID); + if (!(u32Eflags & X86_EFL_VM)) + { + /* CS */ + HMVMX_CHECK_BREAK(pCtx->cs.Attr.n.u1Present, VMX_IGS_CS_ATTR_P_INVALID); + HMVMX_CHECK_BREAK(!(pCtx->cs.Attr.u & 0xf00), VMX_IGS_CS_ATTR_RESERVED); + HMVMX_CHECK_BREAK(!(pCtx->cs.Attr.u & 0xfffe0000), VMX_IGS_CS_ATTR_RESERVED); + HMVMX_CHECK_BREAK( (pCtx->cs.u32Limit & 0xfff) == 0xfff + || !(pCtx->cs.Attr.n.u1Granularity), VMX_IGS_CS_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->cs.u32Limit & 0xfff00000) + || (pCtx->cs.Attr.n.u1Granularity), VMX_IGS_CS_ATTR_G_INVALID); + /* CS cannot be loaded with NULL in protected mode. */ + HMVMX_CHECK_BREAK(pCtx->cs.Attr.u && !(pCtx->cs.Attr.u & X86DESCATTR_UNUSABLE), VMX_IGS_CS_ATTR_UNUSABLE); + HMVMX_CHECK_BREAK(pCtx->cs.Attr.n.u1DescType, VMX_IGS_CS_ATTR_S_INVALID); + if (pCtx->cs.Attr.n.u4Type == 9 || pCtx->cs.Attr.n.u4Type == 11) + HMVMX_CHECK_BREAK(pCtx->cs.Attr.n.u2Dpl == pCtx->ss.Attr.n.u2Dpl, VMX_IGS_CS_SS_ATTR_DPL_UNEQUAL); + else if (pCtx->cs.Attr.n.u4Type == 13 || pCtx->cs.Attr.n.u4Type == 15) + HMVMX_CHECK_BREAK(pCtx->cs.Attr.n.u2Dpl <= pCtx->ss.Attr.n.u2Dpl, VMX_IGS_CS_SS_ATTR_DPL_MISMATCH); + else if (pVM->hm.s.vmx.fUnrestrictedGuest && pCtx->cs.Attr.n.u4Type == 3) + HMVMX_CHECK_BREAK(pCtx->cs.Attr.n.u2Dpl == 0, VMX_IGS_CS_ATTR_DPL_INVALID); + else + HMVMX_ERROR_BREAK(VMX_IGS_CS_ATTR_TYPE_INVALID); + + /* SS */ + HMVMX_CHECK_BREAK( pVM->hm.s.vmx.fUnrestrictedGuest + || (pCtx->ss.Sel & X86_SEL_RPL) == (pCtx->cs.Sel & X86_SEL_RPL), VMX_IGS_SS_CS_RPL_UNEQUAL); + HMVMX_CHECK_BREAK(pCtx->ss.Attr.n.u2Dpl == (pCtx->ss.Sel & X86_SEL_RPL), VMX_IGS_SS_ATTR_DPL_RPL_UNEQUAL); + if ( !(pCtx->cr0 & X86_CR0_PE) + || pCtx->cs.Attr.n.u4Type == 3) + { + HMVMX_CHECK_BREAK(!pCtx->ss.Attr.n.u2Dpl, VMX_IGS_SS_ATTR_DPL_INVALID); + } + if (!(pCtx->ss.Attr.u & X86DESCATTR_UNUSABLE)) + { + HMVMX_CHECK_BREAK(pCtx->ss.Attr.n.u4Type == 3 || pCtx->ss.Attr.n.u4Type == 7, VMX_IGS_SS_ATTR_TYPE_INVALID); + HMVMX_CHECK_BREAK(pCtx->ss.Attr.n.u1Present, VMX_IGS_SS_ATTR_P_INVALID); + HMVMX_CHECK_BREAK(!(pCtx->ss.Attr.u & 0xf00), VMX_IGS_SS_ATTR_RESERVED); + HMVMX_CHECK_BREAK(!(pCtx->ss.Attr.u & 0xfffe0000), VMX_IGS_SS_ATTR_RESERVED); + HMVMX_CHECK_BREAK( (pCtx->ss.u32Limit & 0xfff) == 0xfff + || !(pCtx->ss.Attr.n.u1Granularity), VMX_IGS_SS_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->ss.u32Limit & 0xfff00000) + || (pCtx->ss.Attr.n.u1Granularity), VMX_IGS_SS_ATTR_G_INVALID); + } + + /* DS, ES, FS, GS - only check for usable selectors, see hmR0VmxExportGuestSegmenReg(). */ + if (!(pCtx->ds.Attr.u & X86DESCATTR_UNUSABLE)) + { + HMVMX_CHECK_BREAK(pCtx->ds.Attr.n.u4Type & X86_SEL_TYPE_ACCESSED, VMX_IGS_DS_ATTR_A_INVALID); + HMVMX_CHECK_BREAK(pCtx->ds.Attr.n.u1Present, VMX_IGS_DS_ATTR_P_INVALID); + HMVMX_CHECK_BREAK( pVM->hm.s.vmx.fUnrestrictedGuest + || pCtx->ds.Attr.n.u4Type > 11 + || pCtx->ds.Attr.n.u2Dpl >= (pCtx->ds.Sel & X86_SEL_RPL), VMX_IGS_DS_ATTR_DPL_RPL_UNEQUAL); + HMVMX_CHECK_BREAK(!(pCtx->ds.Attr.u & 0xf00), VMX_IGS_DS_ATTR_RESERVED); + HMVMX_CHECK_BREAK(!(pCtx->ds.Attr.u & 0xfffe0000), VMX_IGS_DS_ATTR_RESERVED); + HMVMX_CHECK_BREAK( (pCtx->ds.u32Limit & 0xfff) == 0xfff + || !(pCtx->ds.Attr.n.u1Granularity), VMX_IGS_DS_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->ds.u32Limit & 0xfff00000) + || (pCtx->ds.Attr.n.u1Granularity), VMX_IGS_DS_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->ds.Attr.n.u4Type & X86_SEL_TYPE_CODE) + || (pCtx->ds.Attr.n.u4Type & X86_SEL_TYPE_READ), VMX_IGS_DS_ATTR_TYPE_INVALID); + } + if (!(pCtx->es.Attr.u & X86DESCATTR_UNUSABLE)) + { + HMVMX_CHECK_BREAK(pCtx->es.Attr.n.u4Type & X86_SEL_TYPE_ACCESSED, VMX_IGS_ES_ATTR_A_INVALID); + HMVMX_CHECK_BREAK(pCtx->es.Attr.n.u1Present, VMX_IGS_ES_ATTR_P_INVALID); + HMVMX_CHECK_BREAK( pVM->hm.s.vmx.fUnrestrictedGuest + || pCtx->es.Attr.n.u4Type > 11 + || pCtx->es.Attr.n.u2Dpl >= (pCtx->es.Sel & X86_SEL_RPL), VMX_IGS_DS_ATTR_DPL_RPL_UNEQUAL); + HMVMX_CHECK_BREAK(!(pCtx->es.Attr.u & 0xf00), VMX_IGS_ES_ATTR_RESERVED); + HMVMX_CHECK_BREAK(!(pCtx->es.Attr.u & 0xfffe0000), VMX_IGS_ES_ATTR_RESERVED); + HMVMX_CHECK_BREAK( (pCtx->es.u32Limit & 0xfff) == 0xfff + || !(pCtx->es.Attr.n.u1Granularity), VMX_IGS_ES_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->es.u32Limit & 0xfff00000) + || (pCtx->es.Attr.n.u1Granularity), VMX_IGS_ES_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->es.Attr.n.u4Type & X86_SEL_TYPE_CODE) + || (pCtx->es.Attr.n.u4Type & X86_SEL_TYPE_READ), VMX_IGS_ES_ATTR_TYPE_INVALID); + } + if (!(pCtx->fs.Attr.u & X86DESCATTR_UNUSABLE)) + { + HMVMX_CHECK_BREAK(pCtx->fs.Attr.n.u4Type & X86_SEL_TYPE_ACCESSED, VMX_IGS_FS_ATTR_A_INVALID); + HMVMX_CHECK_BREAK(pCtx->fs.Attr.n.u1Present, VMX_IGS_FS_ATTR_P_INVALID); + HMVMX_CHECK_BREAK( pVM->hm.s.vmx.fUnrestrictedGuest + || pCtx->fs.Attr.n.u4Type > 11 + || pCtx->fs.Attr.n.u2Dpl >= (pCtx->fs.Sel & X86_SEL_RPL), VMX_IGS_FS_ATTR_DPL_RPL_UNEQUAL); + HMVMX_CHECK_BREAK(!(pCtx->fs.Attr.u & 0xf00), VMX_IGS_FS_ATTR_RESERVED); + HMVMX_CHECK_BREAK(!(pCtx->fs.Attr.u & 0xfffe0000), VMX_IGS_FS_ATTR_RESERVED); + HMVMX_CHECK_BREAK( (pCtx->fs.u32Limit & 0xfff) == 0xfff + || !(pCtx->fs.Attr.n.u1Granularity), VMX_IGS_FS_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->fs.u32Limit & 0xfff00000) + || (pCtx->fs.Attr.n.u1Granularity), VMX_IGS_FS_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->fs.Attr.n.u4Type & X86_SEL_TYPE_CODE) + || (pCtx->fs.Attr.n.u4Type & X86_SEL_TYPE_READ), VMX_IGS_FS_ATTR_TYPE_INVALID); + } + if (!(pCtx->gs.Attr.u & X86DESCATTR_UNUSABLE)) + { + HMVMX_CHECK_BREAK(pCtx->gs.Attr.n.u4Type & X86_SEL_TYPE_ACCESSED, VMX_IGS_GS_ATTR_A_INVALID); + HMVMX_CHECK_BREAK(pCtx->gs.Attr.n.u1Present, VMX_IGS_GS_ATTR_P_INVALID); + HMVMX_CHECK_BREAK( pVM->hm.s.vmx.fUnrestrictedGuest + || pCtx->gs.Attr.n.u4Type > 11 + || pCtx->gs.Attr.n.u2Dpl >= (pCtx->gs.Sel & X86_SEL_RPL), VMX_IGS_GS_ATTR_DPL_RPL_UNEQUAL); + HMVMX_CHECK_BREAK(!(pCtx->gs.Attr.u & 0xf00), VMX_IGS_GS_ATTR_RESERVED); + HMVMX_CHECK_BREAK(!(pCtx->gs.Attr.u & 0xfffe0000), VMX_IGS_GS_ATTR_RESERVED); + HMVMX_CHECK_BREAK( (pCtx->gs.u32Limit & 0xfff) == 0xfff + || !(pCtx->gs.Attr.n.u1Granularity), VMX_IGS_GS_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->gs.u32Limit & 0xfff00000) + || (pCtx->gs.Attr.n.u1Granularity), VMX_IGS_GS_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->gs.Attr.n.u4Type & X86_SEL_TYPE_CODE) + || (pCtx->gs.Attr.n.u4Type & X86_SEL_TYPE_READ), VMX_IGS_GS_ATTR_TYPE_INVALID); + } + /* 64-bit capable CPUs. */ +#if HC_ARCH_BITS == 64 + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(pCtx->fs.u64Base), VMX_IGS_FS_BASE_NOT_CANONICAL); + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(pCtx->gs.u64Base), VMX_IGS_GS_BASE_NOT_CANONICAL); + HMVMX_CHECK_BREAK( (pCtx->ldtr.Attr.u & X86DESCATTR_UNUSABLE) + || X86_IS_CANONICAL(pCtx->ldtr.u64Base), VMX_IGS_LDTR_BASE_NOT_CANONICAL); + HMVMX_CHECK_BREAK(!RT_HI_U32(pCtx->cs.u64Base), VMX_IGS_LONGMODE_CS_BASE_INVALID); + HMVMX_CHECK_BREAK((pCtx->ss.Attr.u & X86DESCATTR_UNUSABLE) || !RT_HI_U32(pCtx->ss.u64Base), + VMX_IGS_LONGMODE_SS_BASE_INVALID); + HMVMX_CHECK_BREAK((pCtx->ds.Attr.u & X86DESCATTR_UNUSABLE) || !RT_HI_U32(pCtx->ds.u64Base), + VMX_IGS_LONGMODE_DS_BASE_INVALID); + HMVMX_CHECK_BREAK((pCtx->es.Attr.u & X86DESCATTR_UNUSABLE) || !RT_HI_U32(pCtx->es.u64Base), + VMX_IGS_LONGMODE_ES_BASE_INVALID); +#endif + } + else + { + /* V86 mode checks. */ + uint32_t u32CSAttr, u32SSAttr, u32DSAttr, u32ESAttr, u32FSAttr, u32GSAttr; + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + u32CSAttr = 0xf3; u32SSAttr = 0xf3; + u32DSAttr = 0xf3; u32ESAttr = 0xf3; + u32FSAttr = 0xf3; u32GSAttr = 0xf3; + } + else + { + u32CSAttr = pCtx->cs.Attr.u; u32SSAttr = pCtx->ss.Attr.u; + u32DSAttr = pCtx->ds.Attr.u; u32ESAttr = pCtx->es.Attr.u; + u32FSAttr = pCtx->fs.Attr.u; u32GSAttr = pCtx->gs.Attr.u; + } + + /* CS */ + HMVMX_CHECK_BREAK((pCtx->cs.u64Base == (uint64_t)pCtx->cs.Sel << 4), VMX_IGS_V86_CS_BASE_INVALID); + HMVMX_CHECK_BREAK(pCtx->cs.u32Limit == 0xffff, VMX_IGS_V86_CS_LIMIT_INVALID); + HMVMX_CHECK_BREAK(u32CSAttr == 0xf3, VMX_IGS_V86_CS_ATTR_INVALID); + /* SS */ + HMVMX_CHECK_BREAK((pCtx->ss.u64Base == (uint64_t)pCtx->ss.Sel << 4), VMX_IGS_V86_SS_BASE_INVALID); + HMVMX_CHECK_BREAK(pCtx->ss.u32Limit == 0xffff, VMX_IGS_V86_SS_LIMIT_INVALID); + HMVMX_CHECK_BREAK(u32SSAttr == 0xf3, VMX_IGS_V86_SS_ATTR_INVALID); + /* DS */ + HMVMX_CHECK_BREAK((pCtx->ds.u64Base == (uint64_t)pCtx->ds.Sel << 4), VMX_IGS_V86_DS_BASE_INVALID); + HMVMX_CHECK_BREAK(pCtx->ds.u32Limit == 0xffff, VMX_IGS_V86_DS_LIMIT_INVALID); + HMVMX_CHECK_BREAK(u32DSAttr == 0xf3, VMX_IGS_V86_DS_ATTR_INVALID); + /* ES */ + HMVMX_CHECK_BREAK((pCtx->es.u64Base == (uint64_t)pCtx->es.Sel << 4), VMX_IGS_V86_ES_BASE_INVALID); + HMVMX_CHECK_BREAK(pCtx->es.u32Limit == 0xffff, VMX_IGS_V86_ES_LIMIT_INVALID); + HMVMX_CHECK_BREAK(u32ESAttr == 0xf3, VMX_IGS_V86_ES_ATTR_INVALID); + /* FS */ + HMVMX_CHECK_BREAK((pCtx->fs.u64Base == (uint64_t)pCtx->fs.Sel << 4), VMX_IGS_V86_FS_BASE_INVALID); + HMVMX_CHECK_BREAK(pCtx->fs.u32Limit == 0xffff, VMX_IGS_V86_FS_LIMIT_INVALID); + HMVMX_CHECK_BREAK(u32FSAttr == 0xf3, VMX_IGS_V86_FS_ATTR_INVALID); + /* GS */ + HMVMX_CHECK_BREAK((pCtx->gs.u64Base == (uint64_t)pCtx->gs.Sel << 4), VMX_IGS_V86_GS_BASE_INVALID); + HMVMX_CHECK_BREAK(pCtx->gs.u32Limit == 0xffff, VMX_IGS_V86_GS_LIMIT_INVALID); + HMVMX_CHECK_BREAK(u32GSAttr == 0xf3, VMX_IGS_V86_GS_ATTR_INVALID); + /* 64-bit capable CPUs. */ +#if HC_ARCH_BITS == 64 + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(pCtx->fs.u64Base), VMX_IGS_FS_BASE_NOT_CANONICAL); + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(pCtx->gs.u64Base), VMX_IGS_GS_BASE_NOT_CANONICAL); + HMVMX_CHECK_BREAK( (pCtx->ldtr.Attr.u & X86DESCATTR_UNUSABLE) + || X86_IS_CANONICAL(pCtx->ldtr.u64Base), VMX_IGS_LDTR_BASE_NOT_CANONICAL); + HMVMX_CHECK_BREAK(!RT_HI_U32(pCtx->cs.u64Base), VMX_IGS_LONGMODE_CS_BASE_INVALID); + HMVMX_CHECK_BREAK((pCtx->ss.Attr.u & X86DESCATTR_UNUSABLE) || !RT_HI_U32(pCtx->ss.u64Base), + VMX_IGS_LONGMODE_SS_BASE_INVALID); + HMVMX_CHECK_BREAK((pCtx->ds.Attr.u & X86DESCATTR_UNUSABLE) || !RT_HI_U32(pCtx->ds.u64Base), + VMX_IGS_LONGMODE_DS_BASE_INVALID); + HMVMX_CHECK_BREAK((pCtx->es.Attr.u & X86DESCATTR_UNUSABLE) || !RT_HI_U32(pCtx->es.u64Base), + VMX_IGS_LONGMODE_ES_BASE_INVALID); +#endif + } + + /* + * TR. + */ + HMVMX_CHECK_BREAK(!(pCtx->tr.Sel & X86_SEL_LDT), VMX_IGS_TR_TI_INVALID); + /* 64-bit capable CPUs. */ +#if HC_ARCH_BITS == 64 + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(pCtx->tr.u64Base), VMX_IGS_TR_BASE_NOT_CANONICAL); +#endif + if (fLongModeGuest) + { + HMVMX_CHECK_BREAK(pCtx->tr.Attr.n.u4Type == 11, /* 64-bit busy TSS. */ + VMX_IGS_LONGMODE_TR_ATTR_TYPE_INVALID); + } + else + { + HMVMX_CHECK_BREAK( pCtx->tr.Attr.n.u4Type == 3 /* 16-bit busy TSS. */ + || pCtx->tr.Attr.n.u4Type == 11, /* 32-bit busy TSS.*/ + VMX_IGS_TR_ATTR_TYPE_INVALID); + } + HMVMX_CHECK_BREAK(!pCtx->tr.Attr.n.u1DescType, VMX_IGS_TR_ATTR_S_INVALID); + HMVMX_CHECK_BREAK(pCtx->tr.Attr.n.u1Present, VMX_IGS_TR_ATTR_P_INVALID); + HMVMX_CHECK_BREAK(!(pCtx->tr.Attr.u & 0xf00), VMX_IGS_TR_ATTR_RESERVED); /* Bits 11:8 MBZ. */ + HMVMX_CHECK_BREAK( (pCtx->tr.u32Limit & 0xfff) == 0xfff + || !(pCtx->tr.Attr.n.u1Granularity), VMX_IGS_TR_ATTR_G_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->tr.u32Limit & 0xfff00000) + || (pCtx->tr.Attr.n.u1Granularity), VMX_IGS_TR_ATTR_G_INVALID); + HMVMX_CHECK_BREAK(!(pCtx->tr.Attr.u & X86DESCATTR_UNUSABLE), VMX_IGS_TR_ATTR_UNUSABLE); + + /* + * GDTR and IDTR. + */ +#if HC_ARCH_BITS == 64 + rc = VMXReadVmcs64(VMX_VMCS_GUEST_GDTR_BASE, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(u64Val), VMX_IGS_GDTR_BASE_NOT_CANONICAL); + + rc = VMXReadVmcs64(VMX_VMCS_GUEST_IDTR_BASE, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(X86_IS_CANONICAL(u64Val), VMX_IGS_IDTR_BASE_NOT_CANONICAL); +#endif + + rc = VMXReadVmcs32(VMX_VMCS32_GUEST_GDTR_LIMIT, &u32Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u32Val & 0xffff0000), VMX_IGS_GDTR_LIMIT_INVALID); /* Bits 31:16 MBZ. */ + + rc = VMXReadVmcs32(VMX_VMCS32_GUEST_IDTR_LIMIT, &u32Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u32Val & 0xffff0000), VMX_IGS_IDTR_LIMIT_INVALID); /* Bits 31:16 MBZ. */ + + /* + * Guest Non-Register State. + */ + /* Activity State. */ + uint32_t u32ActivityState; + rc = VMXReadVmcs32(VMX_VMCS32_GUEST_ACTIVITY_STATE, &u32ActivityState); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK( !u32ActivityState + || (u32ActivityState & RT_BF_GET(pVM->hm.s.vmx.Msrs.u64Misc, VMX_BF_MISC_ACTIVITY_STATES)), + VMX_IGS_ACTIVITY_STATE_INVALID); + HMVMX_CHECK_BREAK( !(pCtx->ss.Attr.n.u2Dpl) + || u32ActivityState != VMX_VMCS_GUEST_ACTIVITY_HLT, VMX_IGS_ACTIVITY_STATE_HLT_INVALID); + uint32_t u32IntrState; + rc = VMXReadVmcs32(VMX_VMCS32_GUEST_INT_STATE, &u32IntrState); + AssertRCBreak(rc); + if ( u32IntrState == VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS + || u32IntrState == VMX_VMCS_GUEST_INT_STATE_BLOCK_STI) + { + HMVMX_CHECK_BREAK(u32ActivityState == VMX_VMCS_GUEST_ACTIVITY_ACTIVE, VMX_IGS_ACTIVITY_STATE_ACTIVE_INVALID); + } + + /** @todo Activity state and injecting interrupts. Left as a todo since we + * currently don't use activity states but ACTIVE. */ + + HMVMX_CHECK_BREAK( !(pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_ENTRY_TO_SMM) + || u32ActivityState != VMX_VMCS_GUEST_ACTIVITY_SIPI_WAIT, VMX_IGS_ACTIVITY_STATE_SIPI_WAIT_INVALID); + + /* Guest interruptibility-state. */ + HMVMX_CHECK_BREAK(!(u32IntrState & 0xffffffe0), VMX_IGS_INTERRUPTIBILITY_STATE_RESERVED); + HMVMX_CHECK_BREAK((u32IntrState & (VMX_VMCS_GUEST_INT_STATE_BLOCK_STI | VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS)) + != (VMX_VMCS_GUEST_INT_STATE_BLOCK_STI | VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS), + VMX_IGS_INTERRUPTIBILITY_STATE_STI_MOVSS_INVALID); + HMVMX_CHECK_BREAK( (u32Eflags & X86_EFL_IF) + || !(u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_STI), + VMX_IGS_INTERRUPTIBILITY_STATE_STI_EFL_INVALID); + if (VMX_ENTRY_INT_INFO_IS_VALID(u32EntryInfo)) + { + if (VMX_ENTRY_INT_INFO_TYPE(u32EntryInfo) == VMX_EXIT_INT_INFO_TYPE_EXT_INT) + { + HMVMX_CHECK_BREAK( !(u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_STI) + && !(u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS), + VMX_IGS_INTERRUPTIBILITY_STATE_EXT_INT_INVALID); + } + else if (VMX_ENTRY_INT_INFO_TYPE(u32EntryInfo) == VMX_EXIT_INT_INFO_TYPE_NMI) + { + HMVMX_CHECK_BREAK(!(u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS), + VMX_IGS_INTERRUPTIBILITY_STATE_MOVSS_INVALID); + HMVMX_CHECK_BREAK(!(u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_STI), + VMX_IGS_INTERRUPTIBILITY_STATE_STI_INVALID); + } + } + /** @todo Assumes the processor is not in SMM. */ + HMVMX_CHECK_BREAK(!(u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_SMI), + VMX_IGS_INTERRUPTIBILITY_STATE_SMI_INVALID); + HMVMX_CHECK_BREAK( !(pVCpu->hm.s.vmx.u32EntryCtls & VMX_ENTRY_CTLS_ENTRY_TO_SMM) + || (u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_SMI), + VMX_IGS_INTERRUPTIBILITY_STATE_SMI_SMM_INVALID); + if ( (pVCpu->hm.s.vmx.u32PinCtls & VMX_PIN_CTLS_VIRT_NMI) + && VMX_ENTRY_INT_INFO_IS_VALID(u32EntryInfo) + && VMX_ENTRY_INT_INFO_TYPE(u32EntryInfo) == VMX_EXIT_INT_INFO_TYPE_NMI) + { + HMVMX_CHECK_BREAK(!(u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_NMI), + VMX_IGS_INTERRUPTIBILITY_STATE_NMI_INVALID); + } + + /* Pending debug exceptions. */ +#if HC_ARCH_BITS == 64 + rc = VMXReadVmcs64(VMX_VMCS_GUEST_PENDING_DEBUG_XCPTS, &u64Val); + AssertRCBreak(rc); + /* Bits 63:15, Bit 13, Bits 11:4 MBZ. */ + HMVMX_CHECK_BREAK(!(u64Val & UINT64_C(0xffffffffffffaff0)), VMX_IGS_LONGMODE_PENDING_DEBUG_RESERVED); + u32Val = u64Val; /* For pending debug exceptions checks below. */ +#else + rc = VMXReadVmcs32(VMX_VMCS_GUEST_PENDING_DEBUG_XCPTS, &u32Val); + AssertRCBreak(rc); + /* Bits 31:15, Bit 13, Bits 11:4 MBZ. */ + HMVMX_CHECK_BREAK(!(u32Val & 0xffffaff0), VMX_IGS_PENDING_DEBUG_RESERVED); +#endif + + if ( (u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_STI) + || (u32IntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS) + || u32ActivityState == VMX_VMCS_GUEST_ACTIVITY_HLT) + { + if ( (u32Eflags & X86_EFL_TF) + && !(u64DebugCtlMsr & RT_BIT_64(1))) /* Bit 1 is IA32_DEBUGCTL.BTF. */ + { + /* Bit 14 is PendingDebug.BS. */ + HMVMX_CHECK_BREAK(u32Val & RT_BIT(14), VMX_IGS_PENDING_DEBUG_XCPT_BS_NOT_SET); + } + if ( !(u32Eflags & X86_EFL_TF) + || (u64DebugCtlMsr & RT_BIT_64(1))) /* Bit 1 is IA32_DEBUGCTL.BTF. */ + { + /* Bit 14 is PendingDebug.BS. */ + HMVMX_CHECK_BREAK(!(u32Val & RT_BIT(14)), VMX_IGS_PENDING_DEBUG_XCPT_BS_NOT_CLEAR); + } + } + + /* VMCS link pointer. */ + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_VMCS_LINK_PTR_FULL, &u64Val); + AssertRCBreak(rc); + if (u64Val != UINT64_C(0xffffffffffffffff)) + { + HMVMX_CHECK_BREAK(!(u64Val & 0xfff), VMX_IGS_VMCS_LINK_PTR_RESERVED); + /** @todo Bits beyond the processor's physical-address width MBZ. */ + /** @todo 32-bit located in memory referenced by value of this field (as a + * physical address) must contain the processor's VMCS revision ID. */ + /** @todo SMM checks. */ + } + + /** @todo Checks on Guest Page-Directory-Pointer-Table Entries when guest is + * not using Nested Paging? */ + if ( pVM->hm.s.fNestedPaging + && !fLongModeGuest + && CPUMIsGuestInPAEModeEx(pCtx)) + { + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_PDPTE0_FULL, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u64Val & X86_PDPE_PAE_MBZ_MASK), VMX_IGS_PAE_PDPTE_RESERVED); + + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_PDPTE1_FULL, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u64Val & X86_PDPE_PAE_MBZ_MASK), VMX_IGS_PAE_PDPTE_RESERVED); + + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_PDPTE2_FULL, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u64Val & X86_PDPE_PAE_MBZ_MASK), VMX_IGS_PAE_PDPTE_RESERVED); + + rc = VMXReadVmcs64(VMX_VMCS64_GUEST_PDPTE3_FULL, &u64Val); + AssertRCBreak(rc); + HMVMX_CHECK_BREAK(!(u64Val & X86_PDPE_PAE_MBZ_MASK), VMX_IGS_PAE_PDPTE_RESERVED); + } + + /* Shouldn't happen but distinguish it from AssertRCBreak() errors. */ + if (uError == VMX_IGS_ERROR) + uError = VMX_IGS_REASON_NOT_FOUND; + } while (0); + + pVCpu->hm.s.u32HMError = uError; + return uError; + +#undef HMVMX_ERROR_BREAK +#undef HMVMX_CHECK_BREAK +} + + +/** @name VM-exit handlers. + * @{ + */ +/* -=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ +/* -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- VM-exit handlers -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- */ +/* -=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ + +/** + * VM-exit handler for external interrupts (VMX_EXIT_EXT_INT). + */ +HMVMX_EXIT_DECL hmR0VmxExitExtInt(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitExtInt); + /* Windows hosts (32-bit and 64-bit) have DPC latency issues. See @bugref{6853}. */ + if (VMMR0ThreadCtxHookIsEnabled(pVCpu)) + return VINF_SUCCESS; + return VINF_EM_RAW_INTERRUPT; +} + + +/** + * VM-exit handler for exceptions or NMIs (VMX_EXIT_XCPT_OR_NMI). + */ +HMVMX_EXIT_DECL hmR0VmxExitXcptOrNmi(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatExitXcptNmi, y3); + + int rc = hmR0VmxReadExitIntInfoVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + + uint32_t uIntType = VMX_EXIT_INT_INFO_TYPE(pVmxTransient->uExitIntInfo); + Assert( !(pVCpu->hm.s.vmx.u32ExitCtls & VMX_EXIT_CTLS_ACK_EXT_INT) + && uIntType != VMX_EXIT_INT_INFO_TYPE_EXT_INT); + Assert(VMX_EXIT_INT_INFO_IS_VALID(pVmxTransient->uExitIntInfo)); + + if (uIntType == VMX_EXIT_INT_INFO_TYPE_NMI) + { + /* + * This cannot be a guest NMI as the only way for the guest to receive an NMI is if we + * injected it ourselves and anything we inject is not going to cause a VM-exit directly + * for the event being injected[1]. Go ahead and dispatch the NMI to the host[2]. + * + * [1] -- See Intel spec. 27.2.3 "Information for VM Exits During Event Delivery". + * [2] -- See Intel spec. 27.5.5 "Updating Non-Register State". + */ + VMXDispatchHostNmi(); + STAM_REL_COUNTER_INC(&pVCpu->hm.s.StatExitHostNmiInGC); + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitXcptNmi, y3); + return VINF_SUCCESS; + } + + /* If this VM-exit occurred while delivering an event through the guest IDT, handle it accordingly. */ + VBOXSTRICTRC rcStrictRc1 = hmR0VmxCheckExitDueToEventDelivery(pVCpu, pVmxTransient); + if (RT_UNLIKELY(rcStrictRc1 == VINF_SUCCESS)) + { /* likely */ } + else + { + if (rcStrictRc1 == VINF_HM_DOUBLE_FAULT) + rcStrictRc1 = VINF_SUCCESS; + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitXcptNmi, y3); + return rcStrictRc1; + } + + uint32_t uExitIntInfo = pVmxTransient->uExitIntInfo; + uint32_t uVector = VMX_EXIT_INT_INFO_VECTOR(uExitIntInfo); + switch (uIntType) + { + case VMX_EXIT_INT_INFO_TYPE_PRIV_SW_XCPT: /* Privileged software exception. (#DB from ICEBP) */ + Assert(uVector == X86_XCPT_DB); + RT_FALL_THRU(); + case VMX_EXIT_INT_INFO_TYPE_SW_XCPT: /* Software exception. (#BP or #OF) */ + Assert(uVector == X86_XCPT_BP || uVector == X86_XCPT_OF || uIntType == VMX_EXIT_INT_INFO_TYPE_PRIV_SW_XCPT); + RT_FALL_THRU(); + case VMX_EXIT_INT_INFO_TYPE_HW_XCPT: + { + /* + * If there's any exception caused as a result of event injection, the resulting + * secondary/final execption will be pending, we shall continue guest execution + * after injecting the event. The page-fault case is complicated and we manually + * handle any currently pending event in hmR0VmxExitXcptPF. + */ + if (!pVCpu->hm.s.Event.fPending) + { /* likely */ } + else if (uVector != X86_XCPT_PF) + { + rc = VINF_SUCCESS; + break; + } + + switch (uVector) + { + case X86_XCPT_PF: rc = hmR0VmxExitXcptPF(pVCpu, pVmxTransient); break; + case X86_XCPT_GP: rc = hmR0VmxExitXcptGP(pVCpu, pVmxTransient); break; + case X86_XCPT_MF: rc = hmR0VmxExitXcptMF(pVCpu, pVmxTransient); break; + case X86_XCPT_DB: rc = hmR0VmxExitXcptDB(pVCpu, pVmxTransient); break; + case X86_XCPT_BP: rc = hmR0VmxExitXcptBP(pVCpu, pVmxTransient); break; + case X86_XCPT_AC: rc = hmR0VmxExitXcptAC(pVCpu, pVmxTransient); break; + + case X86_XCPT_NM: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestNM); + rc = hmR0VmxExitXcptGeneric(pVCpu, pVmxTransient); break; + case X86_XCPT_XF: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestXF); + rc = hmR0VmxExitXcptGeneric(pVCpu, pVmxTransient); break; + case X86_XCPT_DE: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestDE); + rc = hmR0VmxExitXcptGeneric(pVCpu, pVmxTransient); break; + case X86_XCPT_UD: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestUD); + rc = hmR0VmxExitXcptGeneric(pVCpu, pVmxTransient); break; + case X86_XCPT_SS: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestSS); + rc = hmR0VmxExitXcptGeneric(pVCpu, pVmxTransient); break; + case X86_XCPT_NP: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestNP); + rc = hmR0VmxExitXcptGeneric(pVCpu, pVmxTransient); break; + case X86_XCPT_TS: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestTS); + rc = hmR0VmxExitXcptGeneric(pVCpu, pVmxTransient); break; + default: + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestXcpUnk); + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.vmx.pRealModeTSS); + Assert(PDMVmmDevHeapIsEnabled(pVCpu->CTX_SUFF(pVM))); + Assert(CPUMIsGuestInRealModeEx(&pVCpu->cpum.GstCtx)); + + rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR0); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= hmR0VmxReadExitIntErrorCodeVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(uExitIntInfo), + pVmxTransient->cbInstr, pVmxTransient->uExitIntErrorCode, + 0 /* GCPtrFaultAddress */); + } + else + { + AssertMsgFailed(("Unexpected VM-exit caused by exception %#x\n", uVector)); + pVCpu->hm.s.u32HMError = uVector; + rc = VERR_VMX_UNEXPECTED_EXCEPTION; + } + break; + } + } + break; + } + + default: + { + pVCpu->hm.s.u32HMError = uExitIntInfo; + rc = VERR_VMX_UNEXPECTED_INTERRUPTION_EXIT_TYPE; + AssertMsgFailed(("Unexpected interruption info %#x\n", VMX_EXIT_INT_INFO_TYPE(uExitIntInfo))); + break; + } + } + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitXcptNmi, y3); + return rc; +} + + +/** + * VM-exit handler for interrupt-window exiting (VMX_EXIT_INT_WINDOW). + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitIntWindow(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /* Indicate that we no longer need to VM-exit when the guest is ready to receive interrupts, it is now ready. */ + hmR0VmxClearIntWindowExitVmcs(pVCpu); + + /* Deliver the pending interrupts via hmR0VmxEvaluatePendingEvent() and resume guest execution. */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitIntWindow); + return VINF_SUCCESS; +} + + +/** + * VM-exit handler for NMI-window exiting (VMX_EXIT_NMI_WINDOW). + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitNmiWindow(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + if (RT_UNLIKELY(!(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_NMI_WINDOW_EXIT))) + { + AssertMsgFailed(("Unexpected NMI-window exit.\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); + } + + Assert(!VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)); + + /* + * If block-by-STI is set when we get this VM-exit, it means the CPU doesn't block NMIs following STI. + * It is therefore safe to unblock STI and deliver the NMI ourselves. See @bugref{7445}. + */ + uint32_t fIntrState = 0; + int rc = VMXReadVmcs32(VMX_VMCS32_GUEST_INT_STATE, &fIntrState); + AssertRCReturn(rc, rc); + Assert(!(fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_MOVSS)); + if (fIntrState & VMX_VMCS_GUEST_INT_STATE_BLOCK_STI) + { + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS)) + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); + + fIntrState &= ~VMX_VMCS_GUEST_INT_STATE_BLOCK_STI; + rc = VMXWriteVmcs32(VMX_VMCS32_GUEST_INT_STATE, fIntrState); + AssertRCReturn(rc, rc); + } + + /* Indicate that we no longer need to VM-exit when the guest is ready to receive NMIs, it is now ready */ + hmR0VmxClearNmiWindowExitVmcs(pVCpu); + + /* Deliver the pending NMI via hmR0VmxEvaluatePendingEvent() and resume guest execution. */ + return VINF_SUCCESS; +} + + +/** + * VM-exit handler for WBINVD (VMX_EXIT_WBINVD). Conditional VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitWbinvd(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + return hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for INVD (VMX_EXIT_INVD). Unconditional VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitInvd(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + return hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for CPUID (VMX_EXIT_CPUID). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitCpuid(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /* + * Get the state we need and update the exit history entry. + */ + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK); + AssertRCReturn(rc, rc); + + VBOXSTRICTRC rcStrict; + PCEMEXITREC pExitRec = EMHistoryUpdateFlagsAndTypeAndPC(pVCpu, + EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_CPUID), + pVCpu->cpum.GstCtx.rip + pVCpu->cpum.GstCtx.cs.u64Base); + if (!pExitRec) + { + /* + * Regular CPUID instruction execution. + */ + rcStrict = IEMExecDecodedCpuid(pVCpu, pVmxTransient->cbInstr); + if (rcStrict == VINF_SUCCESS) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + } + else + { + /* + * Frequent exit or something needing probing. Get state and call EMHistoryExec. + */ + int rc2 = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRCReturn(rc2, rc2); + + Log4(("CpuIdExit/%u: %04x:%08RX64: %#x/%#x -> EMHistoryExec\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, pVCpu->cpum.GstCtx.eax, pVCpu->cpum.GstCtx.ecx)); + + rcStrict = EMHistoryExec(pVCpu, pExitRec, 0); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + + Log4(("CpuIdExit/%u: %04x:%08RX64: EMHistoryExec -> %Rrc + %04x:%08RX64\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, + VBOXSTRICTRC_VAL(rcStrict), pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip)); + } + return rcStrict; +} + + +/** + * VM-exit handler for GETSEC (VMX_EXIT_GETSEC). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitGetsec(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR4); + AssertRCReturn(rc, rc); + + if (pVCpu->cpum.GstCtx.cr4 & X86_CR4_SMXE) + return VINF_EM_RAW_EMULATE_INSTR; + + AssertMsgFailed(("hmR0VmxExitGetsec: unexpected VM-exit when CR4.SMXE is 0.\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for RDTSC (VMX_EXIT_RDTSC). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitRdtsc(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + + VBOXSTRICTRC rcStrict = IEMExecDecodedRdtsc(pVCpu, pVmxTransient->cbInstr); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + { + /* If we get a spurious VM-exit when offsetting is enabled, + we must reset offsetting on VM-reentry. See @bugref{6634}. */ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TSC_OFFSETTING) + pVmxTransient->fUpdateTscOffsettingAndPreemptTimer = true; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS); + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +} + + +/** + * VM-exit handler for RDTSCP (VMX_EXIT_RDTSCP). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitRdtscp(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | CPUMCTX_EXTRN_TSC_AUX); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + + VBOXSTRICTRC rcStrict = IEMExecDecodedRdtscp(pVCpu, pVmxTransient->cbInstr); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + { + /* If we get a spurious VM-exit when offsetting is enabled, + we must reset offsetting on VM-reentry. See @bugref{6634}. */ + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TSC_OFFSETTING) + pVmxTransient->fUpdateTscOffsettingAndPreemptTimer = true; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS); + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +} + + +/** + * VM-exit handler for RDPMC (VMX_EXIT_RDPMC). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitRdpmc(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR4 | CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_RFLAGS | CPUMCTX_EXTRN_SS); + AssertRCReturn(rc, rc); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + rc = EMInterpretRdpmc(pVM, pVCpu, CPUMCTX2CORE(pCtx)); + if (RT_LIKELY(rc == VINF_SUCCESS)) + { + rc = hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); + Assert(pVmxTransient->cbInstr == 2); + } + else + { + AssertMsgFailed(("hmR0VmxExitRdpmc: EMInterpretRdpmc failed with %Rrc\n", rc)); + rc = VERR_EM_INTERPRETER; + } + return rc; +} + + +/** + * VM-exit handler for VMCALL (VMX_EXIT_VMCALL). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmcall(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + VBOXSTRICTRC rcStrict = VERR_VMX_IPE_3; + if (EMAreHypercallInstructionsEnabled(pVCpu)) + { + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_RIP | CPUMCTX_EXTRN_RFLAGS | CPUMCTX_EXTRN_CR0 + | CPUMCTX_EXTRN_SS | CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_EFER); + AssertRCReturn(rc, rc); + + /* Perform the hypercall. */ + rcStrict = GIMHypercall(pVCpu, &pVCpu->cpum.GstCtx); + if (rcStrict == VINF_SUCCESS) + { + rc = hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + } + else + Assert( rcStrict == VINF_GIM_R3_HYPERCALL + || rcStrict == VINF_GIM_HYPERCALL_CONTINUING + || RT_FAILURE(rcStrict)); + + /* If the hypercall changes anything other than guest's general-purpose registers, + we would need to reload the guest changed bits here before VM-entry. */ + } + else + Log4Func(("Hypercalls not enabled\n")); + + /* If hypercalls are disabled or the hypercall failed for some reason, raise #UD and continue. */ + if (RT_FAILURE(rcStrict)) + { + hmR0VmxSetPendingXcptUD(pVCpu); + rcStrict = VINF_SUCCESS; + } + + return rcStrict; +} + + +/** + * VM-exit handler for INVLPG (VMX_EXIT_INVLPG). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitInvlpg(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + Assert(!pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging || pVCpu->hm.s.fUsingDebugLoop); + + int rc = hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK); + AssertRCReturn(rc, rc); + + VBOXSTRICTRC rcStrict = IEMExecDecodedInvlpg(pVCpu, pVmxTransient->cbInstr, pVmxTransient->uExitQual); + + if (rcStrict == VINF_SUCCESS || rcStrict == VINF_PGM_SYNC_CR3) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + else + AssertMsgFailed(("Unexpected IEMExecDecodedInvlpg(%#RX64) sttus: %Rrc\n", pVmxTransient->uExitQual, + VBOXSTRICTRC_VAL(rcStrict))); + return rcStrict; +} + + +/** + * VM-exit handler for MONITOR (VMX_EXIT_MONITOR). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitMonitor(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_RFLAGS | CPUMCTX_EXTRN_SS); + AssertRCReturn(rc, rc); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + rc = EMInterpretMonitor(pVM, pVCpu, CPUMCTX2CORE(pCtx)); + if (RT_LIKELY(rc == VINF_SUCCESS)) + rc = hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); + else + { + AssertMsg(rc == VERR_EM_INTERPRETER, ("hmR0VmxExitMonitor: EMInterpretMonitor failed with %Rrc\n", rc)); + rc = VERR_EM_INTERPRETER; + } + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitMonitor); + return rc; +} + + +/** + * VM-exit handler for MWAIT (VMX_EXIT_MWAIT). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitMwait(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR0 | CPUMCTX_EXTRN_RFLAGS | CPUMCTX_EXTRN_SS); + AssertRCReturn(rc, rc); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + VBOXSTRICTRC rc2 = EMInterpretMWait(pVM, pVCpu, CPUMCTX2CORE(pCtx)); + rc = VBOXSTRICTRC_VAL(rc2); + if (RT_LIKELY( rc == VINF_SUCCESS + || rc == VINF_EM_HALT)) + { + int rc3 = hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); + AssertRCReturn(rc3, rc3); + + if ( rc == VINF_EM_HALT + && EMMonitorWaitShouldContinue(pVCpu, pCtx)) + rc = VINF_SUCCESS; + } + else + { + AssertMsg(rc == VERR_EM_INTERPRETER, ("hmR0VmxExitMwait: EMInterpretMWait failed with %Rrc\n", rc)); + rc = VERR_EM_INTERPRETER; + } + AssertMsg(rc == VINF_SUCCESS || rc == VINF_EM_HALT || rc == VERR_EM_INTERPRETER, + ("hmR0VmxExitMwait: failed, invalid error code %Rrc\n", rc)); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitMwait); + return rc; +} + + +/** + * VM-exit handler for RSM (VMX_EXIT_RSM). Unconditional VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitRsm(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + /* + * Execution of RSM outside of SMM mode causes #UD regardless of VMX root or VMX non-root + * mode. In theory, we should never get this VM-exit. This can happen only if dual-monitor + * treatment of SMI and VMX is enabled, which can (only?) be done by executing VMCALL in + * VMX root operation. If we get here, something funny is going on. + * + * See Intel spec. 33.15.5 "Enabling the Dual-Monitor Treatment". + */ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + AssertMsgFailed(("Unexpected RSM VM-exit\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for SMI (VMX_EXIT_SMI). Unconditional VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitSmi(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + /* + * This can only happen if we support dual-monitor treatment of SMI, which can be activated + * by executing VMCALL in VMX root operation. Only an STM (SMM transfer monitor) would get + * this VM-exit when we (the executive monitor) execute a VMCALL in VMX root mode or receive + * an SMI. If we get here, something funny is going on. + * + * See Intel spec. 33.15.6 "Activating the Dual-Monitor Treatment" + * See Intel spec. 25.3 "Other Causes of VM-Exits" + */ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + AssertMsgFailed(("Unexpected SMI VM-exit\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for IO SMI (VMX_EXIT_IO_SMI). Unconditional VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitIoSmi(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + /* Same treatment as VMX_EXIT_SMI. See comment in hmR0VmxExitSmi(). */ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + AssertMsgFailed(("Unexpected IO SMI VM-exit\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for SIPI (VMX_EXIT_SIPI). Conditional VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitSipi(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + /* + * SIPI exits can only occur in VMX non-root operation when the "wait-for-SIPI" guest activity state is used. + * We don't make use of it as our guests don't have direct access to the host LAPIC. + * See Intel spec. 25.3 "Other Causes of VM-exits". + */ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + AssertMsgFailed(("Unexpected SIPI VM-exit\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for INIT signal (VMX_EXIT_INIT_SIGNAL). Unconditional + * VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitInitSignal(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + /* + * INIT signals are blocked in VMX root operation by VMXON and by SMI in SMM. + * See Intel spec. 33.14.1 Default Treatment of SMI Delivery" and Intel spec. 29.3 "VMX Instructions" for "VMXON". + * + * It is -NOT- blocked in VMX non-root operation so we can, in theory, still get these VM-exits. + * See Intel spec. "23.8 Restrictions on VMX operation". + */ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + return VINF_SUCCESS; +} + + +/** + * VM-exit handler for triple faults (VMX_EXIT_TRIPLE_FAULT). Unconditional + * VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitTripleFault(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + return VINF_EM_RESET; +} + + +/** + * VM-exit handler for HLT (VMX_EXIT_HLT). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitHlt(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + Assert(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_HLT_EXIT); + + int rc = hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_RFLAGS); + AssertRCReturn(rc, rc); + + if (EMShouldContinueAfterHalt(pVCpu, &pVCpu->cpum.GstCtx)) /* Requires eflags. */ + rc = VINF_SUCCESS; + else + rc = VINF_EM_HALT; + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitHlt); + if (rc != VINF_SUCCESS) + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchHltToR3); + return rc; +} + + +/** + * VM-exit handler for instructions that result in a \#UD exception delivered to + * the guest. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitSetPendingXcptUD(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + hmR0VmxSetPendingXcptUD(pVCpu); + return VINF_SUCCESS; +} + + +/** + * VM-exit handler for expiry of the VMX preemption timer. + */ +HMVMX_EXIT_DECL hmR0VmxExitPreemptTimer(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /* If the preemption-timer has expired, reinitialize the preemption timer on next VM-entry. */ + pVmxTransient->fUpdateTscOffsettingAndPreemptTimer = true; + + /* If there are any timer events pending, fall back to ring-3, otherwise resume guest execution. */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + bool fTimersPending = TMTimerPollBool(pVM, pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitPreemptTimer); + return fTimersPending ? VINF_EM_RAW_TIMER_PENDING : VINF_SUCCESS; +} + + +/** + * VM-exit handler for XSETBV (VMX_EXIT_XSETBV). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitXsetbv(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | CPUMCTX_EXTRN_CR4); + AssertRCReturn(rc, rc); + + VBOXSTRICTRC rcStrict = IEMExecDecodedXsetbv(pVCpu, pVmxTransient->cbInstr); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, rcStrict != VINF_IEM_RAISED_XCPT ? HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS + : HM_CHANGED_RAISED_XCPT_MASK); + + PCCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + pVCpu->hm.s.fLoadSaveGuestXcr0 = (pCtx->cr4 & X86_CR4_OSXSAVE) && pCtx->aXcr[0] != ASMGetXcr0(); + + return rcStrict; +} + + +/** + * VM-exit handler for INVPCID (VMX_EXIT_INVPCID). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitInvpcid(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + /** @todo Use VM-exit instruction information. */ + return VERR_EM_INTERPRETER; +} + + +/** + * VM-exit handler for invalid-guest-state (VMX_EXIT_ERR_INVALID_GUEST_STATE). + * Error VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitErrInvalidGuestState(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRCReturn(rc, rc); + rc = hmR0VmxCheckVmcsCtls(pVCpu); + if (RT_FAILURE(rc)) + return rc; + + uint32_t uInvalidReason = hmR0VmxCheckGuestState(pVCpu); + NOREF(uInvalidReason); + +#ifdef VBOX_STRICT + uint32_t fIntrState; + RTHCUINTREG uHCReg; + uint64_t u64Val; + uint32_t u32Val; + + rc = hmR0VmxReadEntryIntInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadEntryXcptErrorCodeVmcs(pVmxTransient); + rc |= hmR0VmxReadEntryInstrLenVmcs(pVmxTransient); + rc |= VMXReadVmcs32(VMX_VMCS32_GUEST_INT_STATE, &fIntrState); + AssertRCReturn(rc, rc); + + Log4(("uInvalidReason %u\n", uInvalidReason)); + Log4(("VMX_VMCS32_CTRL_ENTRY_INTERRUPTION_INFO %#RX32\n", pVmxTransient->uEntryIntInfo)); + Log4(("VMX_VMCS32_CTRL_ENTRY_EXCEPTION_ERRCODE %#RX32\n", pVmxTransient->uEntryXcptErrorCode)); + Log4(("VMX_VMCS32_CTRL_ENTRY_INSTR_LENGTH %#RX32\n", pVmxTransient->cbEntryInstr)); + Log4(("VMX_VMCS32_GUEST_INT_STATE %#RX32\n", fIntrState)); + + rc = VMXReadVmcs32(VMX_VMCS_GUEST_CR0, &u32Val); AssertRC(rc); + Log4(("VMX_VMCS_GUEST_CR0 %#RX32\n", u32Val)); + rc = VMXReadVmcsHstN(VMX_VMCS_CTRL_CR0_MASK, &uHCReg); AssertRC(rc); + Log4(("VMX_VMCS_CTRL_CR0_MASK %#RHr\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_CTRL_CR0_READ_SHADOW, &uHCReg); AssertRC(rc); + Log4(("VMX_VMCS_CTRL_CR4_READ_SHADOW %#RHr\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_CTRL_CR4_MASK, &uHCReg); AssertRC(rc); + Log4(("VMX_VMCS_CTRL_CR4_MASK %#RHr\n", uHCReg)); + rc = VMXReadVmcsHstN(VMX_VMCS_CTRL_CR4_READ_SHADOW, &uHCReg); AssertRC(rc); + Log4(("VMX_VMCS_CTRL_CR4_READ_SHADOW %#RHr\n", uHCReg)); + rc = VMXReadVmcs64(VMX_VMCS64_CTRL_EPTP_FULL, &u64Val); AssertRC(rc); + Log4(("VMX_VMCS64_CTRL_EPTP_FULL %#RX64\n", u64Val)); + + hmR0DumpRegs(pVCpu); +#else + NOREF(pVmxTransient); +#endif + + return VERR_VMX_INVALID_GUEST_STATE; +} + + +/** + * VM-exit handler for VM-entry failure due to an MSR-load + * (VMX_EXIT_ERR_MSR_LOAD). Error VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitErrMsrLoad(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + AssertMsgFailed(("Unexpected MSR-load exit\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for VM-entry failure due to a machine-check event + * (VMX_EXIT_ERR_MACHINE_CHECK). Error VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitErrMachineCheck(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + AssertMsgFailed(("Unexpected machine-check event exit\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for all undefined reasons. Should never ever happen.. in + * theory. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitErrUndefined(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + RT_NOREF2(pVCpu, pVmxTransient); + AssertMsgFailed(("Huh!? Undefined VM-exit reason %d\n", pVmxTransient->uExitReason)); + return VERR_VMX_UNDEFINED_EXIT_CODE; +} + + +/** + * VM-exit handler for XDTR (LGDT, SGDT, LIDT, SIDT) accesses + * (VMX_EXIT_GDTR_IDTR_ACCESS) and LDT and TR access (LLDT, LTR, SLDT, STR). + * Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitXdtrAccess(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /* By default, we don't enable VMX_PROC_CTLS2_DESCRIPTOR_TABLE_EXIT. */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitXdtrAccess); + if (pVCpu->hm.s.vmx.u32ProcCtls2 & VMX_PROC_CTLS2_DESC_TABLE_EXIT) + return VERR_EM_INTERPRETER; + AssertMsgFailed(("Unexpected XDTR access\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for RDRAND (VMX_EXIT_RDRAND). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitRdrand(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /* By default, we don't enable VMX_PROC_CTLS2_RDRAND_EXIT. */ + if (pVCpu->hm.s.vmx.u32ProcCtls2 & VMX_PROC_CTLS2_RDRAND_EXIT) + return VERR_EM_INTERPRETER; + AssertMsgFailed(("Unexpected RDRAND exit\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); +} + + +/** + * VM-exit handler for RDMSR (VMX_EXIT_RDMSR). + */ +HMVMX_EXIT_DECL hmR0VmxExitRdmsr(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /** @todo Optimize this: We currently drag in in the whole MSR state + * (CPUMCTX_EXTRN_ALL_MSRS) here. We should optimize this to only get + * MSRs required. That would require changes to IEM and possibly CPUM too. + * (Should probably do it lazy fashion from CPUMAllMsrs.cpp). */ + uint32_t const idMsr = pVCpu->cpum.GstCtx.ecx; + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK | CPUMCTX_EXTRN_ALL_MSRS); + switch (idMsr) + { + /* The FS and GS base MSRs are not part of the above all-MSRs mask. */ + case MSR_K8_FS_BASE: rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_FS); break; + case MSR_K8_GS_BASE: rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_GS); break; + } + AssertRCReturn(rc, rc); + + Log4Func(("ecx=%#RX32\n", idMsr)); + +#ifdef VBOX_STRICT + if (pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_MSR_BITMAPS) + { + if ( hmR0VmxIsAutoLoadStoreGuestMsr(pVCpu, idMsr) + && idMsr != MSR_K6_EFER) + { + AssertMsgFailed(("Unexpected RDMSR for an MSR in the auto-load/store area in the VMCS. ecx=%#RX32\n", idMsr)); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); + } + if (hmR0VmxIsLazyGuestMsr(pVCpu, idMsr)) + { + VMXMSREXITREAD enmRead; + VMXMSREXITWRITE enmWrite; + int rc2 = HMGetVmxMsrPermission(pVCpu->hm.s.vmx.pvMsrBitmap, idMsr, &enmRead, &enmWrite); + AssertRCReturn(rc2, rc2); + if (enmRead == VMXMSREXIT_PASSTHRU_READ) + { + AssertMsgFailed(("Unexpected RDMSR for a passthru lazy-restore MSR. ecx=%#RX32\n", idMsr)); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); + } + } + } +#endif + + VBOXSTRICTRC rcStrict = IEMExecDecodedRdmsr(pVCpu, pVmxTransient->cbInstr); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitRdmsr); + if (rcStrict == VINF_SUCCESS) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS + | HM_CHANGED_GUEST_RAX | HM_CHANGED_GUEST_RDX); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + else + AssertMsg(rcStrict == VINF_CPUM_R3_MSR_READ, ("Unexpected IEMExecDecodedRdmsr rc (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict))); + + return rcStrict; +} + + +/** + * VM-exit handler for WRMSR (VMX_EXIT_WRMSR). + */ +HMVMX_EXIT_DECL hmR0VmxExitWrmsr(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /** @todo Optimize this: We currently drag in in the whole MSR state + * (CPUMCTX_EXTRN_ALL_MSRS) here. We should optimize this to only get + * MSRs required. That would require changes to IEM and possibly CPUM too. + * (Should probably do it lazy fashion from CPUMAllMsrs.cpp). */ + uint32_t const idMsr = pVCpu->cpum.GstCtx.ecx; + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK + | CPUMCTX_EXTRN_ALL_MSRS); + switch (idMsr) + { + /* + * The FS and GS base MSRs are not part of the above all-MSRs mask. + * + * Although we don't need to fetch the base as it will be overwritten shortly, while + * loading guest-state we would also load the entire segment register including limit + * and attributes and thus we need to load them here. + */ + case MSR_K8_FS_BASE: rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_FS); break; + case MSR_K8_GS_BASE: rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_GS); break; + } + AssertRCReturn(rc, rc); + + Log4Func(("ecx=%#RX32 edx:eax=%#RX32:%#RX32\n", idMsr, pVCpu->cpum.GstCtx.edx, pVCpu->cpum.GstCtx.eax)); + + VBOXSTRICTRC rcStrict = IEMExecDecodedWrmsr(pVCpu, pVmxTransient->cbInstr); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitWrmsr); + + if (rcStrict == VINF_SUCCESS) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS); + + /* If this is an X2APIC WRMSR access, update the APIC state as well. */ + if ( idMsr == MSR_IA32_APICBASE + || ( idMsr >= MSR_IA32_X2APIC_START + && idMsr <= MSR_IA32_X2APIC_END)) + { + /* + * We've already saved the APIC related guest-state (TPR) in hmR0VmxPostRunGuest(). When full APIC register + * virtualization is implemented we'll have to make sure APIC state is saved from the VMCS before IEM changes it. + */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_APIC_TPR); + } + else if (idMsr == MSR_IA32_TSC) /* Windows 7 does this during bootup. See @bugref{6398}. */ + pVmxTransient->fUpdateTscOffsettingAndPreemptTimer = true; + else if (idMsr == MSR_K6_EFER) + { + /* + * If the guest touches EFER we need to update the VM-Entry and VM-Exit controls as well, + * even if it is -not- touching bits that cause paging mode changes (LMA/LME). We care about + * the other bits as well, SCE and NXE. See @bugref{7368}. + */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_EFER_MSR | HM_CHANGED_VMX_ENTRY_CTLS + | HM_CHANGED_VMX_EXIT_CTLS); + } + + /* Update MSRs that are part of the VMCS and auto-load/store area when MSR-bitmaps are not supported. */ + if (!(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_MSR_BITMAPS)) + { + switch (idMsr) + { + case MSR_IA32_SYSENTER_CS: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_SYSENTER_CS_MSR); break; + case MSR_IA32_SYSENTER_EIP: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_SYSENTER_EIP_MSR); break; + case MSR_IA32_SYSENTER_ESP: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_SYSENTER_ESP_MSR); break; + case MSR_K8_FS_BASE: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_FS); break; + case MSR_K8_GS_BASE: ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_GS); break; + case MSR_K6_EFER: /* Nothing to do, already handled above. */ break; + default: + { + if (hmR0VmxIsAutoLoadStoreGuestMsr(pVCpu, idMsr)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_VMX_GUEST_AUTO_MSRS); + else if (hmR0VmxIsLazyGuestMsr(pVCpu, idMsr)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_VMX_GUEST_LAZY_MSRS); + break; + } + } + } +#ifdef VBOX_STRICT + else + { + /* Paranoia. Validate that MSRs in the MSR-bitmaps with write-passthru are not intercepted. */ + switch (idMsr) + { + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_EIP: + case MSR_IA32_SYSENTER_ESP: + case MSR_K8_FS_BASE: + case MSR_K8_GS_BASE: + { + AssertMsgFailed(("Unexpected WRMSR for an MSR in the VMCS. ecx=%#RX32\n", idMsr)); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); + } + + /* Writes to MSRs in auto-load/store area/swapped MSRs, shouldn't cause VM-exits with MSR-bitmaps. */ + default: + { + if (hmR0VmxIsAutoLoadStoreGuestMsr(pVCpu, idMsr)) + { + /* EFER writes are always intercepted, see hmR0VmxExportGuestMsrs(). */ + if (idMsr != MSR_K6_EFER) + { + AssertMsgFailed(("Unexpected WRMSR for an MSR in the auto-load/store area in the VMCS. ecx=%#RX32\n", + idMsr)); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); + } + } + + if (hmR0VmxIsLazyGuestMsr(pVCpu, idMsr)) + { + VMXMSREXITREAD enmRead; + VMXMSREXITWRITE enmWrite; + int rc2 = HMGetVmxMsrPermission(pVCpu->hm.s.vmx.pvMsrBitmap, idMsr, &enmRead, &enmWrite); + AssertRCReturn(rc2, rc2); + if (enmWrite == VMXMSREXIT_PASSTHRU_WRITE) + { + AssertMsgFailed(("Unexpected WRMSR for passthru, lazy-restore MSR. ecx=%#RX32\n", idMsr)); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); + } + } + break; + } + } + } +#endif /* VBOX_STRICT */ + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + else + AssertMsg(rcStrict == VINF_CPUM_R3_MSR_WRITE, ("Unexpected IEMExecDecodedWrmsr rc (%Rrc)\n", VBOXSTRICTRC_VAL(rcStrict))); + + return rcStrict; +} + + +/** + * VM-exit handler for PAUSE (VMX_EXIT_PAUSE). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitPause(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + /** @todo The guest has likely hit a contended spinlock. We might want to + * poke a schedule different guest VCPU. */ + return VINF_EM_RAW_INTERRUPT; +} + + +/** + * VM-exit handler for when the TPR value is lowered below the specified + * threshold (VMX_EXIT_TPR_BELOW_THRESHOLD). Conditional VM-exit. + */ +HMVMX_EXIT_NSRC_DECL hmR0VmxExitTprBelowThreshold(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + Assert(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW); + + /* + * The TPR shadow would've been synced with the APIC TPR in hmR0VmxPostRunGuest(). We'll re-evaluate + * pending interrupts and inject them before the next VM-entry so we can just continue execution here. + */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitTprBelowThreshold); + return VINF_SUCCESS; +} + + +/** + * VM-exit handler for control-register accesses (VMX_EXIT_MOV_CRX). Conditional + * VM-exit. + * + * @retval VINF_SUCCESS when guest execution can continue. + * @retval VINF_PGM_SYNC_CR3 CR3 sync is required, back to ring-3. + * @retval VERR_EM_INTERPRETER when something unexpected happened, fallback to + * interpreter. + */ +HMVMX_EXIT_DECL hmR0VmxExitMovCRx(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatExitMovCRx, y2); + + int rc = hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + AssertRCReturn(rc, rc); + + VBOXSTRICTRC rcStrict; + PVM pVM = pVCpu->CTX_SUFF(pVM); + RTGCUINTPTR const uExitQual = pVmxTransient->uExitQual; + uint32_t const uAccessType = VMX_EXIT_QUAL_CRX_ACCESS(uExitQual); + switch (uAccessType) + { + case VMX_EXIT_QUAL_CRX_ACCESS_WRITE: /* MOV to CRx */ + { + uint32_t const uOldCr0 = pVCpu->cpum.GstCtx.cr0; + rcStrict = IEMExecDecodedMovCRxWrite(pVCpu, pVmxTransient->cbInstr, VMX_EXIT_QUAL_CRX_REGISTER(uExitQual), + VMX_EXIT_QUAL_CRX_GENREG(uExitQual)); + AssertMsg( rcStrict == VINF_SUCCESS + || rcStrict == VINF_IEM_RAISED_XCPT + || rcStrict == VINF_PGM_SYNC_CR3, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + + switch (VMX_EXIT_QUAL_CRX_REGISTER(uExitQual)) + { + case 0: + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, + HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_CR0); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR0Write); + Log4Func(("CR0 write rcStrict=%Rrc CR0=%#RX64\n", VBOXSTRICTRC_VAL(rcStrict), pVCpu->cpum.GstCtx.cr0)); + + /* + * This is a kludge for handling switches back to real mode when we try to use + * V86 mode to run real mode code directly. Problem is that V86 mode cannot + * deal with special selector values, so we have to return to ring-3 and run + * there till the selector values are V86 mode compatible. + * + * Note! Using VINF_EM_RESCHEDULE_REM here rather than VINF_EM_RESCHEDULE since the + * latter is an alias for VINF_IEM_RAISED_XCPT which is converted to VINF_SUCCESs + * at the end of this function. + */ + if ( rc == VINF_SUCCESS + && !pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fUnrestrictedGuest + && CPUMIsGuestInRealModeEx(&pVCpu->cpum.GstCtx) + && (uOldCr0 & X86_CR0_PE) + && !(pVCpu->cpum.GstCtx.cr0 & X86_CR0_PE) ) + { + /** @todo check selectors rather than returning all the time. */ + Log4Func(("CR0 write, back to real mode -> VINF_EM_RESCHEDULE_REM\n")); + rcStrict = VINF_EM_RESCHEDULE_REM; + } + break; + } + + case 2: + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR2Write); + /* Nothing to do here, CR2 it's not part of the VMCS. */ + break; + } + + case 3: + { + Assert( !pVM->hm.s.fNestedPaging + || !CPUMIsGuestPagingEnabledEx(&pVCpu->cpum.GstCtx) + || pVCpu->hm.s.fUsingDebugLoop); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR3Write); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, + HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_CR3); + Log4Func(("CR3 write rcStrict=%Rrc CR3=%#RX64\n", VBOXSTRICTRC_VAL(rcStrict), pVCpu->cpum.GstCtx.cr3)); + break; + } + + case 4: + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR4Write); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, + HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_CR4); + Log4Func(("CR4 write rc=%Rrc CR4=%#RX64 fLoadSaveGuestXcr0=%u\n", VBOXSTRICTRC_VAL(rcStrict), + pVCpu->cpum.GstCtx.cr4, pVCpu->hm.s.fLoadSaveGuestXcr0)); + break; + } + + case 8: + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR8Write); + Assert(!(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW)); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, + HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_APIC_TPR); + break; + } + default: + AssertMsgFailed(("Invalid CRx register %#x\n", VMX_EXIT_QUAL_CRX_REGISTER(uExitQual))); + break; + } + break; + } + + case VMX_EXIT_QUAL_CRX_ACCESS_READ: /* MOV from CRx */ + { + Assert( !pVM->hm.s.fNestedPaging + || !CPUMIsGuestPagingEnabledEx(&pVCpu->cpum.GstCtx) + || pVCpu->hm.s.fUsingDebugLoop + || VMX_EXIT_QUAL_CRX_REGISTER(uExitQual) != 3); + /* CR8 reads only cause a VM-exit when the TPR shadow feature isn't enabled. */ + Assert( VMX_EXIT_QUAL_CRX_REGISTER(uExitQual) != 8 + || !(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW)); + + rcStrict = IEMExecDecodedMovCRxRead(pVCpu, pVmxTransient->cbInstr, VMX_EXIT_QUAL_CRX_GENREG(uExitQual), + VMX_EXIT_QUAL_CRX_REGISTER(uExitQual)); + AssertMsg( rcStrict == VINF_SUCCESS + || rcStrict == VINF_IEM_RAISED_XCPT, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); +#ifdef VBOX_WITH_STATISTICS + switch (VMX_EXIT_QUAL_CRX_REGISTER(uExitQual)) + { + case 0: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR0Read); break; + case 2: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR2Read); break; + case 3: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR3Read); break; + case 4: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR4Read); break; + case 8: STAM_COUNTER_INC(&pVCpu->hm.s.StatExitCR8Read); break; + } +#endif + Log4Func(("CR%d Read access rcStrict=%Rrc\n", VMX_EXIT_QUAL_CRX_REGISTER(uExitQual), + VBOXSTRICTRC_VAL(rcStrict))); + if (VMX_EXIT_QUAL_CRX_GENREG(uExitQual) == X86_GREG_xSP) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_RSP); + else + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS); + break; + } + + case VMX_EXIT_QUAL_CRX_ACCESS_CLTS: /* CLTS (Clear Task-Switch Flag in CR0) */ + { + rcStrict = IEMExecDecodedClts(pVCpu, pVmxTransient->cbInstr); + AssertMsg( rcStrict == VINF_SUCCESS + || rcStrict == VINF_IEM_RAISED_XCPT, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_CR0); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitClts); + Log4Func(("CLTS rcStrict=%d\n", VBOXSTRICTRC_VAL(rcStrict))); + break; + } + + case VMX_EXIT_QUAL_CRX_ACCESS_LMSW: /* LMSW (Load Machine-Status Word into CR0) */ + { + /* Note! LMSW cannot clear CR0.PE, so no fRealOnV86Active kludge needed here. */ + rc = hmR0VmxReadGuestLinearAddrVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + rcStrict = IEMExecDecodedLmsw(pVCpu, pVmxTransient->cbInstr, VMX_EXIT_QUAL_CRX_LMSW_DATA(uExitQual), + pVmxTransient->uGuestLinearAddr); + AssertMsg( rcStrict == VINF_SUCCESS + || rcStrict == VINF_IEM_RAISED_XCPT + , ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); + + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_CR0); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitLmsw); + Log4Func(("LMSW rcStrict=%d\n", VBOXSTRICTRC_VAL(rcStrict))); + break; + } + + default: + AssertMsgFailedReturn(("Invalid access-type in Mov CRx VM-exit qualification %#x\n", uAccessType), + VERR_VMX_UNEXPECTED_EXCEPTION); + } + + Assert( (pVCpu->hm.s.fCtxChanged & (HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS)) + == (HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS)); + if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitMovCRx, y2); + NOREF(pVM); + return rcStrict; +} + + +/** + * VM-exit handler for I/O instructions (VMX_EXIT_IO_INSTR). Conditional + * VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitIoInstr(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + STAM_PROFILE_ADV_START(&pVCpu->hm.s.StatExitIO, y1); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + int rc = hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK | CPUMCTX_EXTRN_SREG_MASK | CPUMCTX_EXTRN_EFER); + /* EFER also required for longmode checks in EMInterpretDisasCurrent(), but it's always up-to-date. */ + AssertRCReturn(rc, rc); + + /* Refer Intel spec. 27-5. "Exit Qualifications for I/O Instructions" for the format. */ + uint32_t uIOPort = VMX_EXIT_QUAL_IO_PORT(pVmxTransient->uExitQual); + uint8_t uIOWidth = VMX_EXIT_QUAL_IO_WIDTH(pVmxTransient->uExitQual); + bool fIOWrite = (VMX_EXIT_QUAL_IO_DIRECTION(pVmxTransient->uExitQual) == VMX_EXIT_QUAL_IO_DIRECTION_OUT); + bool fIOString = VMX_EXIT_QUAL_IO_IS_STRING(pVmxTransient->uExitQual); + bool fGstStepping = RT_BOOL(pCtx->eflags.Bits.u1TF); + bool fDbgStepping = pVCpu->hm.s.fSingleInstruction; + AssertReturn(uIOWidth <= 3 && uIOWidth != 2, VERR_VMX_IPE_1); + + /* + * Update exit history to see if this exit can be optimized. + */ + VBOXSTRICTRC rcStrict; + PCEMEXITREC pExitRec = NULL; + if ( !fGstStepping + && !fDbgStepping) + pExitRec = EMHistoryUpdateFlagsAndTypeAndPC(pVCpu, + !fIOString + ? !fIOWrite + ? EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_IO_PORT_READ) + : EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_IO_PORT_WRITE) + : !fIOWrite + ? EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_IO_PORT_STR_READ) + : EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_IO_PORT_STR_WRITE), + pVCpu->cpum.GstCtx.rip + pVCpu->cpum.GstCtx.cs.u64Base); + if (!pExitRec) + { + /* I/O operation lookup arrays. */ + static uint32_t const s_aIOSizes[4] = { 1, 2, 0, 4 }; /* Size of the I/O accesses. */ + static uint32_t const s_aIOOpAnd[4] = { 0xff, 0xffff, 0, 0xffffffff }; /* AND masks for saving result in AL/AX/EAX. */ + uint32_t const cbValue = s_aIOSizes[uIOWidth]; + uint32_t const cbInstr = pVmxTransient->cbInstr; + bool fUpdateRipAlready = false; /* ugly hack, should be temporary. */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (fIOString) + { + /* + * INS/OUTS - I/O String instruction. + * + * Use instruction-information if available, otherwise fall back on + * interpreting the instruction. + */ + Log4Func(("CS:RIP=%04x:%08RX64 %#06x/%u %c str\n", pCtx->cs.Sel, pCtx->rip, uIOPort, cbValue, fIOWrite ? 'w' : 'r')); + AssertReturn(pCtx->dx == uIOPort, VERR_VMX_IPE_2); + bool const fInsOutsInfo = RT_BF_GET(pVM->hm.s.vmx.Msrs.u64Basic, VMX_BF_BASIC_VMCS_INS_OUTS); + if (fInsOutsInfo) + { + int rc2 = hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + AssertRCReturn(rc2, rc2); + AssertReturn(pVmxTransient->ExitInstrInfo.StrIo.u3AddrSize <= 2, VERR_VMX_IPE_3); + AssertCompile(IEMMODE_16BIT == 0 && IEMMODE_32BIT == 1 && IEMMODE_64BIT == 2); + IEMMODE const enmAddrMode = (IEMMODE)pVmxTransient->ExitInstrInfo.StrIo.u3AddrSize; + bool const fRep = VMX_EXIT_QUAL_IO_IS_REP(pVmxTransient->uExitQual); + if (fIOWrite) + rcStrict = IEMExecStringIoWrite(pVCpu, cbValue, enmAddrMode, fRep, cbInstr, + pVmxTransient->ExitInstrInfo.StrIo.iSegReg, true /*fIoChecked*/); + else + { + /* + * The segment prefix for INS cannot be overridden and is always ES. We can safely assume X86_SREG_ES. + * Hence "iSegReg" field is undefined in the instruction-information field in VT-x for INS. + * See Intel Instruction spec. for "INS". + * See Intel spec. Table 27-8 "Format of the VM-Exit Instruction-Information Field as Used for INS and OUTS". + */ + rcStrict = IEMExecStringIoRead(pVCpu, cbValue, enmAddrMode, fRep, cbInstr, true /*fIoChecked*/); + } + } + else + rcStrict = IEMExecOne(pVCpu); + + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP); + fUpdateRipAlready = true; + } + else + { + /* + * IN/OUT - I/O instruction. + */ + Log4Func(("CS:RIP=%04x:%08RX64 %#06x/%u %c\n", pCtx->cs.Sel, pCtx->rip, uIOPort, cbValue, fIOWrite ? 'w' : 'r')); + uint32_t const uAndVal = s_aIOOpAnd[uIOWidth]; + Assert(!VMX_EXIT_QUAL_IO_IS_REP(pVmxTransient->uExitQual)); + if (fIOWrite) + { + rcStrict = IOMIOPortWrite(pVM, pVCpu, uIOPort, pCtx->eax & uAndVal, cbValue); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitIOWrite); + if ( rcStrict == VINF_IOM_R3_IOPORT_WRITE + && !pCtx->eflags.Bits.u1TF) + rcStrict = EMRZSetPendingIoPortWrite(pVCpu, uIOPort, cbInstr, cbValue, pCtx->eax & uAndVal); + } + else + { + uint32_t u32Result = 0; + rcStrict = IOMIOPortRead(pVM, pVCpu, uIOPort, &u32Result, cbValue); + if (IOM_SUCCESS(rcStrict)) + { + /* Save result of I/O IN instr. in AL/AX/EAX. */ + pCtx->eax = (pCtx->eax & ~uAndVal) | (u32Result & uAndVal); + } + if ( rcStrict == VINF_IOM_R3_IOPORT_READ + && !pCtx->eflags.Bits.u1TF) + rcStrict = EMRZSetPendingIoPortRead(pVCpu, uIOPort, cbInstr, cbValue); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitIORead); + } + } + + if (IOM_SUCCESS(rcStrict)) + { + if (!fUpdateRipAlready) + { + hmR0VmxAdvanceGuestRipBy(pVCpu, cbInstr); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP); + } + + /* + * INS/OUTS with REP prefix updates RFLAGS, can be observed with triple-fault guru + * while booting Fedora 17 64-bit guest. + * + * See Intel Instruction reference for REP/REPE/REPZ/REPNE/REPNZ. + */ + if (fIOString) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RFLAGS); + + /* + * If any I/O breakpoints are armed, we need to check if one triggered + * and take appropriate action. + * Note that the I/O breakpoint type is undefined if CR4.DE is 0. + */ + rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_DR7); + AssertRCReturn(rc, rc); + + /** @todo Optimize away the DBGFBpIsHwIoArmed call by having DBGF tell the + * execution engines about whether hyper BPs and such are pending. */ + uint32_t const uDr7 = pCtx->dr[7]; + if (RT_UNLIKELY( ( (uDr7 & X86_DR7_ENABLED_MASK) + && X86_DR7_ANY_RW_IO(uDr7) + && (pCtx->cr4 & X86_CR4_DE)) + || DBGFBpIsHwIoArmed(pVM))) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatDRxIoCheck); + + /* We're playing with the host CPU state here, make sure we don't preempt or longjmp. */ + VMMRZCallRing3Disable(pVCpu); + HM_DISABLE_PREEMPT(pVCpu); + + bool fIsGuestDbgActive = CPUMR0DebugStateMaybeSaveGuest(pVCpu, true /* fDr6 */); + + VBOXSTRICTRC rcStrict2 = DBGFBpCheckIo(pVM, pVCpu, pCtx, uIOPort, cbValue); + if (rcStrict2 == VINF_EM_RAW_GUEST_TRAP) + { + /* Raise #DB. */ + if (fIsGuestDbgActive) + ASMSetDR6(pCtx->dr[6]); + if (pCtx->dr[7] != uDr7) + pVCpu->hm.s.fCtxChanged |= HM_CHANGED_GUEST_DR7; + + hmR0VmxSetPendingXcptDB(pVCpu); + } + /* rcStrict is VINF_SUCCESS, VINF_IOM_R3_IOPORT_COMMIT_WRITE, or in [VINF_EM_FIRST..VINF_EM_LAST], + however we can ditch VINF_IOM_R3_IOPORT_COMMIT_WRITE as it has VMCPU_FF_IOM as backup. */ + else if ( rcStrict2 != VINF_SUCCESS + && (rcStrict == VINF_SUCCESS || rcStrict2 < rcStrict)) + rcStrict = rcStrict2; + AssertCompile(VINF_EM_LAST < VINF_IOM_R3_IOPORT_COMMIT_WRITE); + + HM_RESTORE_PREEMPT(); + VMMRZCallRing3Enable(pVCpu); + } + } + +#ifdef VBOX_STRICT + if ( rcStrict == VINF_IOM_R3_IOPORT_READ + || rcStrict == VINF_EM_PENDING_R3_IOPORT_READ) + Assert(!fIOWrite); + else if ( rcStrict == VINF_IOM_R3_IOPORT_WRITE + || rcStrict == VINF_IOM_R3_IOPORT_COMMIT_WRITE + || rcStrict == VINF_EM_PENDING_R3_IOPORT_WRITE) + Assert(fIOWrite); + else + { +# if 0 /** @todo r=bird: This is missing a bunch of VINF_EM_FIRST..VINF_EM_LAST + * statuses, that the VMM device and some others may return. See + * IOM_SUCCESS() for guidance. */ + AssertMsg( RT_FAILURE(rcStrict) + || rcStrict == VINF_SUCCESS + || rcStrict == VINF_EM_RAW_EMULATE_INSTR + || rcStrict == VINF_EM_DBG_BREAKPOINT + || rcStrict == VINF_EM_RAW_GUEST_TRAP + || rcStrict == VINF_EM_RAW_TO_R3 + || rcStrict == VINF_TRPM_XCPT_DISPATCHED, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); +# endif + } +#endif + STAM_PROFILE_ADV_STOP(&pVCpu->hm.s.StatExitIO, y1); + } + else + { + /* + * Frequent exit or something needing probing. Get state and call EMHistoryExec. + */ + int rc2 = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRCReturn(rc2, rc2); + STAM_COUNTER_INC(!fIOString ? fIOWrite ? &pVCpu->hm.s.StatExitIOWrite : &pVCpu->hm.s.StatExitIORead + : fIOWrite ? &pVCpu->hm.s.StatExitIOStringWrite : &pVCpu->hm.s.StatExitIOStringRead); + Log4(("IOExit/%u: %04x:%08RX64: %s%s%s %#x LB %u -> EMHistoryExec\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, + VMX_EXIT_QUAL_IO_IS_REP(pVmxTransient->uExitQual) ? "REP " : "", + fIOWrite ? "OUT" : "IN", fIOString ? "S" : "", uIOPort, uIOWidth)); + + rcStrict = EMHistoryExec(pVCpu, pExitRec, 0); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + + Log4(("IOExit/%u: %04x:%08RX64: EMHistoryExec -> %Rrc + %04x:%08RX64\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, + VBOXSTRICTRC_VAL(rcStrict), pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip)); + } + return rcStrict; +} + + +/** + * VM-exit handler for task switches (VMX_EXIT_TASK_SWITCH). Unconditional + * VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitTaskSwitch(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /* Check if this task-switch occurred while delivery an event through the guest IDT. */ + int rc = hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + if (VMX_EXIT_QUAL_TASK_SWITCH_TYPE(pVmxTransient->uExitQual) == VMX_EXIT_QUAL_TASK_SWITCH_TYPE_IDT) + { + rc = hmR0VmxReadIdtVectoringInfoVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + if (VMX_IDT_VECTORING_INFO_IS_VALID(pVmxTransient->uIdtVectoringInfo)) + { + uint32_t uErrCode; + RTGCUINTPTR GCPtrFaultAddress; + uint32_t const uIntType = VMX_IDT_VECTORING_INFO_TYPE(pVmxTransient->uIdtVectoringInfo); + uint32_t const uVector = VMX_IDT_VECTORING_INFO_VECTOR(pVmxTransient->uIdtVectoringInfo); + bool const fErrorCodeValid = VMX_IDT_VECTORING_INFO_IS_ERROR_CODE_VALID(pVmxTransient->uIdtVectoringInfo); + if (fErrorCodeValid) + { + rc = hmR0VmxReadIdtVectoringErrorCodeVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + uErrCode = pVmxTransient->uIdtVectoringErrorCode; + } + else + uErrCode = 0; + + if ( uIntType == VMX_IDT_VECTORING_INFO_TYPE_HW_XCPT + && uVector == X86_XCPT_PF) + GCPtrFaultAddress = pVCpu->cpum.GstCtx.cr2; + else + GCPtrFaultAddress = 0; + + rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_IDT_INFO(pVmxTransient->uIdtVectoringInfo), + pVmxTransient->cbInstr, uErrCode, GCPtrFaultAddress); + + Log4Func(("Pending event. uIntType=%#x uVector=%#x\n", uIntType, uVector)); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitTaskSwitch); + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + } + + /* Fall back to the interpreter to emulate the task-switch. */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitTaskSwitch); + return VERR_EM_INTERPRETER; +} + + +/** + * VM-exit handler for monitor-trap-flag (VMX_EXIT_MTF). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitMtf(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + Assert(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_MONITOR_TRAP_FLAG); + pVCpu->hm.s.vmx.u32ProcCtls &= ~VMX_PROC_CTLS_MONITOR_TRAP_FLAG; + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, pVCpu->hm.s.vmx.u32ProcCtls); + AssertRCReturn(rc, rc); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitMtf); + return VINF_EM_DBG_STEPPED; +} + + +/** + * VM-exit handler for APIC access (VMX_EXIT_APIC_ACCESS). Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitApicAccess(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitApicAccess); + + /* If this VM-exit occurred while delivering an event through the guest IDT, handle it accordingly. */ + VBOXSTRICTRC rcStrict1 = hmR0VmxCheckExitDueToEventDelivery(pVCpu, pVmxTransient); + if (RT_LIKELY(rcStrict1 == VINF_SUCCESS)) + { + /* For some crazy guest, if an event delivery causes an APIC-access VM-exit, go to instruction emulation. */ + if (RT_UNLIKELY(pVCpu->hm.s.Event.fPending)) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectPendingInterpret); + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + } + else + { + if (rcStrict1 == VINF_HM_DOUBLE_FAULT) + rcStrict1 = VINF_SUCCESS; + return rcStrict1; + } + + /* IOMMIOPhysHandler() below may call into IEM, save the necessary state. */ + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + + /* See Intel spec. 27-6 "Exit Qualifications for APIC-access VM-exits from Linear Accesses & Guest-Phyiscal Addresses" */ + uint32_t uAccessType = VMX_EXIT_QUAL_APIC_ACCESS_TYPE(pVmxTransient->uExitQual); + VBOXSTRICTRC rcStrict2; + switch (uAccessType) + { + case VMX_APIC_ACCESS_TYPE_LINEAR_WRITE: + case VMX_APIC_ACCESS_TYPE_LINEAR_READ: + { + AssertMsg( !(pVCpu->hm.s.vmx.u32ProcCtls & VMX_PROC_CTLS_USE_TPR_SHADOW) + || VMX_EXIT_QUAL_APIC_ACCESS_OFFSET(pVmxTransient->uExitQual) != XAPIC_OFF_TPR, + ("hmR0VmxExitApicAccess: can't access TPR offset while using TPR shadowing.\n")); + + RTGCPHYS GCPhys = pVCpu->hm.s.vmx.u64MsrApicBase; /* Always up-to-date, u64MsrApicBase is not part of the VMCS. */ + GCPhys &= PAGE_BASE_GC_MASK; + GCPhys += VMX_EXIT_QUAL_APIC_ACCESS_OFFSET(pVmxTransient->uExitQual); + PVM pVM = pVCpu->CTX_SUFF(pVM); + Log4Func(("Linear access uAccessType=%#x GCPhys=%#RGp Off=%#x\n", uAccessType, GCPhys, + VMX_EXIT_QUAL_APIC_ACCESS_OFFSET(pVmxTransient->uExitQual))); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + rcStrict2 = IOMMMIOPhysHandler(pVM, pVCpu, + uAccessType == VMX_APIC_ACCESS_TYPE_LINEAR_READ ? 0 : X86_TRAP_PF_RW, + CPUMCTX2CORE(pCtx), GCPhys); + Log4Func(("IOMMMIOPhysHandler returned %Rrc\n", VBOXSTRICTRC_VAL(rcStrict2))); + if ( rcStrict2 == VINF_SUCCESS + || rcStrict2 == VERR_PAGE_TABLE_NOT_PRESENT + || rcStrict2 == VERR_PAGE_NOT_PRESENT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RSP | HM_CHANGED_GUEST_RFLAGS + | HM_CHANGED_GUEST_APIC_TPR); + rcStrict2 = VINF_SUCCESS; + } + break; + } + + default: + Log4Func(("uAccessType=%#x\n", uAccessType)); + rcStrict2 = VINF_EM_RAW_EMULATE_INSTR; + break; + } + + if (rcStrict2 != VINF_SUCCESS) + STAM_COUNTER_INC(&pVCpu->hm.s.StatSwitchApicAccessToR3); + return rcStrict2; +} + + +/** + * VM-exit handler for debug-register accesses (VMX_EXIT_MOV_DRX). Conditional + * VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitMovDRx(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /* We should -not- get this VM-exit if the guest's debug registers were active. */ + if (pVmxTransient->fWasGuestDebugStateActive) + { + AssertMsgFailed(("Unexpected MOV DRx exit\n")); + HMVMX_UNEXPECTED_EXIT_RET(pVCpu, pVmxTransient); + } + + if ( !pVCpu->hm.s.fSingleInstruction + && !pVmxTransient->fWasHyperDebugStateActive) + { + Assert(!DBGFIsStepping(pVCpu)); + Assert(pVCpu->hm.s.vmx.u32XcptBitmap & RT_BIT_32(X86_XCPT_DB)); + + /* Don't intercept MOV DRx any more. */ + pVCpu->hm.s.vmx.u32ProcCtls &= ~VMX_PROC_CTLS_MOV_DR_EXIT; + int rc = VMXWriteVmcs32(VMX_VMCS32_CTRL_PROC_EXEC, pVCpu->hm.s.vmx.u32ProcCtls); + AssertRCReturn(rc, rc); + + /* We're playing with the host CPU state here, make sure we can't preempt or longjmp. */ + VMMRZCallRing3Disable(pVCpu); + HM_DISABLE_PREEMPT(pVCpu); + + /* Save the host & load the guest debug state, restart execution of the MOV DRx instruction. */ + CPUMR0LoadGuestDebugState(pVCpu, true /* include DR6 */); + Assert(CPUMIsGuestDebugStateActive(pVCpu) || HC_ARCH_BITS == 32); + + HM_RESTORE_PREEMPT(); + VMMRZCallRing3Enable(pVCpu); + +#ifdef VBOX_WITH_STATISTICS + rc = hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + if (VMX_EXIT_QUAL_DRX_DIRECTION(pVmxTransient->uExitQual) == VMX_EXIT_QUAL_DRX_DIRECTION_WRITE) + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitDRxWrite); + else + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitDRxRead); +#endif + STAM_COUNTER_INC(&pVCpu->hm.s.StatDRxContextSwitch); + return VINF_SUCCESS; + } + + /* + * EMInterpretDRx[Write|Read]() calls CPUMIsGuestIn64BitCode() which requires EFER, CS. EFER is always up-to-date. + * Update the segment registers and DR7 from the CPU. + */ + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + int rc = hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_SREG_MASK | CPUMCTX_EXTRN_DR7); + AssertRCReturn(rc, rc); + Log4Func(("CS:RIP=%04x:%08RX64\n", pCtx->cs.Sel, pCtx->rip)); + + PVM pVM = pVCpu->CTX_SUFF(pVM); + if (VMX_EXIT_QUAL_DRX_DIRECTION(pVmxTransient->uExitQual) == VMX_EXIT_QUAL_DRX_DIRECTION_WRITE) + { + rc = EMInterpretDRxWrite(pVM, pVCpu, CPUMCTX2CORE(pCtx), + VMX_EXIT_QUAL_DRX_REGISTER(pVmxTransient->uExitQual), + VMX_EXIT_QUAL_DRX_GENREG(pVmxTransient->uExitQual)); + if (RT_SUCCESS(rc)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_DR7); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitDRxWrite); + } + else + { + rc = EMInterpretDRxRead(pVM, pVCpu, CPUMCTX2CORE(pCtx), + VMX_EXIT_QUAL_DRX_GENREG(pVmxTransient->uExitQual), + VMX_EXIT_QUAL_DRX_REGISTER(pVmxTransient->uExitQual)); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitDRxRead); + } + + Assert(rc == VINF_SUCCESS || rc == VERR_EM_INTERPRETER); + if (RT_SUCCESS(rc)) + { + int rc2 = hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); + AssertRCReturn(rc2, rc2); + return VINF_SUCCESS; + } + return rc; +} + + +/** + * VM-exit handler for EPT misconfiguration (VMX_EXIT_EPT_MISCONFIG). + * Conditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitEptMisconfig(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging); + + /* If this VM-exit occurred while delivering an event through the guest IDT, handle it accordingly. */ + VBOXSTRICTRC rcStrict1 = hmR0VmxCheckExitDueToEventDelivery(pVCpu, pVmxTransient); + if (RT_LIKELY(rcStrict1 == VINF_SUCCESS)) + { + /* If event delivery causes an EPT misconfig (MMIO), go back to instruction emulation as otherwise + injecting the original pending event would most likely cause the same EPT misconfig VM-exit. */ + if (RT_UNLIKELY(pVCpu->hm.s.Event.fPending)) + { + STAM_COUNTER_INC(&pVCpu->hm.s.StatInjectPendingInterpret); + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + } + else + { + if (rcStrict1 == VINF_HM_DOUBLE_FAULT) + rcStrict1 = VINF_SUCCESS; + return rcStrict1; + } + + /* + * Get sufficent state and update the exit history entry. + */ + RTGCPHYS GCPhys; + int rc = VMXReadVmcs64(VMX_VMCS64_RO_GUEST_PHYS_ADDR_FULL, &GCPhys); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + AssertRCReturn(rc, rc); + + VBOXSTRICTRC rcStrict; + PCEMEXITREC pExitRec = EMHistoryUpdateFlagsAndTypeAndPC(pVCpu, + EMEXIT_MAKE_FT(EMEXIT_F_KIND_EM | EMEXIT_F_HM, EMEXITTYPE_MMIO), + pVCpu->cpum.GstCtx.rip + pVCpu->cpum.GstCtx.cs.u64Base); + if (!pExitRec) + { + /* + * If we succeed, resume guest execution. + * If we fail in interpreting the instruction because we couldn't get the guest physical address + * of the page containing the instruction via the guest's page tables (we would invalidate the guest page + * in the host TLB), resume execution which would cause a guest page fault to let the guest handle this + * weird case. See @bugref{6043}. + */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + rcStrict = PGMR0Trap0eHandlerNPMisconfig(pVM, pVCpu, PGMMODE_EPT, CPUMCTX2CORE(pCtx), GCPhys, UINT32_MAX); + Log4Func(("At %#RGp RIP=%#RX64 rc=%Rrc\n", GCPhys, pCtx->rip, VBOXSTRICTRC_VAL(rcStrict))); + if ( rcStrict == VINF_SUCCESS + || rcStrict == VERR_PAGE_TABLE_NOT_PRESENT + || rcStrict == VERR_PAGE_NOT_PRESENT) + { + /* Successfully handled MMIO operation. */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RSP | HM_CHANGED_GUEST_RFLAGS + | HM_CHANGED_GUEST_APIC_TPR); + rcStrict = VINF_SUCCESS; + } + } + else + { + /* + * Frequent exit or something needing probing. Get state and call EMHistoryExec. + */ + int rc2 = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + AssertRCReturn(rc2, rc2); + + Log4(("EptMisscfgExit/%u: %04x:%08RX64: %RGp -> EMHistoryExec\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, GCPhys)); + + rcStrict = EMHistoryExec(pVCpu, pExitRec, 0); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + + Log4(("EptMisscfgExit/%u: %04x:%08RX64: EMHistoryExec -> %Rrc + %04x:%08RX64\n", + pVCpu->idCpu, pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip, + VBOXSTRICTRC_VAL(rcStrict), pVCpu->cpum.GstCtx.cs.Sel, pVCpu->cpum.GstCtx.rip)); + } + return VBOXSTRICTRC_TODO(rcStrict); +} + + +/** + * VM-exit handler for EPT violation (VMX_EXIT_EPT_VIOLATION). Conditional + * VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitEptViolation(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); + Assert(pVCpu->CTX_SUFF(pVM)->hm.s.fNestedPaging); + + /* If this VM-exit occurred while delivering an event through the guest IDT, handle it accordingly. */ + VBOXSTRICTRC rcStrict1 = hmR0VmxCheckExitDueToEventDelivery(pVCpu, pVmxTransient); + if (RT_LIKELY(rcStrict1 == VINF_SUCCESS)) + { + /* In the unlikely case that the EPT violation happened as a result of delivering an event, log it. */ + if (RT_UNLIKELY(pVCpu->hm.s.Event.fPending)) + Log4Func(("EPT violation with an event pending u64IntInfo=%#RX64\n", pVCpu->hm.s.Event.u64IntInfo)); + } + else + { + if (rcStrict1 == VINF_HM_DOUBLE_FAULT) + rcStrict1 = VINF_SUCCESS; + return rcStrict1; + } + + RTGCPHYS GCPhys; + int rc = VMXReadVmcs64(VMX_VMCS64_RO_GUEST_PHYS_ADDR_FULL, &GCPhys); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_MUST_MASK); + AssertRCReturn(rc, rc); + + /* Intel spec. Table 27-7 "Exit Qualifications for EPT violations". */ + AssertMsg(((pVmxTransient->uExitQual >> 7) & 3) != 2, ("%#RX64", pVmxTransient->uExitQual)); + + RTGCUINT uErrorCode = 0; + if (pVmxTransient->uExitQual & VMX_EXIT_QUAL_EPT_INSTR_FETCH) + uErrorCode |= X86_TRAP_PF_ID; + if (pVmxTransient->uExitQual & VMX_EXIT_QUAL_EPT_DATA_WRITE) + uErrorCode |= X86_TRAP_PF_RW; + if (pVmxTransient->uExitQual & VMX_EXIT_QUAL_EPT_ENTRY_PRESENT) + uErrorCode |= X86_TRAP_PF_P; + + TRPMAssertXcptPF(pVCpu, GCPhys, uErrorCode); + + + /* Handle the pagefault trap for the nested shadow table. */ + PVM pVM = pVCpu->CTX_SUFF(pVM); + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + + Log4Func(("EPT violation %#x at %#RX64 ErrorCode %#x CS:RIP=%04x:%08RX64\n", pVmxTransient->uExitQual, GCPhys, uErrorCode, + pCtx->cs.Sel, pCtx->rip)); + + VBOXSTRICTRC rcStrict2 = PGMR0Trap0eHandlerNestedPaging(pVM, pVCpu, PGMMODE_EPT, uErrorCode, CPUMCTX2CORE(pCtx), GCPhys); + TRPMResetTrap(pVCpu); + + /* Same case as PGMR0Trap0eHandlerNPMisconfig(). See comment above, @bugref{6043}. */ + if ( rcStrict2 == VINF_SUCCESS + || rcStrict2 == VERR_PAGE_TABLE_NOT_PRESENT + || rcStrict2 == VERR_PAGE_NOT_PRESENT) + { + /* Successfully synced our nested page tables. */ + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitReasonNpf); + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RSP | HM_CHANGED_GUEST_RFLAGS); + return VINF_SUCCESS; + } + + Log4Func(("EPT return to ring-3 rcStrict2=%Rrc\n", VBOXSTRICTRC_VAL(rcStrict2))); + return rcStrict2; +} + +/** @} */ + +/** @name VM-exit exception handlers. + * @{ + */ +/* -=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ +/* -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= VM-exit exception handlers =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- */ +/* -=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ + +/** + * VM-exit exception handler for \#MF (Math Fault: floating point exception). + */ +static int hmR0VmxExitXcptMF(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(pVCpu, pVmxTransient); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestMF); + + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR0); + AssertRCReturn(rc, rc); + + if (!(pVCpu->cpum.GstCtx.cr0 & X86_CR0_NE)) + { + /* Convert a #MF into a FERR -> IRQ 13. See @bugref{6117}. */ + rc = PDMIsaSetIrq(pVCpu->CTX_SUFF(pVM), 13, 1, 0 /* uTagSrc */); + + /** @todo r=ramshankar: The Intel spec. does -not- specify that this VM-exit + * provides VM-exit instruction length. If this causes problem later, + * disassemble the instruction like it's done on AMD-V. */ + int rc2 = hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); + AssertRCReturn(rc2, rc2); + return rc; + } + + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(pVmxTransient->uExitIntInfo), pVmxTransient->cbInstr, + pVmxTransient->uExitIntErrorCode, 0 /* GCPtrFaultAddress */); + return rc; +} + + +/** + * VM-exit exception handler for \#BP (Breakpoint exception). + */ +static int hmR0VmxExitXcptBP(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(pVCpu, pVmxTransient); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestBP); + + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRCReturn(rc, rc); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + rc = DBGFRZTrap03Handler(pVCpu->CTX_SUFF(pVM), pVCpu, CPUMCTX2CORE(pCtx)); + if (rc == VINF_EM_RAW_GUEST_TRAP) + { + rc = hmR0VmxReadExitIntInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= hmR0VmxReadExitIntErrorCodeVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(pVmxTransient->uExitIntInfo), pVmxTransient->cbInstr, + pVmxTransient->uExitIntErrorCode, 0 /* GCPtrFaultAddress */); + } + + Assert(rc == VINF_SUCCESS || rc == VINF_EM_RAW_GUEST_TRAP || rc == VINF_EM_DBG_BREAKPOINT); + return rc; +} + + +/** + * VM-exit exception handler for \#AC (alignment check exception). + */ +static int hmR0VmxExitXcptAC(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(pVCpu, pVmxTransient); + + /* + * Re-inject it. We'll detect any nesting before getting here. + */ + int rc = hmR0VmxReadExitIntErrorCodeVmcs(pVmxTransient); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + Assert(ASMAtomicUoReadU32(&pVmxTransient->fVmcsFieldsRead) & HMVMX_READ_EXIT_INTERRUPTION_INFO); + + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(pVmxTransient->uExitIntInfo), pVmxTransient->cbInstr, + pVmxTransient->uExitIntErrorCode, 0 /* GCPtrFaultAddress */); + return VINF_SUCCESS; +} + + +/** + * VM-exit exception handler for \#DB (Debug exception). + */ +static int hmR0VmxExitXcptDB(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(pVCpu, pVmxTransient); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestDB); + + /* + * Get the DR6-like values from the VM-exit qualification and pass it to DBGF + * for processing. + */ + int rc = hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + + /* Refer Intel spec. Table 27-1. "Exit Qualifications for debug exceptions" for the format. */ + uint64_t uDR6 = X86_DR6_INIT_VAL; + uDR6 |= (pVmxTransient->uExitQual & (X86_DR6_B0 | X86_DR6_B1 | X86_DR6_B2 | X86_DR6_B3 | X86_DR6_BD | X86_DR6_BS)); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + rc = DBGFRZTrap01Handler(pVCpu->CTX_SUFF(pVM), pVCpu, CPUMCTX2CORE(pCtx), uDR6, pVCpu->hm.s.fSingleInstruction); + Log6Func(("rc=%Rrc\n", rc)); + if (rc == VINF_EM_RAW_GUEST_TRAP) + { + /* + * The exception was for the guest. Update DR6, DR7.GD and + * IA32_DEBUGCTL.LBR before forwarding it. + * (See Intel spec. 27.1 "Architectural State before a VM-Exit".) + */ + VMMRZCallRing3Disable(pVCpu); + HM_DISABLE_PREEMPT(pVCpu); + + pCtx->dr[6] &= ~X86_DR6_B_MASK; + pCtx->dr[6] |= uDR6; + if (CPUMIsGuestDebugStateActive(pVCpu)) + ASMSetDR6(pCtx->dr[6]); + + HM_RESTORE_PREEMPT(); + VMMRZCallRing3Enable(pVCpu); + + rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_DR7); + AssertRCReturn(rc, rc); + + /* X86_DR7_GD will be cleared if DRx accesses should be trapped inside the guest. */ + pCtx->dr[7] &= ~X86_DR7_GD; + + /* Paranoia. */ + pCtx->dr[7] &= ~X86_DR7_RAZ_MASK; + pCtx->dr[7] |= X86_DR7_RA1_MASK; + + rc = VMXWriteVmcs32(VMX_VMCS_GUEST_DR7, (uint32_t)pCtx->dr[7]); + AssertRCReturn(rc, rc); + + /* + * Raise #DB in the guest. + * + * It is important to reflect exactly what the VM-exit gave us (preserving the + * interruption-type) rather than use hmR0VmxSetPendingXcptDB() as the #DB could've + * been raised while executing ICEBP (INT1) and not the regular #DB. Thus it may + * trigger different handling in the CPU (like skipping DPL checks), see @bugref{6398}. + * + * Intel re-documented ICEBP/INT1 on May 2018 previously documented as part of + * Intel 386, see Intel spec. 24.8.3 "VM-Entry Controls for Event Injection". + */ + rc = hmR0VmxReadExitIntInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= hmR0VmxReadExitIntErrorCodeVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(pVmxTransient->uExitIntInfo), pVmxTransient->cbInstr, + pVmxTransient->uExitIntErrorCode, 0 /* GCPtrFaultAddress */); + return VINF_SUCCESS; + } + + /* + * Not a guest trap, must be a hypervisor related debug event then. + * Update DR6 in case someone is interested in it. + */ + AssertMsg(rc == VINF_EM_DBG_STEPPED || rc == VINF_EM_DBG_BREAKPOINT, ("%Rrc\n", rc)); + AssertReturn(pVmxTransient->fWasHyperDebugStateActive, VERR_HM_IPE_5); + CPUMSetHyperDR6(pVCpu, uDR6); + + return rc; +} + + +/** + * Hacks its way around the lovely mesa driver's backdoor accesses. + * + * @sa hmR0SvmHandleMesaDrvGp + */ +static int hmR0VmxHandleMesaDrvGp(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient, PCPUMCTX pCtx) +{ + Log(("hmR0VmxHandleMesaDrvGp: at %04x:%08RX64 rcx=%RX64 rbx=%RX64\n", pCtx->cs.Sel, pCtx->rip, pCtx->rcx, pCtx->rbx)); + RT_NOREF(pCtx); + + /* For now we'll just skip the instruction. */ + return hmR0VmxAdvanceGuestRip(pVCpu, pVmxTransient); +} + + +/** + * Checks if the \#GP'ing instruction is the mesa driver doing it's lovely + * backdoor logging w/o checking what it is running inside. + * + * This recognizes an "IN EAX,DX" instruction executed in flat ring-3, with the + * backdoor port and magic numbers loaded in registers. + * + * @returns true if it is, false if it isn't. + * @sa hmR0SvmIsMesaDrvGp + */ +DECLINLINE(bool) hmR0VmxIsMesaDrvGp(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient, PCPUMCTX pCtx) +{ + /* 0xed: IN eAX,dx */ + uint8_t abInstr[1]; + if (pVmxTransient->cbInstr != sizeof(abInstr)) + return false; + + /* Check that it is #GP(0). */ + if (pVmxTransient->uExitIntErrorCode != 0) + return false; + + /* Check magic and port. */ + Assert(!(pCtx->fExtrn & (CPUMCTX_EXTRN_RAX | CPUMCTX_EXTRN_RDX | CPUMCTX_EXTRN_RCX))); + /*Log(("hmR0VmxIsMesaDrvGp: rax=%RX64 rdx=%RX64\n", pCtx->rax, pCtx->rdx));*/ + if (pCtx->rax != UINT32_C(0x564d5868)) + return false; + if (pCtx->dx != UINT32_C(0x5658)) + return false; + + /* Flat ring-3 CS. */ + AssertCompile(HMVMX_CPUMCTX_EXTRN_ALL & CPUMCTX_EXTRN_CS); + Assert(!(pCtx->fExtrn & CPUMCTX_EXTRN_CS)); + /*Log(("hmR0VmxIsMesaDrvGp: cs.Attr.n.u2Dpl=%d base=%Rx64\n", pCtx->cs.Attr.n.u2Dpl, pCtx->cs.u64Base));*/ + if (pCtx->cs.Attr.n.u2Dpl != 3) + return false; + if (pCtx->cs.u64Base != 0) + return false; + + /* Check opcode. */ + AssertCompile(HMVMX_CPUMCTX_EXTRN_ALL & CPUMCTX_EXTRN_RIP); + Assert(!(pCtx->fExtrn & CPUMCTX_EXTRN_RIP)); + int rc = PGMPhysSimpleReadGCPtr(pVCpu, abInstr, pCtx->rip, sizeof(abInstr)); + /*Log(("hmR0VmxIsMesaDrvGp: PGMPhysSimpleReadGCPtr -> %Rrc %#x\n", rc, abInstr[0]));*/ + if (RT_FAILURE(rc)) + return false; + if (abInstr[0] != 0xed) + return false; + + return true; +} + + +/** + * VM-exit exception handler for \#GP (General-protection exception). + * + * @remarks Requires pVmxTransient->uExitIntInfo to be up-to-date. + */ +static int hmR0VmxExitXcptGP(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(pVCpu, pVmxTransient); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestGP); + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + if (pVCpu->hm.s.vmx.RealMode.fRealOnV86Active) + { /* likely */ } + else + { +#ifndef HMVMX_ALWAYS_TRAP_ALL_XCPTS + Assert(pVCpu->hm.s.fUsingDebugLoop || pVCpu->hm.s.fTrapXcptGpForLovelyMesaDrv); +#endif + /* If the guest is not in real-mode or we have unrestricted execution support, reflect #GP to the guest. */ + int rc = hmR0VmxReadExitIntInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitIntErrorCodeVmcs(pVmxTransient); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRCReturn(rc, rc); + Log4Func(("Gst: CS:RIP %04x:%08RX64 ErrorCode=%#x CR0=%#RX64 CPL=%u TR=%#04x\n", pCtx->cs.Sel, pCtx->rip, + pVmxTransient->uExitIntErrorCode, pCtx->cr0, CPUMGetGuestCPL(pVCpu), pCtx->tr.Sel)); + + if ( !pVCpu->hm.s.fTrapXcptGpForLovelyMesaDrv + || !hmR0VmxIsMesaDrvGp(pVCpu, pVmxTransient, pCtx)) + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(pVmxTransient->uExitIntInfo), + pVmxTransient->cbInstr, pVmxTransient->uExitIntErrorCode, 0 /* GCPtrFaultAddress */); + else + rc = hmR0VmxHandleMesaDrvGp(pVCpu, pVmxTransient, pCtx); + return rc; + } + + Assert(CPUMIsGuestInRealModeEx(pCtx)); + Assert(!pVCpu->CTX_SUFF(pVM)->hm.s.vmx.fUnrestrictedGuest); + + int rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRCReturn(rc, rc); + + VBOXSTRICTRC rcStrict = IEMExecOne(pVCpu); + if (rcStrict == VINF_SUCCESS) + { + if (!CPUMIsGuestInRealModeEx(pCtx)) + { + /* + * The guest is no longer in real-mode, check if we can continue executing the + * guest using hardware-assisted VMX. Otherwise, fall back to emulation. + */ + if (HMCanExecuteVmxGuest(pVCpu, pCtx)) + { + Log4Func(("Mode changed but guest still suitable for executing using VT-x\n")); + pVCpu->hm.s.vmx.RealMode.fRealOnV86Active = false; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + } + else + { + Log4Func(("Mode changed -> VINF_EM_RESCHEDULE\n")); + rcStrict = VINF_EM_RESCHEDULE; + } + } + else + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + rcStrict = VINF_SUCCESS; + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + } + return VBOXSTRICTRC_VAL(rcStrict); +} + + +/** + * VM-exit exception handler wrapper for generic exceptions. Simply re-injects + * the exception reported in the VMX transient structure back into the VM. + * + * @remarks Requires uExitIntInfo in the VMX transient structure to be + * up-to-date. + */ +static int hmR0VmxExitXcptGeneric(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef HMVMX_ALWAYS_TRAP_ALL_XCPTS + AssertMsg(pVCpu->hm.s.fUsingDebugLoop || pVCpu->hm.s.vmx.RealMode.fRealOnV86Active, + ("uVector=%#x u32XcptBitmap=%#X32\n", + VMX_EXIT_INT_INFO_VECTOR(pVmxTransient->uExitIntInfo), pVCpu->hm.s.vmx.u32XcptBitmap)); +#endif + + /* Re-inject the exception into the guest. This cannot be a double-fault condition which would have been handled in + hmR0VmxCheckExitDueToEventDelivery(). */ + int rc = hmR0VmxReadExitIntErrorCodeVmcs(pVmxTransient); + rc |= hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + Assert(ASMAtomicUoReadU32(&pVmxTransient->fVmcsFieldsRead) & HMVMX_READ_EXIT_INTERRUPTION_INFO); + +#ifdef DEBUG_ramshankar + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CS | CPUMCTX_EXTRN_RIP); + uint8_t uVector = VMX_EXIT_INT_INFO_VECTOR(pVmxTransient->uExitIntInfo); + Log(("hmR0VmxExitXcptGeneric: Reinjecting Xcpt. uVector=%#x cs:rip=%#04x:%#RX64\n", uVector, pCtx->cs.Sel, pCtx->rip)); +#endif + + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(pVmxTransient->uExitIntInfo), pVmxTransient->cbInstr, + pVmxTransient->uExitIntErrorCode, 0 /* GCPtrFaultAddress */); + return VINF_SUCCESS; +} + + +/** + * VM-exit exception handler for \#PF (Page-fault exception). + */ +static int hmR0VmxExitXcptPF(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_XCPT_HANDLER_PARAMS(pVCpu, pVmxTransient); + PVM pVM = pVCpu->CTX_SUFF(pVM); + int rc = hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + rc |= hmR0VmxReadExitIntInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitIntErrorCodeVmcs(pVmxTransient); + AssertRCReturn(rc, rc); + + if (!pVM->hm.s.fNestedPaging) + { /* likely */ } + else + { +#if !defined(HMVMX_ALWAYS_TRAP_ALL_XCPTS) && !defined(HMVMX_ALWAYS_TRAP_PF) + Assert(pVCpu->hm.s.fUsingDebugLoop); +#endif + pVCpu->hm.s.Event.fPending = false; /* In case it's a contributory or vectoring #PF. */ + if (RT_LIKELY(!pVmxTransient->fVectoringDoublePF)) + { + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(pVmxTransient->uExitIntInfo), 0 /* cbInstr */, + pVmxTransient->uExitIntErrorCode, pVmxTransient->uExitQual); + } + else + { + /* A guest page-fault occurred during delivery of a page-fault. Inject #DF. */ + hmR0VmxSetPendingXcptDF(pVCpu); + Log4Func(("Pending #DF due to vectoring #PF w/ NestedPaging\n")); + } + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestPF); + return rc; + } + + /* If it's a vectoring #PF, emulate injecting the original event injection as PGMTrap0eHandler() is incapable + of differentiating between instruction emulation and event injection that caused a #PF. See @bugref{6607}. */ + if (pVmxTransient->fVectoringPF) + { + Assert(pVCpu->hm.s.Event.fPending); + return VINF_EM_RAW_INJECT_TRPM_EVENT; + } + + PCPUMCTX pCtx = &pVCpu->cpum.GstCtx; + rc = HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, HMVMX_CPUMCTX_EXTRN_ALL); + AssertRCReturn(rc, rc); + + Log4Func(("#PF: cr2=%#RX64 cs:rip=%#04x:%#RX64 uErrCode %#RX32 cr3=%#RX64\n", pVmxTransient->uExitQual, pCtx->cs.Sel, + pCtx->rip, pVmxTransient->uExitIntErrorCode, pCtx->cr3)); + + TRPMAssertXcptPF(pVCpu, pVmxTransient->uExitQual, (RTGCUINT)pVmxTransient->uExitIntErrorCode); + rc = PGMTrap0eHandler(pVCpu, pVmxTransient->uExitIntErrorCode, CPUMCTX2CORE(pCtx), (RTGCPTR)pVmxTransient->uExitQual); + + Log4Func(("#PF: rc=%Rrc\n", rc)); + if (rc == VINF_SUCCESS) + { + /* + * This is typically a shadow page table sync or a MMIO instruction. But we may have + * emulated something like LTR or a far jump. Any part of the CPU context may have changed. + */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + TRPMResetTrap(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitShadowPF); + return rc; + } + + if (rc == VINF_EM_RAW_GUEST_TRAP) + { + if (!pVmxTransient->fVectoringDoublePF) + { + /* It's a guest page fault and needs to be reflected to the guest. */ + uint32_t uGstErrorCode = TRPMGetErrorCode(pVCpu); + TRPMResetTrap(pVCpu); + pVCpu->hm.s.Event.fPending = false; /* In case it's a contributory #PF. */ + hmR0VmxSetPendingEvent(pVCpu, VMX_ENTRY_INT_INFO_FROM_EXIT_INT_INFO(pVmxTransient->uExitIntInfo), 0 /* cbInstr */, + uGstErrorCode, pVmxTransient->uExitQual); + } + else + { + /* A guest page-fault occurred during delivery of a page-fault. Inject #DF. */ + TRPMResetTrap(pVCpu); + pVCpu->hm.s.Event.fPending = false; /* Clear pending #PF to replace it with #DF. */ + hmR0VmxSetPendingXcptDF(pVCpu); + Log4Func(("#PF: Pending #DF due to vectoring #PF\n")); + } + + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitGuestPF); + return VINF_SUCCESS; + } + + TRPMResetTrap(pVCpu); + STAM_COUNTER_INC(&pVCpu->hm.s.StatExitShadowPFEM); + return rc; +} + +/** @} */ + +#ifdef VBOX_WITH_NESTED_HWVIRT_VMX +/** @name Nested-guest VM-exit handlers. + * @{ + */ +/* -=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ +/* -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Nested-guest VM-exit handlers =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ +/* -=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= */ + +/** + * VM-exit handler for VMCLEAR (VMX_EXIT_VMCLEAR). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmclear(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_SREG_MASK + | IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK); + rc |= hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VMXVEXITINFO ExitInfo; + RT_ZERO(ExitInfo); + ExitInfo.uReason = pVmxTransient->uExitReason; + ExitInfo.u64Qual = pVmxTransient->uExitQual; + ExitInfo.InstrInfo.u = pVmxTransient->ExitInstrInfo.u; + ExitInfo.cbInstr = pVmxTransient->cbInstr; + HMVMX_DECODE_MEM_OPERAND(pVCpu, ExitInfo.InstrInfo.u, ExitInfo.u64Qual, VMXMEMACCESS_READ, &ExitInfo.GCPtrEffAddr); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmclear(pVCpu, &ExitInfo); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_HWVIRT); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + + +/** + * VM-exit handler for VMLAUNCH (VMX_EXIT_VMLAUNCH). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmlaunch(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_VMX_VMENTRY_MASK); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmlaunchVmresume(pVCpu, pVmxTransient->cbInstr, VMXINSTRID_VMLAUNCH); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + Assert(rcStrict != VINF_IEM_RAISED_XCPT); + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + + +/** + * VM-exit handler for VMPTRLD (VMX_EXIT_VMPTRLD). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmptrld(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_SREG_MASK + | IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK); + rc |= hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VMXVEXITINFO ExitInfo; + RT_ZERO(ExitInfo); + ExitInfo.uReason = pVmxTransient->uExitReason; + ExitInfo.u64Qual = pVmxTransient->uExitQual; + ExitInfo.InstrInfo.u = pVmxTransient->ExitInstrInfo.u; + ExitInfo.cbInstr = pVmxTransient->cbInstr; + HMVMX_DECODE_MEM_OPERAND(pVCpu, ExitInfo.InstrInfo.u, ExitInfo.u64Qual, VMXMEMACCESS_READ, &ExitInfo.GCPtrEffAddr); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmptrld(pVCpu, &ExitInfo); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_HWVIRT); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + + +/** + * VM-exit handler for VMPTRST (VMX_EXIT_VMPTRST). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmptrst(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_SREG_MASK + | IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK); + rc |= hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VMXVEXITINFO ExitInfo; + RT_ZERO(ExitInfo); + ExitInfo.uReason = pVmxTransient->uExitReason; + ExitInfo.u64Qual = pVmxTransient->uExitQual; + ExitInfo.InstrInfo.u = pVmxTransient->ExitInstrInfo.u; + ExitInfo.cbInstr = pVmxTransient->cbInstr; + HMVMX_DECODE_MEM_OPERAND(pVCpu, ExitInfo.InstrInfo.u, ExitInfo.u64Qual, VMXMEMACCESS_WRITE, &ExitInfo.GCPtrEffAddr); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmptrst(pVCpu, &ExitInfo); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_HWVIRT); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + + +/** + * VM-exit handler for VMREAD (VMX_EXIT_VMREAD). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmread(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_SREG_MASK + | IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK); + rc |= hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VMXVEXITINFO ExitInfo; + RT_ZERO(ExitInfo); + ExitInfo.uReason = pVmxTransient->uExitReason; + ExitInfo.u64Qual = pVmxTransient->uExitQual; + ExitInfo.InstrInfo.u = pVmxTransient->ExitInstrInfo.u; + ExitInfo.cbInstr = pVmxTransient->cbInstr; + if (!ExitInfo.InstrInfo.VmreadVmwrite.fIsRegOperand) + HMVMX_DECODE_MEM_OPERAND(pVCpu, ExitInfo.InstrInfo.u, ExitInfo.u64Qual, VMXMEMACCESS_WRITE, &ExitInfo.GCPtrEffAddr); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmread(pVCpu, &ExitInfo); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_HWVIRT); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + + +/** + * VM-exit handler for VMRESUME (VMX_EXIT_VMRESUME). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmresume(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, IEM_CPUMCTX_EXTRN_VMX_VMENTRY_MASK); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmlaunchVmresume(pVCpu, pVmxTransient->cbInstr, VMXINSTRID_VMRESUME); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_ALL_GUEST); + Assert(rcStrict != VINF_IEM_RAISED_XCPT); + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + + +/** + * VM-exit handler for VMWRITE (VMX_EXIT_VMWRITE). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmwrite(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_SREG_MASK + | IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK); + rc |= hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VMXVEXITINFO ExitInfo; + RT_ZERO(ExitInfo); + ExitInfo.uReason = pVmxTransient->uExitReason; + ExitInfo.u64Qual = pVmxTransient->uExitQual; + ExitInfo.InstrInfo.u = pVmxTransient->ExitInstrInfo.u; + ExitInfo.cbInstr = pVmxTransient->cbInstr; + if (!ExitInfo.InstrInfo.VmreadVmwrite.fIsRegOperand) + HMVMX_DECODE_MEM_OPERAND(pVCpu, ExitInfo.InstrInfo.u, ExitInfo.u64Qual, VMXMEMACCESS_READ, &ExitInfo.GCPtrEffAddr); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmwrite(pVCpu, &ExitInfo); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_HWVIRT); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + + +/** + * VM-exit handler for VMXOFF (VMX_EXIT_VMXOFF). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmxoff(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_CR4 | IEM_CPUMCTX_EXTRN_EXEC_DECODED_NO_MEM_MASK); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmxoff(pVCpu, pVmxTransient->cbInstr); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + { + /* VMXOFF changes the internal hwvirt. state but not anything that's visible to the guest other than RIP. */ + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_HWVIRT); + } + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + + +/** + * VM-exit handler for VMXON (VMX_EXIT_VMXON). Unconditional VM-exit. + */ +HMVMX_EXIT_DECL hmR0VmxExitVmxon(PVMCPU pVCpu, PVMXTRANSIENT pVmxTransient) +{ + HMVMX_VALIDATE_EXIT_HANDLER_PARAMS(pVCpu, pVmxTransient); +#ifndef VBOX_WITH_NESTED_HWVIRT_ONLY_IN_IEM + int rc = hmR0VmxReadExitInstrLenVmcs(pVmxTransient); + rc |= HMVMX_CPUMCTX_IMPORT_STATE(pVCpu, CPUMCTX_EXTRN_RSP | CPUMCTX_EXTRN_SREG_MASK + | IEM_CPUMCTX_EXTRN_EXEC_DECODED_MEM_MASK); + rc |= hmR0VmxReadExitInstrInfoVmcs(pVmxTransient); + rc |= hmR0VmxReadExitQualVmcs(pVCpu, pVmxTransient); + AssertRCReturn(rc, rc); + + HMVMX_CHECK_EXIT_DUE_TO_VMX_INSTR(pVCpu, pVmxTransient->uExitReason); + + VMXVEXITINFO ExitInfo; + RT_ZERO(ExitInfo); + ExitInfo.uReason = pVmxTransient->uExitReason; + ExitInfo.u64Qual = pVmxTransient->uExitQual; + ExitInfo.InstrInfo.u = pVmxTransient->ExitInstrInfo.u; + ExitInfo.cbInstr = pVmxTransient->cbInstr; + HMVMX_DECODE_MEM_OPERAND(pVCpu, ExitInfo.InstrInfo.u, ExitInfo.u64Qual, VMXMEMACCESS_READ, &ExitInfo.GCPtrEffAddr); + + VBOXSTRICTRC rcStrict = IEMExecDecodedVmxon(pVCpu, &ExitInfo); + if (RT_LIKELY(rcStrict == VINF_SUCCESS)) + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_GUEST_RIP | HM_CHANGED_GUEST_RFLAGS | HM_CHANGED_GUEST_HWVIRT); + else if (rcStrict == VINF_IEM_RAISED_XCPT) + { + ASMAtomicUoOrU64(&pVCpu->hm.s.fCtxChanged, HM_CHANGED_RAISED_XCPT_MASK); + rcStrict = VINF_SUCCESS; + } + return rcStrict; +#else + HMVMX_IEM_EXEC_VMX_INSTR_RET(pVCpu); +#endif +} + +/** @} */ +#endif /* VBOX_WITH_NESTED_HWVIRT_VMX */ + diff --git a/src/VBox/VMM/VMMR0/HMVMXR0.h b/src/VBox/VMM/VMMR0/HMVMXR0.h new file mode 100644 index 00000000..1094cceb --- /dev/null +++ b/src/VBox/VMM/VMMR0/HMVMXR0.h @@ -0,0 +1,85 @@ +/* $Id: HMVMXR0.h $ */ +/** @file + * HM VMX (VT-x) - Internal header file. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + +#ifndef VMM_INCLUDED_SRC_VMMR0_HMVMXR0_h +#define VMM_INCLUDED_SRC_VMMR0_HMVMXR0_h +#ifndef RT_WITHOUT_PRAGMA_ONCE +# pragma once +#endif + +RT_C_DECLS_BEGIN + +/** @defgroup grp_vmx_int Internal + * @ingroup grp_vmx + * @internal + * @{ + */ + +#ifdef IN_RING0 + +VMMR0DECL(int) VMXR0Enter(PVMCPU pVCpu); +VMMR0DECL(void) VMXR0ThreadCtxCallback(RTTHREADCTXEVENT enmEvent, PVMCPU pVCpu, bool fGlobalInit); +VMMR0DECL(int) VMXR0EnableCpu(PHMPHYSCPU pHostCpu, PVM pVM, void *pvPageCpu, RTHCPHYS pPageCpuPhys, + bool fEnabledBySystem, PCSUPHWVIRTMSRS pHwvirtMsrs); +VMMR0DECL(int) VMXR0DisableCpu(void *pvPageCpu, RTHCPHYS pPageCpuPhys); +VMMR0DECL(int) VMXR0GlobalInit(void); +VMMR0DECL(void) VMXR0GlobalTerm(void); +VMMR0DECL(int) VMXR0InitVM(PVM pVM); +VMMR0DECL(int) VMXR0TermVM(PVM pVM); +VMMR0DECL(int) VMXR0SetupVM(PVM pVM); +VMMR0DECL(int) VMXR0ExportHostState(PVMCPU pVCpu); +VMMR0DECL(int) VMXR0InvalidatePage(PVMCPU pVCpu, RTGCPTR GCVirt); +VMMR0DECL(int) VMXR0ImportStateOnDemand(PVMCPU pVCpu, uint64_t fWhat); +VMMR0DECL(VBOXSTRICTRC) VMXR0RunGuestCode(PVMCPU pVCpu); +DECLASM(int) VMXR0StartVM32(RTHCUINT fResume, PCPUMCTX pCtx, PVMCSCACHE pCache, PVM pVM, PVMCPU pVCpu); +DECLASM(int) VMXR0StartVM64(RTHCUINT fResume, PCPUMCTX pCtx, PVMCSCACHE pCache, PVM pVM, PVMCPU pVCpu); + +# if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) +DECLASM(int) VMXR0SwitcherStartVM64(RTHCUINT fResume, PCPUMCTX pCtx, PVMCSCACHE pCache, PVM pVM, PVMCPU pVCpu); +VMMR0DECL(int) VMXR0Execute64BitsHandler(PVMCPU pVCpu, HM64ON32OP enmOp, uint32_t cbParam, uint32_t *paParam); +# endif + +/* Cached VMCS accesses -- defined only for 32-bit hosts (with 64-bit guest support). */ +# ifdef VMX_USE_CACHED_VMCS_ACCESSES +VMMR0DECL(int) VMXWriteCachedVmcsEx(PVMCPU pVCpu, uint32_t idxField, uint64_t u64Val); + +DECLINLINE(int) VMXReadCachedVmcsEx(PVMCPU pVCpu, uint32_t idxCache, RTGCUINTREG *pVal) +{ + Assert(idxCache <= VMX_VMCS_MAX_NESTED_PAGING_CACHE_IDX); + *pVal = pVCpu->hm.s.vmx.VMCSCache.Read.aFieldVal[idxCache]; + return VINF_SUCCESS; +} +# endif + +# if HC_ARCH_BITS == 32 +# define VMXReadVmcsHstN VMXReadVmcs32 +# define VMXReadVmcsGstN(idxField, pVal) VMXReadCachedVmcsEx(pVCpu, idxField##_CACHE_IDX, pVal) +# define VMXReadVmcsGstNByIdxVal(idxField, pVal) VMXReadCachedVmcsEx(pVCpu, idxField, pVal) +# else /* HC_ARCH_BITS == 64 */ +# define VMXReadVmcsHstN VMXReadVmcs64 +# define VMXReadVmcsGstN VMXReadVmcs64 +# define VMXReadVmcsGstNByIdxVal VMXReadVmcs64 +# endif + +#endif /* IN_RING0 */ + +/** @} */ + +RT_C_DECLS_END + +#endif /* !VMM_INCLUDED_SRC_VMMR0_HMVMXR0_h */ + diff --git a/src/VBox/VMM/VMMR0/Makefile.kup b/src/VBox/VMM/VMMR0/Makefile.kup new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/VBox/VMM/VMMR0/Makefile.kup diff --git a/src/VBox/VMM/VMMR0/NEMR0Native-win.cpp b/src/VBox/VMM/VMMR0/NEMR0Native-win.cpp new file mode 100644 index 00000000..796428a8 --- /dev/null +++ b/src/VBox/VMM/VMMR0/NEMR0Native-win.cpp @@ -0,0 +1,2628 @@ +/* $Id: NEMR0Native-win.cpp $ */ +/** @file + * NEM - Native execution manager, native ring-0 Windows backend. + */ + +/* + * Copyright (C) 2018-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_NEM +#define VMCPU_INCL_CPUM_GST_CTX +#include <iprt/nt/nt.h> +#include <iprt/nt/hyperv.h> +#include <iprt/nt/vid.h> +#include <winerror.h> + +#include <VBox/vmm/nem.h> +#include <VBox/vmm/iem.h> +#include <VBox/vmm/em.h> +#include <VBox/vmm/apic.h> +#include <VBox/vmm/pdm.h> +#include <VBox/vmm/dbgftrace.h> +#include "NEMInternal.h" +#include <VBox/vmm/gvm.h> +#include <VBox/vmm/vm.h> +#include <VBox/vmm/gvmm.h> +#include <VBox/param.h> + +#include <iprt/dbg.h> +#include <iprt/memobj.h> +#include <iprt/string.h> +#include <iprt/time.h> + + +/* Assert compile context sanity. */ +#ifndef RT_OS_WINDOWS +# error "Windows only file!" +#endif +#ifndef RT_ARCH_AMD64 +# error "AMD64 only file!" +#endif + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +typedef uint32_t DWORD; /* for winerror.h constants */ + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +static uint64_t (*g_pfnHvlInvokeHypercall)(uint64_t uCallInfo, uint64_t HCPhysInput, uint64_t HCPhysOutput); + +/** + * WinHvr.sys!WinHvDepositMemory + * + * This API will try allocates cPages on IdealNode and deposit it to the + * hypervisor for use with the given partition. The memory will be freed when + * VID.SYS calls WinHvWithdrawAllMemory when the partition is cleanedup. + * + * Apparently node numbers above 64 has a different meaning. + */ +static NTSTATUS (*g_pfnWinHvDepositMemory)(uintptr_t idPartition, size_t cPages, uintptr_t IdealNode, size_t *pcActuallyAdded); + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +NEM_TMPL_STATIC int nemR0WinMapPages(PGVM pGVM, PVM pVM, PGVMCPU pGVCpu, RTGCPHYS GCPhysSrc, RTGCPHYS GCPhysDst, + uint32_t cPages, uint32_t fFlags); +NEM_TMPL_STATIC int nemR0WinUnmapPages(PGVM pGVM, PGVMCPU pGVCpu, RTGCPHYS GCPhys, uint32_t cPages); +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) +NEM_TMPL_STATIC int nemR0WinExportState(PGVM pGVM, PGVMCPU pGVCpu, PCPUMCTX pCtx); +NEM_TMPL_STATIC int nemR0WinImportState(PGVM pGVM, PGVMCPU pGVCpu, PCPUMCTX pCtx, uint64_t fWhat, bool fCanUpdateCr3); +NEM_TMPL_STATIC int nemR0WinQueryCpuTick(PGVM pGVM, PGVMCPU pGVCpu, uint64_t *pcTicks, uint32_t *pcAux); +NEM_TMPL_STATIC int nemR0WinResumeCpuTickOnAll(PGVM pGVM, PGVMCPU pGVCpu, uint64_t uPausedTscValue); +#endif +DECLINLINE(NTSTATUS) nemR0NtPerformIoControl(PGVM pGVM, uint32_t uFunction, void *pvInput, uint32_t cbInput, + void *pvOutput, uint32_t cbOutput); + + +/* + * Instantate the code we share with ring-0. + */ +#ifdef NEM_WIN_WITH_RING0_RUNLOOP +# define NEM_WIN_TEMPLATE_MODE_OWN_RUN_API +#else +# undef NEM_WIN_TEMPLATE_MODE_OWN_RUN_API +#endif +#include "../VMMAll/NEMAllNativeTemplate-win.cpp.h" + + + +/** + * Worker for NEMR0InitVM that allocates a hypercall page. + * + * @returns VBox status code. + * @param pHypercallData The hypercall data page to initialize. + */ +static int nemR0InitHypercallData(PNEMR0HYPERCALLDATA pHypercallData) +{ + int rc = RTR0MemObjAllocPage(&pHypercallData->hMemObj, PAGE_SIZE, false /*fExecutable*/); + if (RT_SUCCESS(rc)) + { + pHypercallData->HCPhysPage = RTR0MemObjGetPagePhysAddr(pHypercallData->hMemObj, 0 /*iPage*/); + AssertStmt(pHypercallData->HCPhysPage != NIL_RTHCPHYS, rc = VERR_INTERNAL_ERROR_3); + pHypercallData->pbPage = (uint8_t *)RTR0MemObjAddress(pHypercallData->hMemObj); + AssertStmt(pHypercallData->pbPage, rc = VERR_INTERNAL_ERROR_3); + if (RT_SUCCESS(rc)) + return VINF_SUCCESS; + + /* bail out */ + RTR0MemObjFree(pHypercallData->hMemObj, true /*fFreeMappings*/); + } + pHypercallData->hMemObj = NIL_RTR0MEMOBJ; + pHypercallData->HCPhysPage = NIL_RTHCPHYS; + pHypercallData->pbPage = NULL; + return rc; +} + +/** + * Worker for NEMR0CleanupVM and NEMR0InitVM that cleans up a hypercall page. + * + * @param pHypercallData The hypercall data page to uninitialize. + */ +static void nemR0DeleteHypercallData(PNEMR0HYPERCALLDATA pHypercallData) +{ + /* Check pbPage here since it's NULL, whereas the hMemObj can be either + NIL_RTR0MEMOBJ or 0 (they aren't necessarily the same). */ + if (pHypercallData->pbPage != NULL) + { + RTR0MemObjFree(pHypercallData->hMemObj, true /*fFreeMappings*/); + pHypercallData->pbPage = NULL; + } + pHypercallData->hMemObj = NIL_RTR0MEMOBJ; + pHypercallData->HCPhysPage = NIL_RTHCPHYS; +} + + +/** + * Called by NEMR3Init to make sure we've got what we need. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @thread EMT(0) + */ +VMMR0_INT_DECL(int) NEMR0InitVM(PGVM pGVM, PVM pVM) +{ + AssertCompile(sizeof(pGVM->nem.s) <= sizeof(pGVM->nem.padding)); + AssertCompile(sizeof(pGVM->aCpus[0].nem.s) <= sizeof(pGVM->aCpus[0].nem.padding)); + + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, 0); + AssertRCReturn(rc, rc); + + /* + * We want to perform hypercalls here. The NT kernel started to expose a very low + * level interface to do this thru somewhere between build 14271 and 16299. Since + * we need build 17134 to get anywhere at all, the exact build is not relevant here. + * + * We also need to deposit memory to the hypervisor for use with partition (page + * mapping structures, stuff). + */ + RTDBGKRNLINFO hKrnlInfo; + rc = RTR0DbgKrnlInfoOpen(&hKrnlInfo, 0); + if (RT_SUCCESS(rc)) + { + rc = RTR0DbgKrnlInfoQuerySymbol(hKrnlInfo, NULL, "HvlInvokeHypercall", (void **)&g_pfnHvlInvokeHypercall); + if (RT_SUCCESS(rc)) + rc = RTR0DbgKrnlInfoQuerySymbol(hKrnlInfo, "winhvr.sys", "WinHvDepositMemory", (void **)&g_pfnWinHvDepositMemory); + RTR0DbgKrnlInfoRelease(hKrnlInfo); + if (RT_SUCCESS(rc)) + { + /* + * Allocate a page for non-EMT threads to use for hypercalls (update + * statistics and such) and a critical section protecting it. + */ + rc = RTCritSectInit(&pGVM->nem.s.HypercallDataCritSect); + if (RT_SUCCESS(rc)) + { + rc = nemR0InitHypercallData(&pGVM->nem.s.HypercallData); + if (RT_SUCCESS(rc)) + { + /* + * Allocate a page for each VCPU to place hypercall data on. + */ + for (VMCPUID i = 0; i < pGVM->cCpus; i++) + { + rc = nemR0InitHypercallData(&pGVM->aCpus[i].nem.s.HypercallData); + if (RT_FAILURE(rc)) + { + while (i-- > 0) + nemR0DeleteHypercallData(&pGVM->aCpus[i].nem.s.HypercallData); + break; + } + } + if (RT_SUCCESS(rc)) + { + /* + * So far, so good. + */ + return rc; + } + + /* + * Bail out. + */ + nemR0DeleteHypercallData(&pGVM->nem.s.HypercallData); + } + RTCritSectDelete(&pGVM->nem.s.HypercallDataCritSect); + } + } + else + rc = VERR_NEM_MISSING_KERNEL_API; + } + + RT_NOREF(pVM); + return rc; +} + + +/** + * Perform an I/O control operation on the partition handle (VID.SYS). + * + * @returns NT status code. + * @param pGVM The ring-0 VM structure. + * @param uFunction The function to perform. + * @param pvInput The input buffer. This must point within the VM + * structure so we can easily convert to a ring-3 + * pointer if necessary. + * @param cbInput The size of the input. @a pvInput must be NULL when + * zero. + * @param pvOutput The output buffer. This must also point within the + * VM structure for ring-3 pointer magic. + * @param cbOutput The size of the output. @a pvOutput must be NULL + * when zero. + */ +DECLINLINE(NTSTATUS) nemR0NtPerformIoControl(PGVM pGVM, uint32_t uFunction, void *pvInput, uint32_t cbInput, + void *pvOutput, uint32_t cbOutput) +{ +#ifdef RT_STRICT + /* + * Input and output parameters are part of the VM CPU structure. + */ + PVM pVM = pGVM->pVM; + size_t const cbVM = RT_UOFFSETOF_DYN(VM, aCpus[pGVM->cCpus]); + if (pvInput) + AssertReturn(((uintptr_t)pvInput + cbInput) - (uintptr_t)pVM <= cbVM, VERR_INVALID_PARAMETER); + if (pvOutput) + AssertReturn(((uintptr_t)pvOutput + cbOutput) - (uintptr_t)pVM <= cbVM, VERR_INVALID_PARAMETER); +#endif + + int32_t rcNt = STATUS_UNSUCCESSFUL; + int rc = SUPR0IoCtlPerform(pGVM->nem.s.pIoCtlCtx, uFunction, + pvInput, + pvInput ? (uintptr_t)pvInput + pGVM->nem.s.offRing3ConversionDelta : NIL_RTR3PTR, + cbInput, + pvOutput, + pvOutput ? (uintptr_t)pvOutput + pGVM->nem.s.offRing3ConversionDelta : NIL_RTR3PTR, + cbOutput, + &rcNt); + if (RT_SUCCESS(rc) || !NT_SUCCESS((NTSTATUS)rcNt)) + return (NTSTATUS)rcNt; + return STATUS_UNSUCCESSFUL; +} + + +/** + * 2nd part of the initialization, after we've got a partition handle. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @thread EMT(0) + */ +VMMR0_INT_DECL(int) NEMR0InitVMPart2(PGVM pGVM, PVM pVM) +{ + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, 0); + AssertRCReturn(rc, rc); + SUPR0Printf("NEMR0InitVMPart2\n"); LogRel(("2: NEMR0InitVMPart2\n")); + Assert(pGVM->nem.s.fMayUseRing0Runloop == false); + + /* + * Copy and validate the I/O control information from ring-3. + */ + NEMWINIOCTL Copy = pVM->nem.s.IoCtlGetHvPartitionId; + AssertLogRelReturn(Copy.uFunction != 0, VERR_NEM_INIT_FAILED); + AssertLogRelReturn(Copy.cbInput == 0, VERR_NEM_INIT_FAILED); + AssertLogRelReturn(Copy.cbOutput == sizeof(HV_PARTITION_ID), VERR_NEM_INIT_FAILED); + pGVM->nem.s.IoCtlGetHvPartitionId = Copy; + + pGVM->nem.s.fMayUseRing0Runloop = pVM->nem.s.fUseRing0Runloop; + + Copy = pVM->nem.s.IoCtlStartVirtualProcessor; + AssertLogRelStmt(Copy.uFunction != 0, rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.cbInput == sizeof(HV_VP_INDEX), rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.cbOutput == 0, rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.uFunction != pGVM->nem.s.IoCtlGetHvPartitionId.uFunction, rc = VERR_NEM_INIT_FAILED); + if (RT_SUCCESS(rc)) + pGVM->nem.s.IoCtlStartVirtualProcessor = Copy; + + Copy = pVM->nem.s.IoCtlStopVirtualProcessor; + AssertLogRelStmt(Copy.uFunction != 0, rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.cbInput == sizeof(HV_VP_INDEX), rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.cbOutput == 0, rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.uFunction != pGVM->nem.s.IoCtlGetHvPartitionId.uFunction, rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.uFunction != pGVM->nem.s.IoCtlStartVirtualProcessor.uFunction, rc = VERR_NEM_INIT_FAILED); + if (RT_SUCCESS(rc)) + pGVM->nem.s.IoCtlStopVirtualProcessor = Copy; + + Copy = pVM->nem.s.IoCtlMessageSlotHandleAndGetNext; + AssertLogRelStmt(Copy.uFunction != 0, rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt( Copy.cbInput == sizeof(VID_IOCTL_INPUT_MESSAGE_SLOT_HANDLE_AND_GET_NEXT) + || Copy.cbInput == RT_OFFSETOF(VID_IOCTL_INPUT_MESSAGE_SLOT_HANDLE_AND_GET_NEXT, cMillies), + rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.cbOutput == 0, VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.uFunction != pGVM->nem.s.IoCtlGetHvPartitionId.uFunction, rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.uFunction != pGVM->nem.s.IoCtlStartVirtualProcessor.uFunction, rc = VERR_NEM_INIT_FAILED); + AssertLogRelStmt(Copy.uFunction != pGVM->nem.s.IoCtlStopVirtualProcessor.uFunction, rc = VERR_NEM_INIT_FAILED); + if (RT_SUCCESS(rc)) + pGVM->nem.s.IoCtlMessageSlotHandleAndGetNext = Copy; + + if ( RT_SUCCESS(rc) + || !pVM->nem.s.fUseRing0Runloop) + { + /* + * Setup of an I/O control context for the partition handle for later use. + */ + rc = SUPR0IoCtlSetupForHandle(pGVM->pSession, pVM->nem.s.hPartitionDevice, 0, &pGVM->nem.s.pIoCtlCtx); + AssertLogRelRCReturn(rc, rc); + pGVM->nem.s.offRing3ConversionDelta = (uintptr_t)pVM->pVMR3 - (uintptr_t)pGVM->pVM; + + /* + * Get the partition ID. + */ + PVMCPU pVCpu = &pGVM->pVM->aCpus[0]; + NTSTATUS rcNt = nemR0NtPerformIoControl(pGVM, pGVM->nem.s.IoCtlGetHvPartitionId.uFunction, NULL, 0, + &pVCpu->nem.s.uIoCtlBuf.idPartition, sizeof(pVCpu->nem.s.uIoCtlBuf.idPartition)); + AssertLogRelMsgReturn(NT_SUCCESS(rcNt), ("IoCtlGetHvPartitionId failed: %#x\n", rcNt), VERR_NEM_INIT_FAILED); + pGVM->nem.s.idHvPartition = pVCpu->nem.s.uIoCtlBuf.idPartition; + AssertLogRelMsgReturn(pGVM->nem.s.idHvPartition == pVM->nem.s.idHvPartition, + ("idHvPartition mismatch: r0=%#RX64, r3=%#RX64\n", pGVM->nem.s.idHvPartition, pVM->nem.s.idHvPartition), + VERR_NEM_INIT_FAILED); + } + + return rc; +} + + +/** + * Cleanup the NEM parts of the VM in ring-0. + * + * This is always called and must deal the state regardless of whether + * NEMR0InitVM() was called or not. So, take care here. + * + * @param pGVM The ring-0 VM handle. + */ +VMMR0_INT_DECL(void) NEMR0CleanupVM(PGVM pGVM) +{ + pGVM->nem.s.idHvPartition = HV_PARTITION_ID_INVALID; + + /* Clean up I/O control context. */ + if (pGVM->nem.s.pIoCtlCtx) + { + int rc = SUPR0IoCtlCleanup(pGVM->nem.s.pIoCtlCtx); + AssertRC(rc); + pGVM->nem.s.pIoCtlCtx = NULL; + } + + /* Free the hypercall pages. */ + VMCPUID i = pGVM->cCpus; + while (i-- > 0) + nemR0DeleteHypercallData(&pGVM->aCpus[i].nem.s.HypercallData); + + /* The non-EMT one too. */ + if (RTCritSectIsInitialized(&pGVM->nem.s.HypercallDataCritSect)) + RTCritSectDelete(&pGVM->nem.s.HypercallDataCritSect); + nemR0DeleteHypercallData(&pGVM->nem.s.HypercallData); +} + + +#if 0 /* for debugging GPA unmapping. */ +static int nemR3WinDummyReadGpa(PGVM pGVM, PGVMCPU pGVCpu, RTGCPHYS GCPhys) +{ + PHV_INPUT_READ_GPA pIn = (PHV_INPUT_READ_GPA)pGVCpu->nem.s.pbHypercallData; + PHV_OUTPUT_READ_GPA pOut = (PHV_OUTPUT_READ_GPA)(pIn + 1); + pIn->PartitionId = pGVM->nem.s.idHvPartition; + pIn->VpIndex = pGVCpu->idCpu; + pIn->ByteCount = 0x10; + pIn->BaseGpa = GCPhys; + pIn->ControlFlags.AsUINT64 = 0; + pIn->ControlFlags.CacheType = HvCacheTypeX64WriteCombining; + memset(pOut, 0xfe, sizeof(*pOut)); + uint64_t volatile uResult = g_pfnHvlInvokeHypercall(HvCallReadGpa, pGVCpu->nem.s.HCPhysHypercallData, + pGVCpu->nem.s.HCPhysHypercallData + sizeof(*pIn)); + LogRel(("nemR3WinDummyReadGpa: %RGp -> %#RX64; code=%u rsvd=%u abData=%.16Rhxs\n", + GCPhys, uResult, pOut->AccessResult.ResultCode, pOut->AccessResult.Reserved, pOut->Data)); + __debugbreak(); + + return uResult != 0 ? VERR_READ_ERROR : VINF_SUCCESS; +} +#endif + + +/** + * Worker for NEMR0MapPages and others. + */ +NEM_TMPL_STATIC int nemR0WinMapPages(PGVM pGVM, PVM pVM, PGVMCPU pGVCpu, RTGCPHYS GCPhysSrc, RTGCPHYS GCPhysDst, + uint32_t cPages, uint32_t fFlags) +{ + /* + * Validate. + */ + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + AssertReturn(cPages > 0, VERR_OUT_OF_RANGE); + AssertReturn(cPages <= NEM_MAX_MAP_PAGES, VERR_OUT_OF_RANGE); + AssertReturn(!(fFlags & ~(HV_MAP_GPA_MAYBE_ACCESS_MASK & ~HV_MAP_GPA_DUNNO_ACCESS)), VERR_INVALID_FLAGS); + AssertMsgReturn(!(GCPhysDst & X86_PAGE_OFFSET_MASK), ("GCPhysDst=%RGp\n", GCPhysDst), VERR_OUT_OF_RANGE); + AssertReturn(GCPhysDst < _1E, VERR_OUT_OF_RANGE); + if (GCPhysSrc != GCPhysDst) + { + AssertMsgReturn(!(GCPhysSrc & X86_PAGE_OFFSET_MASK), ("GCPhysSrc=%RGp\n", GCPhysSrc), VERR_OUT_OF_RANGE); + AssertReturn(GCPhysSrc < _1E, VERR_OUT_OF_RANGE); + } + + /* + * Compose and make the hypercall. + * Ring-3 is not allowed to fill in the host physical addresses of the call. + */ + for (uint32_t iTries = 0;; iTries++) + { + HV_INPUT_MAP_GPA_PAGES *pMapPages = (HV_INPUT_MAP_GPA_PAGES *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pMapPages, VERR_INTERNAL_ERROR_3); + pMapPages->TargetPartitionId = pGVM->nem.s.idHvPartition; + pMapPages->TargetGpaBase = GCPhysDst >> X86_PAGE_SHIFT; + pMapPages->MapFlags = fFlags; + pMapPages->u32ExplicitPadding = 0; + for (uint32_t iPage = 0; iPage < cPages; iPage++, GCPhysSrc += X86_PAGE_SIZE) + { + RTHCPHYS HCPhys = NIL_RTGCPHYS; + int rc = PGMPhysGCPhys2HCPhys(pVM, GCPhysSrc, &HCPhys); + AssertRCReturn(rc, rc); + pMapPages->PageList[iPage] = HCPhys >> X86_PAGE_SHIFT; + } + + uint64_t uResult = g_pfnHvlInvokeHypercall(HvCallMapGpaPages | ((uint64_t)cPages << 32), + pGVCpu->nem.s.HypercallData.HCPhysPage, 0); + Log6(("NEMR0MapPages: %RGp/%RGp L %u prot %#x -> %#RX64\n", + GCPhysDst, GCPhysSrc - cPages * X86_PAGE_SIZE, cPages, fFlags, uResult)); + if (uResult == ((uint64_t)cPages << 32)) + return VINF_SUCCESS; + + /* + * If the partition is out of memory, try donate another 512 pages to + * it (2MB). VID.SYS does multiples of 512 pages, nothing smaller. + */ + if ( uResult != HV_STATUS_INSUFFICIENT_MEMORY + || iTries > 16 + || g_pfnWinHvDepositMemory == NULL) + { + LogRel(("g_pfnHvlInvokeHypercall/MapGpaPages -> %#RX64\n", uResult)); + return VERR_NEM_MAP_PAGES_FAILED; + } + + size_t cPagesAdded = 0; + NTSTATUS rcNt = g_pfnWinHvDepositMemory(pGVM->nem.s.idHvPartition, 512, 0, &cPagesAdded); + if (!cPagesAdded) + { + LogRel(("g_pfnWinHvDepositMemory -> %#x / %#RX64\n", rcNt, uResult)); + return VERR_NEM_MAP_PAGES_FAILED; + } + } +} + + +/** + * Maps pages into the guest physical address space. + * + * Generally the caller will be under the PGM lock already, so no extra effort + * is needed to make sure all changes happens under it. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @param idCpu The calling EMT. Necessary for getting the + * hypercall page and arguments. + * @thread EMT(idCpu) + */ +VMMR0_INT_DECL(int) NEMR0MapPages(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + /* + * Unpack the call. + */ + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_SUCCESS(rc)) + { + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + + RTGCPHYS const GCPhysSrc = pVCpu->nem.s.Hypercall.MapPages.GCPhysSrc; + RTGCPHYS const GCPhysDst = pVCpu->nem.s.Hypercall.MapPages.GCPhysDst; + uint32_t const cPages = pVCpu->nem.s.Hypercall.MapPages.cPages; + HV_MAP_GPA_FLAGS const fFlags = pVCpu->nem.s.Hypercall.MapPages.fFlags; + + /* + * Do the work. + */ + rc = nemR0WinMapPages(pGVM, pVM, pGVCpu, GCPhysSrc, GCPhysDst, cPages, fFlags); + } + return rc; +} + + +/** + * Worker for NEMR0UnmapPages and others. + */ +NEM_TMPL_STATIC int nemR0WinUnmapPages(PGVM pGVM, PGVMCPU pGVCpu, RTGCPHYS GCPhys, uint32_t cPages) +{ + /* + * Validate input. + */ + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + AssertReturn(cPages > 0, VERR_OUT_OF_RANGE); + AssertReturn(cPages <= NEM_MAX_UNMAP_PAGES, VERR_OUT_OF_RANGE); + AssertMsgReturn(!(GCPhys & X86_PAGE_OFFSET_MASK), ("%RGp\n", GCPhys), VERR_OUT_OF_RANGE); + AssertReturn(GCPhys < _1E, VERR_OUT_OF_RANGE); + + /* + * Compose and make the hypercall. + */ + HV_INPUT_UNMAP_GPA_PAGES *pUnmapPages = (HV_INPUT_UNMAP_GPA_PAGES *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pUnmapPages, VERR_INTERNAL_ERROR_3); + pUnmapPages->TargetPartitionId = pGVM->nem.s.idHvPartition; + pUnmapPages->TargetGpaBase = GCPhys >> X86_PAGE_SHIFT; + pUnmapPages->fFlags = 0; + + uint64_t uResult = g_pfnHvlInvokeHypercall(HvCallUnmapGpaPages | ((uint64_t)cPages << 32), + pGVCpu->nem.s.HypercallData.HCPhysPage, 0); + Log6(("NEMR0UnmapPages: %RGp L %u -> %#RX64\n", GCPhys, cPages, uResult)); + if (uResult == ((uint64_t)cPages << 32)) + { +#if 1 /* Do we need to do this? Hopefully not... */ + uint64_t volatile uR = g_pfnHvlInvokeHypercall(HvCallUncommitGpaPages | ((uint64_t)cPages << 32), + pGVCpu->nem.s.HypercallData.HCPhysPage, 0); + AssertMsg(uR == ((uint64_t)cPages << 32), ("uR=%#RX64\n", uR)); NOREF(uR); +#endif + return VINF_SUCCESS; + } + + LogRel(("g_pfnHvlInvokeHypercall/UnmapGpaPages -> %#RX64\n", uResult)); + return VERR_NEM_UNMAP_PAGES_FAILED; +} + + +/** + * Unmaps pages from the guest physical address space. + * + * Generally the caller will be under the PGM lock already, so no extra effort + * is needed to make sure all changes happens under it. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @param idCpu The calling EMT. Necessary for getting the + * hypercall page and arguments. + * @thread EMT(idCpu) + */ +VMMR0_INT_DECL(int) NEMR0UnmapPages(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + /* + * Unpack the call. + */ + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_SUCCESS(rc)) + { + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + + RTGCPHYS const GCPhys = pVCpu->nem.s.Hypercall.UnmapPages.GCPhys; + uint32_t const cPages = pVCpu->nem.s.Hypercall.UnmapPages.cPages; + + /* + * Do the work. + */ + rc = nemR0WinUnmapPages(pGVM, pGVCpu, GCPhys, cPages); + } + return rc; +} + + +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) +/** + * Worker for NEMR0ExportState. + * + * Intention is to use it internally later. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pGVCpu The ring-0 VCPU handle. + * @param pCtx The CPU context structure to import into. + */ +NEM_TMPL_STATIC int nemR0WinExportState(PGVM pGVM, PGVMCPU pGVCpu, PCPUMCTX pCtx) +{ + PVMCPU pVCpu = &pGVM->pVM->aCpus[pGVCpu->idCpu]; + HV_INPUT_SET_VP_REGISTERS *pInput = (HV_INPUT_SET_VP_REGISTERS *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pInput, VERR_INTERNAL_ERROR_3); + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + pInput->PartitionId = pGVM->nem.s.idHvPartition; + pInput->VpIndex = pGVCpu->idCpu; + pInput->RsvdZ = 0; + + uint64_t const fWhat = ~pCtx->fExtrn & (CPUMCTX_EXTRN_ALL | CPUMCTX_EXTRN_NEM_WIN_MASK); + if ( !fWhat + && pVCpu->nem.s.fCurrentInterruptWindows == pVCpu->nem.s.fDesiredInterruptWindows) + return VINF_SUCCESS; + uintptr_t iReg = 0; + + /* GPRs */ + if (fWhat & CPUMCTX_EXTRN_GPRS_MASK) + { + if (fWhat & CPUMCTX_EXTRN_RAX) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRax; + pInput->Elements[iReg].Value.Reg64 = pCtx->rax; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_RCX) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRcx; + pInput->Elements[iReg].Value.Reg64 = pCtx->rcx; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_RDX) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRdx; + pInput->Elements[iReg].Value.Reg64 = pCtx->rdx; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_RBX) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRbx; + pInput->Elements[iReg].Value.Reg64 = pCtx->rbx; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_RSP) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRsp; + pInput->Elements[iReg].Value.Reg64 = pCtx->rsp; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_RBP) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRbp; + pInput->Elements[iReg].Value.Reg64 = pCtx->rbp; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_RSI) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRsi; + pInput->Elements[iReg].Value.Reg64 = pCtx->rsi; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_RDI) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRdi; + pInput->Elements[iReg].Value.Reg64 = pCtx->rdi; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_R8_R15) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterR8; + pInput->Elements[iReg].Value.Reg64 = pCtx->r8; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterR9; + pInput->Elements[iReg].Value.Reg64 = pCtx->r9; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterR10; + pInput->Elements[iReg].Value.Reg64 = pCtx->r10; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterR11; + pInput->Elements[iReg].Value.Reg64 = pCtx->r11; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterR12; + pInput->Elements[iReg].Value.Reg64 = pCtx->r12; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterR13; + pInput->Elements[iReg].Value.Reg64 = pCtx->r13; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterR14; + pInput->Elements[iReg].Value.Reg64 = pCtx->r14; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterR15; + pInput->Elements[iReg].Value.Reg64 = pCtx->r15; + iReg++; + } + } + + /* RIP & Flags */ + if (fWhat & CPUMCTX_EXTRN_RIP) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRip; + pInput->Elements[iReg].Value.Reg64 = pCtx->rip; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_RFLAGS) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterRflags; + pInput->Elements[iReg].Value.Reg64 = pCtx->rflags.u; + iReg++; + } + + /* Segments */ +# define COPY_OUT_SEG(a_idx, a_enmName, a_SReg) \ + do { \ + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[a_idx]); \ + pInput->Elements[a_idx].Name = a_enmName; \ + pInput->Elements[a_idx].Value.Segment.Base = (a_SReg).u64Base; \ + pInput->Elements[a_idx].Value.Segment.Limit = (a_SReg).u32Limit; \ + pInput->Elements[a_idx].Value.Segment.Selector = (a_SReg).Sel; \ + pInput->Elements[a_idx].Value.Segment.Attributes = (a_SReg).Attr.u; \ + } while (0) + if (fWhat & CPUMCTX_EXTRN_SREG_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CS) + { + COPY_OUT_SEG(iReg, HvX64RegisterCs, pCtx->cs); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_ES) + { + COPY_OUT_SEG(iReg, HvX64RegisterEs, pCtx->es); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_SS) + { + COPY_OUT_SEG(iReg, HvX64RegisterSs, pCtx->ss); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_DS) + { + COPY_OUT_SEG(iReg, HvX64RegisterDs, pCtx->ds); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_FS) + { + COPY_OUT_SEG(iReg, HvX64RegisterFs, pCtx->fs); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_GS) + { + COPY_OUT_SEG(iReg, HvX64RegisterGs, pCtx->gs); + iReg++; + } + } + + /* Descriptor tables & task segment. */ + if (fWhat & CPUMCTX_EXTRN_TABLE_MASK) + { + if (fWhat & CPUMCTX_EXTRN_LDTR) + { + COPY_OUT_SEG(iReg, HvX64RegisterLdtr, pCtx->ldtr); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_TR) + { + COPY_OUT_SEG(iReg, HvX64RegisterTr, pCtx->tr); + iReg++; + } + + if (fWhat & CPUMCTX_EXTRN_IDTR) + { + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Value.Table.Pad[0] = 0; + pInput->Elements[iReg].Value.Table.Pad[1] = 0; + pInput->Elements[iReg].Value.Table.Pad[2] = 0; + pInput->Elements[iReg].Name = HvX64RegisterIdtr; + pInput->Elements[iReg].Value.Table.Limit = pCtx->idtr.cbIdt; + pInput->Elements[iReg].Value.Table.Base = pCtx->idtr.pIdt; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_GDTR) + { + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Value.Table.Pad[0] = 0; + pInput->Elements[iReg].Value.Table.Pad[1] = 0; + pInput->Elements[iReg].Value.Table.Pad[2] = 0; + pInput->Elements[iReg].Name = HvX64RegisterGdtr; + pInput->Elements[iReg].Value.Table.Limit = pCtx->gdtr.cbGdt; + pInput->Elements[iReg].Value.Table.Base = pCtx->gdtr.pGdt; + iReg++; + } + } + + /* Control registers. */ + if (fWhat & CPUMCTX_EXTRN_CR_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CR0) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterCr0; + pInput->Elements[iReg].Value.Reg64 = pCtx->cr0; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_CR2) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterCr2; + pInput->Elements[iReg].Value.Reg64 = pCtx->cr2; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_CR3) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterCr3; + pInput->Elements[iReg].Value.Reg64 = pCtx->cr3; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_CR4) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterCr4; + pInput->Elements[iReg].Value.Reg64 = pCtx->cr4; + iReg++; + } + } + if (fWhat & CPUMCTX_EXTRN_APIC_TPR) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterCr8; + pInput->Elements[iReg].Value.Reg64 = CPUMGetGuestCR8(pVCpu); + iReg++; + } + + /** @todo does HvX64RegisterXfem mean XCR0? What about the related MSR. */ + + /* Debug registers. */ +/** @todo fixme. Figure out what the hyper-v version of KVM_SET_GUEST_DEBUG would be. */ + if (fWhat & CPUMCTX_EXTRN_DR0_DR3) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterDr0; + //pInput->Elements[iReg].Value.Reg64 = CPUMGetHyperDR0(pVCpu); + pInput->Elements[iReg].Value.Reg64 = pCtx->dr[0]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterDr1; + //pInput->Elements[iReg].Value.Reg64 = CPUMGetHyperDR1(pVCpu); + pInput->Elements[iReg].Value.Reg64 = pCtx->dr[1]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterDr2; + //pInput->Elements[iReg].Value.Reg64 = CPUMGetHyperDR2(pVCpu); + pInput->Elements[iReg].Value.Reg64 = pCtx->dr[2]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterDr3; + //pInput->Elements[iReg].Value.Reg64 = CPUMGetHyperDR3(pVCpu); + pInput->Elements[iReg].Value.Reg64 = pCtx->dr[3]; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_DR6) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterDr6; + //pInput->Elements[iReg].Value.Reg64 = CPUMGetHyperDR6(pVCpu); + pInput->Elements[iReg].Value.Reg64 = pCtx->dr[6]; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_DR7) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterDr7; + //pInput->Elements[iReg].Value.Reg64 = CPUMGetHyperDR7(pVCpu); + pInput->Elements[iReg].Value.Reg64 = pCtx->dr[7]; + iReg++; + } + + /* Floating point state. */ + if (fWhat & CPUMCTX_EXTRN_X87) + { + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpMmx0; + pInput->Elements[iReg].Value.Fp.AsUINT128.Low64 = pCtx->pXStateR0->x87.aRegs[0].au64[0]; + pInput->Elements[iReg].Value.Fp.AsUINT128.High64 = pCtx->pXStateR0->x87.aRegs[0].au64[1]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpMmx1; + pInput->Elements[iReg].Value.Fp.AsUINT128.Low64 = pCtx->pXStateR0->x87.aRegs[1].au64[0]; + pInput->Elements[iReg].Value.Fp.AsUINT128.High64 = pCtx->pXStateR0->x87.aRegs[1].au64[1]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpMmx2; + pInput->Elements[iReg].Value.Fp.AsUINT128.Low64 = pCtx->pXStateR0->x87.aRegs[2].au64[0]; + pInput->Elements[iReg].Value.Fp.AsUINT128.High64 = pCtx->pXStateR0->x87.aRegs[2].au64[1]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpMmx3; + pInput->Elements[iReg].Value.Fp.AsUINT128.Low64 = pCtx->pXStateR0->x87.aRegs[3].au64[0]; + pInput->Elements[iReg].Value.Fp.AsUINT128.High64 = pCtx->pXStateR0->x87.aRegs[3].au64[1]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpMmx4; + pInput->Elements[iReg].Value.Fp.AsUINT128.Low64 = pCtx->pXStateR0->x87.aRegs[4].au64[0]; + pInput->Elements[iReg].Value.Fp.AsUINT128.High64 = pCtx->pXStateR0->x87.aRegs[4].au64[1]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpMmx5; + pInput->Elements[iReg].Value.Fp.AsUINT128.Low64 = pCtx->pXStateR0->x87.aRegs[5].au64[0]; + pInput->Elements[iReg].Value.Fp.AsUINT128.High64 = pCtx->pXStateR0->x87.aRegs[5].au64[1]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpMmx6; + pInput->Elements[iReg].Value.Fp.AsUINT128.Low64 = pCtx->pXStateR0->x87.aRegs[6].au64[0]; + pInput->Elements[iReg].Value.Fp.AsUINT128.High64 = pCtx->pXStateR0->x87.aRegs[6].au64[1]; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpMmx7; + pInput->Elements[iReg].Value.Fp.AsUINT128.Low64 = pCtx->pXStateR0->x87.aRegs[7].au64[0]; + pInput->Elements[iReg].Value.Fp.AsUINT128.High64 = pCtx->pXStateR0->x87.aRegs[7].au64[1]; + iReg++; + + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterFpControlStatus; + pInput->Elements[iReg].Value.FpControlStatus.FpControl = pCtx->pXStateR0->x87.FCW; + pInput->Elements[iReg].Value.FpControlStatus.FpStatus = pCtx->pXStateR0->x87.FSW; + pInput->Elements[iReg].Value.FpControlStatus.FpTag = pCtx->pXStateR0->x87.FTW; + pInput->Elements[iReg].Value.FpControlStatus.Reserved = pCtx->pXStateR0->x87.FTW >> 8; + pInput->Elements[iReg].Value.FpControlStatus.LastFpOp = pCtx->pXStateR0->x87.FOP; + pInput->Elements[iReg].Value.FpControlStatus.LastFpRip = (pCtx->pXStateR0->x87.FPUIP) + | ((uint64_t)pCtx->pXStateR0->x87.CS << 32) + | ((uint64_t)pCtx->pXStateR0->x87.Rsrvd1 << 48); + iReg++; +/** @todo we've got trouble if if we try write just SSE w/o X87. */ + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmmControlStatus; + pInput->Elements[iReg].Value.XmmControlStatus.LastFpRdp = (pCtx->pXStateR0->x87.FPUDP) + | ((uint64_t)pCtx->pXStateR0->x87.DS << 32) + | ((uint64_t)pCtx->pXStateR0->x87.Rsrvd2 << 48); + pInput->Elements[iReg].Value.XmmControlStatus.XmmStatusControl = pCtx->pXStateR0->x87.MXCSR; + pInput->Elements[iReg].Value.XmmControlStatus.XmmStatusControlMask = pCtx->pXStateR0->x87.MXCSR_MASK; /** @todo ??? (Isn't this an output field?) */ + iReg++; + } + + /* Vector state. */ + if (fWhat & CPUMCTX_EXTRN_SSE_AVX) + { + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm0; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[0].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[0].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm1; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[1].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[1].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm2; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[2].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[2].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm3; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[3].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[3].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm4; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[4].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[4].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm5; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[5].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[5].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm6; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[6].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[6].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm7; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[7].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[7].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm8; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[8].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[8].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm9; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[9].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[9].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm10; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[10].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[10].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm11; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[11].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[11].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm12; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[12].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[12].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm13; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[13].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[13].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm14; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[14].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[14].uXmm.s.Hi; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterXmm15; + pInput->Elements[iReg].Value.Reg128.Low64 = pCtx->pXStateR0->x87.aXMM[15].uXmm.s.Lo; + pInput->Elements[iReg].Value.Reg128.High64 = pCtx->pXStateR0->x87.aXMM[15].uXmm.s.Hi; + iReg++; + } + + /* MSRs */ + // HvX64RegisterTsc - don't touch + if (fWhat & CPUMCTX_EXTRN_EFER) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterEfer; + pInput->Elements[iReg].Value.Reg64 = pCtx->msrEFER; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_KERNEL_GS_BASE) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterKernelGsBase; + pInput->Elements[iReg].Value.Reg64 = pCtx->msrKERNELGSBASE; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_SYSENTER_MSRS) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterSysenterCs; + pInput->Elements[iReg].Value.Reg64 = pCtx->SysEnter.cs; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterSysenterEip; + pInput->Elements[iReg].Value.Reg64 = pCtx->SysEnter.eip; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterSysenterEsp; + pInput->Elements[iReg].Value.Reg64 = pCtx->SysEnter.esp; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_SYSCALL_MSRS) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterStar; + pInput->Elements[iReg].Value.Reg64 = pCtx->msrSTAR; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterLstar; + pInput->Elements[iReg].Value.Reg64 = pCtx->msrLSTAR; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterCstar; + pInput->Elements[iReg].Value.Reg64 = pCtx->msrCSTAR; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterSfmask; + pInput->Elements[iReg].Value.Reg64 = pCtx->msrSFMASK; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_OTHER_MSRS) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterApicBase; + pInput->Elements[iReg].Value.Reg64 = APICGetBaseMsrNoCheck(pVCpu); + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterPat; + pInput->Elements[iReg].Value.Reg64 = pCtx->msrPAT; + iReg++; +# if 0 /** @todo HvX64RegisterMtrrCap is read only? Seems it's not even readable. */ + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrCap; + pInput->Elements[iReg].Value.Reg64 = CPUMGetGuestIa32MtrrCap(pVCpu); + iReg++; +# endif + + PCPUMCTXMSRS pCtxMsrs = CPUMQueryGuestCtxMsrsPtr(pVCpu); + + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrDefType; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrDefType; + iReg++; + + /** @todo we dont keep state for HvX64RegisterMtrrPhysBaseX and HvX64RegisterMtrrPhysMaskX */ + + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix64k00000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix64K_00000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix16k80000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix16K_80000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix16kA0000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix16K_A0000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix4kC0000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix4K_C0000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix4kC8000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix4K_C8000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix4kD0000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix4K_D0000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix4kD8000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix4K_D8000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix4kE0000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix4K_E0000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix4kE8000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix4K_E8000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix4kF0000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix4K_F0000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterMtrrFix4kF8000; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MtrrFix4K_F8000; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterTscAux; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.TscAux; + iReg++; + +# if 0 /** @todo Why can't we write these on Intel systems? Not that we really care... */ + const CPUMCPUVENDOR enmCpuVendor = CPUMGetHostCpuVendor(pGVM->pVM); + if (enmCpuVendor != CPUMCPUVENDOR_AMD) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterIa32MiscEnable; + pInput->Elements[iReg].Value.Reg64 = pCtxMsrs->msr.MiscEnable; + iReg++; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterIa32FeatureControl; + pInput->Elements[iReg].Value.Reg64 = CPUMGetGuestIa32FeatureControl(pVCpu); + iReg++; + } +# endif + } + + /* event injection (clear it). */ + if (fWhat & CPUMCTX_EXTRN_NEM_WIN_EVENT_INJECT) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvRegisterPendingInterruption; + pInput->Elements[iReg].Value.Reg64 = 0; + iReg++; + } + + /* Interruptibility state. This can get a little complicated since we get + half of the state via HV_X64_VP_EXECUTION_STATE. */ + if ( (fWhat & (CPUMCTX_EXTRN_NEM_WIN_INHIBIT_INT | CPUMCTX_EXTRN_NEM_WIN_INHIBIT_NMI)) + == (CPUMCTX_EXTRN_NEM_WIN_INHIBIT_INT | CPUMCTX_EXTRN_NEM_WIN_INHIBIT_NMI) ) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvRegisterInterruptState; + pInput->Elements[iReg].Value.Reg64 = 0; + if ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS) + && EMGetInhibitInterruptsPC(pVCpu) == pCtx->rip) + pInput->Elements[iReg].Value.InterruptState.InterruptShadow = 1; + if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + pInput->Elements[iReg].Value.InterruptState.NmiMasked = 1; + iReg++; + } + else if (fWhat & CPUMCTX_EXTRN_NEM_WIN_INHIBIT_INT) + { + if ( pVCpu->nem.s.fLastInterruptShadow + || ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS) + && EMGetInhibitInterruptsPC(pVCpu) == pCtx->rip)) + { + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvRegisterInterruptState; + pInput->Elements[iReg].Value.Reg64 = 0; + if ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS) + && EMGetInhibitInterruptsPC(pVCpu) == pCtx->rip) + pInput->Elements[iReg].Value.InterruptState.InterruptShadow = 1; + /** @todo Retrieve NMI state, currently assuming it's zero. (yes this may happen on I/O) */ + //if (VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_BLOCK_NMIS)) + // pInput->Elements[iReg].Value.InterruptState.NmiMasked = 1; + iReg++; + } + } + else + Assert(!(fWhat & CPUMCTX_EXTRN_NEM_WIN_INHIBIT_NMI)); + + /* Interrupt windows. Always set if active as Hyper-V seems to be forgetful. */ + uint8_t const fDesiredIntWin = pVCpu->nem.s.fDesiredInterruptWindows; + if ( fDesiredIntWin + || pVCpu->nem.s.fCurrentInterruptWindows != fDesiredIntWin) + { + pVCpu->nem.s.fCurrentInterruptWindows = pVCpu->nem.s.fDesiredInterruptWindows; + HV_REGISTER_ASSOC_ZERO_PADDING_AND_HI64(&pInput->Elements[iReg]); + pInput->Elements[iReg].Name = HvX64RegisterDeliverabilityNotifications; + pInput->Elements[iReg].Value.DeliverabilityNotifications.AsUINT64 = fDesiredIntWin; + Assert(pInput->Elements[iReg].Value.DeliverabilityNotifications.NmiNotification == RT_BOOL(fDesiredIntWin & NEM_WIN_INTW_F_NMI)); + Assert(pInput->Elements[iReg].Value.DeliverabilityNotifications.InterruptNotification == RT_BOOL(fDesiredIntWin & NEM_WIN_INTW_F_REGULAR)); + Assert(pInput->Elements[iReg].Value.DeliverabilityNotifications.InterruptPriority == (fDesiredIntWin & NEM_WIN_INTW_F_PRIO_MASK) >> NEM_WIN_INTW_F_PRIO_SHIFT); + iReg++; + } + + /// @todo HvRegisterPendingEvent0 + /// @todo HvRegisterPendingEvent1 + + /* + * Set the registers. + */ + Assert((uintptr_t)&pInput->Elements[iReg] - (uintptr_t)pGVCpu->nem.s.HypercallData.pbPage < PAGE_SIZE); /* max is 127 */ + + /* + * Make the hypercall. + */ + uint64_t uResult = g_pfnHvlInvokeHypercall(HV_MAKE_CALL_INFO(HvCallSetVpRegisters, iReg), + pGVCpu->nem.s.HypercallData.HCPhysPage, 0 /*GCPhysOutput*/); + AssertLogRelMsgReturn(uResult == HV_MAKE_CALL_REP_RET(iReg), + ("uResult=%RX64 iRegs=%#x\n", uResult, iReg), + VERR_NEM_SET_REGISTERS_FAILED); + //LogFlow(("nemR0WinExportState: uResult=%#RX64 iReg=%zu fWhat=%#018RX64 fExtrn=%#018RX64 -> %#018RX64\n", uResult, iReg, fWhat, pCtx->fExtrn, + // pCtx->fExtrn | CPUMCTX_EXTRN_ALL | CPUMCTX_EXTRN_NEM_WIN_MASK | CPUMCTX_EXTRN_KEEPER_NEM )); + pCtx->fExtrn |= CPUMCTX_EXTRN_ALL | CPUMCTX_EXTRN_NEM_WIN_MASK | CPUMCTX_EXTRN_KEEPER_NEM; + return VINF_SUCCESS; +} +#endif /* NEM_WIN_WITH_RING0_RUNLOOP || NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS */ + + +/** + * Export the state to the native API (out of CPUMCTX). + * + * @returns VBox status code + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @param idCpu The calling EMT. Necessary for getting the + * hypercall page and arguments. + */ +VMMR0_INT_DECL(int) NEMR0ExportState(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) + /* + * Validate the call. + */ + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_SUCCESS(rc)) + { + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + /* + * Call worker. + */ + rc = nemR0WinExportState(pGVM, pGVCpu, &pVCpu->cpum.GstCtx); + } + return rc; +#else + RT_NOREF(pGVM, pVM, idCpu); + return VERR_NOT_IMPLEMENTED; +#endif +} + + +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) +/** + * Worker for NEMR0ImportState. + * + * Intention is to use it internally later. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pGVCpu The ring-0 VCPU handle. + * @param pCtx The CPU context structure to import into. + * @param fWhat What to import, CPUMCTX_EXTRN_XXX. + * @param fCanUpdateCr3 Whether it's safe to update CR3 or not. + */ +NEM_TMPL_STATIC int nemR0WinImportState(PGVM pGVM, PGVMCPU pGVCpu, PCPUMCTX pCtx, uint64_t fWhat, bool fCanUpdateCr3) +{ + HV_INPUT_GET_VP_REGISTERS *pInput = (HV_INPUT_GET_VP_REGISTERS *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pInput, VERR_INTERNAL_ERROR_3); + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + Assert(pCtx == &pGVCpu->pVCpu->cpum.GstCtx); + + fWhat &= pCtx->fExtrn; + + pInput->PartitionId = pGVM->nem.s.idHvPartition; + pInput->VpIndex = pGVCpu->idCpu; + pInput->fFlags = 0; + + /* GPRs */ + uintptr_t iReg = 0; + if (fWhat & CPUMCTX_EXTRN_GPRS_MASK) + { + if (fWhat & CPUMCTX_EXTRN_RAX) + pInput->Names[iReg++] = HvX64RegisterRax; + if (fWhat & CPUMCTX_EXTRN_RCX) + pInput->Names[iReg++] = HvX64RegisterRcx; + if (fWhat & CPUMCTX_EXTRN_RDX) + pInput->Names[iReg++] = HvX64RegisterRdx; + if (fWhat & CPUMCTX_EXTRN_RBX) + pInput->Names[iReg++] = HvX64RegisterRbx; + if (fWhat & CPUMCTX_EXTRN_RSP) + pInput->Names[iReg++] = HvX64RegisterRsp; + if (fWhat & CPUMCTX_EXTRN_RBP) + pInput->Names[iReg++] = HvX64RegisterRbp; + if (fWhat & CPUMCTX_EXTRN_RSI) + pInput->Names[iReg++] = HvX64RegisterRsi; + if (fWhat & CPUMCTX_EXTRN_RDI) + pInput->Names[iReg++] = HvX64RegisterRdi; + if (fWhat & CPUMCTX_EXTRN_R8_R15) + { + pInput->Names[iReg++] = HvX64RegisterR8; + pInput->Names[iReg++] = HvX64RegisterR9; + pInput->Names[iReg++] = HvX64RegisterR10; + pInput->Names[iReg++] = HvX64RegisterR11; + pInput->Names[iReg++] = HvX64RegisterR12; + pInput->Names[iReg++] = HvX64RegisterR13; + pInput->Names[iReg++] = HvX64RegisterR14; + pInput->Names[iReg++] = HvX64RegisterR15; + } + } + + /* RIP & Flags */ + if (fWhat & CPUMCTX_EXTRN_RIP) + pInput->Names[iReg++] = HvX64RegisterRip; + if (fWhat & CPUMCTX_EXTRN_RFLAGS) + pInput->Names[iReg++] = HvX64RegisterRflags; + + /* Segments */ + if (fWhat & CPUMCTX_EXTRN_SREG_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CS) + pInput->Names[iReg++] = HvX64RegisterCs; + if (fWhat & CPUMCTX_EXTRN_ES) + pInput->Names[iReg++] = HvX64RegisterEs; + if (fWhat & CPUMCTX_EXTRN_SS) + pInput->Names[iReg++] = HvX64RegisterSs; + if (fWhat & CPUMCTX_EXTRN_DS) + pInput->Names[iReg++] = HvX64RegisterDs; + if (fWhat & CPUMCTX_EXTRN_FS) + pInput->Names[iReg++] = HvX64RegisterFs; + if (fWhat & CPUMCTX_EXTRN_GS) + pInput->Names[iReg++] = HvX64RegisterGs; + } + + /* Descriptor tables and the task segment. */ + if (fWhat & CPUMCTX_EXTRN_TABLE_MASK) + { + if (fWhat & CPUMCTX_EXTRN_LDTR) + pInput->Names[iReg++] = HvX64RegisterLdtr; + if (fWhat & CPUMCTX_EXTRN_TR) + pInput->Names[iReg++] = HvX64RegisterTr; + if (fWhat & CPUMCTX_EXTRN_IDTR) + pInput->Names[iReg++] = HvX64RegisterIdtr; + if (fWhat & CPUMCTX_EXTRN_GDTR) + pInput->Names[iReg++] = HvX64RegisterGdtr; + } + + /* Control registers. */ + if (fWhat & CPUMCTX_EXTRN_CR_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CR0) + pInput->Names[iReg++] = HvX64RegisterCr0; + if (fWhat & CPUMCTX_EXTRN_CR2) + pInput->Names[iReg++] = HvX64RegisterCr2; + if (fWhat & CPUMCTX_EXTRN_CR3) + pInput->Names[iReg++] = HvX64RegisterCr3; + if (fWhat & CPUMCTX_EXTRN_CR4) + pInput->Names[iReg++] = HvX64RegisterCr4; + } + if (fWhat & CPUMCTX_EXTRN_APIC_TPR) + pInput->Names[iReg++] = HvX64RegisterCr8; + + /* Debug registers. */ + if (fWhat & CPUMCTX_EXTRN_DR7) + pInput->Names[iReg++] = HvX64RegisterDr7; + if (fWhat & CPUMCTX_EXTRN_DR0_DR3) + { + if (!(fWhat & CPUMCTX_EXTRN_DR7) && (pCtx->fExtrn & CPUMCTX_EXTRN_DR7)) + { + fWhat |= CPUMCTX_EXTRN_DR7; + pInput->Names[iReg++] = HvX64RegisterDr7; + } + pInput->Names[iReg++] = HvX64RegisterDr0; + pInput->Names[iReg++] = HvX64RegisterDr1; + pInput->Names[iReg++] = HvX64RegisterDr2; + pInput->Names[iReg++] = HvX64RegisterDr3; + } + if (fWhat & CPUMCTX_EXTRN_DR6) + pInput->Names[iReg++] = HvX64RegisterDr6; + + /* Floating point state. */ + if (fWhat & CPUMCTX_EXTRN_X87) + { + pInput->Names[iReg++] = HvX64RegisterFpMmx0; + pInput->Names[iReg++] = HvX64RegisterFpMmx1; + pInput->Names[iReg++] = HvX64RegisterFpMmx2; + pInput->Names[iReg++] = HvX64RegisterFpMmx3; + pInput->Names[iReg++] = HvX64RegisterFpMmx4; + pInput->Names[iReg++] = HvX64RegisterFpMmx5; + pInput->Names[iReg++] = HvX64RegisterFpMmx6; + pInput->Names[iReg++] = HvX64RegisterFpMmx7; + pInput->Names[iReg++] = HvX64RegisterFpControlStatus; + } + if (fWhat & (CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX)) + pInput->Names[iReg++] = HvX64RegisterXmmControlStatus; + + /* Vector state. */ + if (fWhat & CPUMCTX_EXTRN_SSE_AVX) + { + pInput->Names[iReg++] = HvX64RegisterXmm0; + pInput->Names[iReg++] = HvX64RegisterXmm1; + pInput->Names[iReg++] = HvX64RegisterXmm2; + pInput->Names[iReg++] = HvX64RegisterXmm3; + pInput->Names[iReg++] = HvX64RegisterXmm4; + pInput->Names[iReg++] = HvX64RegisterXmm5; + pInput->Names[iReg++] = HvX64RegisterXmm6; + pInput->Names[iReg++] = HvX64RegisterXmm7; + pInput->Names[iReg++] = HvX64RegisterXmm8; + pInput->Names[iReg++] = HvX64RegisterXmm9; + pInput->Names[iReg++] = HvX64RegisterXmm10; + pInput->Names[iReg++] = HvX64RegisterXmm11; + pInput->Names[iReg++] = HvX64RegisterXmm12; + pInput->Names[iReg++] = HvX64RegisterXmm13; + pInput->Names[iReg++] = HvX64RegisterXmm14; + pInput->Names[iReg++] = HvX64RegisterXmm15; + } + + /* MSRs */ + // HvX64RegisterTsc - don't touch + if (fWhat & CPUMCTX_EXTRN_EFER) + pInput->Names[iReg++] = HvX64RegisterEfer; + if (fWhat & CPUMCTX_EXTRN_KERNEL_GS_BASE) + pInput->Names[iReg++] = HvX64RegisterKernelGsBase; + if (fWhat & CPUMCTX_EXTRN_SYSENTER_MSRS) + { + pInput->Names[iReg++] = HvX64RegisterSysenterCs; + pInput->Names[iReg++] = HvX64RegisterSysenterEip; + pInput->Names[iReg++] = HvX64RegisterSysenterEsp; + } + if (fWhat & CPUMCTX_EXTRN_SYSCALL_MSRS) + { + pInput->Names[iReg++] = HvX64RegisterStar; + pInput->Names[iReg++] = HvX64RegisterLstar; + pInput->Names[iReg++] = HvX64RegisterCstar; + pInput->Names[iReg++] = HvX64RegisterSfmask; + } + +# ifdef LOG_ENABLED + const CPUMCPUVENDOR enmCpuVendor = CPUMGetHostCpuVendor(pGVM->pVM); +# endif + if (fWhat & CPUMCTX_EXTRN_OTHER_MSRS) + { + pInput->Names[iReg++] = HvX64RegisterApicBase; /// @todo APIC BASE + pInput->Names[iReg++] = HvX64RegisterPat; +# if 0 /*def LOG_ENABLED*/ /** @todo something's wrong with HvX64RegisterMtrrCap? (AMD) */ + pInput->Names[iReg++] = HvX64RegisterMtrrCap; +# endif + pInput->Names[iReg++] = HvX64RegisterMtrrDefType; + pInput->Names[iReg++] = HvX64RegisterMtrrFix64k00000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix16k80000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix16kA0000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix4kC0000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix4kC8000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix4kD0000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix4kD8000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix4kE0000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix4kE8000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix4kF0000; + pInput->Names[iReg++] = HvX64RegisterMtrrFix4kF8000; + pInput->Names[iReg++] = HvX64RegisterTscAux; +# if 0 /** @todo why can't we read HvX64RegisterIa32MiscEnable? */ + if (enmCpuVendor != CPUMCPUVENDOR_AMD) + pInput->Names[iReg++] = HvX64RegisterIa32MiscEnable; +# endif +# ifdef LOG_ENABLED + if (enmCpuVendor != CPUMCPUVENDOR_AMD) + pInput->Names[iReg++] = HvX64RegisterIa32FeatureControl; +# endif + } + + /* Interruptibility. */ + if (fWhat & (CPUMCTX_EXTRN_NEM_WIN_INHIBIT_INT | CPUMCTX_EXTRN_NEM_WIN_INHIBIT_NMI)) + { + pInput->Names[iReg++] = HvRegisterInterruptState; + pInput->Names[iReg++] = HvX64RegisterRip; + } + + /* event injection */ + pInput->Names[iReg++] = HvRegisterPendingInterruption; + pInput->Names[iReg++] = HvRegisterPendingEvent0; + pInput->Names[iReg++] = HvRegisterPendingEvent1; + size_t const cRegs = iReg; + size_t const cbInput = RT_ALIGN_Z(RT_UOFFSETOF_DYN(HV_INPUT_GET_VP_REGISTERS, Names[cRegs]), 32); + + HV_REGISTER_VALUE *paValues = (HV_REGISTER_VALUE *)((uint8_t *)pInput + cbInput); + Assert((uintptr_t)&paValues[cRegs] - (uintptr_t)pGVCpu->nem.s.HypercallData.pbPage < PAGE_SIZE); /* (max is around 168 registers) */ + RT_BZERO(paValues, cRegs * sizeof(paValues[0])); + + /* + * Make the hypercall. + */ + uint64_t uResult = g_pfnHvlInvokeHypercall(HV_MAKE_CALL_INFO(HvCallGetVpRegisters, cRegs), + pGVCpu->nem.s.HypercallData.HCPhysPage, + pGVCpu->nem.s.HypercallData.HCPhysPage + cbInput); + AssertLogRelMsgReturn(uResult == HV_MAKE_CALL_REP_RET(cRegs), + ("uResult=%RX64 cRegs=%#x\n", uResult, cRegs), + VERR_NEM_GET_REGISTERS_FAILED); + //LogFlow(("nemR0WinImportState: uResult=%#RX64 iReg=%zu fWhat=%#018RX64 fExtr=%#018RX64\n", uResult, cRegs, fWhat, pCtx->fExtrn)); + + /* + * Copy information to the CPUM context. + */ + PVMCPU pVCpu = &pGVM->pVM->aCpus[pGVCpu->idCpu]; + iReg = 0; + + /* GPRs */ + if (fWhat & CPUMCTX_EXTRN_GPRS_MASK) + { + if (fWhat & CPUMCTX_EXTRN_RAX) + { + Assert(pInput->Names[iReg] == HvX64RegisterRax); + pCtx->rax = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_RCX) + { + Assert(pInput->Names[iReg] == HvX64RegisterRcx); + pCtx->rcx = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_RDX) + { + Assert(pInput->Names[iReg] == HvX64RegisterRdx); + pCtx->rdx = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_RBX) + { + Assert(pInput->Names[iReg] == HvX64RegisterRbx); + pCtx->rbx = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_RSP) + { + Assert(pInput->Names[iReg] == HvX64RegisterRsp); + pCtx->rsp = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_RBP) + { + Assert(pInput->Names[iReg] == HvX64RegisterRbp); + pCtx->rbp = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_RSI) + { + Assert(pInput->Names[iReg] == HvX64RegisterRsi); + pCtx->rsi = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_RDI) + { + Assert(pInput->Names[iReg] == HvX64RegisterRdi); + pCtx->rdi = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_R8_R15) + { + Assert(pInput->Names[iReg] == HvX64RegisterR8); + Assert(pInput->Names[iReg + 7] == HvX64RegisterR15); + pCtx->r8 = paValues[iReg++].Reg64; + pCtx->r9 = paValues[iReg++].Reg64; + pCtx->r10 = paValues[iReg++].Reg64; + pCtx->r11 = paValues[iReg++].Reg64; + pCtx->r12 = paValues[iReg++].Reg64; + pCtx->r13 = paValues[iReg++].Reg64; + pCtx->r14 = paValues[iReg++].Reg64; + pCtx->r15 = paValues[iReg++].Reg64; + } + } + + /* RIP & Flags */ + if (fWhat & CPUMCTX_EXTRN_RIP) + { + Assert(pInput->Names[iReg] == HvX64RegisterRip); + pCtx->rip = paValues[iReg++].Reg64; + } + if (fWhat & CPUMCTX_EXTRN_RFLAGS) + { + Assert(pInput->Names[iReg] == HvX64RegisterRflags); + pCtx->rflags.u = paValues[iReg++].Reg64; + } + + /* Segments */ +# define COPY_BACK_SEG(a_idx, a_enmName, a_SReg) \ + do { \ + Assert(pInput->Names[a_idx] == a_enmName); \ + (a_SReg).u64Base = paValues[a_idx].Segment.Base; \ + (a_SReg).u32Limit = paValues[a_idx].Segment.Limit; \ + (a_SReg).ValidSel = (a_SReg).Sel = paValues[a_idx].Segment.Selector; \ + (a_SReg).Attr.u = paValues[a_idx].Segment.Attributes; \ + (a_SReg).fFlags = CPUMSELREG_FLAGS_VALID; \ + } while (0) + if (fWhat & CPUMCTX_EXTRN_SREG_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CS) + { + COPY_BACK_SEG(iReg, HvX64RegisterCs, pCtx->cs); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_ES) + { + COPY_BACK_SEG(iReg, HvX64RegisterEs, pCtx->es); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_SS) + { + COPY_BACK_SEG(iReg, HvX64RegisterSs, pCtx->ss); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_DS) + { + COPY_BACK_SEG(iReg, HvX64RegisterDs, pCtx->ds); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_FS) + { + COPY_BACK_SEG(iReg, HvX64RegisterFs, pCtx->fs); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_GS) + { + COPY_BACK_SEG(iReg, HvX64RegisterGs, pCtx->gs); + iReg++; + } + } + /* Descriptor tables and the task segment. */ + if (fWhat & CPUMCTX_EXTRN_TABLE_MASK) + { + if (fWhat & CPUMCTX_EXTRN_LDTR) + { + COPY_BACK_SEG(iReg, HvX64RegisterLdtr, pCtx->ldtr); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_TR) + { + /* AMD-V likes loading TR with in AVAIL state, whereas intel insists on BUSY. So, + avoid to trigger sanity assertions around the code, always fix this. */ + COPY_BACK_SEG(iReg, HvX64RegisterTr, pCtx->tr); + switch (pCtx->tr.Attr.n.u4Type) + { + case X86_SEL_TYPE_SYS_386_TSS_BUSY: + case X86_SEL_TYPE_SYS_286_TSS_BUSY: + break; + case X86_SEL_TYPE_SYS_386_TSS_AVAIL: + pCtx->tr.Attr.n.u4Type = X86_SEL_TYPE_SYS_386_TSS_BUSY; + break; + case X86_SEL_TYPE_SYS_286_TSS_AVAIL: + pCtx->tr.Attr.n.u4Type = X86_SEL_TYPE_SYS_286_TSS_BUSY; + break; + } + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_IDTR) + { + Assert(pInput->Names[iReg] == HvX64RegisterIdtr); + pCtx->idtr.cbIdt = paValues[iReg].Table.Limit; + pCtx->idtr.pIdt = paValues[iReg].Table.Base; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_GDTR) + { + Assert(pInput->Names[iReg] == HvX64RegisterGdtr); + pCtx->gdtr.cbGdt = paValues[iReg].Table.Limit; + pCtx->gdtr.pGdt = paValues[iReg].Table.Base; + iReg++; + } + } + + /* Control registers. */ + bool fMaybeChangedMode = false; + bool fUpdateCr3 = false; + if (fWhat & CPUMCTX_EXTRN_CR_MASK) + { + if (fWhat & CPUMCTX_EXTRN_CR0) + { + Assert(pInput->Names[iReg] == HvX64RegisterCr0); + if (pCtx->cr0 != paValues[iReg].Reg64) + { + CPUMSetGuestCR0(pVCpu, paValues[iReg].Reg64); + fMaybeChangedMode = true; + } + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_CR2) + { + Assert(pInput->Names[iReg] == HvX64RegisterCr2); + pCtx->cr2 = paValues[iReg].Reg64; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_CR3) + { + Assert(pInput->Names[iReg] == HvX64RegisterCr3); + if (pCtx->cr3 != paValues[iReg].Reg64) + { + CPUMSetGuestCR3(pVCpu, paValues[iReg].Reg64); + fUpdateCr3 = true; + } + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_CR4) + { + Assert(pInput->Names[iReg] == HvX64RegisterCr4); + if (pCtx->cr4 != paValues[iReg].Reg64) + { + CPUMSetGuestCR4(pVCpu, paValues[iReg].Reg64); + fMaybeChangedMode = true; + } + iReg++; + } + } + if (fWhat & CPUMCTX_EXTRN_APIC_TPR) + { + Assert(pInput->Names[iReg] == HvX64RegisterCr8); + APICSetTpr(pVCpu, (uint8_t)paValues[iReg].Reg64 << 4); + iReg++; + } + + /* Debug registers. */ + if (fWhat & CPUMCTX_EXTRN_DR7) + { + Assert(pInput->Names[iReg] == HvX64RegisterDr7); + if (pCtx->dr[7] != paValues[iReg].Reg64) + CPUMSetGuestDR7(pVCpu, paValues[iReg].Reg64); + pCtx->fExtrn &= ~CPUMCTX_EXTRN_DR7; /* Hack alert! Avoids asserting when processing CPUMCTX_EXTRN_DR0_DR3. */ + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_DR0_DR3) + { + Assert(pInput->Names[iReg] == HvX64RegisterDr0); + Assert(pInput->Names[iReg+3] == HvX64RegisterDr3); + if (pCtx->dr[0] != paValues[iReg].Reg64) + CPUMSetGuestDR0(pVCpu, paValues[iReg].Reg64); + iReg++; + if (pCtx->dr[1] != paValues[iReg].Reg64) + CPUMSetGuestDR1(pVCpu, paValues[iReg].Reg64); + iReg++; + if (pCtx->dr[2] != paValues[iReg].Reg64) + CPUMSetGuestDR2(pVCpu, paValues[iReg].Reg64); + iReg++; + if (pCtx->dr[3] != paValues[iReg].Reg64) + CPUMSetGuestDR3(pVCpu, paValues[iReg].Reg64); + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_DR6) + { + Assert(pInput->Names[iReg] == HvX64RegisterDr6); + if (pCtx->dr[6] != paValues[iReg].Reg64) + CPUMSetGuestDR6(pVCpu, paValues[iReg].Reg64); + iReg++; + } + + /* Floating point state. */ + if (fWhat & CPUMCTX_EXTRN_X87) + { + Assert(pInput->Names[iReg] == HvX64RegisterFpMmx0); + Assert(pInput->Names[iReg + 7] == HvX64RegisterFpMmx7); + pCtx->pXStateR0->x87.aRegs[0].au64[0] = paValues[iReg].Fp.AsUINT128.Low64; + pCtx->pXStateR0->x87.aRegs[0].au64[1] = paValues[iReg].Fp.AsUINT128.High64; + iReg++; + pCtx->pXStateR0->x87.aRegs[1].au64[0] = paValues[iReg].Fp.AsUINT128.Low64; + pCtx->pXStateR0->x87.aRegs[1].au64[1] = paValues[iReg].Fp.AsUINT128.High64; + iReg++; + pCtx->pXStateR0->x87.aRegs[2].au64[0] = paValues[iReg].Fp.AsUINT128.Low64; + pCtx->pXStateR0->x87.aRegs[2].au64[1] = paValues[iReg].Fp.AsUINT128.High64; + iReg++; + pCtx->pXStateR0->x87.aRegs[3].au64[0] = paValues[iReg].Fp.AsUINT128.Low64; + pCtx->pXStateR0->x87.aRegs[3].au64[1] = paValues[iReg].Fp.AsUINT128.High64; + iReg++; + pCtx->pXStateR0->x87.aRegs[4].au64[0] = paValues[iReg].Fp.AsUINT128.Low64; + pCtx->pXStateR0->x87.aRegs[4].au64[1] = paValues[iReg].Fp.AsUINT128.High64; + iReg++; + pCtx->pXStateR0->x87.aRegs[5].au64[0] = paValues[iReg].Fp.AsUINT128.Low64; + pCtx->pXStateR0->x87.aRegs[5].au64[1] = paValues[iReg].Fp.AsUINT128.High64; + iReg++; + pCtx->pXStateR0->x87.aRegs[6].au64[0] = paValues[iReg].Fp.AsUINT128.Low64; + pCtx->pXStateR0->x87.aRegs[6].au64[1] = paValues[iReg].Fp.AsUINT128.High64; + iReg++; + pCtx->pXStateR0->x87.aRegs[7].au64[0] = paValues[iReg].Fp.AsUINT128.Low64; + pCtx->pXStateR0->x87.aRegs[7].au64[1] = paValues[iReg].Fp.AsUINT128.High64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterFpControlStatus); + pCtx->pXStateR0->x87.FCW = paValues[iReg].FpControlStatus.FpControl; + pCtx->pXStateR0->x87.FSW = paValues[iReg].FpControlStatus.FpStatus; + pCtx->pXStateR0->x87.FTW = paValues[iReg].FpControlStatus.FpTag + /*| (paValues[iReg].FpControlStatus.Reserved << 8)*/; + pCtx->pXStateR0->x87.FOP = paValues[iReg].FpControlStatus.LastFpOp; + pCtx->pXStateR0->x87.FPUIP = (uint32_t)paValues[iReg].FpControlStatus.LastFpRip; + pCtx->pXStateR0->x87.CS = (uint16_t)(paValues[iReg].FpControlStatus.LastFpRip >> 32); + pCtx->pXStateR0->x87.Rsrvd1 = (uint16_t)(paValues[iReg].FpControlStatus.LastFpRip >> 48); + iReg++; + } + + if (fWhat & (CPUMCTX_EXTRN_X87 | CPUMCTX_EXTRN_SSE_AVX)) + { + Assert(pInput->Names[iReg] == HvX64RegisterXmmControlStatus); + if (fWhat & CPUMCTX_EXTRN_X87) + { + pCtx->pXStateR0->x87.FPUDP = (uint32_t)paValues[iReg].XmmControlStatus.LastFpRdp; + pCtx->pXStateR0->x87.DS = (uint16_t)(paValues[iReg].XmmControlStatus.LastFpRdp >> 32); + pCtx->pXStateR0->x87.Rsrvd2 = (uint16_t)(paValues[iReg].XmmControlStatus.LastFpRdp >> 48); + } + pCtx->pXStateR0->x87.MXCSR = paValues[iReg].XmmControlStatus.XmmStatusControl; + pCtx->pXStateR0->x87.MXCSR_MASK = paValues[iReg].XmmControlStatus.XmmStatusControlMask; /** @todo ??? (Isn't this an output field?) */ + iReg++; + } + + /* Vector state. */ + if (fWhat & CPUMCTX_EXTRN_SSE_AVX) + { + Assert(pInput->Names[iReg] == HvX64RegisterXmm0); + Assert(pInput->Names[iReg+15] == HvX64RegisterXmm15); + pCtx->pXStateR0->x87.aXMM[0].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[0].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[1].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[1].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[2].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[2].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[3].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[3].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[4].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[4].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[5].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[5].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[6].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[6].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[7].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[7].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[8].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[8].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[9].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[9].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[10].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[10].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[11].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[11].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[12].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[12].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[13].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[13].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[14].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[14].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + pCtx->pXStateR0->x87.aXMM[15].uXmm.s.Lo = paValues[iReg].Reg128.Low64; + pCtx->pXStateR0->x87.aXMM[15].uXmm.s.Hi = paValues[iReg].Reg128.High64; + iReg++; + } + + + /* MSRs */ + // HvX64RegisterTsc - don't touch + if (fWhat & CPUMCTX_EXTRN_EFER) + { + Assert(pInput->Names[iReg] == HvX64RegisterEfer); + if (paValues[iReg].Reg64 != pCtx->msrEFER) + { + Log7(("NEM/%u: MSR EFER changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->msrEFER, paValues[iReg].Reg64)); + if ((paValues[iReg].Reg64 ^ pCtx->msrEFER) & MSR_K6_EFER_NXE) + PGMNotifyNxeChanged(pVCpu, RT_BOOL(paValues[iReg].Reg64 & MSR_K6_EFER_NXE)); + pCtx->msrEFER = paValues[iReg].Reg64; + fMaybeChangedMode = true; + } + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_KERNEL_GS_BASE) + { + Assert(pInput->Names[iReg] == HvX64RegisterKernelGsBase); + if (pCtx->msrKERNELGSBASE != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR KERNELGSBASE changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->msrKERNELGSBASE, paValues[iReg].Reg64)); + pCtx->msrKERNELGSBASE = paValues[iReg].Reg64; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_SYSENTER_MSRS) + { + Assert(pInput->Names[iReg] == HvX64RegisterSysenterCs); + if (pCtx->SysEnter.cs != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR SYSENTER.CS changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->SysEnter.cs, paValues[iReg].Reg64)); + pCtx->SysEnter.cs = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterSysenterEip); + if (pCtx->SysEnter.eip != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR SYSENTER.EIP changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->SysEnter.eip, paValues[iReg].Reg64)); + pCtx->SysEnter.eip = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterSysenterEsp); + if (pCtx->SysEnter.esp != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR SYSENTER.ESP changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->SysEnter.esp, paValues[iReg].Reg64)); + pCtx->SysEnter.esp = paValues[iReg].Reg64; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_SYSCALL_MSRS) + { + Assert(pInput->Names[iReg] == HvX64RegisterStar); + if (pCtx->msrSTAR != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR STAR changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->msrSTAR, paValues[iReg].Reg64)); + pCtx->msrSTAR = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterLstar); + if (pCtx->msrLSTAR != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR LSTAR changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->msrLSTAR, paValues[iReg].Reg64)); + pCtx->msrLSTAR = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterCstar); + if (pCtx->msrCSTAR != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR CSTAR changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->msrCSTAR, paValues[iReg].Reg64)); + pCtx->msrCSTAR = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterSfmask); + if (pCtx->msrSFMASK != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR SFMASK changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->msrSFMASK, paValues[iReg].Reg64)); + pCtx->msrSFMASK = paValues[iReg].Reg64; + iReg++; + } + if (fWhat & CPUMCTX_EXTRN_OTHER_MSRS) + { + Assert(pInput->Names[iReg] == HvX64RegisterApicBase); + const uint64_t uOldBase = APICGetBaseMsrNoCheck(pVCpu); + if (paValues[iReg].Reg64 != uOldBase) + { + Log7(("NEM/%u: MSR APICBase changed %RX64 -> %RX64 (%RX64)\n", + pVCpu->idCpu, uOldBase, paValues[iReg].Reg64, paValues[iReg].Reg64 ^ uOldBase)); + int rc2 = APICSetBaseMsr(pVCpu, paValues[iReg].Reg64); + AssertLogRelMsg(rc2 == VINF_SUCCESS, ("rc2=%Rrc [%#RX64]\n", rc2, paValues[iReg].Reg64)); + } + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterPat); + if (pCtx->msrPAT != paValues[iReg].Reg64) + Log7(("NEM/%u: MSR PAT changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtx->msrPAT, paValues[iReg].Reg64)); + pCtx->msrPAT = paValues[iReg].Reg64; + iReg++; + +# if 0 /*def LOG_ENABLED*/ /** @todo something's wrong with HvX64RegisterMtrrCap? (AMD) */ + Assert(pInput->Names[iReg] == HvX64RegisterMtrrCap); + if (paValues[iReg].Reg64 != CPUMGetGuestIa32MtrrCap(pVCpu)) + Log7(("NEM/%u: MSR MTRR_CAP changed %RX64 -> %RX64 (!!)\n", pVCpu->idCpu, CPUMGetGuestIa32MtrrCap(pVCpu), paValues[iReg].Reg64)); + iReg++; +# endif + + PCPUMCTXMSRS pCtxMsrs = CPUMQueryGuestCtxMsrsPtr(pVCpu); + Assert(pInput->Names[iReg] == HvX64RegisterMtrrDefType); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrDefType ) + Log7(("NEM/%u: MSR MTRR_DEF_TYPE changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrDefType, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrDefType = paValues[iReg].Reg64; + iReg++; + + /** @todo we dont keep state for HvX64RegisterMtrrPhysBaseX and HvX64RegisterMtrrPhysMaskX */ + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix64k00000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix64K_00000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_00000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix64K_00000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix64K_00000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix16k80000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix16K_80000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_80000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix16K_80000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix16K_80000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix16kA0000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix16K_A0000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_A0000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix16K_A0000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix16K_A0000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix4kC0000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix4K_C0000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_C0000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix4K_C0000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix4K_C0000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix4kC8000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix4K_C8000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_C8000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix4K_C8000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix4K_C8000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix4kD0000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix4K_D0000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_D0000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix4K_D0000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix4K_D0000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix4kD8000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix4K_D8000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_D8000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix4K_D8000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix4K_D8000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix4kE0000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix4K_E0000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_E0000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix4K_E0000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix4K_E0000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix4kE8000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix4K_E8000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_E8000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix4K_E8000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix4K_E8000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix4kF0000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix4K_F0000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_F0000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix4K_F0000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix4K_F0000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterMtrrFix4kF8000); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MtrrFix4K_F8000 ) + Log7(("NEM/%u: MSR MTRR_FIX16K_F8000 changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MtrrFix4K_F8000, paValues[iReg].Reg64)); + pCtxMsrs->msr.MtrrFix4K_F8000 = paValues[iReg].Reg64; + iReg++; + + Assert(pInput->Names[iReg] == HvX64RegisterTscAux); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.TscAux ) + Log7(("NEM/%u: MSR TSC_AUX changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.TscAux, paValues[iReg].Reg64)); + pCtxMsrs->msr.TscAux = paValues[iReg].Reg64; + iReg++; + +# if 0 /** @todo why can't we even read HvX64RegisterIa32MiscEnable? */ + if (enmCpuVendor != CPUMCPUVENDOR_AMD) + { + Assert(pInput->Names[iReg] == HvX64RegisterIa32MiscEnable); + if (paValues[iReg].Reg64 != pCtxMsrs->msr.MiscEnable) + Log7(("NEM/%u: MSR MISC_ENABLE changed %RX64 -> %RX64\n", pVCpu->idCpu, pCtxMsrs->msr.MiscEnable, paValues[iReg].Reg64)); + pCtxMsrs->msr.MiscEnable = paValues[iReg].Reg64; + iReg++; + } +# endif +# ifdef LOG_ENABLED + if (enmCpuVendor != CPUMCPUVENDOR_AMD) + { + Assert(pInput->Names[iReg] == HvX64RegisterIa32FeatureControl); + if (paValues[iReg].Reg64 != pCtx->hwvirt.vmx.Msrs.u64FeatCtrl) + Log7(("NEM/%u: MSR FEATURE_CONTROL changed %RX64 -> %RX64 (!!)\n", pVCpu->idCpu, pCtx->hwvirt.vmx.Msrs.u64FeatCtrl, paValues[iReg].Reg64)); + iReg++; + } +# endif + } + + /* Interruptibility. */ + if (fWhat & (CPUMCTX_EXTRN_NEM_WIN_INHIBIT_INT | CPUMCTX_EXTRN_NEM_WIN_INHIBIT_NMI)) + { + Assert(pInput->Names[iReg] == HvRegisterInterruptState); + Assert(pInput->Names[iReg + 1] == HvX64RegisterRip); + + if (!(pCtx->fExtrn & CPUMCTX_EXTRN_NEM_WIN_INHIBIT_INT)) + { + pVCpu->nem.s.fLastInterruptShadow = paValues[iReg].InterruptState.InterruptShadow; + if (paValues[iReg].InterruptState.InterruptShadow) + EMSetInhibitInterruptsPC(pVCpu, paValues[iReg + 1].Reg64); + else + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_INHIBIT_INTERRUPTS); + } + + if (!(pCtx->fExtrn & CPUMCTX_EXTRN_NEM_WIN_INHIBIT_NMI)) + { + if (paValues[iReg].InterruptState.NmiMasked) + VMCPU_FF_SET(pVCpu, VMCPU_FF_BLOCK_NMIS); + else + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_BLOCK_NMIS); + } + + fWhat |= CPUMCTX_EXTRN_NEM_WIN_INHIBIT_INT | CPUMCTX_EXTRN_NEM_WIN_INHIBIT_NMI; + iReg += 2; + } + + /* Event injection. */ + /// @todo HvRegisterPendingInterruption + Assert(pInput->Names[iReg] == HvRegisterPendingInterruption); + if (paValues[iReg].PendingInterruption.InterruptionPending) + { + Log7(("PendingInterruption: type=%u vector=%#x errcd=%RTbool/%#x instr-len=%u nested=%u\n", + paValues[iReg].PendingInterruption.InterruptionType, paValues[iReg].PendingInterruption.InterruptionVector, + paValues[iReg].PendingInterruption.DeliverErrorCode, paValues[iReg].PendingInterruption.ErrorCode, + paValues[iReg].PendingInterruption.InstructionLength, paValues[iReg].PendingInterruption.NestedEvent)); + AssertMsg((paValues[iReg].PendingInterruption.AsUINT64 & UINT64_C(0xfc00)) == 0, + ("%#RX64\n", paValues[iReg].PendingInterruption.AsUINT64)); + } + + /// @todo HvRegisterPendingEvent0 + /// @todo HvRegisterPendingEvent1 + + /* Almost done, just update extrn flags and maybe change PGM mode. */ + pCtx->fExtrn &= ~fWhat; + if (!(pCtx->fExtrn & (CPUMCTX_EXTRN_ALL | (CPUMCTX_EXTRN_NEM_WIN_MASK & ~CPUMCTX_EXTRN_NEM_WIN_EVENT_INJECT)))) + pCtx->fExtrn = 0; + + /* Typical. */ + if (!fMaybeChangedMode && !fUpdateCr3) + return VINF_SUCCESS; + + /* + * Slow. + */ + int rc = VINF_SUCCESS; + if (fMaybeChangedMode) + { + rc = PGMChangeMode(pVCpu, pCtx->cr0, pCtx->cr4, pCtx->msrEFER); + AssertMsgReturn(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc), RT_FAILURE_NP(rc) ? rc : VERR_NEM_IPE_1); + } + + if (fUpdateCr3) + { + if (fCanUpdateCr3) + { + LogFlow(("nemR0WinImportState: -> PGMUpdateCR3!\n")); + rc = PGMUpdateCR3(pVCpu, pCtx->cr3); + AssertMsgReturn(rc == VINF_SUCCESS, ("rc=%Rrc\n", rc), RT_FAILURE_NP(rc) ? rc : VERR_NEM_IPE_2); + } + else + { + LogFlow(("nemR0WinImportState: -> VERR_NEM_FLUSH_TLB!\n")); + rc = VERR_NEM_FLUSH_TLB; /* Calling PGMFlushTLB w/o long jump setup doesn't work, ring-3 does it. */ + } + } + + return rc; +} +#endif /* NEM_WIN_WITH_RING0_RUNLOOP || NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS */ + + +/** + * Import the state from the native API (back to CPUMCTX). + * + * @returns VBox status code + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @param idCpu The calling EMT. Necessary for getting the + * hypercall page and arguments. + * @param fWhat What to import, CPUMCTX_EXTRN_XXX. Set + * CPUMCTX_EXTERN_ALL for everything. + */ +VMMR0_INT_DECL(int) NEMR0ImportState(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint64_t fWhat) +{ +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) + /* + * Validate the call. + */ + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_SUCCESS(rc)) + { + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + /* + * Call worker. + */ + rc = nemR0WinImportState(pGVM, pGVCpu, &pVCpu->cpum.GstCtx, fWhat, false /*fCanUpdateCr3*/); + } + return rc; +#else + RT_NOREF(pGVM, pVM, idCpu, fWhat); + return VERR_NOT_IMPLEMENTED; +#endif +} + + +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) +/** + * Worker for NEMR0QueryCpuTick and the ring-0 NEMHCQueryCpuTick. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pGVCpu The ring-0 VCPU handle. + * @param pcTicks Where to return the current CPU tick count. + * @param pcAux Where to return the hyper-V TSC_AUX value. Optional. + */ +NEM_TMPL_STATIC int nemR0WinQueryCpuTick(PGVM pGVM, PGVMCPU pGVCpu, uint64_t *pcTicks, uint32_t *pcAux) +{ + /* + * Hypercall parameters. + */ + HV_INPUT_GET_VP_REGISTERS *pInput = (HV_INPUT_GET_VP_REGISTERS *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pInput, VERR_INTERNAL_ERROR_3); + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + pInput->PartitionId = pGVM->nem.s.idHvPartition; + pInput->VpIndex = pGVCpu->idCpu; + pInput->fFlags = 0; + pInput->Names[0] = HvX64RegisterTsc; + pInput->Names[1] = HvX64RegisterTscAux; + + size_t const cbInput = RT_ALIGN_Z(RT_UOFFSETOF(HV_INPUT_GET_VP_REGISTERS, Names[2]), 32); + HV_REGISTER_VALUE *paValues = (HV_REGISTER_VALUE *)((uint8_t *)pInput + cbInput); + RT_BZERO(paValues, sizeof(paValues[0]) * 2); + + /* + * Make the hypercall. + */ + uint64_t uResult = g_pfnHvlInvokeHypercall(HV_MAKE_CALL_INFO(HvCallGetVpRegisters, 2), + pGVCpu->nem.s.HypercallData.HCPhysPage, + pGVCpu->nem.s.HypercallData.HCPhysPage + cbInput); + AssertLogRelMsgReturn(uResult == HV_MAKE_CALL_REP_RET(2), ("uResult=%RX64 cRegs=%#x\n", uResult, 2), + VERR_NEM_GET_REGISTERS_FAILED); + + /* + * Get results. + */ + *pcTicks = paValues[0].Reg64; + if (pcAux) + *pcAux = paValues[0].Reg32; + return VINF_SUCCESS; +} +#endif /* NEM_WIN_WITH_RING0_RUNLOOP || NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS */ + + +/** + * Queries the TSC and TSC_AUX values, putting the results in . + * + * @returns VBox status code + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @param idCpu The calling EMT. Necessary for getting the + * hypercall page and arguments. + */ +VMMR0_INT_DECL(int) NEMR0QueryCpuTick(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) + /* + * Validate the call. + */ + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_SUCCESS(rc)) + { + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + /* + * Call worker. + */ + pVCpu->nem.s.Hypercall.QueryCpuTick.cTicks = 0; + pVCpu->nem.s.Hypercall.QueryCpuTick.uAux = 0; + rc = nemR0WinQueryCpuTick(pGVM, pGVCpu, &pVCpu->nem.s.Hypercall.QueryCpuTick.cTicks, + &pVCpu->nem.s.Hypercall.QueryCpuTick.uAux); + } + return rc; +#else + RT_NOREF(pGVM, pVM, idCpu); + return VERR_NOT_IMPLEMENTED; +#endif +} + + +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) +/** + * Worker for NEMR0ResumeCpuTickOnAll and the ring-0 NEMHCResumeCpuTickOnAll. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pGVCpu The ring-0 VCPU handle. + * @param uPausedTscValue The TSC value at the time of pausing. + */ +NEM_TMPL_STATIC int nemR0WinResumeCpuTickOnAll(PGVM pGVM, PGVMCPU pGVCpu, uint64_t uPausedTscValue) +{ + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + /* + * Set up the hypercall parameters. + */ + HV_INPUT_SET_VP_REGISTERS *pInput = (HV_INPUT_SET_VP_REGISTERS *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pInput, VERR_INTERNAL_ERROR_3); + + pInput->PartitionId = pGVM->nem.s.idHvPartition; + pInput->VpIndex = 0; + pInput->RsvdZ = 0; + pInput->Elements[0].Name = HvX64RegisterTsc; + pInput->Elements[0].Pad0 = 0; + pInput->Elements[0].Pad1 = 0; + pInput->Elements[0].Value.Reg128.High64 = 0; + pInput->Elements[0].Value.Reg64 = uPausedTscValue; + + /* + * Disable interrupts and do the first virtual CPU. + */ + RTCCINTREG const fSavedFlags = ASMIntDisableFlags(); + uint64_t const uFirstTsc = ASMReadTSC(); + uint64_t uResult = g_pfnHvlInvokeHypercall(HV_MAKE_CALL_INFO(HvCallSetVpRegisters, 1), + pGVCpu->nem.s.HypercallData.HCPhysPage, 0 /* no output */); + AssertLogRelMsgReturnStmt(uResult == HV_MAKE_CALL_REP_RET(1), ("uResult=%RX64 uTsc=%#RX64\n", uResult, uPausedTscValue), + ASMSetFlags(fSavedFlags), VERR_NEM_SET_TSC); + + /* + * Do secondary processors, adjusting for elapsed TSC and keeping finger crossed + * that we don't introduce too much drift here. + */ + for (VMCPUID iCpu = 1; iCpu < pGVM->cCpus; iCpu++) + { + Assert(pInput->PartitionId == pGVM->nem.s.idHvPartition); + Assert(pInput->RsvdZ == 0); + Assert(pInput->Elements[0].Name == HvX64RegisterTsc); + Assert(pInput->Elements[0].Pad0 == 0); + Assert(pInput->Elements[0].Pad1 == 0); + Assert(pInput->Elements[0].Value.Reg128.High64 == 0); + + pInput->VpIndex = iCpu; + const uint64_t offDelta = (ASMReadTSC() - uFirstTsc); + pInput->Elements[0].Value.Reg64 = uPausedTscValue + offDelta; + + uResult = g_pfnHvlInvokeHypercall(HV_MAKE_CALL_INFO(HvCallSetVpRegisters, 1), + pGVCpu->nem.s.HypercallData.HCPhysPage, 0 /* no output */); + AssertLogRelMsgReturnStmt(uResult == HV_MAKE_CALL_REP_RET(1), + ("uResult=%RX64 uTsc=%#RX64 + %#RX64\n", uResult, uPausedTscValue, offDelta), + ASMSetFlags(fSavedFlags), VERR_NEM_SET_TSC); + } + + /* + * Done. + */ + ASMSetFlags(fSavedFlags); + return VINF_SUCCESS; +} +#endif /* NEM_WIN_WITH_RING0_RUNLOOP || NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS */ + + +/** + * Sets the TSC register to @a uPausedTscValue on all CPUs. + * + * @returns VBox status code + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @param idCpu The calling EMT. Necessary for getting the + * hypercall page and arguments. + * @param uPausedTscValue The TSC value at the time of pausing. + */ +VMMR0_INT_DECL(int) NEMR0ResumeCpuTickOnAll(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint64_t uPausedTscValue) +{ +#if defined(NEM_WIN_WITH_RING0_RUNLOOP) || defined(NEM_WIN_USE_HYPERCALLS_FOR_REGISTERS) + /* + * Validate the call. + */ + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_SUCCESS(rc)) + { + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + /* + * Call worker. + */ + pVCpu->nem.s.Hypercall.QueryCpuTick.cTicks = 0; + pVCpu->nem.s.Hypercall.QueryCpuTick.uAux = 0; + rc = nemR0WinResumeCpuTickOnAll(pGVM, pGVCpu, uPausedTscValue); + } + return rc; +#else + RT_NOREF(pGVM, pVM, idCpu, uPausedTscValue); + return VERR_NOT_IMPLEMENTED; +#endif +} + + +VMMR0_INT_DECL(VBOXSTRICTRC) NEMR0RunGuestCode(PGVM pGVM, VMCPUID idCpu) +{ +#ifdef NEM_WIN_WITH_RING0_RUNLOOP + if (pGVM->nem.s.fMayUseRing0Runloop) + { + PVM pVM = pGVM->pVM; + return nemHCWinRunGC(pVM, &pVM->aCpus[idCpu], pGVM, &pGVM->aCpus[idCpu]); + } + return VERR_NEM_RING3_ONLY; +#else + RT_NOREF(pGVM, idCpu); + return VERR_NOT_IMPLEMENTED; +#endif +} + + +/** + * Updates statistics in the VM structure. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @param idCpu The calling EMT, or NIL. Necessary for getting the hypercall + * page and arguments. + */ +VMMR0_INT_DECL(int) NEMR0UpdateStatistics(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + /* + * Validate the call. + */ + int rc; + if (idCpu == NIL_VMCPUID) + rc = GVMMR0ValidateGVMandVM(pGVM, pVM); + else + rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_SUCCESS(rc)) + { + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + PNEMR0HYPERCALLDATA pHypercallData = idCpu != NIL_VMCPUID + ? &pGVM->aCpus[idCpu].nem.s.HypercallData + : &pGVM->nem.s.HypercallData; + if ( RT_VALID_PTR(pHypercallData->pbPage) + && pHypercallData->HCPhysPage != NIL_RTHCPHYS) + { + if (idCpu == NIL_VMCPUID) + rc = RTCritSectEnter(&pGVM->nem.s.HypercallDataCritSect); + if (RT_SUCCESS(rc)) + { + /* + * Query the memory statistics for the partition. + */ + HV_INPUT_GET_MEMORY_BALANCE *pInput = (HV_INPUT_GET_MEMORY_BALANCE *)pHypercallData->pbPage; + pInput->TargetPartitionId = pGVM->nem.s.idHvPartition; + pInput->ProximityDomainInfo.Flags.ProximityPreferred = 0; + pInput->ProximityDomainInfo.Flags.ProxyimityInfoValid = 0; + pInput->ProximityDomainInfo.Flags.Reserved = 0; + pInput->ProximityDomainInfo.Id = 0; + + HV_OUTPUT_GET_MEMORY_BALANCE *pOutput = (HV_OUTPUT_GET_MEMORY_BALANCE *)(pInput + 1); + RT_ZERO(*pOutput); + + uint64_t uResult = g_pfnHvlInvokeHypercall(HvCallGetMemoryBalance, + pHypercallData->HCPhysPage, + pHypercallData->HCPhysPage + sizeof(*pInput)); + if (uResult == HV_STATUS_SUCCESS) + { + pVM->nem.s.R0Stats.cPagesAvailable = pOutput->PagesAvailable; + pVM->nem.s.R0Stats.cPagesInUse = pOutput->PagesInUse; + rc = VINF_SUCCESS; + } + else + { + LogRel(("HvCallGetMemoryBalance -> %#RX64 (%#RX64 %#RX64)!!\n", + uResult, pOutput->PagesAvailable, pOutput->PagesInUse)); + rc = VERR_NEM_IPE_0; + } + + if (idCpu == NIL_VMCPUID) + RTCritSectLeave(&pGVM->nem.s.HypercallDataCritSect); + } + } + else + rc = VERR_WRONG_ORDER; + } + return rc; +} + + +#if 1 && defined(DEBUG_bird) +/** + * Debug only interface for poking around and exploring Hyper-V stuff. + * + * @param pGVM The ring-0 VM handle. + * @param pVM The cross context VM handle. + * @param idCpu The calling EMT. + * @param u64Arg What to query. 0 == registers. + */ +VMMR0_INT_DECL(int) NEMR0DoExperiment(PGVM pGVM, PVM pVM, VMCPUID idCpu, uint64_t u64Arg) +{ + /* + * Resolve CPU structures. + */ + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_SUCCESS(rc)) + { + AssertReturn(g_pfnHvlInvokeHypercall, VERR_NEM_MISSING_KERNEL_API); + + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + if (u64Arg == 0) + { + /* + * Query register. + */ + HV_INPUT_GET_VP_REGISTERS *pInput = (HV_INPUT_GET_VP_REGISTERS *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pInput, VERR_INTERNAL_ERROR_3); + + size_t const cbInput = RT_ALIGN_Z(RT_UOFFSETOF(HV_INPUT_GET_VP_REGISTERS, Names[1]), 32); + HV_REGISTER_VALUE *paValues = (HV_REGISTER_VALUE *)((uint8_t *)pInput + cbInput); + RT_BZERO(paValues, sizeof(paValues[0]) * 1); + + pInput->PartitionId = pGVM->nem.s.idHvPartition; + pInput->VpIndex = pGVCpu->idCpu; + pInput->fFlags = 0; + pInput->Names[0] = (HV_REGISTER_NAME)pVCpu->nem.s.Hypercall.Experiment.uItem; + + uint64_t uResult = g_pfnHvlInvokeHypercall(HV_MAKE_CALL_INFO(HvCallGetVpRegisters, 1), + pGVCpu->nem.s.HypercallData.HCPhysPage, + pGVCpu->nem.s.HypercallData.HCPhysPage + cbInput); + pVCpu->nem.s.Hypercall.Experiment.fSuccess = uResult == HV_MAKE_CALL_REP_RET(1); + pVCpu->nem.s.Hypercall.Experiment.uStatus = uResult; + pVCpu->nem.s.Hypercall.Experiment.uLoValue = paValues[0].Reg128.Low64; + pVCpu->nem.s.Hypercall.Experiment.uHiValue = paValues[0].Reg128.High64; + rc = VINF_SUCCESS; + } + else if (u64Arg == 1) + { + /* + * Query partition property. + */ + HV_INPUT_GET_PARTITION_PROPERTY *pInput = (HV_INPUT_GET_PARTITION_PROPERTY *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pInput, VERR_INTERNAL_ERROR_3); + + size_t const cbInput = RT_ALIGN_Z(sizeof(*pInput), 32); + HV_OUTPUT_GET_PARTITION_PROPERTY *pOutput = (HV_OUTPUT_GET_PARTITION_PROPERTY *)((uint8_t *)pInput + cbInput); + pOutput->PropertyValue = 0; + + pInput->PartitionId = pGVM->nem.s.idHvPartition; + pInput->PropertyCode = (HV_PARTITION_PROPERTY_CODE)pVCpu->nem.s.Hypercall.Experiment.uItem; + pInput->uPadding = 0; + + uint64_t uResult = g_pfnHvlInvokeHypercall(HvCallGetPartitionProperty, + pGVCpu->nem.s.HypercallData.HCPhysPage, + pGVCpu->nem.s.HypercallData.HCPhysPage + cbInput); + pVCpu->nem.s.Hypercall.Experiment.fSuccess = uResult == HV_STATUS_SUCCESS; + pVCpu->nem.s.Hypercall.Experiment.uStatus = uResult; + pVCpu->nem.s.Hypercall.Experiment.uLoValue = pOutput->PropertyValue; + pVCpu->nem.s.Hypercall.Experiment.uHiValue = 0; + rc = VINF_SUCCESS; + } + else if (u64Arg == 2) + { + /* + * Set register. + */ + HV_INPUT_SET_VP_REGISTERS *pInput = (HV_INPUT_SET_VP_REGISTERS *)pGVCpu->nem.s.HypercallData.pbPage; + AssertPtrReturn(pInput, VERR_INTERNAL_ERROR_3); + RT_BZERO(pInput, RT_UOFFSETOF(HV_INPUT_SET_VP_REGISTERS, Elements[1])); + + pInput->PartitionId = pGVM->nem.s.idHvPartition; + pInput->VpIndex = pGVCpu->idCpu; + pInput->RsvdZ = 0; + pInput->Elements[0].Name = (HV_REGISTER_NAME)pVCpu->nem.s.Hypercall.Experiment.uItem; + pInput->Elements[0].Value.Reg128.High64 = pVCpu->nem.s.Hypercall.Experiment.uHiValue; + pInput->Elements[0].Value.Reg128.Low64 = pVCpu->nem.s.Hypercall.Experiment.uLoValue; + + uint64_t uResult = g_pfnHvlInvokeHypercall(HV_MAKE_CALL_INFO(HvCallSetVpRegisters, 1), + pGVCpu->nem.s.HypercallData.HCPhysPage, 0); + pVCpu->nem.s.Hypercall.Experiment.fSuccess = uResult == HV_MAKE_CALL_REP_RET(1); + pVCpu->nem.s.Hypercall.Experiment.uStatus = uResult; + rc = VINF_SUCCESS; + } + else + rc = VERR_INVALID_FUNCTION; + } + return rc; +} +#endif /* DEBUG_bird */ + diff --git a/src/VBox/VMM/VMMR0/PDMR0Device.cpp b/src/VBox/VMM/VMMR0/PDMR0Device.cpp new file mode 100644 index 00000000..e905f1d6 --- /dev/null +++ b/src/VBox/VMM/VMMR0/PDMR0Device.cpp @@ -0,0 +1,861 @@ +/* $Id: PDMR0Device.cpp $ */ +/** @file + * PDM - Pluggable Device and Driver Manager, R0 Device parts. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_PDM_DEVICE +#define PDMPCIDEV_INCLUDE_PRIVATE /* Hack to get pdmpcidevint.h included at the right point. */ +#include "PDMInternal.h" +#include <VBox/vmm/pdm.h> +#include <VBox/vmm/pgm.h> +#include <VBox/vmm/mm.h> +#include <VBox/vmm/vm.h> +#include <VBox/vmm/vmm.h> +#include <VBox/vmm/patm.h> +#include <VBox/vmm/hm.h> +#include <VBox/vmm/apic.h> + +#include <VBox/log.h> +#include <VBox/err.h> +#include <VBox/vmm/gvmm.h> +#include <iprt/asm.h> +#include <iprt/assert.h> +#include <iprt/string.h> + +#include "dtrace/VBoxVMM.h" +#include "PDMInline.h" + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +RT_C_DECLS_BEGIN +extern DECLEXPORT(const PDMDEVHLPR0) g_pdmR0DevHlp; +extern DECLEXPORT(const PDMPICHLPR0) g_pdmR0PicHlp; +extern DECLEXPORT(const PDMIOAPICHLPR0) g_pdmR0IoApicHlp; +extern DECLEXPORT(const PDMPCIHLPR0) g_pdmR0PciHlp; +extern DECLEXPORT(const PDMHPETHLPR0) g_pdmR0HpetHlp; +extern DECLEXPORT(const PDMPCIRAWHLPR0) g_pdmR0PciRawHlp; +extern DECLEXPORT(const PDMDRVHLPR0) g_pdmR0DrvHlp; +RT_C_DECLS_END + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +static bool pdmR0IsaSetIrq(PVM pVM, int iIrq, int iLevel, uint32_t uTagSrc); + + + +/** @name Ring-0 Device Helpers + * @{ + */ + +/** @interface_method_impl{PDMDEVHLPR0,pfnPCIPhysRead} */ +static DECLCALLBACK(int) pdmR0DevHlp_PCIPhysRead(PPDMDEVINS pDevIns, PPDMPCIDEV pPciDev, RTGCPHYS GCPhys, + void *pvBuf, size_t cbRead) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + if (!pPciDev) /* NULL is an alias for the default PCI device. */ + pPciDev = pDevIns->Internal.s.pHeadPciDevR0; + AssertReturn(pPciDev, VERR_PDM_NOT_PCI_DEVICE); + +#ifndef PDM_DO_NOT_RESPECT_PCI_BM_BIT + /* + * Just check the busmaster setting here and forward the request to the generic read helper. + */ + if (PCIDevIsBusmaster(pPciDev)) + { /* likely */ } + else + { + Log(("pdmRCDevHlp_PCIPhysRead: caller=%p/%d: returns %Rrc - Not bus master! GCPhys=%RGp cbRead=%#zx\n", + pDevIns, pDevIns->iInstance, VERR_PDM_NOT_PCI_BUS_MASTER, GCPhys, cbRead)); + memset(pvBuf, 0xff, cbRead); + return VERR_PDM_NOT_PCI_BUS_MASTER; + } +#endif + + return pDevIns->pHlpR0->pfnPhysRead(pDevIns, GCPhys, pvBuf, cbRead); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnPCIPhysWrite} */ +static DECLCALLBACK(int) pdmR0DevHlp_PCIPhysWrite(PPDMDEVINS pDevIns, PPDMPCIDEV pPciDev, RTGCPHYS GCPhys, + const void *pvBuf, size_t cbWrite) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + if (!pPciDev) /* NULL is an alias for the default PCI device. */ + pPciDev = pDevIns->Internal.s.pHeadPciDevR0; + AssertReturn(pPciDev, VERR_PDM_NOT_PCI_DEVICE); + +#ifndef PDM_DO_NOT_RESPECT_PCI_BM_BIT + /* + * Just check the busmaster setting here and forward the request to the generic read helper. + */ + if (PCIDevIsBusmaster(pPciDev)) + { /* likely */ } + else + { + Log(("pdmRCDevHlp_PCIPhysWrite: caller=%p/%d: returns %Rrc - Not bus master! GCPhys=%RGp cbWrite=%#zx\n", + pDevIns, pDevIns->iInstance, VERR_PDM_NOT_PCI_BUS_MASTER, GCPhys, cbWrite)); + return VERR_PDM_NOT_PCI_BUS_MASTER; + } +#endif + + return pDevIns->pHlpR0->pfnPhysWrite(pDevIns, GCPhys, pvBuf, cbWrite); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnPCISetIrq} */ +static DECLCALLBACK(void) pdmR0DevHlp_PCISetIrq(PPDMDEVINS pDevIns, PPDMPCIDEV pPciDev, int iIrq, int iLevel) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + if (!pPciDev) /* NULL is an alias for the default PCI device. */ + pPciDev = pDevIns->Internal.s.pHeadPciDevR0; + AssertReturnVoid(pPciDev); + LogFlow(("pdmR0DevHlp_PCISetIrq: caller=%p/%d: pPciDev=%p:{%#x} iIrq=%d iLevel=%d\n", + pDevIns, pDevIns->iInstance, pPciDev, pPciDev->uDevFn, iIrq, iLevel)); + PVM pVM = pDevIns->Internal.s.pVMR0; + PPDMPCIBUS pPciBus = pPciDev->Int.s.pPdmBusR0; + + pdmLock(pVM); + uint32_t uTagSrc; + if (iLevel & PDM_IRQ_LEVEL_HIGH) + { + pDevIns->Internal.s.uLastIrqTag = uTagSrc = pdmCalcIrqTag(pVM, pDevIns->idTracing); + if (iLevel == PDM_IRQ_LEVEL_HIGH) + VBOXVMM_PDM_IRQ_HIGH(VMMGetCpu(pVM), RT_LOWORD(uTagSrc), RT_HIWORD(uTagSrc)); + else + VBOXVMM_PDM_IRQ_HILO(VMMGetCpu(pVM), RT_LOWORD(uTagSrc), RT_HIWORD(uTagSrc)); + } + else + uTagSrc = pDevIns->Internal.s.uLastIrqTag; + + if ( pPciBus + && pPciBus->pDevInsR0) + { + pPciBus->pfnSetIrqR0(pPciBus->pDevInsR0, pPciDev, iIrq, iLevel, uTagSrc); + + pdmUnlock(pVM); + + if (iLevel == PDM_IRQ_LEVEL_LOW) + VBOXVMM_PDM_IRQ_LOW(VMMGetCpu(pVM), RT_LOWORD(uTagSrc), RT_HIWORD(uTagSrc)); + } + else + { + pdmUnlock(pVM); + + /* queue for ring-3 execution. */ + PPDMDEVHLPTASK pTask = (PPDMDEVHLPTASK)PDMQueueAlloc(pVM->pdm.s.pDevHlpQueueR0); + AssertReturnVoid(pTask); + + pTask->enmOp = PDMDEVHLPTASKOP_PCI_SET_IRQ; + pTask->pDevInsR3 = PDMDEVINS_2_R3PTR(pDevIns); + pTask->u.PciSetIRQ.iIrq = iIrq; + pTask->u.PciSetIRQ.iLevel = iLevel; + pTask->u.PciSetIRQ.uTagSrc = uTagSrc; + pTask->u.PciSetIRQ.pPciDevR3 = MMHyperR0ToR3(pVM, pPciDev); + + PDMQueueInsertEx(pVM->pdm.s.pDevHlpQueueR0, &pTask->Core, 0); + } + + LogFlow(("pdmR0DevHlp_PCISetIrq: caller=%p/%d: returns void; uTagSrc=%#x\n", pDevIns, pDevIns->iInstance, uTagSrc)); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnISASetIrq} */ +static DECLCALLBACK(void) pdmR0DevHlp_ISASetIrq(PPDMDEVINS pDevIns, int iIrq, int iLevel) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_ISASetIrq: caller=%p/%d: iIrq=%d iLevel=%d\n", pDevIns, pDevIns->iInstance, iIrq, iLevel)); + PVM pVM = pDevIns->Internal.s.pVMR0; + + pdmLock(pVM); + uint32_t uTagSrc; + if (iLevel & PDM_IRQ_LEVEL_HIGH) + { + pDevIns->Internal.s.uLastIrqTag = uTagSrc = pdmCalcIrqTag(pVM, pDevIns->idTracing); + if (iLevel == PDM_IRQ_LEVEL_HIGH) + VBOXVMM_PDM_IRQ_HIGH(VMMGetCpu(pVM), RT_LOWORD(uTagSrc), RT_HIWORD(uTagSrc)); + else + VBOXVMM_PDM_IRQ_HILO(VMMGetCpu(pVM), RT_LOWORD(uTagSrc), RT_HIWORD(uTagSrc)); + } + else + uTagSrc = pDevIns->Internal.s.uLastIrqTag; + + bool fRc = pdmR0IsaSetIrq(pVM, iIrq, iLevel, uTagSrc); + + if (iLevel == PDM_IRQ_LEVEL_LOW && fRc) + VBOXVMM_PDM_IRQ_LOW(VMMGetCpu(pVM), RT_LOWORD(uTagSrc), RT_HIWORD(uTagSrc)); + pdmUnlock(pVM); + LogFlow(("pdmR0DevHlp_ISASetIrq: caller=%p/%d: returns void; uTagSrc=%#x\n", pDevIns, pDevIns->iInstance, uTagSrc)); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnIoApicSendMsi} */ +static DECLCALLBACK(void) pdmR0DevHlp_IoApicSendMsi(PPDMDEVINS pDevIns, RTGCPHYS GCPhys, uint32_t uValue) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_IoApicSendMsi: caller=%p/%d: GCPhys=%RGp uValue=%#x\n", pDevIns, pDevIns->iInstance, GCPhys, uValue)); + PVM pVM = pDevIns->Internal.s.pVMR0; + + uint32_t uTagSrc; + pDevIns->Internal.s.uLastIrqTag = uTagSrc = pdmCalcIrqTag(pVM, pDevIns->idTracing); + VBOXVMM_PDM_IRQ_HILO(VMMGetCpu(pVM), RT_LOWORD(uTagSrc), RT_HIWORD(uTagSrc)); + + if (pVM->pdm.s.IoApic.pDevInsR0) + pVM->pdm.s.IoApic.pfnSendMsiR0(pVM->pdm.s.IoApic.pDevInsR0, GCPhys, uValue, uTagSrc); + else + AssertFatalMsgFailed(("Lazy bastards!")); + + LogFlow(("pdmR0DevHlp_IoApicSendMsi: caller=%p/%d: returns void; uTagSrc=%#x\n", pDevIns, pDevIns->iInstance, uTagSrc)); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnPhysRead} */ +static DECLCALLBACK(int) pdmR0DevHlp_PhysRead(PPDMDEVINS pDevIns, RTGCPHYS GCPhys, void *pvBuf, size_t cbRead) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_PhysRead: caller=%p/%d: GCPhys=%RGp pvBuf=%p cbRead=%#x\n", + pDevIns, pDevIns->iInstance, GCPhys, pvBuf, cbRead)); + + VBOXSTRICTRC rcStrict = PGMPhysRead(pDevIns->Internal.s.pVMR0, GCPhys, pvBuf, cbRead, PGMACCESSORIGIN_DEVICE); + AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); /** @todo track down the users for this bugger. */ + + Log(("pdmR0DevHlp_PhysRead: caller=%p/%d: returns %Rrc\n", pDevIns, pDevIns->iInstance, VBOXSTRICTRC_VAL(rcStrict) )); + return VBOXSTRICTRC_VAL(rcStrict); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnPhysWrite} */ +static DECLCALLBACK(int) pdmR0DevHlp_PhysWrite(PPDMDEVINS pDevIns, RTGCPHYS GCPhys, const void *pvBuf, size_t cbWrite) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_PhysWrite: caller=%p/%d: GCPhys=%RGp pvBuf=%p cbWrite=%#x\n", + pDevIns, pDevIns->iInstance, GCPhys, pvBuf, cbWrite)); + + VBOXSTRICTRC rcStrict = PGMPhysWrite(pDevIns->Internal.s.pVMR0, GCPhys, pvBuf, cbWrite, PGMACCESSORIGIN_DEVICE); + AssertMsg(rcStrict == VINF_SUCCESS, ("%Rrc\n", VBOXSTRICTRC_VAL(rcStrict))); /** @todo track down the users for this bugger. */ + + Log(("pdmR0DevHlp_PhysWrite: caller=%p/%d: returns %Rrc\n", pDevIns, pDevIns->iInstance, VBOXSTRICTRC_VAL(rcStrict) )); + return VBOXSTRICTRC_VAL(rcStrict); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnA20IsEnabled} */ +static DECLCALLBACK(bool) pdmR0DevHlp_A20IsEnabled(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_A20IsEnabled: caller=%p/%d:\n", pDevIns, pDevIns->iInstance)); + + bool fEnabled = PGMPhysIsA20Enabled(VMMGetCpu(pDevIns->Internal.s.pVMR0)); + + Log(("pdmR0DevHlp_A20IsEnabled: caller=%p/%d: returns %RTbool\n", pDevIns, pDevIns->iInstance, fEnabled)); + return fEnabled; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnVMState} */ +static DECLCALLBACK(VMSTATE) pdmR0DevHlp_VMState(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + + VMSTATE enmVMState = pDevIns->Internal.s.pVMR0->enmVMState; + + LogFlow(("pdmR0DevHlp_VMState: caller=%p/%d: returns %d\n", pDevIns, pDevIns->iInstance, enmVMState)); + return enmVMState; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnVMSetError} */ +static DECLCALLBACK(int) pdmR0DevHlp_VMSetError(PPDMDEVINS pDevIns, int rc, RT_SRC_POS_DECL, const char *pszFormat, ...) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + va_list args; + va_start(args, pszFormat); + int rc2 = VMSetErrorV(pDevIns->Internal.s.pVMR0, rc, RT_SRC_POS_ARGS, pszFormat, args); Assert(rc2 == rc); NOREF(rc2); + va_end(args); + return rc; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnVMSetErrorV} */ +static DECLCALLBACK(int) pdmR0DevHlp_VMSetErrorV(PPDMDEVINS pDevIns, int rc, RT_SRC_POS_DECL, const char *pszFormat, va_list va) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + int rc2 = VMSetErrorV(pDevIns->Internal.s.pVMR0, rc, RT_SRC_POS_ARGS, pszFormat, va); Assert(rc2 == rc); NOREF(rc2); + return rc; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnVMSetRuntimeError} */ +static DECLCALLBACK(int) pdmR0DevHlp_VMSetRuntimeError(PPDMDEVINS pDevIns, uint32_t fFlags, const char *pszErrorId, const char *pszFormat, ...) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + va_list va; + va_start(va, pszFormat); + int rc = VMSetRuntimeErrorV(pDevIns->Internal.s.pVMR0, fFlags, pszErrorId, pszFormat, va); + va_end(va); + return rc; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnVMSetRuntimeErrorV} */ +static DECLCALLBACK(int) pdmR0DevHlp_VMSetRuntimeErrorV(PPDMDEVINS pDevIns, uint32_t fFlags, const char *pszErrorId, const char *pszFormat, va_list va) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + int rc = VMSetRuntimeErrorV(pDevIns->Internal.s.pVMR0, fFlags, pszErrorId, pszFormat, va); + return rc; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnPATMSetMMIOPatchInfo} */ +static DECLCALLBACK(int) pdmR0DevHlp_PATMSetMMIOPatchInfo(PPDMDEVINS pDevIns, RTGCPHYS GCPhys, RTGCPTR pCachedData) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_PATMSetMMIOPatchInfo: caller=%p/%d:\n", pDevIns, pDevIns->iInstance)); + + AssertFailed(); + NOREF(GCPhys); NOREF(pCachedData); NOREF(pDevIns); + +/* return PATMSetMMIOPatchInfo(pDevIns->Internal.s.pVMR0, GCPhys, pCachedData); */ + return VINF_SUCCESS; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnGetVM} */ +static DECLCALLBACK(PVM) pdmR0DevHlp_GetVM(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_GetVM: caller='%p'/%d\n", pDevIns, pDevIns->iInstance)); + return pDevIns->Internal.s.pVMR0; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnGetVMCPU} */ +static DECLCALLBACK(PVMCPU) pdmR0DevHlp_GetVMCPU(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_GetVMCPU: caller='%p'/%d\n", pDevIns, pDevIns->iInstance)); + return VMMGetCpu(pDevIns->Internal.s.pVMR0); +} + + +/** @interface_method_impl{PDMDEVHLPRC,pfnGetCurrentCpuId} */ +static DECLCALLBACK(VMCPUID) pdmR0DevHlp_GetCurrentCpuId(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + VMCPUID idCpu = VMMGetCpuId(pDevIns->Internal.s.pVMR0); + LogFlow(("pdmR0DevHlp_GetCurrentCpuId: caller='%p'/%d for CPU %u\n", pDevIns, pDevIns->iInstance, idCpu)); + return idCpu; +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnTMTimeVirtGet} */ +static DECLCALLBACK(uint64_t) pdmR0DevHlp_TMTimeVirtGet(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_TMTimeVirtGet: caller='%p'/%d\n", pDevIns, pDevIns->iInstance)); + return TMVirtualGet(pDevIns->Internal.s.pVMR0); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnTMTimeVirtGetFreq} */ +static DECLCALLBACK(uint64_t) pdmR0DevHlp_TMTimeVirtGetFreq(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_TMTimeVirtGetFreq: caller='%p'/%d\n", pDevIns, pDevIns->iInstance)); + return TMVirtualGetFreq(pDevIns->Internal.s.pVMR0); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnTMTimeVirtGetNano} */ +static DECLCALLBACK(uint64_t) pdmR0DevHlp_TMTimeVirtGetNano(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + LogFlow(("pdmR0DevHlp_TMTimeVirtGetNano: caller='%p'/%d\n", pDevIns, pDevIns->iInstance)); + return TMVirtualToNano(pDevIns->Internal.s.pVMR0, TMVirtualGet(pDevIns->Internal.s.pVMR0)); +} + + +/** @interface_method_impl{PDMDEVHLPR0,pfnDBGFTraceBuf} */ +static DECLCALLBACK(RTTRACEBUF) pdmR0DevHlp_DBGFTraceBuf(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + RTTRACEBUF hTraceBuf = pDevIns->Internal.s.pVMR0->hTraceBufR0; + LogFlow(("pdmR3DevHlp_DBGFTraceBuf: caller='%p'/%d: returns %p\n", pDevIns, pDevIns->iInstance, hTraceBuf)); + return hTraceBuf; +} + + +/** + * The Ring-0 Device Helper Callbacks. + */ +extern DECLEXPORT(const PDMDEVHLPR0) g_pdmR0DevHlp = +{ + PDM_DEVHLPR0_VERSION, + pdmR0DevHlp_PCIPhysRead, + pdmR0DevHlp_PCIPhysWrite, + pdmR0DevHlp_PCISetIrq, + pdmR0DevHlp_ISASetIrq, + pdmR0DevHlp_IoApicSendMsi, + pdmR0DevHlp_PhysRead, + pdmR0DevHlp_PhysWrite, + pdmR0DevHlp_A20IsEnabled, + pdmR0DevHlp_VMState, + pdmR0DevHlp_VMSetError, + pdmR0DevHlp_VMSetErrorV, + pdmR0DevHlp_VMSetRuntimeError, + pdmR0DevHlp_VMSetRuntimeErrorV, + pdmR0DevHlp_PATMSetMMIOPatchInfo, + pdmR0DevHlp_GetVM, + pdmR0DevHlp_GetVMCPU, + pdmR0DevHlp_GetCurrentCpuId, + pdmR0DevHlp_TMTimeVirtGet, + pdmR0DevHlp_TMTimeVirtGetFreq, + pdmR0DevHlp_TMTimeVirtGetNano, + pdmR0DevHlp_DBGFTraceBuf, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + PDM_DEVHLPR0_VERSION +}; + +/** @} */ + + + + +/** @name PIC Ring-0 Helpers + * @{ + */ + +/** @interface_method_impl{PDMPICHLPR0,pfnSetInterruptFF} */ +static DECLCALLBACK(void) pdmR0PicHlp_SetInterruptFF(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + PVM pVM = pDevIns->Internal.s.pVMR0; + PVMCPU pVCpu = &pVM->aCpus[0]; /* for PIC we always deliver to CPU 0, MP use APIC */ + /** @todo r=ramshankar: Propagating rcRZ and make all callers handle it? */ + APICLocalInterrupt(pVCpu, 0 /* u8Pin */, 1 /* u8Level */, VINF_SUCCESS /* rcRZ */); +} + + +/** @interface_method_impl{PDMPICHLPR0,pfnClearInterruptFF} */ +static DECLCALLBACK(void) pdmR0PicHlp_ClearInterruptFF(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + PVM pVM = pDevIns->Internal.s.pVMR0; + PVMCPU pVCpu = &pVM->aCpus[0]; /* for PIC we always deliver to CPU 0, MP use APIC */ + /** @todo r=ramshankar: Propagating rcRZ and make all callers handle it? */ + APICLocalInterrupt(pVCpu, 0 /* u8Pin */, 0 /* u8Level */, VINF_SUCCESS /* rcRZ */); +} + + +/** @interface_method_impl{PDMPICHLPR0,pfnLock} */ +static DECLCALLBACK(int) pdmR0PicHlp_Lock(PPDMDEVINS pDevIns, int rc) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + return pdmLockEx(pDevIns->Internal.s.pVMR0, rc); +} + + +/** @interface_method_impl{PDMPICHLPR0,pfnUnlock} */ +static DECLCALLBACK(void) pdmR0PicHlp_Unlock(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + pdmUnlock(pDevIns->Internal.s.pVMR0); +} + + +/** + * The Ring-0 PIC Helper Callbacks. + */ +extern DECLEXPORT(const PDMPICHLPR0) g_pdmR0PicHlp = +{ + PDM_PICHLPR0_VERSION, + pdmR0PicHlp_SetInterruptFF, + pdmR0PicHlp_ClearInterruptFF, + pdmR0PicHlp_Lock, + pdmR0PicHlp_Unlock, + PDM_PICHLPR0_VERSION +}; + +/** @} */ + + +/** @name I/O APIC Ring-0 Helpers + * @{ + */ + +/** @interface_method_impl{PDMIOAPICHLPR0,pfnApicBusDeliver} */ +static DECLCALLBACK(int) pdmR0IoApicHlp_ApicBusDeliver(PPDMDEVINS pDevIns, uint8_t u8Dest, uint8_t u8DestMode, + uint8_t u8DeliveryMode, uint8_t uVector, uint8_t u8Polarity, + uint8_t u8TriggerMode, uint32_t uTagSrc) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + PVM pVM = pDevIns->Internal.s.pVMR0; + LogFlow(("pdmR0IoApicHlp_ApicBusDeliver: caller=%p/%d: u8Dest=%RX8 u8DestMode=%RX8 u8DeliveryMode=%RX8 uVector=%RX8 u8Polarity=%RX8 u8TriggerMode=%RX8 uTagSrc=%#x\n", + pDevIns, pDevIns->iInstance, u8Dest, u8DestMode, u8DeliveryMode, uVector, u8Polarity, u8TriggerMode, uTagSrc)); + return APICBusDeliver(pVM, u8Dest, u8DestMode, u8DeliveryMode, uVector, u8Polarity, u8TriggerMode, uTagSrc); +} + + +/** @interface_method_impl{PDMIOAPICHLPR0,pfnLock} */ +static DECLCALLBACK(int) pdmR0IoApicHlp_Lock(PPDMDEVINS pDevIns, int rc) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + return pdmLockEx(pDevIns->Internal.s.pVMR0, rc); +} + + +/** @interface_method_impl{PDMIOAPICHLPR0,pfnUnlock} */ +static DECLCALLBACK(void) pdmR0IoApicHlp_Unlock(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + pdmUnlock(pDevIns->Internal.s.pVMR0); +} + + +/** + * The Ring-0 I/O APIC Helper Callbacks. + */ +extern DECLEXPORT(const PDMIOAPICHLPR0) g_pdmR0IoApicHlp = +{ + PDM_IOAPICHLPR0_VERSION, + pdmR0IoApicHlp_ApicBusDeliver, + pdmR0IoApicHlp_Lock, + pdmR0IoApicHlp_Unlock, + PDM_IOAPICHLPR0_VERSION +}; + +/** @} */ + + + + +/** @name PCI Bus Ring-0 Helpers + * @{ + */ + +/** @interface_method_impl{PDMPCIHLPR0,pfnIsaSetIrq} */ +static DECLCALLBACK(void) pdmR0PciHlp_IsaSetIrq(PPDMDEVINS pDevIns, int iIrq, int iLevel, uint32_t uTagSrc) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + Log4(("pdmR0PciHlp_IsaSetIrq: iIrq=%d iLevel=%d uTagSrc=%#x\n", iIrq, iLevel, uTagSrc)); + PVM pVM = pDevIns->Internal.s.pVMR0; + + pdmLock(pVM); + pdmR0IsaSetIrq(pVM, iIrq, iLevel, uTagSrc); + pdmUnlock(pVM); +} + + +/** @interface_method_impl{PDMPCIHLPR0,pfnIoApicSetIrq} */ +static DECLCALLBACK(void) pdmR0PciHlp_IoApicSetIrq(PPDMDEVINS pDevIns, int iIrq, int iLevel, uint32_t uTagSrc) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + Log4(("pdmR0PciHlp_IoApicSetIrq: iIrq=%d iLevel=%d uTagSrc=%#x\n", iIrq, iLevel, uTagSrc)); + PVM pVM = pDevIns->Internal.s.pVMR0; + + if (pVM->pdm.s.IoApic.pDevInsR0) + pVM->pdm.s.IoApic.pfnSetIrqR0(pVM->pdm.s.IoApic.pDevInsR0, iIrq, iLevel, uTagSrc); + else if (pVM->pdm.s.IoApic.pDevInsR3) + { + /* queue for ring-3 execution. */ + PPDMDEVHLPTASK pTask = (PPDMDEVHLPTASK)PDMQueueAlloc(pVM->pdm.s.pDevHlpQueueR0); + if (pTask) + { + pTask->enmOp = PDMDEVHLPTASKOP_IOAPIC_SET_IRQ; + pTask->pDevInsR3 = NIL_RTR3PTR; /* not required */ + pTask->u.IoApicSetIRQ.iIrq = iIrq; + pTask->u.IoApicSetIRQ.iLevel = iLevel; + pTask->u.IoApicSetIRQ.uTagSrc = uTagSrc; + + PDMQueueInsertEx(pVM->pdm.s.pDevHlpQueueR0, &pTask->Core, 0); + } + else + AssertMsgFailed(("We're out of devhlp queue items!!!\n")); + } +} + + +/** @interface_method_impl{PDMPCIHLPR0,pfnIoApicSendMsi} */ +static DECLCALLBACK(void) pdmR0PciHlp_IoApicSendMsi(PPDMDEVINS pDevIns, RTGCPHYS GCPhys, uint32_t uValue, uint32_t uTagSrc) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + Log4(("pdmR0PciHlp_IoApicSendMsi: GCPhys=%p uValue=%d uTagSrc=%#x\n", GCPhys, uValue, uTagSrc)); + PVM pVM = pDevIns->Internal.s.pVMR0; + if (pVM->pdm.s.IoApic.pDevInsR0) + pVM->pdm.s.IoApic.pfnSendMsiR0(pVM->pdm.s.IoApic.pDevInsR0, GCPhys, uValue, uTagSrc); + else + AssertFatalMsgFailed(("Lazy bastards!")); +} + + +/** @interface_method_impl{PDMPCIHLPR0,pfnLock} */ +static DECLCALLBACK(int) pdmR0PciHlp_Lock(PPDMDEVINS pDevIns, int rc) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + return pdmLockEx(pDevIns->Internal.s.pVMR0, rc); +} + + +/** @interface_method_impl{PDMPCIHLPR0,pfnUnlock} */ +static DECLCALLBACK(void) pdmR0PciHlp_Unlock(PPDMDEVINS pDevIns) +{ + PDMDEV_ASSERT_DEVINS(pDevIns); + pdmUnlock(pDevIns->Internal.s.pVMR0); +} + + +/** + * The Ring-0 PCI Bus Helper Callbacks. + */ +extern DECLEXPORT(const PDMPCIHLPR0) g_pdmR0PciHlp = +{ + PDM_PCIHLPR0_VERSION, + pdmR0PciHlp_IsaSetIrq, + pdmR0PciHlp_IoApicSetIrq, + pdmR0PciHlp_IoApicSendMsi, + pdmR0PciHlp_Lock, + pdmR0PciHlp_Unlock, + PDM_PCIHLPR0_VERSION, /* the end */ +}; + +/** @} */ + + + + +/** @name HPET Ring-0 Helpers + * @{ + */ +/* none */ + +/** + * The Ring-0 HPET Helper Callbacks. + */ +extern DECLEXPORT(const PDMHPETHLPR0) g_pdmR0HpetHlp = +{ + PDM_HPETHLPR0_VERSION, + PDM_HPETHLPR0_VERSION, /* the end */ +}; + +/** @} */ + + +/** @name Raw PCI Ring-0 Helpers + * @{ + */ +/* none */ + +/** + * The Ring-0 PCI raw Helper Callbacks. + */ +extern DECLEXPORT(const PDMPCIRAWHLPR0) g_pdmR0PciRawHlp = +{ + PDM_PCIRAWHLPR0_VERSION, + PDM_PCIRAWHLPR0_VERSION, /* the end */ +}; + +/** @} */ + + +/** @name Ring-0 Context Driver Helpers + * @{ + */ + +/** @interface_method_impl{PDMDRVHLPR0,pfnVMSetError} */ +static DECLCALLBACK(int) pdmR0DrvHlp_VMSetError(PPDMDRVINS pDrvIns, int rc, RT_SRC_POS_DECL, const char *pszFormat, ...) +{ + PDMDRV_ASSERT_DRVINS(pDrvIns); + va_list args; + va_start(args, pszFormat); + int rc2 = VMSetErrorV(pDrvIns->Internal.s.pVMR0, rc, RT_SRC_POS_ARGS, pszFormat, args); Assert(rc2 == rc); NOREF(rc2); + va_end(args); + return rc; +} + + +/** @interface_method_impl{PDMDRVHLPR0,pfnVMSetErrorV} */ +static DECLCALLBACK(int) pdmR0DrvHlp_VMSetErrorV(PPDMDRVINS pDrvIns, int rc, RT_SRC_POS_DECL, const char *pszFormat, va_list va) +{ + PDMDRV_ASSERT_DRVINS(pDrvIns); + int rc2 = VMSetErrorV(pDrvIns->Internal.s.pVMR0, rc, RT_SRC_POS_ARGS, pszFormat, va); Assert(rc2 == rc); NOREF(rc2); + return rc; +} + + +/** @interface_method_impl{PDMDRVHLPR0,pfnVMSetRuntimeError} */ +static DECLCALLBACK(int) pdmR0DrvHlp_VMSetRuntimeError(PPDMDRVINS pDrvIns, uint32_t fFlags, const char *pszErrorId, + const char *pszFormat, ...) +{ + PDMDRV_ASSERT_DRVINS(pDrvIns); + va_list va; + va_start(va, pszFormat); + int rc = VMSetRuntimeErrorV(pDrvIns->Internal.s.pVMR0, fFlags, pszErrorId, pszFormat, va); + va_end(va); + return rc; +} + + +/** @interface_method_impl{PDMDRVHLPR0,pfnVMSetRuntimeErrorV} */ +static DECLCALLBACK(int) pdmR0DrvHlp_VMSetRuntimeErrorV(PPDMDRVINS pDrvIns, uint32_t fFlags, const char *pszErrorId, + const char *pszFormat, va_list va) +{ + PDMDRV_ASSERT_DRVINS(pDrvIns); + int rc = VMSetRuntimeErrorV(pDrvIns->Internal.s.pVMR0, fFlags, pszErrorId, pszFormat, va); + return rc; +} + + +/** @interface_method_impl{PDMDRVHLPR0,pfnAssertEMT} */ +static DECLCALLBACK(bool) pdmR0DrvHlp_AssertEMT(PPDMDRVINS pDrvIns, const char *pszFile, unsigned iLine, const char *pszFunction) +{ + PDMDRV_ASSERT_DRVINS(pDrvIns); + if (VM_IS_EMT(pDrvIns->Internal.s.pVMR0)) + return true; + + RTAssertMsg1Weak("AssertEMT", iLine, pszFile, pszFunction); + RTAssertPanic(); + return false; +} + + +/** @interface_method_impl{PDMDRVHLPR0,pfnAssertOther} */ +static DECLCALLBACK(bool) pdmR0DrvHlp_AssertOther(PPDMDRVINS pDrvIns, const char *pszFile, unsigned iLine, const char *pszFunction) +{ + PDMDRV_ASSERT_DRVINS(pDrvIns); + if (!VM_IS_EMT(pDrvIns->Internal.s.pVMR0)) + return true; + + RTAssertMsg1Weak("AssertOther", iLine, pszFile, pszFunction); + RTAssertPanic(); + return false; +} + + +/** @interface_method_impl{PDMDRVHLPR0,pfnFTSetCheckpoint} */ +static DECLCALLBACK(int) pdmR0DrvHlp_FTSetCheckpoint(PPDMDRVINS pDrvIns, FTMCHECKPOINTTYPE enmType) +{ + PDMDRV_ASSERT_DRVINS(pDrvIns); + return FTMSetCheckpoint(pDrvIns->Internal.s.pVMR0, enmType); +} + + +/** + * The Ring-0 Context Driver Helper Callbacks. + */ +extern DECLEXPORT(const PDMDRVHLPR0) g_pdmR0DrvHlp = +{ + PDM_DRVHLPRC_VERSION, + pdmR0DrvHlp_VMSetError, + pdmR0DrvHlp_VMSetErrorV, + pdmR0DrvHlp_VMSetRuntimeError, + pdmR0DrvHlp_VMSetRuntimeErrorV, + pdmR0DrvHlp_AssertEMT, + pdmR0DrvHlp_AssertOther, + pdmR0DrvHlp_FTSetCheckpoint, + PDM_DRVHLPRC_VERSION +}; + +/** @} */ + + + + +/** + * Sets an irq on the PIC and I/O APIC. + * + * @returns true if delivered, false if postponed. + * @param pVM The cross context VM structure. + * @param iIrq The irq. + * @param iLevel The new level. + * @param uTagSrc The IRQ tag and source. + * + * @remarks The caller holds the PDM lock. + */ +static bool pdmR0IsaSetIrq(PVM pVM, int iIrq, int iLevel, uint32_t uTagSrc) +{ + if (RT_LIKELY( ( pVM->pdm.s.IoApic.pDevInsR0 + || !pVM->pdm.s.IoApic.pDevInsR3) + && ( pVM->pdm.s.Pic.pDevInsR0 + || !pVM->pdm.s.Pic.pDevInsR3))) + { + if (pVM->pdm.s.Pic.pDevInsR0) + pVM->pdm.s.Pic.pfnSetIrqR0(pVM->pdm.s.Pic.pDevInsR0, iIrq, iLevel, uTagSrc); + if (pVM->pdm.s.IoApic.pDevInsR0) + pVM->pdm.s.IoApic.pfnSetIrqR0(pVM->pdm.s.IoApic.pDevInsR0, iIrq, iLevel, uTagSrc); + return true; + } + + /* queue for ring-3 execution. */ + PPDMDEVHLPTASK pTask = (PPDMDEVHLPTASK)PDMQueueAlloc(pVM->pdm.s.pDevHlpQueueR0); + AssertReturn(pTask, false); + + pTask->enmOp = PDMDEVHLPTASKOP_ISA_SET_IRQ; + pTask->pDevInsR3 = NIL_RTR3PTR; /* not required */ + pTask->u.IsaSetIRQ.iIrq = iIrq; + pTask->u.IsaSetIRQ.iLevel = iLevel; + pTask->u.IsaSetIRQ.uTagSrc = uTagSrc; + + PDMQueueInsertEx(pVM->pdm.s.pDevHlpQueueR0, &pTask->Core, 0); + return false; +} + + +/** + * PDMDevHlpCallR0 helper. + * + * @returns See PFNPDMDEVREQHANDLERR0. + * @param pGVM The global (ring-0) VM structure. (For validation.) + * @param pVM The cross context VM structure. (For validation.) + * @param pReq Pointer to the request buffer. + */ +VMMR0_INT_DECL(int) PDMR0DeviceCallReqHandler(PGVM pGVM, PVM pVM, PPDMDEVICECALLREQHANDLERREQ pReq) +{ + /* + * Validate input and make the call. + */ + int rc = GVMMR0ValidateGVMandVM(pGVM, pVM); + if (RT_SUCCESS(rc)) + { + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + PPDMDEVINS pDevIns = pReq->pDevInsR0; + AssertPtrReturn(pDevIns, VERR_INVALID_POINTER); + AssertReturn(pDevIns->Internal.s.pVMR0 == pVM, VERR_INVALID_PARAMETER); + + PFNPDMDEVREQHANDLERR0 pfnReqHandlerR0 = pReq->pfnReqHandlerR0; + AssertPtrReturn(pfnReqHandlerR0, VERR_INVALID_POINTER); + + rc = pfnReqHandlerR0(pDevIns, pReq->uOperation, pReq->u64Arg); + } + return rc; +} + diff --git a/src/VBox/VMM/VMMR0/PDMR0Driver.cpp b/src/VBox/VMM/VMMR0/PDMR0Driver.cpp new file mode 100644 index 00000000..bd6a528e --- /dev/null +++ b/src/VBox/VMM/VMMR0/PDMR0Driver.cpp @@ -0,0 +1,64 @@ +/* $Id: PDMR0Driver.cpp $ */ +/** @file + * PDM - Pluggable Device and Driver Manager, R0 Driver parts. + */ + +/* + * Copyright (C) 2010-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_PDM_DRIVER +#include "PDMInternal.h" +#include <VBox/vmm/pdm.h> +#include <VBox/vmm/vm.h> +#include <VBox/vmm/gvmm.h> + +#include <VBox/log.h> +#include <iprt/errcore.h> +#include <iprt/assert.h> + + + +/** + * PDMDrvHlpCallR0 helper. + * + * @returns See PFNPDMDRVREQHANDLERR0. + * @param pGVM The global (ring-0) VM structure. (For validation.) + * @param pVM The cross context VM structure. (For validation.) + * @param pReq Pointer to the request buffer. + */ +VMMR0_INT_DECL(int) PDMR0DriverCallReqHandler(PGVM pGVM, PVM pVM, PPDMDRIVERCALLREQHANDLERREQ pReq) +{ + /* + * Validate input and make the call. + */ + int rc = GVMMR0ValidateGVMandVM(pGVM, pVM); + if (RT_SUCCESS(rc)) + { + AssertPtrReturn(pReq, VERR_INVALID_POINTER); + AssertMsgReturn(pReq->Hdr.cbReq == sizeof(*pReq), ("%#x != %#x\n", pReq->Hdr.cbReq, sizeof(*pReq)), VERR_INVALID_PARAMETER); + + PPDMDRVINS pDrvIns = pReq->pDrvInsR0; + AssertPtrReturn(pDrvIns, VERR_INVALID_POINTER); + AssertReturn(pDrvIns->Internal.s.pVMR0 == pVM, VERR_INVALID_PARAMETER); + + PFNPDMDRVREQHANDLERR0 pfnReqHandlerR0 = pDrvIns->Internal.s.pfnReqHandlerR0; + AssertPtrReturn(pfnReqHandlerR0, VERR_INVALID_POINTER); + + rc = pfnReqHandlerR0(pDrvIns, pReq->uOperation, pReq->u64Arg); + } + return rc; +} + diff --git a/src/VBox/VMM/VMMR0/PGMR0.cpp b/src/VBox/VMM/VMMR0/PGMR0.cpp new file mode 100644 index 00000000..1cf8c74c --- /dev/null +++ b/src/VBox/VMM/VMMR0/PGMR0.cpp @@ -0,0 +1,660 @@ +/* $Id: PGMR0.cpp $ */ +/** @file + * PGM - Page Manager and Monitor, Ring-0. + */ + +/* + * Copyright (C) 2007-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_PGM +#include <VBox/rawpci.h> +#include <VBox/vmm/pgm.h> +#include <VBox/vmm/gmm.h> +#include <VBox/vmm/gvm.h> +#include "PGMInternal.h" +#include <VBox/vmm/vm.h> +#include "PGMInline.h" +#include <VBox/log.h> +#include <VBox/err.h> +#include <iprt/assert.h> +#include <iprt/mem.h> + + +/* + * Instantiate the ring-0 header/code templates. + */ +/** @todo r=bird: Gotta love this nested paging hacking we're still carrying with us... (Split PGM_TYPE_NESTED.) */ +#define PGM_BTH_NAME(name) PGM_BTH_NAME_32BIT_PROT(name) +#include "PGMR0Bth.h" +#undef PGM_BTH_NAME + +#define PGM_BTH_NAME(name) PGM_BTH_NAME_PAE_PROT(name) +#include "PGMR0Bth.h" +#undef PGM_BTH_NAME + +#define PGM_BTH_NAME(name) PGM_BTH_NAME_AMD64_PROT(name) +#include "PGMR0Bth.h" +#undef PGM_BTH_NAME + +#define PGM_BTH_NAME(name) PGM_BTH_NAME_EPT_PROT(name) +#include "PGMR0Bth.h" +#undef PGM_BTH_NAME + + +/** + * Worker function for PGMR3PhysAllocateHandyPages and pgmPhysEnsureHandyPage. + * + * @returns The following VBox status codes. + * @retval VINF_SUCCESS on success. FF cleared. + * @retval VINF_EM_NO_MEMORY if we're out of memory. The FF is set in this case. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The ID of the calling EMT. + * + * @thread EMT(idCpu) + * + * @remarks Must be called from within the PGM critical section. The caller + * must clear the new pages. + */ +VMMR0_INT_DECL(int) PGMR0PhysAllocateHandyPages(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + /* + * Validate inputs. + */ + AssertReturn(idCpu < pGVM->cCpus, VERR_INVALID_CPU_ID); /* caller already checked this, but just to be sure. */ + AssertReturn(pGVM->aCpus[idCpu].hEMT == RTThreadNativeSelf(), VERR_NOT_OWNER); + PGM_LOCK_ASSERT_OWNER_EX(pVM, &pVM->aCpus[idCpu]); + + /* + * Check for error injection. + */ + if (RT_UNLIKELY(pVM->pgm.s.fErrInjHandyPages)) + return VERR_NO_MEMORY; + + /* + * Try allocate a full set of handy pages. + */ + uint32_t iFirst = pVM->pgm.s.cHandyPages; + AssertReturn(iFirst <= RT_ELEMENTS(pVM->pgm.s.aHandyPages), VERR_PGM_HANDY_PAGE_IPE); + uint32_t cPages = RT_ELEMENTS(pVM->pgm.s.aHandyPages) - iFirst; + if (!cPages) + return VINF_SUCCESS; + int rc = GMMR0AllocateHandyPages(pGVM, pVM, idCpu, cPages, cPages, &pVM->pgm.s.aHandyPages[iFirst]); + if (RT_SUCCESS(rc)) + { +#ifdef VBOX_STRICT + for (uint32_t i = 0; i < RT_ELEMENTS(pVM->pgm.s.aHandyPages); i++) + { + Assert(pVM->pgm.s.aHandyPages[i].idPage != NIL_GMM_PAGEID); + Assert(pVM->pgm.s.aHandyPages[i].idPage <= GMM_PAGEID_LAST); + Assert(pVM->pgm.s.aHandyPages[i].idSharedPage == NIL_GMM_PAGEID); + Assert(pVM->pgm.s.aHandyPages[i].HCPhysGCPhys != NIL_RTHCPHYS); + Assert(!(pVM->pgm.s.aHandyPages[i].HCPhysGCPhys & ~X86_PTE_PAE_PG_MASK)); + } +#endif + + pVM->pgm.s.cHandyPages = RT_ELEMENTS(pVM->pgm.s.aHandyPages); + } + else if (rc != VERR_GMM_SEED_ME) + { + if ( ( rc == VERR_GMM_HIT_GLOBAL_LIMIT + || rc == VERR_GMM_HIT_VM_ACCOUNT_LIMIT) + && iFirst < PGM_HANDY_PAGES_MIN) + { + +#ifdef VBOX_STRICT + /* We're ASSUMING that GMM has updated all the entires before failing us. */ + uint32_t i; + for (i = iFirst; i < RT_ELEMENTS(pVM->pgm.s.aHandyPages); i++) + { + Assert(pVM->pgm.s.aHandyPages[i].idPage == NIL_GMM_PAGEID); + Assert(pVM->pgm.s.aHandyPages[i].idSharedPage == NIL_GMM_PAGEID); + Assert(pVM->pgm.s.aHandyPages[i].HCPhysGCPhys == NIL_RTHCPHYS); + } +#endif + + /* + * Reduce the number of pages until we hit the minimum limit. + */ + do + { + cPages >>= 1; + if (cPages + iFirst < PGM_HANDY_PAGES_MIN) + cPages = PGM_HANDY_PAGES_MIN - iFirst; + rc = GMMR0AllocateHandyPages(pGVM, pVM, idCpu, 0, cPages, &pVM->pgm.s.aHandyPages[iFirst]); + } while ( ( rc == VERR_GMM_HIT_GLOBAL_LIMIT + || rc == VERR_GMM_HIT_VM_ACCOUNT_LIMIT) + && cPages + iFirst > PGM_HANDY_PAGES_MIN); + if (RT_SUCCESS(rc)) + { +#ifdef VBOX_STRICT + i = iFirst + cPages; + while (i-- > 0) + { + Assert(pVM->pgm.s.aHandyPages[i].idPage != NIL_GMM_PAGEID); + Assert(pVM->pgm.s.aHandyPages[i].idPage <= GMM_PAGEID_LAST); + Assert(pVM->pgm.s.aHandyPages[i].idSharedPage == NIL_GMM_PAGEID); + Assert(pVM->pgm.s.aHandyPages[i].HCPhysGCPhys != NIL_RTHCPHYS); + Assert(!(pVM->pgm.s.aHandyPages[i].HCPhysGCPhys & ~X86_PTE_PAE_PG_MASK)); + } + + for (i = cPages + iFirst; i < RT_ELEMENTS(pVM->pgm.s.aHandyPages); i++) + { + Assert(pVM->pgm.s.aHandyPages[i].idPage == NIL_GMM_PAGEID); + Assert(pVM->pgm.s.aHandyPages[i].idSharedPage == NIL_GMM_PAGEID); + Assert(pVM->pgm.s.aHandyPages[i].HCPhysGCPhys == NIL_RTHCPHYS); + } +#endif + + pVM->pgm.s.cHandyPages = iFirst + cPages; + } + } + + if (RT_FAILURE(rc) && rc != VERR_GMM_SEED_ME) + { + LogRel(("PGMR0PhysAllocateHandyPages: rc=%Rrc iFirst=%d cPages=%d\n", rc, iFirst, cPages)); + VM_FF_SET(pVM, VM_FF_PGM_NO_MEMORY); + } + } + + + LogFlow(("PGMR0PhysAllocateHandyPages: cPages=%d rc=%Rrc\n", cPages, rc)); + return rc; +} + + +/** + * Flushes any changes pending in the handy page array. + * + * It is very important that this gets done when page sharing is enabled. + * + * @returns The following VBox status codes. + * @retval VINF_SUCCESS on success. FF cleared. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The ID of the calling EMT. + * + * @thread EMT(idCpu) + * + * @remarks Must be called from within the PGM critical section. + */ +VMMR0_INT_DECL(int) PGMR0PhysFlushHandyPages(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + /* + * Validate inputs. + */ + AssertReturn(idCpu < pGVM->cCpus, VERR_INVALID_CPU_ID); /* caller already checked this, but just to be sure. */ + AssertReturn(pGVM->aCpus[idCpu].hEMT == RTThreadNativeSelf(), VERR_NOT_OWNER); + PGM_LOCK_ASSERT_OWNER_EX(pVM, &pVM->aCpus[idCpu]); + + /* + * Try allocate a full set of handy pages. + */ + uint32_t iFirst = pVM->pgm.s.cHandyPages; + AssertReturn(iFirst <= RT_ELEMENTS(pVM->pgm.s.aHandyPages), VERR_PGM_HANDY_PAGE_IPE); + uint32_t cPages = RT_ELEMENTS(pVM->pgm.s.aHandyPages) - iFirst; + if (!cPages) + return VINF_SUCCESS; + int rc = GMMR0AllocateHandyPages(pGVM, pVM, idCpu, cPages, 0, &pVM->pgm.s.aHandyPages[iFirst]); + + LogFlow(("PGMR0PhysFlushHandyPages: cPages=%d rc=%Rrc\n", cPages, rc)); + return rc; +} + + +/** + * Worker function for PGMR3PhysAllocateLargeHandyPage + * + * @returns The following VBox status codes. + * @retval VINF_SUCCESS on success. + * @retval VINF_EM_NO_MEMORY if we're out of memory. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The ID of the calling EMT. + * + * @thread EMT(idCpu) + * + * @remarks Must be called from within the PGM critical section. The caller + * must clear the new pages. + */ +VMMR0_INT_DECL(int) PGMR0PhysAllocateLargeHandyPage(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + /* + * Validate inputs. + */ + AssertReturn(idCpu < pGVM->cCpus, VERR_INVALID_CPU_ID); /* caller already checked this, but just to be sure. */ + AssertReturn(pGVM->aCpus[idCpu].hEMT == RTThreadNativeSelf(), VERR_NOT_OWNER); + PGM_LOCK_ASSERT_OWNER_EX(pVM, &pVM->aCpus[idCpu]); + Assert(!pVM->pgm.s.cLargeHandyPages); + + /* + * Do the job. + */ + int rc = GMMR0AllocateLargePage(pGVM, pVM, idCpu, _2M, + &pVM->pgm.s.aLargeHandyPage[0].idPage, + &pVM->pgm.s.aLargeHandyPage[0].HCPhysGCPhys); + if (RT_SUCCESS(rc)) + pVM->pgm.s.cLargeHandyPages = 1; + + return rc; +} + + +#ifdef VBOX_WITH_PCI_PASSTHROUGH +/* Interface sketch. The interface belongs to a global PCI pass-through + manager. It shall use the global VM handle, not the user VM handle to + store the per-VM info (domain) since that is all ring-0 stuff, thus + passing pGVM here. I've tentitively prefixed the functions 'GPciRawR0', + we can discuss the PciRaw code re-organtization when I'm back from + vacation. + + I've implemented the initial IOMMU set up below. For things to work + reliably, we will probably need add a whole bunch of checks and + GPciRawR0GuestPageUpdate call to the PGM code. For the present, + assuming nested paging (enforced) and prealloc (enforced), no + ballooning (check missing), page sharing (check missing) or live + migration (check missing), it might work fine. At least if some + VM power-off hook is present and can tear down the IOMMU page tables. */ + +/** + * Tells the global PCI pass-through manager that we are about to set up the + * guest page to host page mappings for the specfied VM. + * + * @returns VBox status code. + * + * @param pGVM The ring-0 VM structure. + */ +VMMR0_INT_DECL(int) GPciRawR0GuestPageBeginAssignments(PGVM pGVM) +{ + NOREF(pGVM); + return VINF_SUCCESS; +} + + +/** + * Assigns a host page mapping for a guest page. + * + * This is only used when setting up the mappings, i.e. between + * GPciRawR0GuestPageBeginAssignments and GPciRawR0GuestPageEndAssignments. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM structure. + * @param GCPhys The address of the guest page (page aligned). + * @param HCPhys The address of the host page (page aligned). + */ +VMMR0_INT_DECL(int) GPciRawR0GuestPageAssign(PGVM pGVM, RTGCPHYS GCPhys, RTHCPHYS HCPhys) +{ + AssertReturn(!(GCPhys & PAGE_OFFSET_MASK), VERR_INTERNAL_ERROR_3); + AssertReturn(!(HCPhys & PAGE_OFFSET_MASK), VERR_INTERNAL_ERROR_3); + + if (pGVM->rawpci.s.pfnContigMemInfo) + /** @todo what do we do on failure? */ + pGVM->rawpci.s.pfnContigMemInfo(&pGVM->rawpci.s, HCPhys, GCPhys, PAGE_SIZE, PCIRAW_MEMINFO_MAP); + + return VINF_SUCCESS; +} + + +/** + * Indicates that the specified guest page doesn't exists but doesn't have host + * page mapping we trust PCI pass-through with. + * + * This is only used when setting up the mappings, i.e. between + * GPciRawR0GuestPageBeginAssignments and GPciRawR0GuestPageEndAssignments. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM structure. + * @param GCPhys The address of the guest page (page aligned). + * @param HCPhys The address of the host page (page aligned). + */ +VMMR0_INT_DECL(int) GPciRawR0GuestPageUnassign(PGVM pGVM, RTGCPHYS GCPhys) +{ + AssertReturn(!(GCPhys & PAGE_OFFSET_MASK), VERR_INTERNAL_ERROR_3); + + if (pGVM->rawpci.s.pfnContigMemInfo) + /** @todo what do we do on failure? */ + pGVM->rawpci.s.pfnContigMemInfo(&pGVM->rawpci.s, 0, GCPhys, PAGE_SIZE, PCIRAW_MEMINFO_UNMAP); + + return VINF_SUCCESS; +} + + +/** + * Tells the global PCI pass-through manager that we have completed setting up + * the guest page to host page mappings for the specfied VM. + * + * This complements GPciRawR0GuestPageBeginAssignments and will be called even + * if some page assignment failed. + * + * @returns VBox status code. + * + * @param pGVM The ring-0 VM structure. + */ +VMMR0_INT_DECL(int) GPciRawR0GuestPageEndAssignments(PGVM pGVM) +{ + NOREF(pGVM); + return VINF_SUCCESS; +} + + +/** + * Tells the global PCI pass-through manager that a guest page mapping has + * changed after the initial setup. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM structure. + * @param GCPhys The address of the guest page (page aligned). + * @param HCPhys The new host page address or NIL_RTHCPHYS if + * now unassigned. + */ +VMMR0_INT_DECL(int) GPciRawR0GuestPageUpdate(PGVM pGVM, RTGCPHYS GCPhys, RTHCPHYS HCPhys) +{ + AssertReturn(!(GCPhys & PAGE_OFFSET_MASK), VERR_INTERNAL_ERROR_4); + AssertReturn(!(HCPhys & PAGE_OFFSET_MASK) || HCPhys == NIL_RTHCPHYS, VERR_INTERNAL_ERROR_4); + NOREF(pGVM); + return VINF_SUCCESS; +} + +#endif /* VBOX_WITH_PCI_PASSTHROUGH */ + + +/** + * Sets up the IOMMU when raw PCI device is enabled. + * + * @note This is a hack that will probably be remodelled and refined later! + * + * @returns VBox status code. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + */ +VMMR0_INT_DECL(int) PGMR0PhysSetupIoMmu(PGVM pGVM, PVM pVM) +{ + int rc = GVMMR0ValidateGVMandVM(pGVM, pVM); + if (RT_FAILURE(rc)) + return rc; + +#ifdef VBOX_WITH_PCI_PASSTHROUGH + if (pVM->pgm.s.fPciPassthrough) + { + /* + * The Simplistic Approach - Enumerate all the pages and call tell the + * IOMMU about each of them. + */ + pgmLock(pVM); + rc = GPciRawR0GuestPageBeginAssignments(pGVM); + if (RT_SUCCESS(rc)) + { + for (PPGMRAMRANGE pRam = pVM->pgm.s.pRamRangesXR0; RT_SUCCESS(rc) && pRam; pRam = pRam->pNextR0) + { + PPGMPAGE pPage = &pRam->aPages[0]; + RTGCPHYS GCPhys = pRam->GCPhys; + uint32_t cLeft = pRam->cb >> PAGE_SHIFT; + while (cLeft-- > 0) + { + /* Only expose pages that are 100% safe for now. */ + if ( PGM_PAGE_GET_TYPE(pPage) == PGMPAGETYPE_RAM + && PGM_PAGE_GET_STATE(pPage) == PGM_PAGE_STATE_ALLOCATED + && !PGM_PAGE_HAS_ANY_HANDLERS(pPage)) + rc = GPciRawR0GuestPageAssign(pGVM, GCPhys, PGM_PAGE_GET_HCPHYS(pPage)); + else + rc = GPciRawR0GuestPageUnassign(pGVM, GCPhys); + + /* next */ + pPage++; + GCPhys += PAGE_SIZE; + } + } + + int rc2 = GPciRawR0GuestPageEndAssignments(pGVM); + if (RT_FAILURE(rc2) && RT_SUCCESS(rc)) + rc = rc2; + } + pgmUnlock(pVM); + } + else +#endif + rc = VERR_NOT_SUPPORTED; + return rc; +} + + +/** + * \#PF Handler for nested paging. + * + * @returns VBox status code (appropriate for trap handling and GC return). + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * @param enmShwPagingMode Paging mode for the nested page tables. + * @param uErr The trap error code. + * @param pRegFrame Trap register frame. + * @param GCPhysFault The fault address. + */ +VMMR0DECL(int) PGMR0Trap0eHandlerNestedPaging(PVM pVM, PVMCPU pVCpu, PGMMODE enmShwPagingMode, RTGCUINT uErr, + PCPUMCTXCORE pRegFrame, RTGCPHYS GCPhysFault) +{ + int rc; + + LogFlow(("PGMTrap0eHandler: uErr=%RGx GCPhysFault=%RGp eip=%RGv\n", uErr, GCPhysFault, (RTGCPTR)pRegFrame->rip)); + STAM_PROFILE_START(&pVCpu->pgm.s.StatRZTrap0e, a); + STAM_STATS({ pVCpu->pgm.s.CTX_SUFF(pStatTrap0eAttribution) = NULL; } ); + + /* AMD uses the host's paging mode; Intel has a single mode (EPT). */ + AssertMsg( enmShwPagingMode == PGMMODE_32_BIT || enmShwPagingMode == PGMMODE_PAE || enmShwPagingMode == PGMMODE_PAE_NX + || enmShwPagingMode == PGMMODE_AMD64 || enmShwPagingMode == PGMMODE_AMD64_NX || enmShwPagingMode == PGMMODE_EPT, + ("enmShwPagingMode=%d\n", enmShwPagingMode)); + + /* Reserved shouldn't end up here. */ + Assert(!(uErr & X86_TRAP_PF_RSVD)); + +#ifdef VBOX_WITH_STATISTICS + /* + * Error code stats. + */ + if (uErr & X86_TRAP_PF_US) + { + if (!(uErr & X86_TRAP_PF_P)) + { + if (uErr & X86_TRAP_PF_RW) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eUSNotPresentWrite); + else + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eUSNotPresentRead); + } + else if (uErr & X86_TRAP_PF_RW) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eUSWrite); + else if (uErr & X86_TRAP_PF_RSVD) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eUSReserved); + else if (uErr & X86_TRAP_PF_ID) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eUSNXE); + else + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eUSRead); + } + else + { /* Supervisor */ + if (!(uErr & X86_TRAP_PF_P)) + { + if (uErr & X86_TRAP_PF_RW) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eSVNotPresentWrite); + else + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eSVNotPresentRead); + } + else if (uErr & X86_TRAP_PF_RW) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eSVWrite); + else if (uErr & X86_TRAP_PF_ID) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eSNXE); + else if (uErr & X86_TRAP_PF_RSVD) + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eSVReserved); + } +#endif + + /* + * Call the worker. + * + * Note! We pretend the guest is in protected mode without paging, so we + * can use existing code to build the nested page tables. + */ +/** @todo r=bird: Gotta love this nested paging hacking we're still carrying with us... (Split PGM_TYPE_NESTED.) */ + bool fLockTaken = false; + switch (enmShwPagingMode) + { + case PGMMODE_32_BIT: + rc = PGM_BTH_NAME_32BIT_PROT(Trap0eHandler)(pVCpu, uErr, pRegFrame, GCPhysFault, &fLockTaken); + break; + case PGMMODE_PAE: + case PGMMODE_PAE_NX: + rc = PGM_BTH_NAME_PAE_PROT(Trap0eHandler)(pVCpu, uErr, pRegFrame, GCPhysFault, &fLockTaken); + break; + case PGMMODE_AMD64: + case PGMMODE_AMD64_NX: + rc = PGM_BTH_NAME_AMD64_PROT(Trap0eHandler)(pVCpu, uErr, pRegFrame, GCPhysFault, &fLockTaken); + break; + case PGMMODE_EPT: + rc = PGM_BTH_NAME_EPT_PROT(Trap0eHandler)(pVCpu, uErr, pRegFrame, GCPhysFault, &fLockTaken); + break; + default: + AssertFailed(); + rc = VERR_INVALID_PARAMETER; + break; + } + if (fLockTaken) + { + PGM_LOCK_ASSERT_OWNER(pVM); + pgmUnlock(pVM); + } + + if (rc == VINF_PGM_SYNCPAGE_MODIFIED_PDE) + rc = VINF_SUCCESS; + /* + * Handle the case where we cannot interpret the instruction because we cannot get the guest physical address + * via its page tables, see @bugref{6043}. + */ + else if ( rc == VERR_PAGE_NOT_PRESENT /* SMP only ; disassembly might fail. */ + || rc == VERR_PAGE_TABLE_NOT_PRESENT /* seen with UNI & SMP */ + || rc == VERR_PAGE_DIRECTORY_PTR_NOT_PRESENT /* seen with SMP */ + || rc == VERR_PAGE_MAP_LEVEL4_NOT_PRESENT) /* precaution */ + { + Log(("WARNING: Unexpected VERR_PAGE_TABLE_NOT_PRESENT (%d) for page fault at %RGp error code %x (rip=%RGv)\n", rc, GCPhysFault, uErr, pRegFrame->rip)); + /* Some kind of inconsistency in the SMP case; it's safe to just execute the instruction again; not sure about + single VCPU VMs though. */ + rc = VINF_SUCCESS; + } + + STAM_STATS({ if (!pVCpu->pgm.s.CTX_SUFF(pStatTrap0eAttribution)) + pVCpu->pgm.s.CTX_SUFF(pStatTrap0eAttribution) = &pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0eTime2Misc; }); + STAM_PROFILE_STOP_EX(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatRZTrap0e, pVCpu->pgm.s.CTX_SUFF(pStatTrap0eAttribution), a); + return rc; +} + + +/** + * \#PF Handler for deliberate nested paging misconfiguration (/reserved bit) + * employed for MMIO pages. + * + * @returns VBox status code (appropriate for trap handling and GC return). + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * @param enmShwPagingMode Paging mode for the nested page tables. + * @param pRegFrame Trap register frame. + * @param GCPhysFault The fault address. + * @param uErr The error code, UINT32_MAX if not available + * (VT-x). + */ +VMMR0DECL(VBOXSTRICTRC) PGMR0Trap0eHandlerNPMisconfig(PVM pVM, PVMCPU pVCpu, PGMMODE enmShwPagingMode, + PCPUMCTXCORE pRegFrame, RTGCPHYS GCPhysFault, uint32_t uErr) +{ +#ifdef PGM_WITH_MMIO_OPTIMIZATIONS + STAM_PROFILE_START(&pVCpu->CTX_SUFF(pStats)->StatR0NpMiscfg, a); + VBOXSTRICTRC rc; + + /* + * Try lookup the all access physical handler for the address. + */ + pgmLock(pVM); + PPGMPHYSHANDLER pHandler = pgmHandlerPhysicalLookup(pVM, GCPhysFault); + PPGMPHYSHANDLERTYPEINT pHandlerType = RT_LIKELY(pHandler) ? PGMPHYSHANDLER_GET_TYPE(pVM, pHandler) : NULL; + if (RT_LIKELY(pHandler && pHandlerType->enmKind != PGMPHYSHANDLERKIND_WRITE)) + { + /* + * If the handle has aliases page or pages that have been temporarily + * disabled, we'll have to take a detour to make sure we resync them + * to avoid lots of unnecessary exits. + */ + PPGMPAGE pPage; + if ( ( pHandler->cAliasedPages + || pHandler->cTmpOffPages) + && ( (pPage = pgmPhysGetPage(pVM, GCPhysFault)) == NULL + || PGM_PAGE_GET_HNDL_PHYS_STATE(pPage) == PGM_PAGE_HNDL_PHYS_STATE_DISABLED) + ) + { + Log(("PGMR0Trap0eHandlerNPMisconfig: Resyncing aliases / tmp-off page at %RGp (uErr=%#x) %R[pgmpage]\n", GCPhysFault, uErr, pPage)); + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatR0NpMiscfgSyncPage); + rc = pgmShwSyncNestedPageLocked(pVCpu, GCPhysFault, 1 /*cPages*/, enmShwPagingMode); + pgmUnlock(pVM); + } + else + { + if (pHandlerType->CTX_SUFF(pfnPfHandler)) + { + void *pvUser = pHandler->CTX_SUFF(pvUser); + STAM_PROFILE_START(&pHandler->Stat, h); + pgmUnlock(pVM); + + Log6(("PGMR0Trap0eHandlerNPMisconfig: calling %p(,%#x,,%RGp,%p)\n", pHandlerType->CTX_SUFF(pfnPfHandler), uErr, GCPhysFault, pvUser)); + rc = pHandlerType->CTX_SUFF(pfnPfHandler)(pVM, pVCpu, uErr == UINT32_MAX ? RTGCPTR_MAX : uErr, pRegFrame, + GCPhysFault, GCPhysFault, pvUser); + +#ifdef VBOX_WITH_STATISTICS + pgmLock(pVM); + pHandler = pgmHandlerPhysicalLookup(pVM, GCPhysFault); + if (pHandler) + STAM_PROFILE_STOP(&pHandler->Stat, h); + pgmUnlock(pVM); +#endif + } + else + { + pgmUnlock(pVM); + Log(("PGMR0Trap0eHandlerNPMisconfig: %RGp (uErr=%#x) -> R3\n", GCPhysFault, uErr)); + rc = VINF_EM_RAW_EMULATE_INSTR; + } + } + } + else + { + /* + * Must be out of sync, so do a SyncPage and restart the instruction. + * + * ASSUMES that ALL handlers are page aligned and covers whole pages + * (assumption asserted in PGMHandlerPhysicalRegisterEx). + */ + Log(("PGMR0Trap0eHandlerNPMisconfig: Out of sync page at %RGp (uErr=%#x)\n", GCPhysFault, uErr)); + STAM_COUNTER_INC(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatR0NpMiscfgSyncPage); + rc = pgmShwSyncNestedPageLocked(pVCpu, GCPhysFault, 1 /*cPages*/, enmShwPagingMode); + pgmUnlock(pVM); + } + + STAM_PROFILE_STOP(&pVCpu->pgm.s.CTX_SUFF(pStats)->StatR0NpMiscfg, a); + return rc; + +#else + AssertLogRelFailed(); + return VERR_PGM_NOT_USED_IN_MODE; +#endif +} + diff --git a/src/VBox/VMM/VMMR0/PGMR0Bth.h b/src/VBox/VMM/VMMR0/PGMR0Bth.h new file mode 100644 index 00000000..e67cf6f4 --- /dev/null +++ b/src/VBox/VMM/VMMR0/PGMR0Bth.h @@ -0,0 +1,25 @@ +/* $Id: PGMR0Bth.h $ */ +/** @file + * VBox - Page Manager / Monitor, Shadow+Guest Paging Template. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/******************************************************************************* +* Internal Functions * +*******************************************************************************/ +RT_C_DECLS_BEGIN +PGM_BTH_DECL(int, Trap0eHandler)(PVMCPU pVCpu, RTGCUINT uErr, PCPUMCTXCORE pRegFrame, RTGCPTR pvFault, bool *pfLockTaken); +RT_C_DECLS_END + diff --git a/src/VBox/VMM/VMMR0/PGMR0SharedPage.cpp b/src/VBox/VMM/VMMR0/PGMR0SharedPage.cpp new file mode 100644 index 00000000..de94eec3 --- /dev/null +++ b/src/VBox/VMM/VMMR0/PGMR0SharedPage.cpp @@ -0,0 +1,170 @@ +/* $Id: PGMR0SharedPage.cpp $ */ +/** @file + * PGM - Page Manager and Monitor, Page Sharing, Ring-0. + */ + +/* + * Copyright (C) 2010-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_PGM_SHARED +#include <VBox/vmm/pgm.h> +#include <VBox/vmm/gmm.h> +#include "PGMInternal.h" +#include <VBox/vmm/vm.h> +#include "PGMInline.h" +#include <VBox/log.h> +#include <VBox/err.h> +#include <iprt/assert.h> +#include <iprt/mem.h> + + +#ifdef VBOX_WITH_PAGE_SHARING +/** + * Check a registered module for shared page changes. + * + * The PGM lock shall be taken prior to calling this method. + * + * @returns The following VBox status codes. + * + * @param pVM The cross context VM structure. + * @param pGVM Pointer to the GVM instance data. + * @param idCpu The ID of the calling virtual CPU. + * @param pModule Global module description. + * @param paRegionsGCPtrs Array parallel to pModules->aRegions with the + * addresses of the regions in the calling + * process. + */ +VMMR0DECL(int) PGMR0SharedModuleCheck(PVM pVM, PGVM pGVM, VMCPUID idCpu, PGMMSHAREDMODULE pModule, PCRTGCPTR64 paRegionsGCPtrs) +{ + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + int rc = VINF_SUCCESS; + bool fFlushTLBs = false; + bool fFlushRemTLBs = false; + GMMSHAREDPAGEDESC PageDesc; + + Log(("PGMR0SharedModuleCheck: check %s %s base=%RGv size=%x\n", pModule->szName, pModule->szVersion, pModule->Core.Key, pModule->cbModule)); + + PGM_LOCK_ASSERT_OWNER(pVM); /* This cannot fail as we grab the lock in pgmR3SharedModuleRegRendezvous before calling into ring-0. */ + + /* + * Check every region of the shared module. + */ + for (uint32_t idxRegion = 0; idxRegion < pModule->cRegions; idxRegion++) + { + RTGCPTR GCPtrPage = paRegionsGCPtrs[idxRegion] & ~(RTGCPTR)PAGE_OFFSET_MASK; + uint32_t cbLeft = pModule->aRegions[idxRegion].cb; Assert(!(cbLeft & PAGE_OFFSET_MASK)); + uint32_t idxPage = 0; + + while (cbLeft) + { + /** @todo inefficient to fetch each guest page like this... */ + RTGCPHYS GCPhys; + uint64_t fFlags; + rc = PGMGstGetPage(pVCpu, GCPtrPage, &fFlags, &GCPhys); + if ( rc == VINF_SUCCESS + && !(fFlags & X86_PTE_RW)) /* important as we make assumptions about this below! */ + { + PPGMPAGE pPage = pgmPhysGetPage(pVM, GCPhys); + Assert(!pPage || !PGM_PAGE_IS_BALLOONED(pPage)); + if ( pPage + && PGM_PAGE_GET_STATE(pPage) == PGM_PAGE_STATE_ALLOCATED + && PGM_PAGE_GET_READ_LOCKS(pPage) == 0 + && PGM_PAGE_GET_WRITE_LOCKS(pPage) == 0 ) + { + PageDesc.idPage = PGM_PAGE_GET_PAGEID(pPage); + PageDesc.HCPhys = PGM_PAGE_GET_HCPHYS(pPage); + PageDesc.GCPhys = GCPhys; + + rc = GMMR0SharedModuleCheckPage(pGVM, pModule, idxRegion, idxPage, &PageDesc); + if (RT_FAILURE(rc)) + break; + + /* + * Any change for this page? + */ + if (PageDesc.idPage != NIL_GMM_PAGEID) + { + Assert(PGM_PAGE_GET_STATE(pPage) == PGM_PAGE_STATE_ALLOCATED); + + Log(("PGMR0SharedModuleCheck: shared page gst virt=%RGv phys=%RGp host %RHp->%RHp\n", + GCPtrPage, PageDesc.GCPhys, PGM_PAGE_GET_HCPHYS(pPage), PageDesc.HCPhys)); + + /* Page was either replaced by an existing shared + version of it or converted into a read-only shared + page, so, clear all references. */ + bool fFlush = false; + rc = pgmPoolTrackUpdateGCPhys(pVM, PageDesc.GCPhys, pPage, true /* clear the entries */, &fFlush); + Assert( rc == VINF_SUCCESS + || ( VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_PGM_SYNC_CR3) + && (pVCpu->pgm.s.fSyncFlags & PGM_SYNC_CLEAR_PGM_POOL))); + if (rc == VINF_SUCCESS) + fFlushTLBs |= fFlush; + fFlushRemTLBs = true; + + if (PageDesc.HCPhys != PGM_PAGE_GET_HCPHYS(pPage)) + { + /* Update the physical address and page id now. */ + PGM_PAGE_SET_HCPHYS(pVM, pPage, PageDesc.HCPhys); + PGM_PAGE_SET_PAGEID(pVM, pPage, PageDesc.idPage); + + /* Invalidate page map TLB entry for this page too. */ + pgmPhysInvalidatePageMapTLBEntry(pVM, PageDesc.GCPhys); + pVM->pgm.s.cReusedSharedPages++; + } + /* else: nothing changed (== this page is now a shared + page), so no need to flush anything. */ + + pVM->pgm.s.cSharedPages++; + pVM->pgm.s.cPrivatePages--; + PGM_PAGE_SET_STATE(pVM, pPage, PGM_PAGE_STATE_SHARED); + +# ifdef VBOX_STRICT /* check sum hack */ + pPage->s.u2Unused0 = PageDesc.u32StrictChecksum & 3; + //pPage->s.u2Unused1 = (PageDesc.u32StrictChecksum >> 8) & 3; +# endif + } + } + } + else + { + Assert( rc == VINF_SUCCESS + || rc == VERR_PAGE_NOT_PRESENT + || rc == VERR_PAGE_MAP_LEVEL4_NOT_PRESENT + || rc == VERR_PAGE_DIRECTORY_PTR_NOT_PRESENT + || rc == VERR_PAGE_TABLE_NOT_PRESENT); + rc = VINF_SUCCESS; /* ignore error */ + } + + idxPage++; + GCPtrPage += PAGE_SIZE; + cbLeft -= PAGE_SIZE; + } + } + + /* + * Do TLB flushing if necessary. + */ + if (fFlushTLBs) + PGM_INVL_ALL_VCPU_TLBS(pVM); + + if (fFlushRemTLBs) + for (VMCPUID idCurCpu = 0; idCurCpu < pVM->cCpus; idCurCpu++) + CPUMSetChangedFlags(&pVM->aCpus[idCurCpu], CPUM_CHANGED_GLOBAL_TLB_FLUSH); + + return rc; +} +#endif /* VBOX_WITH_PAGE_SHARING */ + diff --git a/src/VBox/VMM/VMMR0/TRPMR0.cpp b/src/VBox/VMM/VMMR0/TRPMR0.cpp new file mode 100644 index 00000000..f9ca7939 --- /dev/null +++ b/src/VBox/VMM/VMMR0/TRPMR0.cpp @@ -0,0 +1,107 @@ +/* $Id: TRPMR0.cpp $ */ +/** @file + * TRPM - The Trap Monitor - HC Ring 0 + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_TRPM +#include <VBox/vmm/trpm.h> +#include "TRPMInternal.h" +#include <VBox/vmm/vm.h> +#include <VBox/vmm/vmm.h> +#include <iprt/errcore.h> +#include <VBox/log.h> +#include <iprt/assert.h> +#include <iprt/asm-amd64-x86.h> + + +#if defined(RT_OS_DARWIN) && ARCH_BITS == 32 +# error "32-bit darwin is no longer supported. Go back to 4.3 or earlier!" +#endif + + +/** + * Dispatches an interrupt that arrived while we were in the guest context. + * + * @param pVM The cross context VM structure. + * @remark Must be called with interrupts disabled. + */ +VMMR0DECL(void) TRPMR0DispatchHostInterrupt(PVM pVM) +{ + /* + * Get the active interrupt vector number. + */ + PVMCPU pVCpu = VMMGetCpu0(pVM); + RTUINT uActiveVector = pVCpu->trpm.s.uActiveVector; + pVCpu->trpm.s.uActiveVector = UINT32_MAX; + AssertMsgReturnVoid(uActiveVector < 256, ("uActiveVector=%#x is invalid! (More assertions to come, please enjoy!)\n", uActiveVector)); + +#if HC_ARCH_BITS == 64 && defined(RT_OS_DARWIN) + /* + * Do it the simple and safe way. + * + * This is a workaround for an optimization bug in the code below + * or a gcc 4.2 on mac (snow leopard seed 314). + */ + trpmR0DispatchHostInterruptSimple(uActiveVector); + +#else /* The complicated way: */ + + /* + * Get the handler pointer (16:32 ptr) / (16:48 ptr). + */ + RTIDTR Idtr; + ASMGetIDTR(&Idtr); +# if HC_ARCH_BITS == 32 + PVBOXIDTE pIdte = &((PVBOXIDTE)Idtr.pIdt)[uActiveVector]; +# else + PVBOXIDTE64 pIdte = &((PVBOXIDTE64)Idtr.pIdt)[uActiveVector]; +# endif + AssertMsgReturnVoid(pIdte->Gen.u1Present, ("The IDT entry (%d) is not present!\n", uActiveVector)); + AssertMsgReturnVoid( pIdte->Gen.u3Type1 == VBOX_IDTE_TYPE1 + || pIdte->Gen.u5Type2 == VBOX_IDTE_TYPE2_INT_32, + ("The IDT entry (%d) is not 32-bit int gate! type1=%#x type2=%#x\n", + uActiveVector, pIdte->Gen.u3Type1, pIdte->Gen.u5Type2)); +# if HC_ARCH_BITS == 32 + RTFAR32 pfnHandler; + pfnHandler.off = VBOXIDTE_OFFSET(*pIdte); + pfnHandler.sel = pIdte->Gen.u16SegSel; + + const RTR0UINTREG uRSP = ~(RTR0UINTREG)0; + +# else /* 64-bit: */ + RTFAR64 pfnHandler; + pfnHandler.off = VBOXIDTE64_OFFSET(*pIdte); + pfnHandler.sel = pIdte->Gen.u16SegSel; + + const RTR0UINTREG uRSP = ~(RTR0UINTREG)0; + if (pIdte->Gen.u3Ist) + { + trpmR0DispatchHostInterruptSimple(uActiveVector); + return; + } + +# endif + + /* + * Dispatch it. + */ + trpmR0DispatchHostInterrupt(pfnHandler.off, pfnHandler.sel, uRSP); +#endif +} + diff --git a/src/VBox/VMM/VMMR0/TRPMR0A.asm b/src/VBox/VMM/VMMR0/TRPMR0A.asm new file mode 100644 index 00000000..8eee50f3 --- /dev/null +++ b/src/VBox/VMM/VMMR0/TRPMR0A.asm @@ -0,0 +1,155 @@ +; $Id: TRPMR0A.asm $ +;; @file +; TRPM - Host Context Ring-0 +; + +; +; Copyright (C) 2006-2019 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + +;******************************************************************************* +;* Header Files * +;******************************************************************************* +%include "VBox/asmdefs.mac" +%include "iprt/x86.mac" + + +BEGINCODE + +;; +; Calls the interrupt gate as if we received an interrupt while in Ring-0. +; +; @param uIP x86:[ebp+8] msc:rcx gcc:rdi The interrupt gate IP. +; @param SelCS x86:[ebp+12] msc:dx gcc:si The interrupt gate CS. +; @param RSP msc:r8 gcc:rdx The interrupt gate RSP. ~0 if no stack switch should take place. (only AMD64) +;DECLASM(void) trpmR0DispatchHostInterrupt(RTR0UINTPTR uIP, RTSEL SelCS, RTR0UINTPTR RSP); +ALIGNCODE(16) +BEGINPROC trpmR0DispatchHostInterrupt + push xBP + mov xBP, xSP + +%ifdef RT_ARCH_AMD64 + mov r11, rsp ; save the RSP for the iret frame. + and rsp, 0fffffffffffffff0h ; align the stack. (do it unconditionally saves some jump mess) + + ; switch stack? + %ifdef ASM_CALL64_MSC + cmp r8, 0ffffffffffffffffh + je .no_stack_switch + mov rsp, r8 + %else + cmp rdx, 0ffffffffffffffffh + je .no_stack_switch + mov rsp, rdx + %endif +.no_stack_switch: + + ; create the iret frame + push 0 ; SS + push r11 ; RSP + pushfq ; RFLAGS + and dword [rsp], ~X86_EFL_IF + mov ax, cs + push rax ; CS + lea r10, [.return wrt rip] ; RIP + push r10 + + ; create the retf frame + %ifdef ASM_CALL64_MSC + movzx rdx, dx + cmp rdx, r11 + je .dir_jump + push rdx + push rcx + %else + movzx rsi, si + cmp rsi, r11 + je .dir_jump + push rsi + push rdi + %endif + + ; dispatch it + db 048h + retf + + ; dispatch it by a jmp (don't mess up the IST stack) +.dir_jump: + %ifdef ASM_CALL64_MSC + jmp rcx + %else + jmp rdi + %endif + +%else ; 32-bit: + mov ecx, [ebp + 8] ; uIP + movzx edx, word [ebp + 12] ; SelCS + + ; create the iret frame + pushfd ; EFLAGS + and dword [esp], ~X86_EFL_IF + push cs ; CS + push .return ; EIP + + ; create the retf frame + push edx + push ecx + + ; dispatch it! + retf +%endif +.return: + cli + + leave + ret +ENDPROC trpmR0DispatchHostInterrupt + + +;; +; Issues a software interrupt to the specified interrupt vector. +; +; @param uActiveVector x86:[esp+4] msc:rcx gcc:rdi The vector number. +; +;DECLASM(void) trpmR0DispatchHostInterruptSimple(RTUINT uActiveVector); +ALIGNCODE(16) +BEGINPROC trpmR0DispatchHostInterruptSimple +%ifdef RT_ARCH_X86 + mov eax, [esp + 4] + jmp dword [.jmp_table + eax * 4] +%else + lea r9, [.jmp_table wrt rip] + %ifdef ASM_CALL64_MSC + jmp qword [r9 + rcx * 8] + %else + jmp qword [r9 + rdi * 8] + %endif +%endif + +ALIGNCODE(4) +.jmp_table: +%assign i 0 +%rep 256 +RTCCPTR_DEF .int_ %+ i +%assign i i+1 +%endrep + +%assign i 0 +%rep 256 + ALIGNCODE(4) +.int_ %+ i: + int i + ret +%assign i i+1 +%endrep + +ENDPROC trpmR0DispatchHostInterruptSimple + diff --git a/src/VBox/VMM/VMMR0/VMMR0.cpp b/src/VBox/VMM/VMMR0/VMMR0.cpp new file mode 100644 index 00000000..4f5d1c2b --- /dev/null +++ b/src/VBox/VMM/VMMR0/VMMR0.cpp @@ -0,0 +1,2861 @@ +/* $Id: VMMR0.cpp $ */ +/** @file + * VMM - Host Context Ring 0. + */ + +/* + * Copyright (C) 2006-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_VMM +#include <VBox/vmm/vmm.h> +#include <VBox/sup.h> +#include <VBox/vmm/trpm.h> +#include <VBox/vmm/cpum.h> +#include <VBox/vmm/pdmapi.h> +#include <VBox/vmm/pgm.h> +#ifdef VBOX_WITH_NEM_R0 +# include <VBox/vmm/nem.h> +#endif +#include <VBox/vmm/em.h> +#include <VBox/vmm/stam.h> +#include <VBox/vmm/tm.h> +#include "VMMInternal.h" +#include <VBox/vmm/vm.h> +#include <VBox/vmm/gvm.h> +#ifdef VBOX_WITH_PCI_PASSTHROUGH +# include <VBox/vmm/pdmpci.h> +#endif +#include <VBox/vmm/apic.h> + +#include <VBox/vmm/gvmm.h> +#include <VBox/vmm/gmm.h> +#include <VBox/vmm/gim.h> +#include <VBox/intnet.h> +#include <VBox/vmm/hm.h> +#include <VBox/param.h> +#include <VBox/err.h> +#include <VBox/version.h> +#include <VBox/log.h> + +#include <iprt/asm-amd64-x86.h> +#include <iprt/assert.h> +#include <iprt/crc.h> +#include <iprt/mp.h> +#include <iprt/once.h> +#include <iprt/stdarg.h> +#include <iprt/string.h> +#include <iprt/thread.h> +#include <iprt/timer.h> +#include <iprt/time.h> + +#include "dtrace/VBoxVMM.h" + + +#if defined(_MSC_VER) && defined(RT_ARCH_AMD64) /** @todo check this with with VC7! */ +# pragma intrinsic(_AddressOfReturnAddress) +#endif + +#if defined(RT_OS_DARWIN) && ARCH_BITS == 32 +# error "32-bit darwin is no longer supported. Go back to 4.3 or earlier!" +#endif + + + +/********************************************************************************************************************************* +* Defined Constants And Macros * +*********************************************************************************************************************************/ +/** @def VMM_CHECK_SMAP_SETUP + * SMAP check setup. */ +/** @def VMM_CHECK_SMAP_CHECK + * Checks that the AC flag is set if SMAP is enabled. If AC is not set, + * it will be logged and @a a_BadExpr is executed. */ +/** @def VMM_CHECK_SMAP_CHECK2 + * Checks that the AC flag is set if SMAP is enabled. If AC is not set, it will + * be logged, written to the VMs assertion text buffer, and @a a_BadExpr is + * executed. */ +#if defined(VBOX_STRICT) || 1 +# define VMM_CHECK_SMAP_SETUP() uint32_t const fKernelFeatures = SUPR0GetKernelFeatures() +# define VMM_CHECK_SMAP_CHECK(a_BadExpr) \ + do { \ + if (fKernelFeatures & SUPKERNELFEATURES_SMAP) \ + { \ + RTCCUINTREG fEflCheck = ASMGetFlags(); \ + if (RT_LIKELY(fEflCheck & X86_EFL_AC)) \ + { /* likely */ } \ + else \ + { \ + SUPR0Printf("%s, line %d: EFLAGS.AC is clear! (%#x)\n", __FUNCTION__, __LINE__, (uint32_t)fEflCheck); \ + a_BadExpr; \ + } \ + } \ + } while (0) +# define VMM_CHECK_SMAP_CHECK2(a_pVM, a_BadExpr) \ + do { \ + if (fKernelFeatures & SUPKERNELFEATURES_SMAP) \ + { \ + RTCCUINTREG fEflCheck = ASMGetFlags(); \ + if (RT_LIKELY(fEflCheck & X86_EFL_AC)) \ + { /* likely */ } \ + else \ + { \ + SUPR0BadContext((a_pVM) ? (a_pVM)->pSession : NULL, __FILE__, __LINE__, "EFLAGS.AC is zero!"); \ + RTStrPrintf(pVM->vmm.s.szRing0AssertMsg1, sizeof(pVM->vmm.s.szRing0AssertMsg1), \ + "%s, line %d: EFLAGS.AC is clear! (%#x)\n", __FUNCTION__, __LINE__, (uint32_t)fEflCheck); \ + a_BadExpr; \ + } \ + } \ + } while (0) +#else +# define VMM_CHECK_SMAP_SETUP() uint32_t const fKernelFeatures = 0 +# define VMM_CHECK_SMAP_CHECK(a_BadExpr) NOREF(fKernelFeatures) +# define VMM_CHECK_SMAP_CHECK2(a_pVM, a_BadExpr) NOREF(fKernelFeatures) +#endif + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +RT_C_DECLS_BEGIN +#if defined(RT_ARCH_X86) && (defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)) +extern uint64_t __udivdi3(uint64_t, uint64_t); +extern uint64_t __umoddi3(uint64_t, uint64_t); +#endif +RT_C_DECLS_END + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +/** Drag in necessary library bits. + * The runtime lives here (in VMMR0.r0) and VBoxDD*R0.r0 links against us. */ +PFNRT g_VMMR0Deps[] = +{ + (PFNRT)RTCrc32, + (PFNRT)RTOnce, +#if defined(RT_ARCH_X86) && (defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)) + (PFNRT)__udivdi3, + (PFNRT)__umoddi3, +#endif + NULL +}; + +#ifdef RT_OS_SOLARIS +/* Dependency information for the native solaris loader. */ +extern "C" { char _depends_on[] = "vboxdrv"; } +#endif + +/** The result of SUPR0GetRawModeUsability(), set by ModuleInit(). */ +int g_rcRawModeUsability = VINF_SUCCESS; + + +/** + * Initialize the module. + * This is called when we're first loaded. + * + * @returns 0 on success. + * @returns VBox status on failure. + * @param hMod Image handle for use in APIs. + */ +DECLEXPORT(int) ModuleInit(void *hMod) +{ + VMM_CHECK_SMAP_SETUP(); + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + +#ifdef VBOX_WITH_DTRACE_R0 + /* + * The first thing to do is register the static tracepoints. + * (Deregistration is automatic.) + */ + int rc2 = SUPR0TracerRegisterModule(hMod, &g_VTGObjHeader); + if (RT_FAILURE(rc2)) + return rc2; +#endif + LogFlow(("ModuleInit:\n")); + +#ifdef VBOX_WITH_64ON32_CMOS_DEBUG + /* + * Display the CMOS debug code. + */ + ASMOutU8(0x72, 0x03); + uint8_t bDebugCode = ASMInU8(0x73); + LogRel(("CMOS Debug Code: %#x (%d)\n", bDebugCode, bDebugCode)); + RTLogComPrintf("CMOS Debug Code: %#x (%d)\n", bDebugCode, bDebugCode); +#endif + + /* + * Initialize the VMM, GVMM, GMM, HM, PGM (Darwin) and INTNET. + */ + int rc = vmmInitFormatTypes(); + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + rc = GVMMR0Init(); + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + rc = GMMR0Init(); + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + rc = HMR0Init(); + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + rc = PGMRegisterStringFormatTypes(); + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK(RT_NOTHING); +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + rc = PGMR0DynMapInit(); +#endif + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + rc = IntNetR0Init(); + if (RT_SUCCESS(rc)) + { +#ifdef VBOX_WITH_PCI_PASSTHROUGH + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + rc = PciRawR0Init(); +#endif + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + rc = CPUMR0ModuleInit(); + if (RT_SUCCESS(rc)) + { +#ifdef VBOX_WITH_TRIPLE_FAULT_HACK + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + rc = vmmR0TripleFaultHackInit(); + if (RT_SUCCESS(rc)) +#endif + { + VMM_CHECK_SMAP_CHECK(rc = VERR_VMM_SMAP_BUT_AC_CLEAR); + if (RT_SUCCESS(rc)) + { + g_rcRawModeUsability = SUPR0GetRawModeUsability(); + if (g_rcRawModeUsability != VINF_SUCCESS) + SUPR0Printf("VMMR0!ModuleInit: SUPR0GetRawModeUsability -> %Rrc\n", + g_rcRawModeUsability); + LogFlow(("ModuleInit: returns success\n")); + return VINF_SUCCESS; + } + } + + /* + * Bail out. + */ +#ifdef VBOX_WITH_TRIPLE_FAULT_HACK + vmmR0TripleFaultHackTerm(); +#endif + } + else + LogRel(("ModuleInit: CPUMR0ModuleInit -> %Rrc\n", rc)); +#ifdef VBOX_WITH_PCI_PASSTHROUGH + PciRawR0Term(); +#endif + } + else + LogRel(("ModuleInit: PciRawR0Init -> %Rrc\n", rc)); + IntNetR0Term(); + } + else + LogRel(("ModuleInit: IntNetR0Init -> %Rrc\n", rc)); +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + PGMR0DynMapTerm(); +#endif + } + else + LogRel(("ModuleInit: PGMR0DynMapInit -> %Rrc\n", rc)); + PGMDeregisterStringFormatTypes(); + } + else + LogRel(("ModuleInit: PGMRegisterStringFormatTypes -> %Rrc\n", rc)); + HMR0Term(); + } + else + LogRel(("ModuleInit: HMR0Init -> %Rrc\n", rc)); + GMMR0Term(); + } + else + LogRel(("ModuleInit: GMMR0Init -> %Rrc\n", rc)); + GVMMR0Term(); + } + else + LogRel(("ModuleInit: GVMMR0Init -> %Rrc\n", rc)); + vmmTermFormatTypes(); + } + else + LogRel(("ModuleInit: vmmInitFormatTypes -> %Rrc\n", rc)); + + LogFlow(("ModuleInit: failed %Rrc\n", rc)); + return rc; +} + + +/** + * Terminate the module. + * This is called when we're finally unloaded. + * + * @param hMod Image handle for use in APIs. + */ +DECLEXPORT(void) ModuleTerm(void *hMod) +{ + NOREF(hMod); + LogFlow(("ModuleTerm:\n")); + + /* + * Terminate the CPUM module (Local APIC cleanup). + */ + CPUMR0ModuleTerm(); + + /* + * Terminate the internal network service. + */ + IntNetR0Term(); + + /* + * PGM (Darwin), HM and PciRaw global cleanup. + */ +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + PGMR0DynMapTerm(); +#endif +#ifdef VBOX_WITH_PCI_PASSTHROUGH + PciRawR0Term(); +#endif + PGMDeregisterStringFormatTypes(); + HMR0Term(); +#ifdef VBOX_WITH_TRIPLE_FAULT_HACK + vmmR0TripleFaultHackTerm(); +#endif + + /* + * Destroy the GMM and GVMM instances. + */ + GMMR0Term(); + GVMMR0Term(); + + vmmTermFormatTypes(); + + LogFlow(("ModuleTerm: returns\n")); +} + + +/** + * Initiates the R0 driver for a particular VM instance. + * + * @returns VBox status code. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param uSvnRev The SVN revision of the ring-3 part. + * @param uBuildType Build type indicator. + * @thread EMT(0) + */ +static int vmmR0InitVM(PGVM pGVM, PVM pVM, uint32_t uSvnRev, uint32_t uBuildType) +{ + VMM_CHECK_SMAP_SETUP(); + VMM_CHECK_SMAP_CHECK(return VERR_VMM_SMAP_BUT_AC_CLEAR); + + /* + * Match the SVN revisions and build type. + */ + if (uSvnRev != VMMGetSvnRev()) + { + LogRel(("VMMR0InitVM: Revision mismatch, r3=%d r0=%d\n", uSvnRev, VMMGetSvnRev())); + SUPR0Printf("VMMR0InitVM: Revision mismatch, r3=%d r0=%d\n", uSvnRev, VMMGetSvnRev()); + return VERR_VMM_R0_VERSION_MISMATCH; + } + if (uBuildType != vmmGetBuildType()) + { + LogRel(("VMMR0InitVM: Build type mismatch, r3=%#x r0=%#x\n", uBuildType, vmmGetBuildType())); + SUPR0Printf("VMMR0InitVM: Build type mismatch, r3=%#x r0=%#x\n", uBuildType, vmmGetBuildType()); + return VERR_VMM_R0_VERSION_MISMATCH; + } + + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, 0 /*idCpu*/); + if (RT_FAILURE(rc)) + return rc; + +#ifdef LOG_ENABLED + /* + * Register the EMT R0 logger instance for VCPU 0. + */ + PVMCPU pVCpu = &pVM->aCpus[0]; + + PVMMR0LOGGER pR0Logger = pVCpu->vmm.s.pR0LoggerR0; + if (pR0Logger) + { +# if 0 /* testing of the logger. */ + LogCom(("vmmR0InitVM: before %p\n", RTLogDefaultInstance())); + LogCom(("vmmR0InitVM: pfnFlush=%p actual=%p\n", pR0Logger->Logger.pfnFlush, vmmR0LoggerFlush)); + LogCom(("vmmR0InitVM: pfnLogger=%p actual=%p\n", pR0Logger->Logger.pfnLogger, vmmR0LoggerWrapper)); + LogCom(("vmmR0InitVM: offScratch=%d fFlags=%#x fDestFlags=%#x\n", pR0Logger->Logger.offScratch, pR0Logger->Logger.fFlags, pR0Logger->Logger.fDestFlags)); + + RTLogSetDefaultInstanceThread(&pR0Logger->Logger, (uintptr_t)pVM->pSession); + LogCom(("vmmR0InitVM: after %p reg\n", RTLogDefaultInstance())); + RTLogSetDefaultInstanceThread(NULL, pVM->pSession); + LogCom(("vmmR0InitVM: after %p dereg\n", RTLogDefaultInstance())); + + pR0Logger->Logger.pfnLogger("hello ring-0 logger\n"); + LogCom(("vmmR0InitVM: returned successfully from direct logger call.\n")); + pR0Logger->Logger.pfnFlush(&pR0Logger->Logger); + LogCom(("vmmR0InitVM: returned successfully from direct flush call.\n")); + + RTLogSetDefaultInstanceThread(&pR0Logger->Logger, (uintptr_t)pVM->pSession); + LogCom(("vmmR0InitVM: after %p reg2\n", RTLogDefaultInstance())); + pR0Logger->Logger.pfnLogger("hello ring-0 logger\n"); + LogCom(("vmmR0InitVM: returned successfully from direct logger call (2). offScratch=%d\n", pR0Logger->Logger.offScratch)); + RTLogSetDefaultInstanceThread(NULL, pVM->pSession); + LogCom(("vmmR0InitVM: after %p dereg2\n", RTLogDefaultInstance())); + + RTLogLoggerEx(&pR0Logger->Logger, 0, ~0U, "hello ring-0 logger (RTLogLoggerEx)\n"); + LogCom(("vmmR0InitVM: RTLogLoggerEx returned fine offScratch=%d\n", pR0Logger->Logger.offScratch)); + + RTLogSetDefaultInstanceThread(&pR0Logger->Logger, (uintptr_t)pVM->pSession); + RTLogPrintf("hello ring-0 logger (RTLogPrintf)\n"); + LogCom(("vmmR0InitVM: RTLogPrintf returned fine offScratch=%d\n", pR0Logger->Logger.offScratch)); +# endif + Log(("Switching to per-thread logging instance %p (key=%p)\n", &pR0Logger->Logger, pVM->pSession)); + RTLogSetDefaultInstanceThread(&pR0Logger->Logger, (uintptr_t)pVM->pSession); + pR0Logger->fRegistered = true; + } +#endif /* LOG_ENABLED */ + + /* + * Check if the host supports high resolution timers or not. + */ + if ( pVM->vmm.s.fUsePeriodicPreemptionTimers + && !RTTimerCanDoHighResolution()) + pVM->vmm.s.fUsePeriodicPreemptionTimers = false; + + /* + * Initialize the per VM data for GVMM and GMM. + */ + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = GVMMR0InitVM(pGVM); +// if (RT_SUCCESS(rc)) +// rc = GMMR0InitPerVMData(pVM); + if (RT_SUCCESS(rc)) + { + /* + * Init HM, CPUM and PGM (Darwin only). + */ + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = HMR0InitVM(pVM); + if (RT_SUCCESS(rc)) + VMM_CHECK_SMAP_CHECK2(pVM, rc = VERR_VMM_RING0_ASSERTION); /* CPUR0InitVM will otherwise panic the host */ + if (RT_SUCCESS(rc)) + { + rc = CPUMR0InitVM(pVM); + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + rc = PGMR0DynMapInitVM(pVM); +#endif + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = EMR0InitVM(pGVM, pVM); + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); +#ifdef VBOX_WITH_PCI_PASSTHROUGH + rc = PciRawR0InitVM(pGVM, pVM); +#endif + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = GIMR0InitVM(pVM); + if (RT_SUCCESS(rc)) + { + VMM_CHECK_SMAP_CHECK2(pVM, rc = VERR_VMM_RING0_ASSERTION); + if (RT_SUCCESS(rc)) + { + GVMMR0DoneInitVM(pGVM); + + /* + * Collect a bit of info for the VM release log. + */ + pVM->vmm.s.fIsPreemptPendingApiTrusty = RTThreadPreemptIsPendingTrusty(); + pVM->vmm.s.fIsPreemptPossible = RTThreadPreemptIsPossible();; + + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + return rc; + } + + /* bail out*/ + GIMR0TermVM(pVM); + } +#ifdef VBOX_WITH_PCI_PASSTHROUGH + PciRawR0TermVM(pGVM, pVM); +#endif + } + } + } + } + HMR0TermVM(pVM); + } + } + + RTLogSetDefaultInstanceThread(NULL, (uintptr_t)pVM->pSession); + return rc; +} + + +/** + * Does EMT specific VM initialization. + * + * @returns VBox status code. + * @param pGVM The ring-0 VM structure. + * @param pVM The cross context VM structure. + * @param idCpu The EMT that's calling. + */ +static int vmmR0InitVMEmt(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + /* Paranoia (caller checked these already). */ + AssertReturn(idCpu < pGVM->cCpus, VERR_INVALID_CPU_ID); + AssertReturn(pGVM->aCpus[idCpu].hEMT == RTThreadNativeSelf(), VERR_INVALID_CPU_ID); + +#ifdef LOG_ENABLED + /* + * Registration of ring 0 loggers. + */ + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + PVMMR0LOGGER pR0Logger = pVCpu->vmm.s.pR0LoggerR0; + if ( pR0Logger + && !pR0Logger->fRegistered) + { + RTLogSetDefaultInstanceThread(&pR0Logger->Logger, (uintptr_t)pVM->pSession); + pR0Logger->fRegistered = true; + } +#endif + RT_NOREF(pVM); + + return VINF_SUCCESS; +} + + + +/** + * Terminates the R0 bits for a particular VM instance. + * + * This is normally called by ring-3 as part of the VM termination process, but + * may alternatively be called during the support driver session cleanup when + * the VM object is destroyed (see GVMM). + * + * @returns VBox status code. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu Set to 0 if EMT(0) or NIL_VMCPUID if session cleanup + * thread. + * @thread EMT(0) or session clean up thread. + */ +VMMR0_INT_DECL(int) VMMR0TermVM(PGVM pGVM, PVM pVM, VMCPUID idCpu) +{ + /* + * Check EMT(0) claim if we're called from userland. + */ + if (idCpu != NIL_VMCPUID) + { + AssertReturn(idCpu == 0, VERR_INVALID_CPU_ID); + int rc = GVMMR0ValidateGVMandVMandEMT(pGVM, pVM, idCpu); + if (RT_FAILURE(rc)) + return rc; + } + +#ifdef VBOX_WITH_PCI_PASSTHROUGH + PciRawR0TermVM(pGVM, pVM); +#endif + + /* + * Tell GVMM what we're up to and check that we only do this once. + */ + if (GVMMR0DoingTermVM(pGVM)) + { + GIMR0TermVM(pVM); + + /** @todo I wish to call PGMR0PhysFlushHandyPages(pVM, &pVM->aCpus[idCpu]) + * here to make sure we don't leak any shared pages if we crash... */ +#ifdef VBOX_WITH_2X_4GB_ADDR_SPACE + PGMR0DynMapTermVM(pVM); +#endif + HMR0TermVM(pVM); + } + + /* + * Deregister the logger. + */ + RTLogSetDefaultInstanceThread(NULL, (uintptr_t)pVM->pSession); + return VINF_SUCCESS; +} + + +/** + * An interrupt or unhalt force flag is set, deal with it. + * + * @returns VINF_SUCCESS (or VINF_EM_HALT). + * @param pVCpu The cross context virtual CPU structure. + * @param uMWait Result from EMMonitorWaitIsActive(). + * @param enmInterruptibility Guest CPU interruptbility level. + */ +static int vmmR0DoHaltInterrupt(PVMCPU pVCpu, unsigned uMWait, CPUMINTERRUPTIBILITY enmInterruptibility) +{ + Assert(!TRPMHasTrap(pVCpu)); + Assert( enmInterruptibility > CPUMINTERRUPTIBILITY_INVALID + && enmInterruptibility < CPUMINTERRUPTIBILITY_END); + + /* + * Pending interrupts w/o any SMIs or NMIs? That the usual case. + */ + if ( VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC) + && !VMCPU_FF_IS_ANY_SET(pVCpu, VMCPU_FF_INTERRUPT_SMI | VMCPU_FF_INTERRUPT_NMI)) + { + if (enmInterruptibility <= CPUMINTERRUPTIBILITY_UNRESTRAINED) + { + uint8_t u8Interrupt = 0; + int rc = PDMGetInterrupt(pVCpu, &u8Interrupt); + Log(("vmmR0DoHaltInterrupt: CPU%d u8Interrupt=%d (%#x) rc=%Rrc\n", pVCpu->idCpu, u8Interrupt, u8Interrupt, rc)); + if (RT_SUCCESS(rc)) + { + VMCPU_FF_CLEAR(pVCpu, VMCPU_FF_UNHALT); + + rc = TRPMAssertTrap(pVCpu, u8Interrupt, TRPM_HARDWARE_INT); + AssertRCSuccess(rc); + STAM_REL_COUNTER_INC(&pVCpu->vmm.s.StatR0HaltExec); + return rc; + } + } + } + /* + * SMI is not implemented yet, at least not here. + */ + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_SMI)) + { + return VINF_EM_HALT; + } + /* + * NMI. + */ + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NMI)) + { + if (enmInterruptibility < CPUMINTERRUPTIBILITY_NMI_INHIBIT) + { + /** @todo later. */ + return VINF_EM_HALT; + } + } + /* + * Nested-guest virtual interrupt. + */ + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_INTERRUPT_NESTED_GUEST)) + { + if (enmInterruptibility < CPUMINTERRUPTIBILITY_VIRT_INT_DISABLED) + { + /** @todo NSTVMX: NSTSVM: Remember, we might have to check and perform VM-exits + * here before injecting the virtual interrupt. See emR3ForcedActions + * for details. */ + return VINF_EM_HALT; + } + } + + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UNHALT)) + { + STAM_REL_COUNTER_INC(&pVCpu->vmm.s.StatR0HaltExec); + return VINF_SUCCESS; + } + if (uMWait > 1) + { + STAM_REL_COUNTER_INC(&pVCpu->vmm.s.StatR0HaltExec); + return VINF_SUCCESS; + } + + return VINF_EM_HALT; +} + + +/** + * This does one round of vmR3HaltGlobal1Halt(). + * + * The rational here is that we'll reduce latency in interrupt situations if we + * don't go to ring-3 immediately on a VINF_EM_HALT (guest executed HLT or + * MWAIT), but do one round of blocking here instead and hope the interrupt is + * raised in the meanwhile. + * + * If we go to ring-3 we'll quit the inner HM/NEM loop in EM and end up in the + * outer loop, which will then call VMR3WaitHalted() and that in turn will do a + * ring-0 call (unless we're too close to a timer event). When the interrupt + * wakes us up, we'll return from ring-0 and EM will by instinct do a + * rescheduling (because of raw-mode) before it resumes the HM/NEM loop and gets + * back to VMMR0EntryFast(). + * + * @returns VINF_SUCCESS or VINF_EM_HALT. + * @param pGVM The ring-0 VM structure. + * @param pVM The cross context VM structure. + * @param pGVCpu The ring-0 virtual CPU structure. + * @param pVCpu The cross context virtual CPU structure. + * + * @todo r=bird: All the blocking/waiting and EMT managment should move out of + * the VM module, probably to VMM. Then this would be more weird wrt + * parameters and statistics. + */ +static int vmmR0DoHalt(PGVM pGVM, PVM pVM, PGVMCPU pGVCpu, PVMCPU pVCpu) +{ + Assert(pVCpu == pGVCpu->pVCpu); + + /* + * Do spin stat historization. + */ + if (++pVCpu->vmm.s.cR0Halts & 0xff) + { /* likely */ } + else if (pVCpu->vmm.s.cR0HaltsSucceeded > pVCpu->vmm.s.cR0HaltsToRing3) + { + pVCpu->vmm.s.cR0HaltsSucceeded = 2; + pVCpu->vmm.s.cR0HaltsToRing3 = 0; + } + else + { + pVCpu->vmm.s.cR0HaltsSucceeded = 0; + pVCpu->vmm.s.cR0HaltsToRing3 = 2; + } + + /* + * Flags that makes us go to ring-3. + */ + uint32_t const fVmFFs = VM_FF_TM_VIRTUAL_SYNC | VM_FF_PDM_QUEUES | VM_FF_PDM_DMA + | VM_FF_DBGF | VM_FF_REQUEST | VM_FF_CHECK_VM_STATE + | VM_FF_RESET | VM_FF_EMT_RENDEZVOUS | VM_FF_PGM_NEED_HANDY_PAGES + | VM_FF_PGM_NO_MEMORY | VM_FF_REM_HANDLER_NOTIFY | VM_FF_DEBUG_SUSPEND; + uint64_t const fCpuFFs = VMCPU_FF_TIMER | VMCPU_FF_PDM_CRITSECT | VMCPU_FF_IEM + | VMCPU_FF_REQUEST | VMCPU_FF_DBGF | VMCPU_FF_HM_UPDATE_CR3 + | VMCPU_FF_HM_UPDATE_PAE_PDPES | VMCPU_FF_PGM_SYNC_CR3 | VMCPU_FF_PGM_SYNC_CR3_NON_GLOBAL + | VMCPU_FF_TO_R3 | VMCPU_FF_IOM +#ifdef VBOX_WITH_RAW_MODE + | VMCPU_FF_TRPM_SYNC_IDT | VMCPU_FF_SELM_SYNC_TSS | VMCPU_FF_SELM_SYNC_GDT + | VMCPU_FF_SELM_SYNC_LDT | VMCPU_FF_CSAM_SCAN_PAGE | VMCPU_FF_CSAM_PENDING_ACTION + | VMCPU_FF_CPUM +#endif + ; + + /* + * Check preconditions. + */ + unsigned const uMWait = EMMonitorWaitIsActive(pVCpu); + CPUMINTERRUPTIBILITY const enmInterruptibility = CPUMGetGuestInterruptibility(pVCpu); + if ( pVCpu->vmm.s.fMayHaltInRing0 + && !TRPMHasTrap(pVCpu) + && ( enmInterruptibility == CPUMINTERRUPTIBILITY_UNRESTRAINED + || uMWait > 1)) + { + if ( !VM_FF_IS_ANY_SET(pVM, fVmFFs) + && !VMCPU_FF_IS_ANY_SET(pVCpu, fCpuFFs)) + { + /* + * Interrupts pending already? + */ + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC)) + APICUpdatePendingInterrupts(pVCpu); + + /* + * Flags that wake up from the halted state. + */ + uint64_t const fIntMask = VMCPU_FF_INTERRUPT_APIC | VMCPU_FF_INTERRUPT_PIC | VMCPU_FF_INTERRUPT_NESTED_GUEST + | VMCPU_FF_INTERRUPT_NMI | VMCPU_FF_INTERRUPT_SMI | VMCPU_FF_UNHALT; + + if (VMCPU_FF_IS_ANY_SET(pVCpu, fIntMask)) + return vmmR0DoHaltInterrupt(pVCpu, uMWait, enmInterruptibility); + ASMNopPause(); + + /* + * Check out how long till the next timer event. + */ + uint64_t u64Delta; + uint64_t u64GipTime = TMTimerPollGIP(pVM, pVCpu, &u64Delta); + + if ( !VM_FF_IS_ANY_SET(pVM, fVmFFs) + && !VMCPU_FF_IS_ANY_SET(pVCpu, fCpuFFs)) + { + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC)) + APICUpdatePendingInterrupts(pVCpu); + + if (VMCPU_FF_IS_ANY_SET(pVCpu, fIntMask)) + return vmmR0DoHaltInterrupt(pVCpu, uMWait, enmInterruptibility); + + /* + * Wait if there is enough time to the next timer event. + */ + if (u64Delta >= pVCpu->vmm.s.cNsSpinBlockThreshold) + { + /* If there are few other CPU cores around, we will procrastinate a + little before going to sleep, hoping for some device raising an + interrupt or similar. Though, the best thing here would be to + dynamically adjust the spin count according to its usfulness or + something... */ + if ( pVCpu->vmm.s.cR0HaltsSucceeded > pVCpu->vmm.s.cR0HaltsToRing3 + && RTMpGetOnlineCount() >= 4) + { + /** @todo Figure out how we can skip this if it hasn't help recently... + * @bugref{9172#c12} */ + uint32_t cSpinLoops = 42; + while (cSpinLoops-- > 0) + { + ASMNopPause(); + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC)) + APICUpdatePendingInterrupts(pVCpu); + ASMNopPause(); + if (VM_FF_IS_ANY_SET(pVM, fVmFFs)) + { + STAM_REL_COUNTER_INC(&pVCpu->vmm.s.StatR0HaltToR3FromSpin); + return VINF_EM_HALT; + } + ASMNopPause(); + if (VMCPU_FF_IS_ANY_SET(pVCpu, fCpuFFs)) + { + STAM_REL_COUNTER_INC(&pVCpu->vmm.s.StatR0HaltToR3FromSpin); + return VINF_EM_HALT; + } + ASMNopPause(); + if (VMCPU_FF_IS_ANY_SET(pVCpu, fIntMask)) + { + STAM_REL_COUNTER_INC(&pVCpu->vmm.s.StatR0HaltExecFromSpin); + return vmmR0DoHaltInterrupt(pVCpu, uMWait, enmInterruptibility); + } + ASMNopPause(); + } + } + + /* Block. We have to set the state to VMCPUSTATE_STARTED_HALTED here so ring-3 + knows when to notify us (cannot access VMINTUSERPERVMCPU::fWait from here). */ + VMCPU_CMPXCHG_STATE(pVCpu, VMCPUSTATE_STARTED_HALTED, VMCPUSTATE_STARTED); + uint64_t const u64StartSchedHalt = RTTimeNanoTS(); + int rc = GVMMR0SchedHalt(pGVM, pVM, pGVCpu, u64GipTime); + uint64_t const u64EndSchedHalt = RTTimeNanoTS(); + uint64_t const cNsElapsedSchedHalt = u64EndSchedHalt - u64StartSchedHalt; + VMCPU_CMPXCHG_STATE(pVCpu, VMCPUSTATE_STARTED, VMCPUSTATE_STARTED_HALTED); + STAM_REL_PROFILE_ADD_PERIOD(&pVCpu->vmm.s.StatR0HaltBlock, cNsElapsedSchedHalt); + if ( rc == VINF_SUCCESS + || rc == VERR_INTERRUPTED) + + { + /* Keep some stats like ring-3 does. */ + int64_t const cNsOverslept = u64EndSchedHalt - u64GipTime; + if (cNsOverslept > 50000) + STAM_REL_PROFILE_ADD_PERIOD(&pVCpu->vmm.s.StatR0HaltBlockOverslept, cNsOverslept); + else if (cNsOverslept < -50000) + STAM_REL_PROFILE_ADD_PERIOD(&pVCpu->vmm.s.StatR0HaltBlockInsomnia, cNsElapsedSchedHalt); + else + STAM_REL_PROFILE_ADD_PERIOD(&pVCpu->vmm.s.StatR0HaltBlockOnTime, cNsElapsedSchedHalt); + + /* + * Recheck whether we can resume execution or have to go to ring-3. + */ + if ( !VM_FF_IS_ANY_SET(pVM, fVmFFs) + && !VMCPU_FF_IS_ANY_SET(pVCpu, fCpuFFs)) + { + if (VMCPU_FF_TEST_AND_CLEAR(pVCpu, VMCPU_FF_UPDATE_APIC)) + APICUpdatePendingInterrupts(pVCpu); + if (VMCPU_FF_IS_ANY_SET(pVCpu, fIntMask)) + { + STAM_REL_COUNTER_INC(&pVCpu->vmm.s.StatR0HaltExecFromBlock); + return vmmR0DoHaltInterrupt(pVCpu, uMWait, enmInterruptibility); + } + } + } + } + } + } + } + return VINF_EM_HALT; +} + + +/** + * VMM ring-0 thread-context callback. + * + * This does common HM state updating and calls the HM-specific thread-context + * callback. + * + * @param enmEvent The thread-context event. + * @param pvUser Opaque pointer to the VMCPU. + * + * @thread EMT(pvUser) + */ +static DECLCALLBACK(void) vmmR0ThreadCtxCallback(RTTHREADCTXEVENT enmEvent, void *pvUser) +{ + PVMCPU pVCpu = (PVMCPU)pvUser; + + switch (enmEvent) + { + case RTTHREADCTXEVENT_IN: + { + /* + * Linux may call us with preemption enabled (really!) but technically we + * cannot get preempted here, otherwise we end up in an infinite recursion + * scenario (i.e. preempted in resume hook -> preempt hook -> resume hook... + * ad infinitum). Let's just disable preemption for now... + */ + /** @todo r=bird: I don't believe the above. The linux code is clearly enabling + * preemption after doing the callout (one or two functions up the + * call chain). */ + /** @todo r=ramshankar: See @bugref{5313#c30}. */ + RTTHREADPREEMPTSTATE ParanoidPreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + RTThreadPreemptDisable(&ParanoidPreemptState); + + /* We need to update the VCPU <-> host CPU mapping. */ + RTCPUID idHostCpu; + uint32_t iHostCpuSet = RTMpCurSetIndexAndId(&idHostCpu); + pVCpu->iHostCpuSet = iHostCpuSet; + ASMAtomicWriteU32(&pVCpu->idHostCpu, idHostCpu); + + /* In the very unlikely event that the GIP delta for the CPU we're + rescheduled needs calculating, try force a return to ring-3. + We unfortunately cannot do the measurements right here. */ + if (RT_UNLIKELY(SUPIsTscDeltaAvailableForCpuSetIndex(iHostCpuSet))) + VMCPU_FF_SET(pVCpu, VMCPU_FF_TO_R3); + + /* Invoke the HM-specific thread-context callback. */ + HMR0ThreadCtxCallback(enmEvent, pvUser); + + /* Restore preemption. */ + RTThreadPreemptRestore(&ParanoidPreemptState); + break; + } + + case RTTHREADCTXEVENT_OUT: + { + /* Invoke the HM-specific thread-context callback. */ + HMR0ThreadCtxCallback(enmEvent, pvUser); + + /* + * Sigh. See VMMGetCpu() used by VMCPU_ASSERT_EMT(). We cannot let several VCPUs + * have the same host CPU associated with it. + */ + pVCpu->iHostCpuSet = UINT32_MAX; + ASMAtomicWriteU32(&pVCpu->idHostCpu, NIL_RTCPUID); + break; + } + + default: + /* Invoke the HM-specific thread-context callback. */ + HMR0ThreadCtxCallback(enmEvent, pvUser); + break; + } +} + + +/** + * Creates thread switching hook for the current EMT thread. + * + * This is called by GVMMR0CreateVM and GVMMR0RegisterVCpu. If the host + * platform does not implement switcher hooks, no hooks will be create and the + * member set to NIL_RTTHREADCTXHOOK. + * + * @returns VBox status code. + * @param pVCpu The cross context virtual CPU structure. + * @thread EMT(pVCpu) + */ +VMMR0_INT_DECL(int) VMMR0ThreadCtxHookCreateForEmt(PVMCPU pVCpu) +{ + VMCPU_ASSERT_EMT(pVCpu); + Assert(pVCpu->vmm.s.hCtxHook == NIL_RTTHREADCTXHOOK); + +#if 1 /* To disable this stuff change to zero. */ + int rc = RTThreadCtxHookCreate(&pVCpu->vmm.s.hCtxHook, 0, vmmR0ThreadCtxCallback, pVCpu); + if (RT_SUCCESS(rc)) + return rc; +#else + RT_NOREF(vmmR0ThreadCtxCallback); + int rc = VERR_NOT_SUPPORTED; +#endif + + pVCpu->vmm.s.hCtxHook = NIL_RTTHREADCTXHOOK; + if (rc == VERR_NOT_SUPPORTED) + return VINF_SUCCESS; + + LogRelMax(32, ("RTThreadCtxHookCreate failed! rc=%Rrc pVCpu=%p idCpu=%RU32\n", rc, pVCpu, pVCpu->idCpu)); + return VINF_SUCCESS; /* Just ignore it, we can live without context hooks. */ +} + + +/** + * Destroys the thread switching hook for the specified VCPU. + * + * @param pVCpu The cross context virtual CPU structure. + * @remarks Can be called from any thread. + */ +VMMR0_INT_DECL(void) VMMR0ThreadCtxHookDestroyForEmt(PVMCPU pVCpu) +{ + int rc = RTThreadCtxHookDestroy(pVCpu->vmm.s.hCtxHook); + AssertRC(rc); + pVCpu->vmm.s.hCtxHook = NIL_RTTHREADCTXHOOK; +} + + +/** + * Disables the thread switching hook for this VCPU (if we got one). + * + * @param pVCpu The cross context virtual CPU structure. + * @thread EMT(pVCpu) + * + * @remarks This also clears VMCPU::idHostCpu, so the mapping is invalid after + * this call. This means you have to be careful with what you do! + */ +VMMR0_INT_DECL(void) VMMR0ThreadCtxHookDisable(PVMCPU pVCpu) +{ + /* + * Clear the VCPU <-> host CPU mapping as we've left HM context. + * @bugref{7726#c19} explains the need for this trick: + * + * hmR0VmxCallRing3Callback/hmR0SvmCallRing3Callback & + * hmR0VmxLeaveSession/hmR0SvmLeaveSession disables context hooks during + * longjmp & normal return to ring-3, which opens a window where we may be + * rescheduled without changing VMCPUID::idHostCpu and cause confusion if + * the CPU starts executing a different EMT. Both functions first disables + * preemption and then calls HMR0LeaveCpu which invalids idHostCpu, leaving + * an opening for getting preempted. + */ + /** @todo Make HM not need this API! Then we could leave the hooks enabled + * all the time. */ + /** @todo move this into the context hook disabling if(). */ + ASMAtomicWriteU32(&pVCpu->idHostCpu, NIL_RTCPUID); + + /* + * Disable the context hook, if we got one. + */ + if (pVCpu->vmm.s.hCtxHook != NIL_RTTHREADCTXHOOK) + { + Assert(!RTThreadPreemptIsEnabled(NIL_RTTHREAD)); + int rc = RTThreadCtxHookDisable(pVCpu->vmm.s.hCtxHook); + AssertRC(rc); + } +} + + +/** + * Internal version of VMMR0ThreadCtxHooksAreRegistered. + * + * @returns true if registered, false otherwise. + * @param pVCpu The cross context virtual CPU structure. + */ +DECLINLINE(bool) vmmR0ThreadCtxHookIsEnabled(PVMCPU pVCpu) +{ + return RTThreadCtxHookIsEnabled(pVCpu->vmm.s.hCtxHook); +} + + +/** + * Whether thread-context hooks are registered for this VCPU. + * + * @returns true if registered, false otherwise. + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0_INT_DECL(bool) VMMR0ThreadCtxHookIsEnabled(PVMCPU pVCpu) +{ + return vmmR0ThreadCtxHookIsEnabled(pVCpu); +} + + +#ifdef VBOX_WITH_STATISTICS +/** + * Record return code statistics + * @param pVM The cross context VM structure. + * @param pVCpu The cross context virtual CPU structure. + * @param rc The status code. + */ +static void vmmR0RecordRC(PVM pVM, PVMCPU pVCpu, int rc) +{ + /* + * Collect statistics. + */ + switch (rc) + { + case VINF_SUCCESS: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetNormal); + break; + case VINF_EM_RAW_INTERRUPT: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetInterrupt); + break; + case VINF_EM_RAW_INTERRUPT_HYPER: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetInterruptHyper); + break; + case VINF_EM_RAW_GUEST_TRAP: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetGuestTrap); + break; + case VINF_EM_RAW_RING_SWITCH: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetRingSwitch); + break; + case VINF_EM_RAW_RING_SWITCH_INT: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetRingSwitchInt); + break; + case VINF_EM_RAW_STALE_SELECTOR: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetStaleSelector); + break; + case VINF_EM_RAW_IRET_TRAP: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetIRETTrap); + break; + case VINF_IOM_R3_IOPORT_READ: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetIORead); + break; + case VINF_IOM_R3_IOPORT_WRITE: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetIOWrite); + break; + case VINF_IOM_R3_IOPORT_COMMIT_WRITE: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetIOCommitWrite); + break; + case VINF_IOM_R3_MMIO_READ: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMMIORead); + break; + case VINF_IOM_R3_MMIO_WRITE: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMMIOWrite); + break; + case VINF_IOM_R3_MMIO_COMMIT_WRITE: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMMIOCommitWrite); + break; + case VINF_IOM_R3_MMIO_READ_WRITE: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMMIOReadWrite); + break; + case VINF_PATM_HC_MMIO_PATCH_READ: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMMIOPatchRead); + break; + case VINF_PATM_HC_MMIO_PATCH_WRITE: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMMIOPatchWrite); + break; + case VINF_CPUM_R3_MSR_READ: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMSRRead); + break; + case VINF_CPUM_R3_MSR_WRITE: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMSRWrite); + break; + case VINF_EM_RAW_EMULATE_INSTR: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetEmulate); + break; + case VINF_PATCH_EMULATE_INSTR: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPatchEmulate); + break; + case VINF_EM_RAW_EMULATE_INSTR_LDT_FAULT: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetLDTFault); + break; + case VINF_EM_RAW_EMULATE_INSTR_GDT_FAULT: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetGDTFault); + break; + case VINF_EM_RAW_EMULATE_INSTR_IDT_FAULT: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetIDTFault); + break; + case VINF_EM_RAW_EMULATE_INSTR_TSS_FAULT: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetTSSFault); + break; + case VINF_CSAM_PENDING_ACTION: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetCSAMTask); + break; + case VINF_PGM_SYNC_CR3: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetSyncCR3); + break; + case VINF_PATM_PATCH_INT3: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPatchInt3); + break; + case VINF_PATM_PATCH_TRAP_PF: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPatchPF); + break; + case VINF_PATM_PATCH_TRAP_GP: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPatchGP); + break; + case VINF_PATM_PENDING_IRQ_AFTER_IRET: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPatchIretIRQ); + break; + case VINF_EM_RESCHEDULE_REM: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetRescheduleREM); + break; + case VINF_EM_RAW_TO_R3: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3Total); + if (VM_FF_IS_SET(pVM, VM_FF_TM_VIRTUAL_SYNC)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3TMVirt); + else if (VM_FF_IS_SET(pVM, VM_FF_PGM_NEED_HANDY_PAGES)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3HandyPages); + else if (VM_FF_IS_SET(pVM, VM_FF_PDM_QUEUES)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3PDMQueues); + else if (VM_FF_IS_SET(pVM, VM_FF_EMT_RENDEZVOUS)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3Rendezvous); + else if (VM_FF_IS_SET(pVM, VM_FF_PDM_DMA)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3DMA); + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_TIMER)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3Timer); + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_PDM_CRITSECT)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3CritSect); + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_TO_R3)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3FF); + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_IEM)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3Iem); + else if (VMCPU_FF_IS_SET(pVCpu, VMCPU_FF_IOM)) + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3Iom); + else + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetToR3Unknown); + break; + + case VINF_EM_RAW_TIMER_PENDING: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetTimerPending); + break; + case VINF_EM_RAW_INTERRUPT_PENDING: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetInterruptPending); + break; + case VINF_VMM_CALL_HOST: + switch (pVCpu->vmm.s.enmCallRing3Operation) + { + case VMMCALLRING3_PDM_CRIT_SECT_ENTER: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallPDMCritSectEnter); + break; + case VMMCALLRING3_PDM_LOCK: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallPDMLock); + break; + case VMMCALLRING3_PGM_POOL_GROW: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallPGMPoolGrow); + break; + case VMMCALLRING3_PGM_LOCK: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallPGMLock); + break; + case VMMCALLRING3_PGM_MAP_CHUNK: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallPGMMapChunk); + break; + case VMMCALLRING3_PGM_ALLOCATE_HANDY_PAGES: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallPGMAllocHandy); + break; + case VMMCALLRING3_REM_REPLAY_HANDLER_NOTIFICATIONS: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallRemReplay); + break; + case VMMCALLRING3_VMM_LOGGER_FLUSH: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallLogFlush); + break; + case VMMCALLRING3_VM_SET_ERROR: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallVMSetError); + break; + case VMMCALLRING3_VM_SET_RUNTIME_ERROR: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZCallVMSetRuntimeError); + break; + case VMMCALLRING3_VM_R0_ASSERTION: + default: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetCallRing3); + break; + } + break; + case VINF_PATM_DUPLICATE_FUNCTION: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPATMDuplicateFn); + break; + case VINF_PGM_CHANGE_MODE: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPGMChangeMode); + break; + case VINF_PGM_POOL_FLUSH_PENDING: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPGMFlushPending); + break; + case VINF_EM_PENDING_REQUEST: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPendingRequest); + break; + case VINF_EM_HM_PATCH_TPR_INSTR: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetPatchTPR); + break; + default: + STAM_COUNTER_INC(&pVM->vmm.s.StatRZRetMisc); + break; + } +} +#endif /* VBOX_WITH_STATISTICS */ + + +/** + * The Ring 0 entry point, called by the fast-ioctl path. + * + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * The return code is stored in pVM->vmm.s.iLastGZRc. + * @param idCpu The Virtual CPU ID of the calling EMT. + * @param enmOperation Which operation to execute. + * @remarks Assume called with interrupts _enabled_. + */ +VMMR0DECL(void) VMMR0EntryFast(PGVM pGVM, PVM pVM, VMCPUID idCpu, VMMR0OPERATION enmOperation) +{ + /* + * Validation. + */ + if ( idCpu < pGVM->cCpus + && pGVM->cCpus == pVM->cCpus) + { /*likely*/ } + else + { + SUPR0Printf("VMMR0EntryFast: Bad idCpu=%#x cCpus=%#x/%#x\n", idCpu, pGVM->cCpus, pVM->cCpus); + return; + } + + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + RTNATIVETHREAD const hNativeThread = RTThreadNativeSelf(); + if (RT_LIKELY( pGVCpu->hEMT == hNativeThread + && pVCpu->hNativeThreadR0 == hNativeThread)) + { /* likely */ } + else + { + SUPR0Printf("VMMR0EntryFast: Bad thread idCpu=%#x hNativeSelf=%p pGVCpu->hEmt=%p pVCpu->hNativeThreadR0=%p\n", + idCpu, hNativeThread, pGVCpu->hEMT, pVCpu->hNativeThreadR0); + return; + } + + /* + * SMAP fun. + */ + VMM_CHECK_SMAP_SETUP(); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + + /* + * Perform requested operation. + */ + switch (enmOperation) + { + /* + * Switch to GC and run guest raw mode code. + * Disable interrupts before doing the world switch. + */ + case VMMR0_DO_RAW_RUN: + { +#ifdef VBOX_WITH_RAW_MODE +# ifndef VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0 + /* Some safety precautions first. */ + if (RT_UNLIKELY(!PGMGetHyperCR3(pVCpu))) + { + pVCpu->vmm.s.iLastGZRc = VERR_PGM_NO_CR3_SHADOW_ROOT; + break; + } +# endif + if (RT_SUCCESS(g_rcRawModeUsability)) + { /* likely */ } + else + { + pVCpu->vmm.s.iLastGZRc = g_rcRawModeUsability; + break; + } + + /* + * Disable preemption. + */ + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + RTThreadPreemptDisable(&PreemptState); + + /* + * Get the host CPU identifiers, make sure they are valid and that + * we've got a TSC delta for the CPU. + */ + RTCPUID idHostCpu; + uint32_t iHostCpuSet = RTMpCurSetIndexAndId(&idHostCpu); + if (RT_LIKELY( iHostCpuSet < RTCPUSET_MAX_CPUS + && SUPIsTscDeltaAvailableForCpuSetIndex(iHostCpuSet))) + { + /* + * Commit the CPU identifiers and update the periodict preemption timer if it's active. + */ +# ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI + CPUMR0SetLApic(pVCpu, iHostCpuSet); +# endif + pVCpu->iHostCpuSet = iHostCpuSet; + ASMAtomicWriteU32(&pVCpu->idHostCpu, idHostCpu); + + if (pVM->vmm.s.fUsePeriodicPreemptionTimers) + GVMMR0SchedUpdatePeriodicPreemptionTimer(pVM, pVCpu->idHostCpu, TMCalcHostTimerFrequency(pVM, pVCpu)); + + /* + * We might need to disable VT-x if the active switcher turns off paging. + */ + bool fVTxDisabled; + int rc = HMR0EnterSwitcher(pVM, pVM->vmm.s.enmSwitcher, &fVTxDisabled); + if (RT_SUCCESS(rc)) + { + /* + * Disable interrupts and run raw-mode code. The loop is for efficiently + * dispatching tracepoints that fired in raw-mode context. + */ + RTCCUINTREG uFlags = ASMIntDisableFlags(); + + for (;;) + { + VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED_EXEC); + TMNotifyStartOfExecution(pVCpu); + + rc = pVM->vmm.s.pfnR0ToRawMode(pVM); + pVCpu->vmm.s.iLastGZRc = rc; + + TMNotifyEndOfExecution(pVCpu); + VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED); + + if (rc != VINF_VMM_CALL_TRACER) + break; + SUPR0TracerUmodProbeFire(pVM->pSession, &pVCpu->vmm.s.TracerCtx); + } + + /* + * Re-enable VT-x before we dispatch any pending host interrupts and + * re-enables interrupts. + */ + HMR0LeaveSwitcher(pVM, fVTxDisabled); + + if ( rc == VINF_EM_RAW_INTERRUPT + || rc == VINF_EM_RAW_INTERRUPT_HYPER) + TRPMR0DispatchHostInterrupt(pVM); + + ASMSetFlags(uFlags); + + /* Fire dtrace probe and collect statistics. */ + VBOXVMM_R0_VMM_RETURN_TO_RING3_RC(pVCpu, CPUMQueryGuestCtxPtr(pVCpu), rc); +# ifdef VBOX_WITH_STATISTICS + STAM_COUNTER_INC(&pVM->vmm.s.StatRunRC); + vmmR0RecordRC(pVM, pVCpu, rc); +# endif + } + else + pVCpu->vmm.s.iLastGZRc = rc; + + /* + * Invalidate the host CPU identifiers as we restore preemption. + */ + pVCpu->iHostCpuSet = UINT32_MAX; + ASMAtomicWriteU32(&pVCpu->idHostCpu, NIL_RTCPUID); + + RTThreadPreemptRestore(&PreemptState); + } + /* + * Invalid CPU set index or TSC delta in need of measuring. + */ + else + { + RTThreadPreemptRestore(&PreemptState); + if (iHostCpuSet < RTCPUSET_MAX_CPUS) + { + int rc = SUPR0TscDeltaMeasureBySetIndex(pVM->pSession, iHostCpuSet, 0 /*fFlags*/, + 2 /*cMsWaitRetry*/, 5*RT_MS_1SEC /*cMsWaitThread*/, + 0 /*default cTries*/); + if (RT_SUCCESS(rc) || rc == VERR_CPU_OFFLINE) + pVCpu->vmm.s.iLastGZRc = VINF_EM_RAW_TO_R3; + else + pVCpu->vmm.s.iLastGZRc = rc; + } + else + pVCpu->vmm.s.iLastGZRc = VERR_INVALID_CPU_INDEX; + } + +#else /* !VBOX_WITH_RAW_MODE */ + pVCpu->vmm.s.iLastGZRc = VERR_RAW_MODE_NOT_SUPPORTED; +#endif + break; + } + + /* + * Run guest code using the available hardware acceleration technology. + */ + case VMMR0_DO_HM_RUN: + { + for (;;) /* hlt loop */ + { + /* + * Disable preemption. + */ + Assert(!vmmR0ThreadCtxHookIsEnabled(pVCpu)); + RTTHREADPREEMPTSTATE PreemptState = RTTHREADPREEMPTSTATE_INITIALIZER; + RTThreadPreemptDisable(&PreemptState); + + /* + * Get the host CPU identifiers, make sure they are valid and that + * we've got a TSC delta for the CPU. + */ + RTCPUID idHostCpu; + uint32_t iHostCpuSet = RTMpCurSetIndexAndId(&idHostCpu); + if (RT_LIKELY( iHostCpuSet < RTCPUSET_MAX_CPUS + && SUPIsTscDeltaAvailableForCpuSetIndex(iHostCpuSet))) + { + pVCpu->iHostCpuSet = iHostCpuSet; + ASMAtomicWriteU32(&pVCpu->idHostCpu, idHostCpu); + + /* + * Update the periodic preemption timer if it's active. + */ + if (pVM->vmm.s.fUsePeriodicPreemptionTimers) + GVMMR0SchedUpdatePeriodicPreemptionTimer(pVM, pVCpu->idHostCpu, TMCalcHostTimerFrequency(pVM, pVCpu)); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + +#ifdef VMM_R0_TOUCH_FPU + /* + * Make sure we've got the FPU state loaded so and we don't need to clear + * CR0.TS and get out of sync with the host kernel when loading the guest + * FPU state. @ref sec_cpum_fpu (CPUM.cpp) and @bugref{4053}. + */ + CPUMR0TouchHostFpu(); +#endif + int rc; + bool fPreemptRestored = false; + if (!HMR0SuspendPending()) + { + /* + * Enable the context switching hook. + */ + if (pVCpu->vmm.s.hCtxHook != NIL_RTTHREADCTXHOOK) + { + Assert(!RTThreadCtxHookIsEnabled(pVCpu->vmm.s.hCtxHook)); + int rc2 = RTThreadCtxHookEnable(pVCpu->vmm.s.hCtxHook); AssertRC(rc2); + } + + /* + * Enter HM context. + */ + rc = HMR0Enter(pVCpu); + if (RT_SUCCESS(rc)) + { + VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED_HM); + + /* + * When preemption hooks are in place, enable preemption now that + * we're in HM context. + */ + if (vmmR0ThreadCtxHookIsEnabled(pVCpu)) + { + fPreemptRestored = true; + RTThreadPreemptRestore(&PreemptState); + } + + /* + * Setup the longjmp machinery and execute guest code (calls HMR0RunGuestCode). + */ + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = vmmR0CallRing3SetJmp(&pVCpu->vmm.s.CallRing3JmpBufR0, HMR0RunGuestCode, pVM, pVCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + + /* + * Assert sanity on the way out. Using manual assertions code here as normal + * assertions are going to panic the host since we're outside the setjmp/longjmp zone. + */ + if (RT_UNLIKELY( VMCPU_GET_STATE(pVCpu) != VMCPUSTATE_STARTED_HM + && RT_SUCCESS_NP(rc) && rc != VINF_VMM_CALL_HOST )) + { + pVM->vmm.s.szRing0AssertMsg1[0] = '\0'; + RTStrPrintf(pVM->vmm.s.szRing0AssertMsg2, sizeof(pVM->vmm.s.szRing0AssertMsg2), + "Got VMCPU state %d expected %d.\n", VMCPU_GET_STATE(pVCpu), VMCPUSTATE_STARTED_HM); + rc = VERR_VMM_WRONG_HM_VMCPU_STATE; + } + /** @todo Get rid of this. HM shouldn't disable the context hook. */ + else if (RT_UNLIKELY(vmmR0ThreadCtxHookIsEnabled(pVCpu))) + { + pVM->vmm.s.szRing0AssertMsg1[0] = '\0'; + RTStrPrintf(pVM->vmm.s.szRing0AssertMsg2, sizeof(pVM->vmm.s.szRing0AssertMsg2), + "Thread-context hooks still enabled! VCPU=%p Id=%u rc=%d.\n", pVCpu, pVCpu->idCpu, rc); + rc = VERR_INVALID_STATE; + } + + VMCPU_SET_STATE(pVCpu, VMCPUSTATE_STARTED); + } + STAM_COUNTER_INC(&pVM->vmm.s.StatRunRC); + + /* + * Invalidate the host CPU identifiers before we disable the context + * hook / restore preemption. + */ + pVCpu->iHostCpuSet = UINT32_MAX; + ASMAtomicWriteU32(&pVCpu->idHostCpu, NIL_RTCPUID); + + /* + * Disable context hooks. Due to unresolved cleanup issues, we + * cannot leave the hooks enabled when we return to ring-3. + * + * Note! At the moment HM may also have disabled the hook + * when we get here, but the IPRT API handles that. + */ + if (pVCpu->vmm.s.hCtxHook != NIL_RTTHREADCTXHOOK) + { + ASMAtomicWriteU32(&pVCpu->idHostCpu, NIL_RTCPUID); + RTThreadCtxHookDisable(pVCpu->vmm.s.hCtxHook); + } + } + /* + * The system is about to go into suspend mode; go back to ring 3. + */ + else + { + rc = VINF_EM_RAW_INTERRUPT; + pVCpu->iHostCpuSet = UINT32_MAX; + ASMAtomicWriteU32(&pVCpu->idHostCpu, NIL_RTCPUID); + } + + /** @todo When HM stops messing with the context hook state, we'll disable + * preemption again before the RTThreadCtxHookDisable call. */ + if (!fPreemptRestored) + RTThreadPreemptRestore(&PreemptState); + + pVCpu->vmm.s.iLastGZRc = rc; + + /* Fire dtrace probe and collect statistics. */ + VBOXVMM_R0_VMM_RETURN_TO_RING3_HM(pVCpu, CPUMQueryGuestCtxPtr(pVCpu), rc); +#ifdef VBOX_WITH_STATISTICS + vmmR0RecordRC(pVM, pVCpu, rc); +#endif +#if 1 + /* + * If this is a halt. + */ + if (rc != VINF_EM_HALT) + { /* we're not in a hurry for a HLT, so prefer this path */ } + else + { + pVCpu->vmm.s.iLastGZRc = rc = vmmR0DoHalt(pGVM, pVM, pGVCpu, pVCpu); + if (rc == VINF_SUCCESS) + { + pVCpu->vmm.s.cR0HaltsSucceeded++; + continue; + } + pVCpu->vmm.s.cR0HaltsToRing3++; + } +#endif + } + /* + * Invalid CPU set index or TSC delta in need of measuring. + */ + else + { + pVCpu->iHostCpuSet = UINT32_MAX; + ASMAtomicWriteU32(&pVCpu->idHostCpu, NIL_RTCPUID); + RTThreadPreemptRestore(&PreemptState); + if (iHostCpuSet < RTCPUSET_MAX_CPUS) + { + int rc = SUPR0TscDeltaMeasureBySetIndex(pVM->pSession, iHostCpuSet, 0 /*fFlags*/, + 2 /*cMsWaitRetry*/, 5*RT_MS_1SEC /*cMsWaitThread*/, + 0 /*default cTries*/); + if (RT_SUCCESS(rc) || rc == VERR_CPU_OFFLINE) + pVCpu->vmm.s.iLastGZRc = VINF_EM_RAW_TO_R3; + else + pVCpu->vmm.s.iLastGZRc = rc; + } + else + pVCpu->vmm.s.iLastGZRc = VERR_INVALID_CPU_INDEX; + } + break; + + } /* halt loop. */ + break; + } + +#ifdef VBOX_WITH_NEM_R0 +# if defined(RT_ARCH_AMD64) && defined(RT_OS_WINDOWS) + case VMMR0_DO_NEM_RUN: + { + /* + * Setup the longjmp machinery and execute guest code (calls NEMR0RunGuestCode). + */ + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + int rc = vmmR0CallRing3SetJmp2(&pVCpu->vmm.s.CallRing3JmpBufR0, NEMR0RunGuestCode, pGVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + STAM_COUNTER_INC(&pVM->vmm.s.StatRunRC); + + pVCpu->vmm.s.iLastGZRc = rc; + + /* + * Fire dtrace probe and collect statistics. + */ + VBOXVMM_R0_VMM_RETURN_TO_RING3_NEM(pVCpu, CPUMQueryGuestCtxPtr(pVCpu), rc); +# ifdef VBOX_WITH_STATISTICS + vmmR0RecordRC(pVM, pVCpu, rc); +# endif + break; + } +# endif +#endif + + + /* + * For profiling. + */ + case VMMR0_DO_NOP: + pVCpu->vmm.s.iLastGZRc = VINF_SUCCESS; + break; + + /* + * Shouldn't happen. + */ + default: + AssertMsgFailed(("%#x\n", enmOperation)); + pVCpu->vmm.s.iLastGZRc = VERR_NOT_SUPPORTED; + break; + } + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); +} + + +/** + * Validates a session or VM session argument. + * + * @returns true / false accordingly. + * @param pVM The cross context VM structure. + * @param pClaimedSession The session claim to validate. + * @param pSession The session argument. + */ +DECLINLINE(bool) vmmR0IsValidSession(PVM pVM, PSUPDRVSESSION pClaimedSession, PSUPDRVSESSION pSession) +{ + /* This must be set! */ + if (!pSession) + return false; + + /* Only one out of the two. */ + if (pVM && pClaimedSession) + return false; + if (pVM) + pClaimedSession = pVM->pSession; + return pClaimedSession == pSession; +} + + +/** + * VMMR0EntryEx worker function, either called directly or when ever possible + * called thru a longjmp so we can exit safely on failure. + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu Virtual CPU ID argument. Must be NIL_VMCPUID if pVM + * is NIL_RTR0PTR, and may be NIL_VMCPUID if it isn't + * @param enmOperation Which operation to execute. + * @param pReqHdr This points to a SUPVMMR0REQHDR packet. Optional. + * The support driver validates this if it's present. + * @param u64Arg Some simple constant argument. + * @param pSession The session of the caller. + * + * @remarks Assume called with interrupts _enabled_. + */ +static int vmmR0EntryExWorker(PGVM pGVM, PVM pVM, VMCPUID idCpu, VMMR0OPERATION enmOperation, + PSUPVMMR0REQHDR pReqHdr, uint64_t u64Arg, PSUPDRVSESSION pSession) +{ + /* + * Validate pGVM, pVM and idCpu for consistency and validity. + */ + if ( pGVM != NULL + || pVM != NULL) + { + if (RT_LIKELY( RT_VALID_PTR(pGVM) + && RT_VALID_PTR(pVM) + && ((uintptr_t)pVM & PAGE_OFFSET_MASK) == 0)) + { /* likely */ } + else + { + SUPR0Printf("vmmR0EntryExWorker: Invalid pGVM=%p and/or pVM=%p! (op=%d)\n", pGVM, pVM, enmOperation); + return VERR_INVALID_POINTER; + } + + if (RT_LIKELY(pGVM->pVM == pVM)) + { /* likely */ } + else + { + SUPR0Printf("vmmR0EntryExWorker: pVM mismatch: got %p, pGVM->pVM=%p\n", pVM, pGVM->pVM); + return VERR_INVALID_PARAMETER; + } + + if (RT_LIKELY(idCpu == NIL_VMCPUID || idCpu < pGVM->cCpus)) + { /* likely */ } + else + { + SUPR0Printf("vmmR0EntryExWorker: Invalid idCpu %#x (cCpus=%#x)\n", idCpu, pGVM->cCpus); + return VERR_INVALID_PARAMETER; + } + + if (RT_LIKELY( pVM->enmVMState >= VMSTATE_CREATING + && pVM->enmVMState <= VMSTATE_TERMINATED + && pVM->cCpus == pGVM->cCpus + && pVM->pSession == pSession + && pVM->pVMR0 == pVM)) + { /* likely */ } + else + { + SUPR0Printf("vmmR0EntryExWorker: Invalid pVM=%p:{.enmVMState=%d, .cCpus=%#x(==%#x), .pSession=%p(==%p), .pVMR0=%p(==%p)}! (op=%d)\n", + pVM, pVM->enmVMState, pVM->cCpus, pGVM->cCpus, pVM->pSession, pSession, pVM->pVMR0, pVM, enmOperation); + return VERR_INVALID_POINTER; + } + } + else if (RT_LIKELY(idCpu == NIL_VMCPUID)) + { /* likely */ } + else + { + SUPR0Printf("vmmR0EntryExWorker: Invalid idCpu=%u\n", idCpu); + return VERR_INVALID_PARAMETER; + } + + /* + * SMAP fun. + */ + VMM_CHECK_SMAP_SETUP(); + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + + /* + * Process the request. + */ + int rc; + switch (enmOperation) + { + /* + * GVM requests + */ + case VMMR0_DO_GVMM_CREATE_VM: + if (pGVM == NULL && pVM == NULL && u64Arg == 0 && idCpu == NIL_VMCPUID) + rc = GVMMR0CreateVMReq((PGVMMCREATEVMREQ)pReqHdr, pSession); + else + rc = VERR_INVALID_PARAMETER; + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + break; + + case VMMR0_DO_GVMM_DESTROY_VM: + if (pReqHdr == NULL && u64Arg == 0) + rc = GVMMR0DestroyVM(pGVM, pVM); + else + rc = VERR_INVALID_PARAMETER; + VMM_CHECK_SMAP_CHECK(RT_NOTHING); + break; + + case VMMR0_DO_GVMM_REGISTER_VMCPU: + if (pGVM != NULL && pVM != NULL) + rc = GVMMR0RegisterVCpu(pGVM, pVM, idCpu); + else + rc = VERR_INVALID_PARAMETER; + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GVMM_DEREGISTER_VMCPU: + if (pGVM != NULL && pVM != NULL) + rc = GVMMR0DeregisterVCpu(pGVM, pVM, idCpu); + else + rc = VERR_INVALID_PARAMETER; + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GVMM_SCHED_HALT: + if (pReqHdr) + return VERR_INVALID_PARAMETER; + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = GVMMR0SchedHaltReq(pGVM, pVM, idCpu, u64Arg); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GVMM_SCHED_WAKE_UP: + if (pReqHdr || u64Arg) + return VERR_INVALID_PARAMETER; + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + rc = GVMMR0SchedWakeUp(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GVMM_SCHED_POKE: + if (pReqHdr || u64Arg) + return VERR_INVALID_PARAMETER; + rc = GVMMR0SchedPoke(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GVMM_SCHED_WAKE_UP_AND_POKE_CPUS: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GVMMR0SchedWakeUpAndPokeCpusReq(pGVM, pVM, (PGVMMSCHEDWAKEUPANDPOKECPUSREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GVMM_SCHED_POLL: + if (pReqHdr || u64Arg > 1) + return VERR_INVALID_PARAMETER; + rc = GVMMR0SchedPoll(pGVM, pVM, idCpu, !!u64Arg); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GVMM_QUERY_STATISTICS: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GVMMR0QueryStatisticsReq(pGVM, pVM, (PGVMMQUERYSTATISTICSSREQ)pReqHdr, pSession); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GVMM_RESET_STATISTICS: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GVMMR0ResetStatisticsReq(pGVM, pVM, (PGVMMRESETSTATISTICSSREQ)pReqHdr, pSession); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + /* + * Initialize the R0 part of a VM instance. + */ + case VMMR0_DO_VMMR0_INIT: + rc = vmmR0InitVM(pGVM, pVM, RT_LODWORD(u64Arg), RT_HIDWORD(u64Arg)); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + /* + * Does EMT specific ring-0 init. + */ + case VMMR0_DO_VMMR0_INIT_EMT: + rc = vmmR0InitVMEmt(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + /* + * Terminate the R0 part of a VM instance. + */ + case VMMR0_DO_VMMR0_TERM: + rc = VMMR0TermVM(pGVM, pVM, 0 /*idCpu*/); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + /* + * Attempt to enable hm mode and check the current setting. + */ + case VMMR0_DO_HM_ENABLE: + rc = HMR0EnableAllCpus(pVM); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + /* + * Setup the hardware accelerated session. + */ + case VMMR0_DO_HM_SETUP_VM: + rc = HMR0SetupVM(pVM); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + /* + * Switch to RC to execute Hypervisor function. + */ + case VMMR0_DO_CALL_HYPERVISOR: + { +#ifdef VBOX_WITH_RAW_MODE + /* + * Validate input / context. + */ + if (RT_UNLIKELY(idCpu != 0)) + return VERR_INVALID_CPU_ID; + if (RT_UNLIKELY(pVM->cCpus != 1)) + return VERR_INVALID_PARAMETER; + PVMCPU pVCpu = &pVM->aCpus[idCpu]; +# ifndef VBOX_WITH_2X_4GB_ADDR_SPACE_IN_R0 + if (RT_UNLIKELY(!PGMGetHyperCR3(pVCpu))) + return VERR_PGM_NO_CR3_SHADOW_ROOT; +# endif + if (RT_FAILURE(g_rcRawModeUsability)) + return g_rcRawModeUsability; + + /* + * Disable interrupts. + */ + RTCCUINTREG fFlags = ASMIntDisableFlags(); + + /* + * Get the host CPU identifiers, make sure they are valid and that + * we've got a TSC delta for the CPU. + */ + RTCPUID idHostCpu; + uint32_t iHostCpuSet = RTMpCurSetIndexAndId(&idHostCpu); + if (RT_UNLIKELY(iHostCpuSet >= RTCPUSET_MAX_CPUS)) + { + ASMSetFlags(fFlags); + return VERR_INVALID_CPU_INDEX; + } + if (RT_UNLIKELY(!SUPIsTscDeltaAvailableForCpuSetIndex(iHostCpuSet))) + { + ASMSetFlags(fFlags); + rc = SUPR0TscDeltaMeasureBySetIndex(pVM->pSession, iHostCpuSet, 0 /*fFlags*/, + 2 /*cMsWaitRetry*/, 5*RT_MS_1SEC /*cMsWaitThread*/, + 0 /*default cTries*/); + if (RT_FAILURE(rc) && rc != VERR_CPU_OFFLINE) + { + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + return rc; + } + } + + /* + * Commit the CPU identifiers. + */ +# ifdef VBOX_WITH_VMMR0_DISABLE_LAPIC_NMI + CPUMR0SetLApic(pVCpu, iHostCpuSet); +# endif + pVCpu->iHostCpuSet = iHostCpuSet; + ASMAtomicWriteU32(&pVCpu->idHostCpu, idHostCpu); + + /* + * We might need to disable VT-x if the active switcher turns off paging. + */ + bool fVTxDisabled; + rc = HMR0EnterSwitcher(pVM, pVM->vmm.s.enmSwitcher, &fVTxDisabled); + if (RT_SUCCESS(rc)) + { + /* + * Go through the wormhole... + */ + rc = pVM->vmm.s.pfnR0ToRawMode(pVM); + + /* + * Re-enable VT-x before we dispatch any pending host interrupts. + */ + HMR0LeaveSwitcher(pVM, fVTxDisabled); + + if ( rc == VINF_EM_RAW_INTERRUPT + || rc == VINF_EM_RAW_INTERRUPT_HYPER) + TRPMR0DispatchHostInterrupt(pVM); + } + + /* + * Invalidate the host CPU identifiers as we restore interrupts. + */ + pVCpu->iHostCpuSet = UINT32_MAX; + ASMAtomicWriteU32(&pVCpu->idHostCpu, NIL_RTCPUID); + ASMSetFlags(fFlags); + +#else /* !VBOX_WITH_RAW_MODE */ + rc = VERR_RAW_MODE_NOT_SUPPORTED; +#endif + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + } + + /* + * PGM wrappers. + */ + case VMMR0_DO_PGM_ALLOCATE_HANDY_PAGES: + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + rc = PGMR0PhysAllocateHandyPages(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_PGM_FLUSH_HANDY_PAGES: + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + rc = PGMR0PhysFlushHandyPages(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_PGM_ALLOCATE_LARGE_HANDY_PAGE: + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + rc = PGMR0PhysAllocateLargeHandyPage(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_PGM_PHYS_SETUP_IOMMU: + if (idCpu != 0) + return VERR_INVALID_CPU_ID; + rc = PGMR0PhysSetupIoMmu(pGVM, pVM); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + /* + * GMM wrappers. + */ + case VMMR0_DO_GMM_INITIAL_RESERVATION: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0InitialReservationReq(pGVM, pVM, idCpu, (PGMMINITIALRESERVATIONREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_UPDATE_RESERVATION: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0UpdateReservationReq(pGVM, pVM, idCpu, (PGMMUPDATERESERVATIONREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_ALLOCATE_PAGES: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0AllocatePagesReq(pGVM, pVM, idCpu, (PGMMALLOCATEPAGESREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_FREE_PAGES: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0FreePagesReq(pGVM, pVM, idCpu, (PGMMFREEPAGESREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_FREE_LARGE_PAGE: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0FreeLargePageReq(pGVM, pVM, idCpu, (PGMMFREELARGEPAGEREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_QUERY_HYPERVISOR_MEM_STATS: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0QueryHypervisorMemoryStatsReq((PGMMMEMSTATSREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_QUERY_MEM_STATS: + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0QueryMemoryStatsReq(pGVM, pVM, idCpu, (PGMMMEMSTATSREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_BALLOONED_PAGES: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0BalloonedPagesReq(pGVM, pVM, idCpu, (PGMMBALLOONEDPAGESREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_MAP_UNMAP_CHUNK: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0MapUnmapChunkReq(pGVM, pVM, (PGMMMAPUNMAPCHUNKREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_SEED_CHUNK: + if (pReqHdr) + return VERR_INVALID_PARAMETER; + rc = GMMR0SeedChunk(pGVM, pVM, idCpu, (RTR3PTR)u64Arg); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_REGISTER_SHARED_MODULE: + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0RegisterSharedModuleReq(pGVM, pVM, idCpu, (PGMMREGISTERSHAREDMODULEREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_UNREGISTER_SHARED_MODULE: + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0UnregisterSharedModuleReq(pGVM, pVM, idCpu, (PGMMUNREGISTERSHAREDMODULEREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_RESET_SHARED_MODULES: + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + if ( u64Arg + || pReqHdr) + return VERR_INVALID_PARAMETER; + rc = GMMR0ResetSharedModules(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + +#ifdef VBOX_WITH_PAGE_SHARING + case VMMR0_DO_GMM_CHECK_SHARED_MODULES: + { + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + if ( u64Arg + || pReqHdr) + return VERR_INVALID_PARAMETER; + rc = GMMR0CheckSharedModules(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + } +#endif + +#if defined(VBOX_STRICT) && HC_ARCH_BITS == 64 + case VMMR0_DO_GMM_FIND_DUPLICATE_PAGE: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0FindDuplicatePageReq(pGVM, pVM, (PGMMFINDDUPLICATEPAGEREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; +#endif + + case VMMR0_DO_GMM_QUERY_STATISTICS: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0QueryStatisticsReq(pGVM, pVM, (PGMMQUERYSTATISTICSSREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_GMM_RESET_STATISTICS: + if (u64Arg) + return VERR_INVALID_PARAMETER; + rc = GMMR0ResetStatisticsReq(pGVM, pVM, (PGMMRESETSTATISTICSSREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + /* + * A quick GCFGM mock-up. + */ + /** @todo GCFGM with proper access control, ring-3 management interface and all that. */ + case VMMR0_DO_GCFGM_SET_VALUE: + case VMMR0_DO_GCFGM_QUERY_VALUE: + { + if (pGVM || pVM || !pReqHdr || u64Arg || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + PGCFGMVALUEREQ pReq = (PGCFGMVALUEREQ)pReqHdr; + if (pReq->Hdr.cbReq != sizeof(*pReq)) + return VERR_INVALID_PARAMETER; + if (enmOperation == VMMR0_DO_GCFGM_SET_VALUE) + { + rc = GVMMR0SetConfig(pReq->pSession, &pReq->szName[0], pReq->u64Value); + //if (rc == VERR_CFGM_VALUE_NOT_FOUND) + // rc = GMMR0SetConfig(pReq->pSession, &pReq->szName[0], pReq->u64Value); + } + else + { + rc = GVMMR0QueryConfig(pReq->pSession, &pReq->szName[0], &pReq->u64Value); + //if (rc == VERR_CFGM_VALUE_NOT_FOUND) + // rc = GMMR0QueryConfig(pReq->pSession, &pReq->szName[0], &pReq->u64Value); + } + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + } + + /* + * PDM Wrappers. + */ + case VMMR0_DO_PDM_DRIVER_CALL_REQ_HANDLER: + { + if (!pReqHdr || u64Arg || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = PDMR0DriverCallReqHandler(pGVM, pVM, (PPDMDRIVERCALLREQHANDLERREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + } + + case VMMR0_DO_PDM_DEVICE_CALL_REQ_HANDLER: + { + if (!pReqHdr || u64Arg || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = PDMR0DeviceCallReqHandler(pGVM, pVM, (PPDMDEVICECALLREQHANDLERREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + } + + /* + * Requests to the internal networking service. + */ + case VMMR0_DO_INTNET_OPEN: + { + PINTNETOPENREQ pReq = (PINTNETOPENREQ)pReqHdr; + if (u64Arg || !pReq || !vmmR0IsValidSession(pVM, pReq->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0OpenReq(pSession, pReq); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + } + + case VMMR0_DO_INTNET_IF_CLOSE: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PINTNETIFCLOSEREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0IfCloseReq(pSession, (PINTNETIFCLOSEREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + + case VMMR0_DO_INTNET_IF_GET_BUFFER_PTRS: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PINTNETIFGETBUFFERPTRSREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0IfGetBufferPtrsReq(pSession, (PINTNETIFGETBUFFERPTRSREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_INTNET_IF_SET_PROMISCUOUS_MODE: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PINTNETIFSETPROMISCUOUSMODEREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0IfSetPromiscuousModeReq(pSession, (PINTNETIFSETPROMISCUOUSMODEREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_INTNET_IF_SET_MAC_ADDRESS: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PINTNETIFSETMACADDRESSREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0IfSetMacAddressReq(pSession, (PINTNETIFSETMACADDRESSREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_INTNET_IF_SET_ACTIVE: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PINTNETIFSETACTIVEREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0IfSetActiveReq(pSession, (PINTNETIFSETACTIVEREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_INTNET_IF_SEND: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PINTNETIFSENDREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0IfSendReq(pSession, (PINTNETIFSENDREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_INTNET_IF_WAIT: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PINTNETIFWAITREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0IfWaitReq(pSession, (PINTNETIFWAITREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_INTNET_IF_ABORT_WAIT: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PINTNETIFWAITREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = IntNetR0IfAbortWaitReq(pSession, (PINTNETIFABORTWAITREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + +#ifdef VBOX_WITH_PCI_PASSTHROUGH + /* + * Requests to host PCI driver service. + */ + case VMMR0_DO_PCIRAW_REQ: + if (u64Arg || !pReqHdr || !vmmR0IsValidSession(pVM, ((PPCIRAWSENDREQ)pReqHdr)->pSession, pSession) || idCpu != NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = PciRawR0ProcessReq(pGVM, pVM, pSession, (PPCIRAWSENDREQ)pReqHdr); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; +#endif + + /* + * NEM requests. + */ +#ifdef VBOX_WITH_NEM_R0 +# if defined(RT_ARCH_AMD64) && defined(RT_OS_WINDOWS) + case VMMR0_DO_NEM_INIT_VM: + if (u64Arg || pReqHdr || idCpu != 0) + return VERR_INVALID_PARAMETER; + rc = NEMR0InitVM(pGVM, pVM); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_NEM_INIT_VM_PART_2: + if (u64Arg || pReqHdr || idCpu != 0) + return VERR_INVALID_PARAMETER; + rc = NEMR0InitVMPart2(pGVM, pVM); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_NEM_MAP_PAGES: + if (u64Arg || pReqHdr || idCpu == NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = NEMR0MapPages(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_NEM_UNMAP_PAGES: + if (u64Arg || pReqHdr || idCpu == NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = NEMR0UnmapPages(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_NEM_EXPORT_STATE: + if (u64Arg || pReqHdr || idCpu == NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = NEMR0ExportState(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_NEM_IMPORT_STATE: + if (pReqHdr || idCpu == NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = NEMR0ImportState(pGVM, pVM, idCpu, u64Arg); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_NEM_QUERY_CPU_TICK: + if (u64Arg || pReqHdr || idCpu == NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = NEMR0QueryCpuTick(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_NEM_RESUME_CPU_TICK_ON_ALL: + if (pReqHdr || idCpu == NIL_VMCPUID) + return VERR_INVALID_PARAMETER; + rc = NEMR0ResumeCpuTickOnAll(pGVM, pVM, idCpu, u64Arg); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + + case VMMR0_DO_NEM_UPDATE_STATISTICS: + if (u64Arg || pReqHdr) + return VERR_INVALID_PARAMETER; + rc = NEMR0UpdateStatistics(pGVM, pVM, idCpu); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; + +# if 1 && defined(DEBUG_bird) + case VMMR0_DO_NEM_EXPERIMENT: + if (pReqHdr) + return VERR_INVALID_PARAMETER; + rc = NEMR0DoExperiment(pGVM, pVM, idCpu, u64Arg); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; +# endif +# endif +#endif + + /* + * For profiling. + */ + case VMMR0_DO_NOP: + case VMMR0_DO_SLOW_NOP: + return VINF_SUCCESS; + + /* + * For testing Ring-0 APIs invoked in this environment. + */ + case VMMR0_DO_TESTS: + /** @todo make new test */ + return VINF_SUCCESS; + + +#if HC_ARCH_BITS == 32 && defined(VBOX_WITH_64_BITS_GUESTS) + case VMMR0_DO_TEST_SWITCHER3264: + if (idCpu == NIL_VMCPUID) + return VERR_INVALID_CPU_ID; + rc = HMR0TestSwitcher3264(pVM); + VMM_CHECK_SMAP_CHECK2(pVM, RT_NOTHING); + break; +#endif + default: + /* + * We're returning VERR_NOT_SUPPORT here so we've got something else + * than -1 which the interrupt gate glue code might return. + */ + Log(("operation %#x is not supported\n", enmOperation)); + return VERR_NOT_SUPPORTED; + } + return rc; +} + + +/** + * Argument for vmmR0EntryExWrapper containing the arguments for VMMR0EntryEx. + */ +typedef struct VMMR0ENTRYEXARGS +{ + PGVM pGVM; + PVM pVM; + VMCPUID idCpu; + VMMR0OPERATION enmOperation; + PSUPVMMR0REQHDR pReq; + uint64_t u64Arg; + PSUPDRVSESSION pSession; +} VMMR0ENTRYEXARGS; +/** Pointer to a vmmR0EntryExWrapper argument package. */ +typedef VMMR0ENTRYEXARGS *PVMMR0ENTRYEXARGS; + +/** + * This is just a longjmp wrapper function for VMMR0EntryEx calls. + * + * @returns VBox status code. + * @param pvArgs The argument package + */ +static DECLCALLBACK(int) vmmR0EntryExWrapper(void *pvArgs) +{ + return vmmR0EntryExWorker(((PVMMR0ENTRYEXARGS)pvArgs)->pGVM, + ((PVMMR0ENTRYEXARGS)pvArgs)->pVM, + ((PVMMR0ENTRYEXARGS)pvArgs)->idCpu, + ((PVMMR0ENTRYEXARGS)pvArgs)->enmOperation, + ((PVMMR0ENTRYEXARGS)pvArgs)->pReq, + ((PVMMR0ENTRYEXARGS)pvArgs)->u64Arg, + ((PVMMR0ENTRYEXARGS)pvArgs)->pSession); +} + + +/** + * The Ring 0 entry point, called by the support library (SUP). + * + * @returns VBox status code. + * @param pGVM The global (ring-0) VM structure. + * @param pVM The cross context VM structure. + * @param idCpu Virtual CPU ID argument. Must be NIL_VMCPUID if pVM + * is NIL_RTR0PTR, and may be NIL_VMCPUID if it isn't + * @param enmOperation Which operation to execute. + * @param pReq Pointer to the SUPVMMR0REQHDR packet. Optional. + * @param u64Arg Some simple constant argument. + * @param pSession The session of the caller. + * @remarks Assume called with interrupts _enabled_. + */ +VMMR0DECL(int) VMMR0EntryEx(PGVM pGVM, PVM pVM, VMCPUID idCpu, VMMR0OPERATION enmOperation, + PSUPVMMR0REQHDR pReq, uint64_t u64Arg, PSUPDRVSESSION pSession) +{ + /* + * Requests that should only happen on the EMT thread will be + * wrapped in a setjmp so we can assert without causing trouble. + */ + if ( pVM != NULL + && pGVM != NULL + && idCpu < pGVM->cCpus + && pVM->pVMR0 != NULL) + { + switch (enmOperation) + { + /* These might/will be called before VMMR3Init. */ + case VMMR0_DO_GMM_INITIAL_RESERVATION: + case VMMR0_DO_GMM_UPDATE_RESERVATION: + case VMMR0_DO_GMM_ALLOCATE_PAGES: + case VMMR0_DO_GMM_FREE_PAGES: + case VMMR0_DO_GMM_BALLOONED_PAGES: + /* On the mac we might not have a valid jmp buf, so check these as well. */ + case VMMR0_DO_VMMR0_INIT: + case VMMR0_DO_VMMR0_TERM: + { + PGVMCPU pGVCpu = &pGVM->aCpus[idCpu]; + PVMCPU pVCpu = &pVM->aCpus[idCpu]; + RTNATIVETHREAD hNativeThread = RTThreadNativeSelf(); + if (RT_LIKELY( pGVCpu->hEMT == hNativeThread + && pVCpu->hNativeThreadR0 == hNativeThread)) + { + if (!pVCpu->vmm.s.CallRing3JmpBufR0.pvSavedStack) + break; + + /** @todo validate this EMT claim... GVM knows. */ + VMMR0ENTRYEXARGS Args; + Args.pGVM = pGVM; + Args.pVM = pVM; + Args.idCpu = idCpu; + Args.enmOperation = enmOperation; + Args.pReq = pReq; + Args.u64Arg = u64Arg; + Args.pSession = pSession; + return vmmR0CallRing3SetJmpEx(&pVCpu->vmm.s.CallRing3JmpBufR0, vmmR0EntryExWrapper, &Args); + } + return VERR_VM_THREAD_NOT_EMT; + } + + default: + break; + } + } + return vmmR0EntryExWorker(pGVM, pVM, idCpu, enmOperation, pReq, u64Arg, pSession); +} + + +/** + * Checks whether we've armed the ring-0 long jump machinery. + * + * @returns @c true / @c false + * @param pVCpu The cross context virtual CPU structure. + * @thread EMT + * @sa VMMIsLongJumpArmed + */ +VMMR0_INT_DECL(bool) VMMR0IsLongJumpArmed(PVMCPU pVCpu) +{ +#ifdef RT_ARCH_X86 + return pVCpu->vmm.s.CallRing3JmpBufR0.eip + && !pVCpu->vmm.s.CallRing3JmpBufR0.fInRing3Call; +#else + return pVCpu->vmm.s.CallRing3JmpBufR0.rip + && !pVCpu->vmm.s.CallRing3JmpBufR0.fInRing3Call; +#endif +} + + +/** + * Checks whether we've done a ring-3 long jump. + * + * @returns @c true / @c false + * @param pVCpu The cross context virtual CPU structure. + * @thread EMT + */ +VMMR0_INT_DECL(bool) VMMR0IsInRing3LongJump(PVMCPU pVCpu) +{ + return pVCpu->vmm.s.CallRing3JmpBufR0.fInRing3Call; +} + + +/** + * Internal R0 logger worker: Flush logger. + * + * @param pLogger The logger instance to flush. + * @remark This function must be exported! + */ +VMMR0DECL(void) vmmR0LoggerFlush(PRTLOGGER pLogger) +{ +#ifdef LOG_ENABLED + /* + * Convert the pLogger into a VM handle and 'call' back to Ring-3. + * (This is a bit paranoid code.) + */ + PVMMR0LOGGER pR0Logger = (PVMMR0LOGGER)((uintptr_t)pLogger - RT_UOFFSETOF(VMMR0LOGGER, Logger)); + if ( !VALID_PTR(pR0Logger) + || !VALID_PTR(pR0Logger + 1) + || pLogger->u32Magic != RTLOGGER_MAGIC) + { +# ifdef DEBUG + SUPR0Printf("vmmR0LoggerFlush: pLogger=%p!\n", pLogger); +# endif + return; + } + if (pR0Logger->fFlushingDisabled) + return; /* quietly */ + + PVM pVM = pR0Logger->pVM; + if ( !VALID_PTR(pVM) + || pVM->pVMR0 != pVM) + { +# ifdef DEBUG + SUPR0Printf("vmmR0LoggerFlush: pVM=%p! pVMR0=%p! pLogger=%p\n", pVM, pVM->pVMR0, pLogger); +# endif + return; + } + + PVMCPU pVCpu = VMMGetCpu(pVM); + if (pVCpu) + { + /* + * Check that the jump buffer is armed. + */ +# ifdef RT_ARCH_X86 + if ( !pVCpu->vmm.s.CallRing3JmpBufR0.eip + || pVCpu->vmm.s.CallRing3JmpBufR0.fInRing3Call) +# else + if ( !pVCpu->vmm.s.CallRing3JmpBufR0.rip + || pVCpu->vmm.s.CallRing3JmpBufR0.fInRing3Call) +# endif + { +# ifdef DEBUG + SUPR0Printf("vmmR0LoggerFlush: Jump buffer isn't armed!\n"); +# endif + return; + } + VMMRZCallRing3(pVM, pVCpu, VMMCALLRING3_VMM_LOGGER_FLUSH, 0); + } +# ifdef DEBUG + else + SUPR0Printf("vmmR0LoggerFlush: invalid VCPU context!\n"); +# endif +#else + NOREF(pLogger); +#endif /* LOG_ENABLED */ +} + +#ifdef LOG_ENABLED + +/** + * Disables flushing of the ring-0 debug log. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0_INT_DECL(void) VMMR0LogFlushDisable(PVMCPU pVCpu) +{ + if (pVCpu->vmm.s.pR0LoggerR0) + pVCpu->vmm.s.pR0LoggerR0->fFlushingDisabled = true; + if (pVCpu->vmm.s.pR0RelLoggerR0) + pVCpu->vmm.s.pR0RelLoggerR0->fFlushingDisabled = true; +} + + +/** + * Enables flushing of the ring-0 debug log. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0_INT_DECL(void) VMMR0LogFlushEnable(PVMCPU pVCpu) +{ + if (pVCpu->vmm.s.pR0LoggerR0) + pVCpu->vmm.s.pR0LoggerR0->fFlushingDisabled = false; + if (pVCpu->vmm.s.pR0RelLoggerR0) + pVCpu->vmm.s.pR0RelLoggerR0->fFlushingDisabled = false; +} + + +/** + * Checks if log flushing is disabled or not. + * + * @param pVCpu The cross context virtual CPU structure. + */ +VMMR0_INT_DECL(bool) VMMR0IsLogFlushDisabled(PVMCPU pVCpu) +{ + if (pVCpu->vmm.s.pR0LoggerR0) + return pVCpu->vmm.s.pR0LoggerR0->fFlushingDisabled; + if (pVCpu->vmm.s.pR0RelLoggerR0) + return pVCpu->vmm.s.pR0RelLoggerR0->fFlushingDisabled; + return true; +} + +#endif /* LOG_ENABLED */ + +/** + * Override RTLogRelGetDefaultInstanceEx so we can do LogRel to VBox.log from EMTs in ring-0. + */ +DECLEXPORT(PRTLOGGER) RTLogRelGetDefaultInstanceEx(uint32_t fFlagsAndGroup) +{ + PGVMCPU pGVCpu = GVMMR0GetGVCpuByEMT(NIL_RTNATIVETHREAD); + if (pGVCpu) + { + PVMCPU pVCpu = pGVCpu->pVCpu; + if (RT_VALID_PTR(pVCpu)) + { + PVMMR0LOGGER pVmmLogger = pVCpu->vmm.s.pR0RelLoggerR0; + if (RT_VALID_PTR(pVmmLogger)) + { + if ( pVmmLogger->fCreated + && pVmmLogger->pVM == pGVCpu->pVM) + { + if (pVmmLogger->Logger.fFlags & RTLOGFLAGS_DISABLED) + return NULL; + uint16_t const fFlags = RT_LO_U16(fFlagsAndGroup); + uint16_t const iGroup = RT_HI_U16(fFlagsAndGroup); + if ( iGroup != UINT16_MAX + && ( ( pVmmLogger->Logger.afGroups[iGroup < pVmmLogger->Logger.cGroups ? iGroup : 0] + & (fFlags | (uint32_t)RTLOGGRPFLAGS_ENABLED)) + != (fFlags | (uint32_t)RTLOGGRPFLAGS_ENABLED))) + return NULL; + return &pVmmLogger->Logger; + } + } + } + } + return SUPR0GetDefaultLogRelInstanceEx(fFlagsAndGroup); +} + + +/** + * Jump back to ring-3 if we're the EMT and the longjmp is armed. + * + * @returns true if the breakpoint should be hit, false if it should be ignored. + */ +DECLEXPORT(bool) RTCALL RTAssertShouldPanic(void) +{ +#if 0 + return true; +#else + PVM pVM = GVMMR0GetVMByEMT(NIL_RTNATIVETHREAD); + if (pVM) + { + PVMCPU pVCpu = VMMGetCpu(pVM); + + if (pVCpu) + { +#ifdef RT_ARCH_X86 + if ( pVCpu->vmm.s.CallRing3JmpBufR0.eip + && !pVCpu->vmm.s.CallRing3JmpBufR0.fInRing3Call) +#else + if ( pVCpu->vmm.s.CallRing3JmpBufR0.rip + && !pVCpu->vmm.s.CallRing3JmpBufR0.fInRing3Call) +#endif + { + int rc = VMMRZCallRing3(pVM, pVCpu, VMMCALLRING3_VM_R0_ASSERTION, 0); + return RT_FAILURE_NP(rc); + } + } + } +#ifdef RT_OS_LINUX + return true; +#else + return false; +#endif +#endif +} + + +/** + * Override this so we can push it up to ring-3. + * + * @param pszExpr Expression. Can be NULL. + * @param uLine Location line number. + * @param pszFile Location file name. + * @param pszFunction Location function name. + */ +DECLEXPORT(void) RTCALL RTAssertMsg1Weak(const char *pszExpr, unsigned uLine, const char *pszFile, const char *pszFunction) +{ + /* + * To the log. + */ + LogAlways(("\n!!R0-Assertion Failed!!\n" + "Expression: %s\n" + "Location : %s(%d) %s\n", + pszExpr, pszFile, uLine, pszFunction)); + + /* + * To the global VMM buffer. + */ + PVM pVM = GVMMR0GetVMByEMT(NIL_RTNATIVETHREAD); + if (pVM) + RTStrPrintf(pVM->vmm.s.szRing0AssertMsg1, sizeof(pVM->vmm.s.szRing0AssertMsg1), + "\n!!R0-Assertion Failed!!\n" + "Expression: %.*s\n" + "Location : %s(%d) %s\n", + sizeof(pVM->vmm.s.szRing0AssertMsg1) / 4 * 3, pszExpr, + pszFile, uLine, pszFunction); + + /* + * Continue the normal way. + */ + RTAssertMsg1(pszExpr, uLine, pszFile, pszFunction); +} + + +/** + * Callback for RTLogFormatV which writes to the ring-3 log port. + * See PFNLOGOUTPUT() for details. + */ +static DECLCALLBACK(size_t) rtLogOutput(void *pv, const char *pachChars, size_t cbChars) +{ + for (size_t i = 0; i < cbChars; i++) + { + LogAlways(("%c", pachChars[i])); NOREF(pachChars); + } + + NOREF(pv); + return cbChars; +} + + +/** + * Override this so we can push it up to ring-3. + * + * @param pszFormat The format string. + * @param va Arguments. + */ +DECLEXPORT(void) RTCALL RTAssertMsg2WeakV(const char *pszFormat, va_list va) +{ + va_list vaCopy; + + /* + * Push the message to the loggers. + */ + PRTLOGGER pLog = RTLogGetDefaultInstance(); /* Don't initialize it here... */ + if (pLog) + { + va_copy(vaCopy, va); + RTLogFormatV(rtLogOutput, pLog, pszFormat, vaCopy); + va_end(vaCopy); + } + pLog = RTLogRelGetDefaultInstance(); + if (pLog) + { + va_copy(vaCopy, va); + RTLogFormatV(rtLogOutput, pLog, pszFormat, vaCopy); + va_end(vaCopy); + } + + /* + * Push it to the global VMM buffer. + */ + PVM pVM = GVMMR0GetVMByEMT(NIL_RTNATIVETHREAD); + if (pVM) + { + va_copy(vaCopy, va); + RTStrPrintfV(pVM->vmm.s.szRing0AssertMsg2, sizeof(pVM->vmm.s.szRing0AssertMsg2), pszFormat, vaCopy); + va_end(vaCopy); + } + + /* + * Continue the normal way. + */ + RTAssertMsg2V(pszFormat, va); +} + diff --git a/src/VBox/VMM/VMMR0/VMMR0.def b/src/VBox/VMM/VMMR0/VMMR0.def new file mode 100644 index 00000000..0735d86d --- /dev/null +++ b/src/VBox/VMM/VMMR0/VMMR0.def @@ -0,0 +1,117 @@ +; $Id: VMMR0.def $ +;; @file +; VMM Ring 0 DLL - Definition file. + +; +; Copyright (C) 2006-2019 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + +LIBRARY VMMR0.r0 +EXPORTS + ; data + + ; code + PDMCritSectEnter + PDMCritSectEnterDebug + PDMCritSectIsOwner + PDMCritSectLeave + PDMHCCritSectScheduleExitEvent + PDMCritSectTryEnter + PDMCritSectTryEnterDebug + PDMQueueAlloc + PDMQueueInsert + PGMHandlerPhysicalPageTempOff + PGMShwMakePageWritable + PGMPhysSimpleWriteGCPhys + PGMPhysSimpleReadGCPtr + PGMPhysSimpleWriteGCPtr + PGMPhysReadGCPtr + PGMPhysWriteGCPtr + PGMPhysSimpleDirtyWriteGCPtr + IOMMMIOResetRegion + IOMMMIOMapMMIO2Page + RTLogDefaultInstance + RTLogDefaultInstanceEx + RTLogRelGetDefaultInstance + RTLogRelGetDefaultInstanceEx + RTLogLogger + RTLogLoggerEx + RTLogLoggerExV + RTTimeMilliTS + RTTraceBufAddMsgF + RTTraceBufAddPos + RTTraceBufAddPosMsgF + TMTimerFromMilli + TMTimerFromMicro + TMTimerFromNano + TMTimerGet + TMTimerGetFreq + TMTimerIsActive + TMTimerIsLockOwner + TMTimerLock + TMTimerSet + TMTimerSetRelative + TMTimerSetMillies + TMTimerSetMicro + TMTimerSetNano + TMTimerSetFrequencyHint + TMTimerStop + TMTimerUnlock + VMMGetSvnRev + vmmR0LoggerFlush + vmmR0LoggerWrapper + VMSetError + VMSetErrorV + + ; Internal Networking + IntNetR0Open + IntNetR0IfClose + IntNetR0IfGetBufferPtrs + IntNetR0IfSetPromiscuousMode + IntNetR0IfSetMacAddress + IntNetR0IfSetActive + IntNetR0IfSend + IntNetR0IfWait + + ; Network Shaper + PDMNsAllocateBandwidth + + ; runtime + RTAssertMsg1Weak + RTAssertMsg2Weak + RTAssertShouldPanic + RTCrc32 + RTOnceSlow + RTTimeNanoTSLegacySyncInvarNoDelta + RTTimeNanoTSLegacySyncInvarWithDelta + RTTimeNanoTSLegacyAsync + RTTimeNanoTSLFenceSyncInvarNoDelta + RTTimeNanoTSLFenceSyncInvarWithDelta + RTTimeNanoTSLFenceAsync + RTTimeSystemNanoTS + RTTimeNanoTS + ASMMultU64ByU32DivByU32 ; not-os2 + ASMAtomicXchgU8 ; not-x86 + ASMAtomicXchgU16 ; not-x86 + ASMBitFirstSet ; not-x86 + ASMNopPause ; not-x86 + nocrt_memchr + nocrt_memcmp + nocrt_memcpy + memcpy=nocrt_memcpy ; not-os2 + nocrt_memmove + nocrt_memset + memset=nocrt_memset ; not-os2 + nocrt_strcpy + nocrt_strcmp + nocrt_strchr + nocrt_strlen + diff --git a/src/VBox/VMM/VMMR0/VMMR0JmpA-amd64.asm b/src/VBox/VMM/VMMR0/VMMR0JmpA-amd64.asm new file mode 100644 index 00000000..8735dfa6 --- /dev/null +++ b/src/VBox/VMM/VMMR0/VMMR0JmpA-amd64.asm @@ -0,0 +1,491 @@ +; $Id: VMMR0JmpA-amd64.asm $ +;; @file +; VMM - R0 SetJmp / LongJmp routines for AMD64. +; + +; +; Copyright (C) 2006-2019 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + +;******************************************************************************* +;* Header Files * +;******************************************************************************* +%define RT_ASM_WITH_SEH64 +%include "VBox/asmdefs.mac" +%include "VMMInternal.mac" +%include "VBox/err.mac" +%include "VBox/param.mac" + + +;******************************************************************************* +;* Defined Constants And Macros * +;******************************************************************************* +%define RESUME_MAGIC 07eadf00dh +%define STACK_PADDING 0eeeeeeeeeeeeeeeeh + +;; Workaround for linux 4.6 fast/slow syscall stack depth difference. +%ifdef VMM_R0_SWITCH_STACK + %define STACK_FUZZ_SIZE 0 +%else + %define STACK_FUZZ_SIZE 128 +%endif + + +BEGINCODE + + +;; +; The setjmp variant used for calling Ring-3. +; +; This differs from the normal setjmp in that it will resume VMMRZCallRing3 if we're +; in the middle of a ring-3 call. Another differences is the function pointer and +; argument. This has to do with resuming code and the stack frame of the caller. +; +; @returns VINF_SUCCESS on success or whatever is passed to vmmR0CallRing3LongJmp. +; @param pJmpBuf msc:rcx gcc:rdi x86:[esp+0x04] Our jmp_buf. +; @param pfn msc:rdx gcc:rsi x86:[esp+0x08] The function to be called when not resuming. +; @param pvUser1 msc:r8 gcc:rdx x86:[esp+0x0c] The argument of that function. +; @param pvUser2 msc:r9 gcc:rcx x86:[esp+0x10] The argument of that function. +; +BEGINPROC vmmR0CallRing3SetJmp +GLOBALNAME vmmR0CallRing3SetJmp2 +GLOBALNAME vmmR0CallRing3SetJmpEx + ; + ; Save the registers. + ; + push rbp + SEH64_PUSH_xBP + mov rbp, rsp + SEH64_SET_FRAME_xBP 0 + %ifdef ASM_CALL64_MSC + sub rsp, 30h + STACK_FUZZ_SIZE ; (10h is used by resume (??), 20h for callee spill area) + SEH64_ALLOCATE_STACK 30h + STACK_FUZZ_SIZE +SEH64_END_PROLOGUE + mov r11, rdx ; pfn + mov rdx, rcx ; pJmpBuf; + %else + sub rsp, 10h + STACK_FUZZ_SIZE ; (10h is used by resume (??)) + SEH64_ALLOCATE_STACK 10h + STACK_FUZZ_SIZE +SEH64_END_PROLOGUE + mov r8, rdx ; pvUser1 (save it like MSC) + mov r9, rcx ; pvUser2 (save it like MSC) + mov r11, rsi ; pfn + mov rdx, rdi ; pJmpBuf + %endif + mov [xDX + VMMR0JMPBUF.rbx], rbx + %ifdef ASM_CALL64_MSC + mov [xDX + VMMR0JMPBUF.rsi], rsi + mov [xDX + VMMR0JMPBUF.rdi], rdi + %endif + mov [xDX + VMMR0JMPBUF.rbp], rbp + mov [xDX + VMMR0JMPBUF.r12], r12 + mov [xDX + VMMR0JMPBUF.r13], r13 + mov [xDX + VMMR0JMPBUF.r14], r14 + mov [xDX + VMMR0JMPBUF.r15], r15 + mov xAX, [rbp + 8] ; (not really necessary, except for validity check) + mov [xDX + VMMR0JMPBUF.rip], xAX + %ifdef ASM_CALL64_MSC + lea r10, [rsp + 20h] ; must save the spill area + %else + lea r10, [rsp] + %endif + mov [xDX + VMMR0JMPBUF.rsp], r10 + %ifdef RT_OS_WINDOWS + movdqa [xDX + VMMR0JMPBUF.xmm6], xmm6 + movdqa [xDX + VMMR0JMPBUF.xmm7], xmm7 + movdqa [xDX + VMMR0JMPBUF.xmm8], xmm8 + movdqa [xDX + VMMR0JMPBUF.xmm9], xmm9 + movdqa [xDX + VMMR0JMPBUF.xmm10], xmm10 + movdqa [xDX + VMMR0JMPBUF.xmm11], xmm11 + movdqa [xDX + VMMR0JMPBUF.xmm12], xmm12 + movdqa [xDX + VMMR0JMPBUF.xmm13], xmm13 + movdqa [xDX + VMMR0JMPBUF.xmm14], xmm14 + movdqa [xDX + VMMR0JMPBUF.xmm15], xmm15 + %endif + pushf + pop xAX + mov [xDX + VMMR0JMPBUF.rflags], xAX + + ; + ; If we're not in a ring-3 call, call pfn and return. + ; + test byte [xDX + VMMR0JMPBUF.fInRing3Call], 1 + jnz .resume + + %ifdef VMM_R0_SWITCH_STACK + mov r15, [xDX + VMMR0JMPBUF.pvSavedStack] + test r15, r15 + jz .entry_error + %ifdef VBOX_STRICT + cmp dword [r15], 0h + jne .entry_error + mov rdi, r15 + mov rcx, VMM_STACK_SIZE / 8 + mov rax, qword 0eeeeeeeffeeeeeeeh + repne stosq + mov [rdi - 10h], rbx + %endif + lea r15, [r15 + VMM_STACK_SIZE - 40h] + mov rsp, r15 ; Switch stack! + %endif ; VMM_R0_SWITCH_STACK + + mov r12, rdx ; Save pJmpBuf. + %ifdef ASM_CALL64_MSC + mov rcx, r8 ; pvUser -> arg0 + mov rdx, r9 + %else + mov rdi, r8 ; pvUser -> arg0 + mov rsi, r9 + %endif + call r11 + mov rdx, r12 ; Restore pJmpBuf + + %ifdef VMM_R0_SWITCH_STACK + %ifdef VBOX_STRICT + mov r15, [xDX + VMMR0JMPBUF.pvSavedStack] + mov dword [r15], 0h ; Reset the marker + %endif + %endif + + ; + ; Return like in the long jump but clear eip, no shortcuts here. + ; +.proper_return: +%ifdef RT_OS_WINDOWS + movdqa xmm6, [xDX + VMMR0JMPBUF.xmm6 ] + movdqa xmm7, [xDX + VMMR0JMPBUF.xmm7 ] + movdqa xmm8, [xDX + VMMR0JMPBUF.xmm8 ] + movdqa xmm9, [xDX + VMMR0JMPBUF.xmm9 ] + movdqa xmm10, [xDX + VMMR0JMPBUF.xmm10] + movdqa xmm11, [xDX + VMMR0JMPBUF.xmm11] + movdqa xmm12, [xDX + VMMR0JMPBUF.xmm12] + movdqa xmm13, [xDX + VMMR0JMPBUF.xmm13] + movdqa xmm14, [xDX + VMMR0JMPBUF.xmm14] + movdqa xmm15, [xDX + VMMR0JMPBUF.xmm15] +%endif + mov rbx, [xDX + VMMR0JMPBUF.rbx] +%ifdef ASM_CALL64_MSC + mov rsi, [xDX + VMMR0JMPBUF.rsi] + mov rdi, [xDX + VMMR0JMPBUF.rdi] +%endif + mov r12, [xDX + VMMR0JMPBUF.r12] + mov r13, [xDX + VMMR0JMPBUF.r13] + mov r14, [xDX + VMMR0JMPBUF.r14] + mov r15, [xDX + VMMR0JMPBUF.r15] + mov rbp, [xDX + VMMR0JMPBUF.rbp] + and qword [xDX + VMMR0JMPBUF.rip], byte 0 ; used for valid check. + mov rsp, [xDX + VMMR0JMPBUF.rsp] + push qword [xDX + VMMR0JMPBUF.rflags] + popf + leave + ret + +.entry_error: + mov eax, VERR_VMM_SET_JMP_ERROR + jmp .proper_return + +.stack_overflow: + mov eax, VERR_VMM_SET_JMP_STACK_OVERFLOW + jmp .proper_return + + ; + ; Aborting resume. + ; Note! No need to restore XMM registers here since we haven't touched them yet. + ; +.bad: + and qword [xDX + VMMR0JMPBUF.rip], byte 0 ; used for valid check. + mov rbx, [xDX + VMMR0JMPBUF.rbx] + %ifdef ASM_CALL64_MSC + mov rsi, [xDX + VMMR0JMPBUF.rsi] + mov rdi, [xDX + VMMR0JMPBUF.rdi] + %endif + mov r12, [xDX + VMMR0JMPBUF.r12] + mov r13, [xDX + VMMR0JMPBUF.r13] + mov r14, [xDX + VMMR0JMPBUF.r14] + mov r15, [xDX + VMMR0JMPBUF.r15] + mov eax, VERR_VMM_SET_JMP_ABORTED_RESUME + leave + ret + + ; + ; Resume VMMRZCallRing3 the call. + ; +.resume: + %ifndef VMM_R0_SWITCH_STACK + ; Sanity checks incoming stack, applying fuzz if needed. + sub r10, [xDX + VMMR0JMPBUF.SpCheck] + jz .resume_stack_checked_out + add r10, STACK_FUZZ_SIZE ; plus/minus STACK_FUZZ_SIZE is fine. + cmp r10, STACK_FUZZ_SIZE * 2 + ja .bad + + mov r10, [xDX + VMMR0JMPBUF.SpCheck] + mov [xDX + VMMR0JMPBUF.rsp], r10 ; Must be update in case of another long jump (used for save calc). + +.resume_stack_checked_out: + mov ecx, [xDX + VMMR0JMPBUF.cbSavedStack] + cmp rcx, VMM_STACK_SIZE + ja .bad + test rcx, 7 + jnz .bad + mov rdi, [xDX + VMMR0JMPBUF.SpCheck] + sub rdi, [xDX + VMMR0JMPBUF.SpResume] + cmp rcx, rdi + jne .bad + %endif + +%ifdef VMM_R0_SWITCH_STACK + ; Switch stack. + mov rsp, [xDX + VMMR0JMPBUF.SpResume] +%else + ; Restore the stack. + mov ecx, [xDX + VMMR0JMPBUF.cbSavedStack] + shr ecx, 3 + mov rsi, [xDX + VMMR0JMPBUF.pvSavedStack] + mov rdi, [xDX + VMMR0JMPBUF.SpResume] + mov rsp, rdi + rep movsq +%endif ; !VMM_R0_SWITCH_STACK + mov byte [xDX + VMMR0JMPBUF.fInRing3Call], 0 + + ; + ; Continue where we left off. + ; +%ifdef VBOX_STRICT + pop rax ; magic + cmp rax, RESUME_MAGIC + je .magic_ok + mov ecx, 0123h + mov [ecx], edx +.magic_ok: +%endif +%ifdef RT_OS_WINDOWS + movdqa xmm6, [rsp + 000h] + movdqa xmm7, [rsp + 010h] + movdqa xmm8, [rsp + 020h] + movdqa xmm9, [rsp + 030h] + movdqa xmm10, [rsp + 040h] + movdqa xmm11, [rsp + 050h] + movdqa xmm12, [rsp + 060h] + movdqa xmm13, [rsp + 070h] + movdqa xmm14, [rsp + 080h] + movdqa xmm15, [rsp + 090h] + add rsp, 0a0h +%endif + popf + pop rbx +%ifdef ASM_CALL64_MSC + pop rsi + pop rdi +%endif + pop r12 + pop r13 + pop r14 + pop r15 + pop rbp + xor eax, eax ; VINF_SUCCESS + ret +ENDPROC vmmR0CallRing3SetJmp + + +;; +; Worker for VMMRZCallRing3. +; This will save the stack and registers. +; +; @param pJmpBuf msc:rcx gcc:rdi x86:[ebp+8] Pointer to the jump buffer. +; @param rc msc:rdx gcc:rsi x86:[ebp+c] The return code. +; +BEGINPROC vmmR0CallRing3LongJmp + ; + ; Save the registers on the stack. + ; + push rbp + SEH64_PUSH_xBP + mov rbp, rsp + SEH64_SET_FRAME_xBP 0 + push r15 + SEH64_PUSH_GREG r15 + push r14 + SEH64_PUSH_GREG r14 + push r13 + SEH64_PUSH_GREG r13 + push r12 + SEH64_PUSH_GREG r12 +%ifdef ASM_CALL64_MSC + push rdi + SEH64_PUSH_GREG rdi + push rsi + SEH64_PUSH_GREG rsi +%endif + push rbx + SEH64_PUSH_GREG rbx + pushf + SEH64_ALLOCATE_STACK 8 +%ifdef RT_OS_WINDOWS + sub rsp, 0a0h + SEH64_ALLOCATE_STACK 0a0h + movdqa [rsp + 000h], xmm6 + movdqa [rsp + 010h], xmm7 + movdqa [rsp + 020h], xmm8 + movdqa [rsp + 030h], xmm9 + movdqa [rsp + 040h], xmm10 + movdqa [rsp + 050h], xmm11 + movdqa [rsp + 060h], xmm12 + movdqa [rsp + 070h], xmm13 + movdqa [rsp + 080h], xmm14 + movdqa [rsp + 090h], xmm15 +%endif +%ifdef VBOX_STRICT + push RESUME_MAGIC + SEH64_ALLOCATE_STACK 8 +%endif +SEH64_END_PROLOGUE + + ; + ; Normalize the parameters. + ; +%ifdef ASM_CALL64_MSC + mov eax, edx ; rc + mov rdx, rcx ; pJmpBuf +%else + mov rdx, rdi ; pJmpBuf + mov eax, esi ; rc +%endif + + ; + ; Is the jump buffer armed? + ; + cmp qword [xDX + VMMR0JMPBUF.rip], byte 0 + je .nok + + ; + ; Sanity checks. + ; + mov rdi, [xDX + VMMR0JMPBUF.pvSavedStack] + test rdi, rdi ; darwin may set this to 0. + jz .nok + mov [xDX + VMMR0JMPBUF.SpResume], rsp + %ifndef VMM_R0_SWITCH_STACK + mov rsi, rsp + mov rcx, [xDX + VMMR0JMPBUF.rsp] + sub rcx, rsi + + ; two sanity checks on the size. + cmp rcx, VMM_STACK_SIZE ; check max size. + jnbe .nok + + ; + ; Copy the stack + ; + test ecx, 7 ; check alignment + jnz .nok + mov [xDX + VMMR0JMPBUF.cbSavedStack], ecx + shr ecx, 3 + rep movsq + + %endif ; !VMM_R0_SWITCH_STACK + + ; Save a PC and return PC here to assist unwinding. +.unwind_point: + lea rcx, [.unwind_point wrt RIP] + mov [xDX + VMMR0JMPBUF.SavedEipForUnwind], rcx + mov rcx, [xDX + VMMR0JMPBUF.rbp] + lea rcx, [rcx + 8] + mov [xDX + VMMR0JMPBUF.UnwindRetPcLocation], rcx + mov rcx, [rcx] + mov [xDX + VMMR0JMPBUF.UnwindRetPcValue], rcx + + ; Save RSP & RBP to enable stack dumps + mov rcx, rbp + mov [xDX + VMMR0JMPBUF.SavedEbp], rcx + sub rcx, 8 + mov [xDX + VMMR0JMPBUF.SavedEsp], rcx + + ; store the last pieces of info. + mov rcx, [xDX + VMMR0JMPBUF.rsp] + mov [xDX + VMMR0JMPBUF.SpCheck], rcx + mov byte [xDX + VMMR0JMPBUF.fInRing3Call], 1 + + ; + ; Do the long jump. + ; +%ifdef RT_OS_WINDOWS + movdqa xmm6, [xDX + VMMR0JMPBUF.xmm6 ] + movdqa xmm7, [xDX + VMMR0JMPBUF.xmm7 ] + movdqa xmm8, [xDX + VMMR0JMPBUF.xmm8 ] + movdqa xmm9, [xDX + VMMR0JMPBUF.xmm9 ] + movdqa xmm10, [xDX + VMMR0JMPBUF.xmm10] + movdqa xmm11, [xDX + VMMR0JMPBUF.xmm11] + movdqa xmm12, [xDX + VMMR0JMPBUF.xmm12] + movdqa xmm13, [xDX + VMMR0JMPBUF.xmm13] + movdqa xmm14, [xDX + VMMR0JMPBUF.xmm14] + movdqa xmm15, [xDX + VMMR0JMPBUF.xmm15] +%endif + mov rbx, [xDX + VMMR0JMPBUF.rbx] +%ifdef ASM_CALL64_MSC + mov rsi, [xDX + VMMR0JMPBUF.rsi] + mov rdi, [xDX + VMMR0JMPBUF.rdi] +%endif + mov r12, [xDX + VMMR0JMPBUF.r12] + mov r13, [xDX + VMMR0JMPBUF.r13] + mov r14, [xDX + VMMR0JMPBUF.r14] + mov r15, [xDX + VMMR0JMPBUF.r15] + mov rbp, [xDX + VMMR0JMPBUF.rbp] + mov rsp, [xDX + VMMR0JMPBUF.rsp] + push qword [xDX + VMMR0JMPBUF.rflags] + popf + leave + ret + + ; + ; Failure + ; +.nok: +%ifdef VBOX_STRICT + pop rax ; magic + cmp rax, RESUME_MAGIC + je .magic_ok + mov ecx, 0123h + mov [rcx], edx +.magic_ok: +%endif + mov eax, VERR_VMM_LONG_JMP_ERROR +%ifdef RT_OS_WINDOWS + add rsp, 0a0h ; skip XMM registers since they are unmodified. +%endif + popf + pop rbx +%ifdef ASM_CALL64_MSC + pop rsi + pop rdi +%endif + pop r12 + pop r13 + pop r14 + pop r15 + leave + ret +ENDPROC vmmR0CallRing3LongJmp + + +;; +; Internal R0 logger worker: Logger wrapper. +; +; @cproto VMMR0DECL(void) vmmR0LoggerWrapper(const char *pszFormat, ...) +; +BEGINPROC_EXPORTED vmmR0LoggerWrapper +SEH64_END_PROLOGUE + int3 + int3 + int3 + ret +ENDPROC vmmR0LoggerWrapper + diff --git a/src/VBox/VMM/VMMR0/VMMR0JmpA-x86.asm b/src/VBox/VMM/VMMR0/VMMR0JmpA-x86.asm new file mode 100644 index 00000000..3a89a244 --- /dev/null +++ b/src/VBox/VMM/VMMR0/VMMR0JmpA-x86.asm @@ -0,0 +1,401 @@ +; $Id: VMMR0JmpA-x86.asm $ +;; @file +; VMM - R0 SetJmp / LongJmp routines for X86. +; + +; +; Copyright (C) 2006-2019 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + +;******************************************************************************* +;* Header Files * +;******************************************************************************* +%include "VBox/asmdefs.mac" +%include "VMMInternal.mac" +%include "VBox/err.mac" +%include "VBox/param.mac" + + +;******************************************************************************* +;* Defined Constants And Macros * +;******************************************************************************* +%define RESUME_MAGIC 07eadf00dh +%define STACK_PADDING 0eeeeeeeeh + + +; For vmmR0LoggerWrapper. (The other architecture(s) use(s) C99 variadic macros.) +extern NAME(RTLogLogger) + + +BEGINCODE + + +;; +; The setjmp variant used for calling Ring-3. +; +; This differs from the normal setjmp in that it will resume VMMRZCallRing3 if we're +; in the middle of a ring-3 call. Another differences is the function pointer and +; argument. This has to do with resuming code and the stack frame of the caller. +; +; @returns VINF_SUCCESS on success or whatever is passed to vmmR0CallRing3LongJmp. +; @param pJmpBuf msc:rcx gcc:rdi x86:[esp+0x04] Our jmp_buf. +; @param pfn msc:rdx gcc:rsi x86:[esp+0x08] The function to be called when not resuming. +; @param pvUser1 msc:r8 gcc:rdx x86:[esp+0x0c] The argument of that function. +; @param pvUser2 msc:r9 gcc:rcx x86:[esp+0x10] The argument of that function. +; +BEGINPROC vmmR0CallRing3SetJmp +GLOBALNAME vmmR0CallRing3SetJmp2 +GLOBALNAME vmmR0CallRing3SetJmpEx + ; + ; Save the registers. + ; + mov edx, [esp + 4h] ; pJmpBuf + mov [xDX + VMMR0JMPBUF.ebx], ebx + mov [xDX + VMMR0JMPBUF.esi], esi + mov [xDX + VMMR0JMPBUF.edi], edi + mov [xDX + VMMR0JMPBUF.ebp], ebp + mov xAX, [esp] + mov [xDX + VMMR0JMPBUF.eip], xAX + lea ecx, [esp + 4] ; (used in resume) + mov [xDX + VMMR0JMPBUF.esp], ecx + pushf + pop xAX + mov [xDX + VMMR0JMPBUF.eflags], xAX + + ; + ; If we're not in a ring-3 call, call pfn and return. + ; + test byte [xDX + VMMR0JMPBUF.fInRing3Call], 1 + jnz .resume + + mov ebx, edx ; pJmpBuf -> ebx (persistent reg) +%ifdef VMM_R0_SWITCH_STACK + mov esi, [ebx + VMMR0JMPBUF.pvSavedStack] + test esi, esi + jz .entry_error + %ifdef VBOX_STRICT + cmp dword [esi], 0h + jne .entry_error + mov edx, esi + mov edi, esi + mov ecx, VMM_STACK_SIZE / 4 + mov eax, STACK_PADDING + repne stosd + %endif + lea esi, [esi + VMM_STACK_SIZE - 32] + mov [esi + 1ch], dword 0deadbeefh ; Marker 1. + mov [esi + 18h], ebx ; Save pJmpBuf pointer. + mov [esi + 14h], dword 00c00ffeeh ; Marker 2. + mov [esi + 10h], dword 0f00dbeefh ; Marker 3. + mov edx, [esp + 10h] ; pvArg2 + mov ecx, [esp + 0ch] ; pvArg1 + mov eax, [esp + 08h] ; pfn + %if 1 ; Use this to eat of some extra stack - handy for finding paths using lots of stack. + %define FRAME_OFFSET 0 + %else + %define FRAME_OFFSET 1024 + %endif + mov [esi - FRAME_OFFSET + 04h], edx + mov [esi - FRAME_OFFSET ], ecx + lea esp, [esi - FRAME_OFFSET] ; Switch stack! + call eax + and dword [esi + 1ch], byte 0 ; reset marker. + + %ifdef VBOX_STRICT + ; Calc stack usage and check for overflows. + mov edi, [ebx + VMMR0JMPBUF.pvSavedStack] + cmp dword [edi], STACK_PADDING ; Check for obvious stack overflow. + jne .stack_overflow + mov esi, eax ; save eax + mov eax, STACK_PADDING + mov ecx, VMM_STACK_SIZE / 4 + cld + repe scasd + shl ecx, 2 ; *4 + cmp ecx, VMM_STACK_SIZE - 64 ; Less than 64 bytes left -> overflow as well. + mov eax, esi ; restore eax in case of overflow (esi remains used) + jae .stack_overflow_almost + + ; Update stack usage statistics. + cmp ecx, [ebx + VMMR0JMPBUF.cbUsedMax] ; New max usage? + jle .no_used_max + mov [ebx + VMMR0JMPBUF.cbUsedMax], ecx +.no_used_max: + ; To simplify the average stuff, just historize before we hit div errors. + inc dword [ebx + VMMR0JMPBUF.cUsedTotal] + test [ebx + VMMR0JMPBUF.cUsedTotal], dword 0c0000000h + jz .no_historize + mov dword [ebx + VMMR0JMPBUF.cUsedTotal], 2 + mov edi, [ebx + VMMR0JMPBUF.cbUsedAvg] + mov [ebx + VMMR0JMPBUF.cbUsedTotal], edi + mov dword [ebx + VMMR0JMPBUF.cbUsedTotal + 4], 0 +.no_historize: + add [ebx + VMMR0JMPBUF.cbUsedTotal], ecx + adc dword [ebx + VMMR0JMPBUF.cbUsedTotal + 4], 0 + mov eax, [ebx + VMMR0JMPBUF.cbUsedTotal] + mov edx, [ebx + VMMR0JMPBUF.cbUsedTotal + 4] + mov edi, [ebx + VMMR0JMPBUF.cUsedTotal] + div edi + mov [ebx + VMMR0JMPBUF.cbUsedAvg], eax + + mov eax, esi ; restore eax (final, esi released) + + mov edi, [ebx + VMMR0JMPBUF.pvSavedStack] + mov dword [edi], 0h ; Reset the overflow marker. + %endif ; VBOX_STRICT + +%else ; !VMM_R0_SWITCH_STACK + mov ecx, [esp + 0ch] ; pvArg1 + mov edx, [esp + 10h] ; pvArg2 + mov eax, [esp + 08h] ; pfn + sub esp, 12 ; align the stack on a 16-byte boundary. + mov [esp ], ecx + mov [esp + 04h], edx + call eax +%endif ; !VMM_R0_SWITCH_STACK + mov edx, ebx ; pJmpBuf -> edx (volatile reg) + + ; + ; Return like in the long jump but clear eip, no short cuts here. + ; +.proper_return: + mov ebx, [xDX + VMMR0JMPBUF.ebx] + mov esi, [xDX + VMMR0JMPBUF.esi] + mov edi, [xDX + VMMR0JMPBUF.edi] + mov ebp, [xDX + VMMR0JMPBUF.ebp] + mov xCX, [xDX + VMMR0JMPBUF.eip] + and dword [xDX + VMMR0JMPBUF.eip], byte 0 ; used for valid check. + mov esp, [xDX + VMMR0JMPBUF.esp] + push dword [xDX + VMMR0JMPBUF.eflags] + popf + jmp xCX + +.entry_error: + mov eax, VERR_VMM_SET_JMP_ERROR + jmp .proper_return + +.stack_overflow: + mov eax, VERR_VMM_SET_JMP_STACK_OVERFLOW + mov edx, ebx + jmp .proper_return + +.stack_overflow_almost: + mov eax, VERR_VMM_SET_JMP_STACK_OVERFLOW + mov edx, ebx + jmp .proper_return + + ; + ; Aborting resume. + ; +.bad: + and dword [xDX + VMMR0JMPBUF.eip], byte 0 ; used for valid check. + mov edi, [xDX + VMMR0JMPBUF.edi] + mov esi, [xDX + VMMR0JMPBUF.esi] + mov ebx, [xDX + VMMR0JMPBUF.ebx] + mov eax, VERR_VMM_SET_JMP_ABORTED_RESUME + ret + + ; + ; Resume VMMRZCallRing3 the call. + ; +.resume: + ; Sanity checks. +%ifdef VMM_R0_SWITCH_STACK + mov eax, [xDX + VMMR0JMPBUF.pvSavedStack] + %ifdef RT_STRICT + cmp dword [eax], STACK_PADDING + %endif + lea eax, [eax + VMM_STACK_SIZE - 32] + cmp dword [eax + 1ch], 0deadbeefh ; Marker 1. + jne .bad + %ifdef RT_STRICT + cmp [esi + 18h], edx ; The saved pJmpBuf pointer. + jne .bad + cmp dword [esi + 14h], 00c00ffeeh ; Marker 2. + jne .bad + cmp dword [esi + 10h], 0f00dbeefh ; Marker 3. + jne .bad + %endif +%else ; !VMM_R0_SWITCH_STACK + cmp ecx, [xDX + VMMR0JMPBUF.SpCheck] + jne .bad +.espCheck_ok: + mov ecx, [xDX + VMMR0JMPBUF.cbSavedStack] + cmp ecx, VMM_STACK_SIZE + ja .bad + test ecx, 3 + jnz .bad + mov edi, [xDX + VMMR0JMPBUF.esp] + sub edi, [xDX + VMMR0JMPBUF.SpResume] + cmp ecx, edi + jne .bad +%endif + +%ifdef VMM_R0_SWITCH_STACK + ; Switch stack. + mov esp, [xDX + VMMR0JMPBUF.SpResume] +%else + ; Restore the stack. + mov ecx, [xDX + VMMR0JMPBUF.cbSavedStack] + shr ecx, 2 + mov esi, [xDX + VMMR0JMPBUF.pvSavedStack] + mov edi, [xDX + VMMR0JMPBUF.SpResume] + mov esp, edi + rep movsd +%endif ; !VMM_R0_SWITCH_STACK + mov byte [xDX + VMMR0JMPBUF.fInRing3Call], 0 + + ; + ; Continue where we left off. + ; +%ifdef VBOX_STRICT + pop eax ; magic + cmp eax, RESUME_MAGIC + je .magic_ok + mov ecx, 0123h + mov [ecx], edx +.magic_ok: +%endif + popf + pop ebx + pop esi + pop edi + pop ebp + xor eax, eax ; VINF_SUCCESS + ret +ENDPROC vmmR0CallRing3SetJmp + + +;; +; Worker for VMMRZCallRing3. +; This will save the stack and registers. +; +; @param pJmpBuf msc:rcx gcc:rdi x86:[ebp+8] Pointer to the jump buffer. +; @param rc msc:rdx gcc:rsi x86:[ebp+c] The return code. +; +BEGINPROC vmmR0CallRing3LongJmp + ; + ; Save the registers on the stack. + ; + push ebp + mov ebp, esp + push edi + push esi + push ebx + pushf +%ifdef VBOX_STRICT + push RESUME_MAGIC +%endif + + ; + ; Load parameters. + ; + mov edx, [ebp + 08h] ; pJmpBuf + mov eax, [ebp + 0ch] ; rc + + ; + ; Is the jump buffer armed? + ; + cmp dword [xDX + VMMR0JMPBUF.eip], byte 0 + je .nok + + ; + ; Sanity checks. + ; + mov edi, [xDX + VMMR0JMPBUF.pvSavedStack] + test edi, edi ; darwin may set this to 0. + jz .nok + mov [xDX + VMMR0JMPBUF.SpResume], esp +%ifndef VMM_R0_SWITCH_STACK + mov esi, esp + mov ecx, [xDX + VMMR0JMPBUF.esp] + sub ecx, esi + + ; two sanity checks on the size. + cmp ecx, VMM_STACK_SIZE ; check max size. + jnbe .nok + + ; + ; Copy the stack. + ; + test ecx, 3 ; check alignment + jnz .nok + mov [xDX + VMMR0JMPBUF.cbSavedStack], ecx + shr ecx, 2 + rep movsd +%endif ; !VMM_R0_SWITCH_STACK + + ; Save a PC here to assist unwinding. +.unwind_point: + mov dword [xDX + VMMR0JMPBUF.SavedEipForUnwind], .unwind_point + mov ecx, [xDX + VMMR0JMPBUF.ebp] + lea ecx, [ecx + 4] + mov [xDX + VMMR0JMPBUF.UnwindRetPcLocation], ecx + + ; Save ESP & EBP to enable stack dumps + mov ecx, ebp + mov [xDX + VMMR0JMPBUF.SavedEbp], ecx + sub ecx, 4 + mov [xDX + VMMR0JMPBUF.SavedEsp], ecx + + ; store the last pieces of info. + mov ecx, [xDX + VMMR0JMPBUF.esp] + mov [xDX + VMMR0JMPBUF.SpCheck], ecx + mov byte [xDX + VMMR0JMPBUF.fInRing3Call], 1 + + ; + ; Do the long jump. + ; + mov ebx, [xDX + VMMR0JMPBUF.ebx] + mov esi, [xDX + VMMR0JMPBUF.esi] + mov edi, [xDX + VMMR0JMPBUF.edi] + mov ebp, [xDX + VMMR0JMPBUF.ebp] + mov ecx, [xDX + VMMR0JMPBUF.eip] + mov [xDX + VMMR0JMPBUF.UnwindRetPcValue], ecx + mov esp, [xDX + VMMR0JMPBUF.esp] + push dword [xDX + VMMR0JMPBUF.eflags] + popf + jmp ecx + + ; + ; Failure + ; +.nok: +%ifdef VBOX_STRICT + pop eax ; magic + cmp eax, RESUME_MAGIC + je .magic_ok + mov ecx, 0123h + mov [ecx], edx +.magic_ok: +%endif + popf + pop ebx + pop esi + pop edi + mov eax, VERR_VMM_LONG_JMP_ERROR + leave + ret +ENDPROC vmmR0CallRing3LongJmp + + +;; +; Internal R0 logger worker: Logger wrapper. +; +; @cproto VMMR0DECL(void) vmmR0LoggerWrapper(const char *pszFormat, ...) +; +EXPORTEDNAME vmmR0LoggerWrapper + push 0 ; assumes we're the wrapper for a default instance. + call NAME(RTLogLogger) + add esp, byte 4 + ret +ENDPROC vmmR0LoggerWrapper + diff --git a/src/VBox/VMM/VMMR0/VMMR0TripleFaultHack.cpp b/src/VBox/VMM/VMMR0/VMMR0TripleFaultHack.cpp new file mode 100644 index 00000000..bcafbd96 --- /dev/null +++ b/src/VBox/VMM/VMMR0/VMMR0TripleFaultHack.cpp @@ -0,0 +1,209 @@ +/* $Id: VMMR0TripleFaultHack.cpp $ */ +/** @file + * VMM - Host Context Ring 0, Triple Fault Debugging Hack. + * + * Only use this when desperate. May not work on all systems, esp. newer ones, + * since it require BIOS support for the warm reset vector at 0467h. + */ + +/* + * Copyright (C) 2011-2019 Oracle Corporation + * + * This file is part of VirtualBox Open Source Edition (OSE), as + * available from http://www.virtualbox.org. This file is free software; + * you can redistribute it and/or modify it under the terms of the GNU + * General Public License (GPL) as published by the Free Software + * Foundation, in version 2 as it comes in the "COPYING" file of the + * VirtualBox OSE distribution. VirtualBox OSE is distributed in the + * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. + */ + + +/********************************************************************************************************************************* +* Header Files * +*********************************************************************************************************************************/ +#define LOG_GROUP LOG_GROUP_VMM +#include <VBox/vmm/vmm.h> +#include "VMMInternal.h" +#include <VBox/param.h> + +#include <iprt/asm-amd64-x86.h> +#include <iprt/assert.h> +#include <iprt/memobj.h> +#include <iprt/mem.h> +#include <iprt/string.h> + + +/********************************************************************************************************************************* +* Global Variables * +*********************************************************************************************************************************/ +static RTR0MEMOBJ g_hMemPage0; +static RTR0MEMOBJ g_hMapPage0; +static uint8_t *g_pbPage0; + +static RTR0MEMOBJ g_hMemLowCore; +static RTR0MEMOBJ g_hMapLowCore; +static uint8_t *g_pbLowCore; +static RTHCPHYS g_HCPhysLowCore; + +/** @name For restoring memory we've overwritten. + * @{ */ +static uint32_t g_u32SavedVector; +static uint16_t g_u16SavedCadIndicator; +static void *g_pvSavedLowCore; +/** @} */ + + +/********************************************************************************************************************************* +* Internal Functions * +*********************************************************************************************************************************/ +/* VMMR0TripleFaultHackA.asm */ +DECLASM(void) vmmR0TripleFaultHackStart(void); +DECLASM(void) vmmR0TripleFaultHackEnd(void); +DECLASM(void) vmmR0TripleFaultHackTripleFault(void); + + +/** + * Initalizes the triple fault / boot hack. + * + * Always call vmmR0TripleFaultHackTerm to clean up, even when this call fails. + * + * @returns VBox status code. + */ +int vmmR0TripleFaultHackInit(void) +{ + /* + * Map the first page. + */ + int rc = RTR0MemObjEnterPhys(&g_hMemPage0, 0, PAGE_SIZE, RTMEM_CACHE_POLICY_DONT_CARE); + AssertRCReturn(rc, rc); + rc = RTR0MemObjMapKernel(&g_hMapPage0, g_hMemPage0, (void *)-1, 0, RTMEM_PROT_READ | RTMEM_PROT_WRITE); + AssertRCReturn(rc, rc); + g_pbPage0 = (uint8_t *)RTR0MemObjAddress(g_hMapPage0); + LogRel(("0040:0067 = %04x:%04x\n", RT_MAKE_U16(g_pbPage0[0x467+2], g_pbPage0[0x467+3]), RT_MAKE_U16(g_pbPage0[0x467+0], g_pbPage0[0x467+1]) )); + + /* + * Allocate some "low core" memory. If that fails, just grab some memory. + */ + //rc = RTR0MemObjAllocPhys(&g_hMemLowCore, PAGE_SIZE, _1M - 1); + //__debugbreak(); + rc = RTR0MemObjEnterPhys(&g_hMemLowCore, 0x7000, PAGE_SIZE, RTMEM_CACHE_POLICY_DONT_CARE); + AssertRCReturn(rc, rc); + rc = RTR0MemObjMapKernel(&g_hMapLowCore, g_hMemLowCore, (void *)-1, 0, RTMEM_PROT_READ | RTMEM_PROT_WRITE); + AssertRCReturn(rc, rc); + g_pbLowCore = (uint8_t *)RTR0MemObjAddress(g_hMapLowCore); + g_HCPhysLowCore = RTR0MemObjGetPagePhysAddr(g_hMapLowCore, 0); + LogRel(("Low core at %RHp mapped at %p\n", g_HCPhysLowCore, g_pbLowCore)); + + /* + * Save memory we'll be overwriting. + */ + g_pvSavedLowCore = RTMemAlloc(PAGE_SIZE); + AssertReturn(g_pvSavedLowCore, VERR_NO_MEMORY); + memcpy(g_pvSavedLowCore, g_pbLowCore, PAGE_SIZE); + + g_u32SavedVector = RT_MAKE_U32_FROM_U8(g_pbPage0[0x467], g_pbPage0[0x467+1], g_pbPage0[0x467+2], g_pbPage0[0x467+3]); + g_u16SavedCadIndicator = RT_MAKE_U16(g_pbPage0[0x472], g_pbPage0[0x472+1]); + + /* + * Install the code. + */ + size_t cbCode = (uintptr_t)&vmmR0TripleFaultHackEnd - (uintptr_t)&vmmR0TripleFaultHackStart; + AssertLogRelReturn(cbCode <= PAGE_SIZE, VERR_OUT_OF_RANGE); + memcpy(g_pbLowCore, &vmmR0TripleFaultHackStart, cbCode); + + g_pbPage0[0x467+0] = 0x00; + g_pbPage0[0x467+1] = 0x70; + g_pbPage0[0x467+2] = 0x00; + g_pbPage0[0x467+3] = 0x00; + + g_pbPage0[0x472+0] = 0x34; + g_pbPage0[0x472+1] = 0x12; + + /* + * Configure the status port and cmos shutdown command. + */ + uint32_t fSaved = ASMIntDisableFlags(); + + ASMOutU8(0x70, 0x0f); + ASMOutU8(0x71, 0x0a); + + ASMOutU8(0x70, 0x05); + ASMInU8(0x71); + + ASMReloadCR3(); + ASMWriteBackAndInvalidateCaches(); + + ASMSetFlags(fSaved); + +#if 1 /* For testing & debugging. */ + vmmR0TripleFaultHackTripleFault(); +#endif + + return VINF_SUCCESS; +} + + +/** + * Try undo the harm done by the init function. + * + * This may leave the system in an unstable state since we might have been + * hijacking memory below 1MB that is in use by the kernel. + */ +void vmmR0TripleFaultHackTerm(void) +{ + /* + * Restore overwritten memory. + */ + if ( g_pvSavedLowCore + && g_pbLowCore) + memcpy(g_pbLowCore, g_pvSavedLowCore, PAGE_SIZE); + + if (g_pbPage0) + { + g_pbPage0[0x467+0] = RT_BYTE1(g_u32SavedVector); + g_pbPage0[0x467+1] = RT_BYTE2(g_u32SavedVector); + g_pbPage0[0x467+2] = RT_BYTE3(g_u32SavedVector); + g_pbPage0[0x467+3] = RT_BYTE4(g_u32SavedVector); + + g_pbPage0[0x472+0] = RT_BYTE1(g_u16SavedCadIndicator); + g_pbPage0[0x472+1] = RT_BYTE2(g_u16SavedCadIndicator); + } + + /* + * Fix the CMOS. + */ + if (g_pvSavedLowCore) + { + uint32_t fSaved = ASMIntDisableFlags(); + + ASMOutU8(0x70, 0x0f); + ASMOutU8(0x71, 0x0a); + + ASMOutU8(0x70, 0x00); + ASMInU8(0x71); + + ASMReloadCR3(); + ASMWriteBackAndInvalidateCaches(); + + ASMSetFlags(fSaved); + } + + /* + * Release resources. + */ + RTMemFree(g_pvSavedLowCore); + g_pvSavedLowCore = NULL; + + RTR0MemObjFree(g_hMemLowCore, true /*fFreeMappings*/); + g_hMemLowCore = NIL_RTR0MEMOBJ; + g_hMapLowCore = NIL_RTR0MEMOBJ; + g_pbLowCore = NULL; + g_HCPhysLowCore = NIL_RTHCPHYS; + + RTR0MemObjFree(g_hMemPage0, true /*fFreeMappings*/); + g_hMemPage0 = NIL_RTR0MEMOBJ; + g_hMapPage0 = NIL_RTR0MEMOBJ; + g_pbPage0 = NULL; +} + diff --git a/src/VBox/VMM/VMMR0/VMMR0TripleFaultHackA.asm b/src/VBox/VMM/VMMR0/VMMR0TripleFaultHackA.asm new file mode 100644 index 00000000..64817920 --- /dev/null +++ b/src/VBox/VMM/VMMR0/VMMR0TripleFaultHackA.asm @@ -0,0 +1,264 @@ +; $Id: VMMR0TripleFaultHackA.asm $ +;; @file +; VMM - Host Context Ring 0, Assembly Code for The Triple Fault Debugging Hack. +; + +; +; Copyright (C) 2011-2019 Oracle Corporation +; +; This file is part of VirtualBox Open Source Edition (OSE), as +; available from http://www.virtualbox.org. This file is free software; +; you can redistribute it and/or modify it under the terms of the GNU +; General Public License (GPL) as published by the Free Software +; Foundation, in version 2 as it comes in the "COPYING" file of the +; VirtualBox OSE distribution. VirtualBox OSE is distributed in the +; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. +; + +;******************************************************************************* +;* Header Files * +;******************************************************************************* +%include "VBox/asmdefs.mac" + + +BEGINCODE +GLOBALNAME vmmR0TripleFaultHackStart +%define CALC_ADDR(a_Addr) ( (a_Addr) - NAME(vmmR0TripleFaultHackStart) + 07000h ) + + +BITS 16 +BEGINPROC vmmR0TripleFaultHack + ; Set up stack. + cli ; paranoia + mov sp, 0ffffh + mov ax, cs + mov ss, ax + mov ds, ax + mov es, ax + cld ; paranoia + + COM_INIT + + ; Beep and say hello to the post-reset world. + call NAME(vmmR0TripleFaultHackBeep) + mov si, CALC_ADDR(.s_szHello) + call NAME(vmmR0TripleFaultHackPrint) + +.forever: + hlt + jmp .forever + +.s_szHello: + db 'Hello post-reset world', 0ah, 0dh, 0 +ENDPROC vmmR0TripleFaultHack + +;; ds:si = zero terminated string. +BEGINPROC vmmR0TripleFaultHackPrint + push eax + push esi + +.outer_loop: + lodsb + cmp al, 0 + je .done + call NAME(vmmR0TripleFaultHackPrintCh) + jmp .outer_loop + +.done: + pop esi + pop eax + ret +ENDPROC vmmR0TripleFaultHackPrint + + +;; al = char to print +BEGINPROC vmmR0TripleFaultHackPrintCh + push eax + push edx + push ecx + mov ah, al ; save char. + + ; Wait for status. + mov ecx, _1G + mov dx, VBOX_UART_BASE + 5 +.pre_status: + in al, dx + test al, 20h + jnz .put_char + dec ecx + jnz .pre_status + + ; Write the character. +.put_char: + mov al, ah + mov dx, VBOX_UART_BASE + out dx, al + + ; Wait for status. + mov ecx, _1G + mov dx, VBOX_UART_BASE + 5 +.post_status: + in al, dx + test al, 20h + jnz .done + dec ecx + jnz .post_status + +.done: + pop ecx + pop edx + pop eax + ret +ENDPROC vmmR0TripleFaultHackPrintCh + +;; +; make a 440 BEEP. +BEGINPROC vmmR0TripleFaultHackBeep + push eax + push edx + push ecx + + ; program PIT(1) and stuff. + mov al, 10110110b + out 43h, al + mov ax, 0a79h ; A = 440 + out 42h, al + shr ax, 8 + out 42h, al + + in al, 61h + or al, 3 + out 61h, al + + ; delay + mov ecx, _1G +.delay: + inc ecx + dec ecx + dec ecx + jnz .delay + + ; shut up speaker. + in al, 61h + and al, 11111100b + out 61h, al + +.done: + pop ecx + pop edx + pop eax + ret +ENDPROC vmmR0TripleFaultHackBeep + + +GLOBALNAME vmmR0TripleFaultHackEnd + + + + +;;; +;;; +;;; +;;; +;;; + + + +BITS ARCH_BITS + +BEGINPROC vmmR0TripleFaultHackKbdWait + push xAX + +.check_status: + in al, 64h + test al, 1 ; KBD_STAT_OBF + jnz .read_data_and_status + test al, 2 ; KBD_STAT_IBF + jnz .check_status + + pop xAX + ret + +.read_data_and_status: + in al, 60h + jmp .check_status +ENDPROC vmmR0TripleFaultHackKbdWait + + +BEGINPROC vmmR0TripleFaultHackKbdRead + out 64h, al ; Write the command. + +.check_status: + in al, 64h + test al, 1 ; KBD_STAT_OBF + jz .check_status + + in al, 60h ; Read the data. + ret +ENDPROC vmmR0TripleFaultHackKbdRead + + +BEGINPROC vmmR0TripleFaultHackKbdWrite + out 64h, al ; Write the command. + call NAME(vmmR0TripleFaultHackKbdWait) + + xchg al, ah + out 60h, al ; Write the data. + call NAME(vmmR0TripleFaultHackKbdWait) + xchg al, ah + + ret +ENDPROC vmmR0TripleFaultHackKbdWrite + + + +BEGINPROC vmmR0TripleFaultHackTripleFault + push xAX + push xSI + + xor eax, eax + push xAX + push xAX + push xAX + push xAX + + COM_CHAR 'B' + COM_CHAR 'y' + COM_CHAR 'e' + COM_CHAR '!' + COM_CHAR 0ah + COM_CHAR 0dh + + + ;call NAME(vmmR0TripleFaultHackBeep32) +%if 1 + lidt [xSP] +%elif 0 + in al, 92h + or al, 1 + out 92h, al + in al, 92h + cli + hlt +%else + mov al, 0d0h ; KBD_CCMD_READ_OUTPORT + call NAME(vmmR0TripleFaultHackKbdRead) + mov ah, 0feh + and ah, al + mov al, 0d1h ; KBD_CCMD_WRITE_OUTPORT + call NAME(vmmR0TripleFaultHackKbdWrite) + cli + hlt +%endif + int3 + + pop xAX + pop xAX + pop xAX + pop xAX + + pop xSI + pop xAX + ret +ENDPROC vmmR0TripleFaultHackTripleFault + |